diff --git a/MAINTAINERS b/MAINTAINERS index 4b65225d443ad783ff991cf15a8aa92641d3c167..db158767de20776f4cad9b62432fa5afabde84fd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13329,15 +13329,6 @@ S: Odd Fixes F: Documentation/devicetree/bindings/staging/iio/ F: drivers/staging/iio/ -STAGING - LUSTRE PARALLEL FILESYSTEM -M: Oleg Drokin -M: Andreas Dilger -M: James Simmons -L: lustre-devel@lists.lustre.org (moderated for non-subscribers) -W: http://wiki.lustre.org/ -S: Maintained -F: drivers/staging/lustre - STAGING - NVIDIA COMPLIANT EMBEDDED CONTROLLER INTERFACE (nvec) M: Marc Dietrich L: ac100@lists.launchpad.net (moderated for non-subscribers) diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig index d5926f0d3f6cb9eb8978762bdf0d9fedffc7ff4f..1c357ef669aefdbce777d0480700e512788e7b82 100644 --- a/drivers/staging/Kconfig +++ b/drivers/staging/Kconfig @@ -84,8 +84,6 @@ source "drivers/staging/netlogic/Kconfig" source "drivers/staging/mt29f_spinand/Kconfig" -source "drivers/staging/lustre/Kconfig" - source "drivers/staging/dgnc/Kconfig" source "drivers/staging/gs_fpgaboot/Kconfig" diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile index 919753c3d3f6fa6118acb7ea7357aa2024b7dd8d..2edb9860931e42188e9b7680bf182d13443c3345 100644 --- a/drivers/staging/Makefile +++ b/drivers/staging/Makefile @@ -32,7 +32,6 @@ obj-$(CONFIG_STAGING_BOARD) += board/ obj-$(CONFIG_LTE_GDM724X) += gdm724x/ obj-$(CONFIG_FIREWIRE_SERIAL) += fwserial/ obj-$(CONFIG_GOLDFISH) += goldfish/ -obj-$(CONFIG_LNET) += lustre/ obj-$(CONFIG_DGNC) += dgnc/ obj-$(CONFIG_MTD_SPINAND_MT29F) += mt29f_spinand/ obj-$(CONFIG_GS_FPGABOOT) += gs_fpgaboot/ diff --git a/drivers/staging/lustre/Kconfig b/drivers/staging/lustre/Kconfig deleted file mode 100644 index b7d81096eee9e87a533abbf42e1ecaab66b91190..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/Kconfig +++ /dev/null @@ -1,3 +0,0 @@ -source "drivers/staging/lustre/lnet/Kconfig" - -source "drivers/staging/lustre/lustre/Kconfig" diff --git a/drivers/staging/lustre/Makefile b/drivers/staging/lustre/Makefile deleted file mode 100644 index 95ffe337a80a96986abfaf2674ac2b9bf6cb2991..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -obj-$(CONFIG_LNET) += lnet/ -obj-$(CONFIG_LUSTRE_FS) += lustre/ diff --git a/drivers/staging/lustre/README.txt b/drivers/staging/lustre/README.txt deleted file mode 100644 index 783959240490029153065a56505125f70224d3cd..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/README.txt +++ /dev/null @@ -1,83 +0,0 @@ -Lustre Parallel Filesystem Client -================================= - -The Lustre file system is an open-source, parallel file system -that supports many requirements of leadership class HPC simulation -environments. -Born from a research project at Carnegie Mellon University, -the Lustre file system is a widely-used option in HPC. -The Lustre file system provides a POSIX compliant file system interface, -can scale to thousands of clients, petabytes of storage and -hundreds of gigabytes per second of I/O bandwidth. - -Unlike shared disk storage cluster filesystems (e.g. OCFS2, GFS, GPFS), -Lustre has independent Metadata and Data servers that clients can access -in parallel to maximize performance. - -In order to use Lustre client you will need to download the "lustre-client" -package that contains the userspace tools from http://lustre.org/download/ - -You will need to install and configure your Lustre servers separately. - -Mount Syntax -============ -After you installed the lustre-client tools including mount.lustre binary -you can mount your Lustre filesystem with: - -mount -t lustre mgs:/fsname mnt - -where mgs is the host name or ip address of your Lustre MGS(management service) -fsname is the name of the filesystem you would like to mount. - - -Mount Options -============= - - noflock - Disable posix file locking (Applications trying to use - the functionality will get ENOSYS) - - localflock - Enable local flock support, using only client-local flock - (faster, for applications that require flock but do not run - on multiple nodes). - - flock - Enable cluster-global posix file locking coherent across all - client nodes. - - user_xattr, nouser_xattr - Support "user." extended attributes (or not) - - user_fid2path, nouser_fid2path - Enable FID to path translation by regular users (or not) - - checksum, nochecksum - Verify data consistency on the wire and in memory as it passes - between the layers (or not). - - lruresize, nolruresize - Allow lock LRU to be controlled by memory pressure on the server - (or only 100 (default, controlled by lru_size proc parameter) locks - per CPU per server on this client). - - lazystatfs, nolazystatfs - Do not block in statfs() if some of the servers are down. - - 32bitapi - Shrink inode numbers to fit into 32 bits. This is necessary - if you plan to reexport Lustre filesystem from this client via - NFSv4. - - verbose, noverbose - Enable mount/umount console messages (or not) - -More Information -================ -You can get more information at the Lustre website: http://wiki.lustre.org/ - -Source for the userspace tools and out-of-tree client and server code -is available at: http://git.hpdd.intel.com/fs/lustre-release.git - -Latest binary packages: -http://lustre.org/download/ diff --git a/drivers/staging/lustre/TODO b/drivers/staging/lustre/TODO deleted file mode 100644 index 5332cdb19bfaa56e249f07ee517ce3a4d72bb728..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/TODO +++ /dev/null @@ -1,302 +0,0 @@ -Currently all the work directed toward the lustre upstream client is tracked -at the following link: - -https://jira.hpdd.intel.com/browse/LU-9679 - -Under this ticket you will see the following work items that need to be -addressed: - -****************************************************************************** -* libcfs cleanup -* -* https://jira.hpdd.intel.com/browse/LU-9859 -* -* Track all the cleanups and simplification of the libcfs module. Remove -* functions the kernel provides. Possibly integrate some of the functionality -* into the kernel proper. -* -****************************************************************************** - -https://jira.hpdd.intel.com/browse/LU-100086 - -LNET_MINOR conflicts with USERIO_MINOR - ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-8130 - -Fix and simplify libcfs hash handling - ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-8703 - -The current way we handle SMP is wrong. Platforms like ARM and KNL can have -core and NUMA setups with things like NUMA nodes with no cores. We need to -handle such cases. This work also greatly simplified the lustre SMP code. - ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-9019 - -Replace libcfs time API with standard kernel APIs. Also migrate away from -jiffies. We found jiffies can vary on nodes which can lead to corner cases -that can break the file system due to nodes having inconsistent behavior. -So move to time64_t and ktime_t as much as possible. - -****************************************************************************** -* Proper IB support for ko2iblnd -****************************************************************************** -https://jira.hpdd.intel.com/browse/LU-9179 - -Poor performance for the ko2iblnd driver. This is related to many of the -patches below that are missing from the linux client. ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-9886 - -Crash in upstream kiblnd_handle_early_rxs() ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-10394 / LU-10526 / LU-10089 - -Default to default to using MEM_REG ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-10459 - -throttle tx based on queue depth ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-9943 - -correct WR fast reg accounting ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-10291 - -remove concurrent_sends tunable ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-10213 - -calculate qp max_send_wrs properly ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-9810 - -use less CQ entries for each connection ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-10129 / LU-9180 - -rework map_on_demand behavior ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-10129 - -query device capabilities ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-10015 - -fix race at kiblnd_connect_peer ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-9983 - -allow for discontiguous fragments ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-9500 - -Don't Page Align remote_addr with FastReg ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-9448 - -handle empty CPTs ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-9507 - -Don't Assert On Reconnect with MultiQP ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-9472 - -Fix FastReg map/unmap for MLX5 ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-9425 - -Turn on 2 sges by default ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-8943 - -Enable Multiple OPA Endpoints between Nodes ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-5718 - -multiple sges for work request ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-9094 - -kill timedout txs from ibp_tx_queue ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-9094 - -reconnect peer for REJ_INVALID_SERVICE_ID ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-8752 - -Stop MLX5 triggering a dump_cqe ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-8874 - -Move ko2iblnd to latest RDMA changes ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-8875 / LU-8874 - -Change to new RDMA done callback mechanism - ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-9164 / LU-8874 - -Incorporate RDMA map/unamp API's into ko2iblnd - -****************************************************************************** -* sysfs/debugfs fixes -* -* https://jira.hpdd.intel.com/browse/LU-8066 -* -* The original migration to sysfs was done in haste without properly working -* utilities to test the changes. This covers the work to restore the proper -* behavior. Huge project to make this right. -* -****************************************************************************** - -https://jira.hpdd.intel.com/browse/LU-9431 - -The function class_process_proc_param was used for our mass updates of proc -tunables. It didn't work with sysfs and it was just ugly so it was removed. -In the process the ability to mass update thousands of clients was lost. This -work restores this in a sane way. - ------------------------------------------------------------------------------- -https://jira.hpdd.intel.com/browse/LU-9091 - -One the major request of users is the ability to pass in parameters into a -sysfs file in various different units. For example we can set max_pages_per_rpc -but this can vary on platforms due to different platform sizes. So you can -set this like max_pages_per_rpc=16MiB. The original code to handle this written -before the string helpers were created so the code doesn't follow that format -but it would be easy to move to. Currently the string helpers does the reverse -of what we need, changing bytes to string. We need to change a string to bytes. - -****************************************************************************** -* Proper user land to kernel space interface for Lustre -* -* https://jira.hpdd.intel.com/browse/LU-9680 -* -****************************************************************************** - -https://jira.hpdd.intel.com/browse/LU-8915 - -Don't use linux list structure as user land arguments for lnet selftest. -This code is pretty poor quality and really needs to be reworked. - ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-8834 - -The lustre ioctl LL_IOC_FUTIMES_3 is very generic. Need to either work with -other file systems with similar functionality and make a common syscall -interface or rework our server code to automagically do it for us. - ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-6202 - -Cleanup up ioctl handling. We have many obsolete ioctls. Also the way we do -ioctls can be changed over to netlink. This also has the benefit of working -better with HPC systems that do IO forwarding. Such systems don't like ioctls -very well. - ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-9667 - -More cleanups by making our utilities use sysfs instead of ioctls for LNet. -Also it has been requested to move the remaining ioctls to the netlink API. - -****************************************************************************** -* Misc -****************************************************************************** - ------------------------------------------------------------------------------- -https://jira.hpdd.intel.com/browse/LU-9855 - -Clean up obdclass preprocessor code. One of the major eye sores is the various -pointer redirections and macros used by the obdclass. This makes the code very -difficult to understand. It was requested by the Al Viro to clean this up before -we leave staging. - ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-9633 - -Migrate to sphinx kernel-doc style comments. Add documents in Documentation. - ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-6142 - -Possible remaining coding style fix. Remove deadcode. Enforce kernel code -style. Other minor misc cleanups... - ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-8837 - -Separate client/server functionality. Functions only used by server can be -removed from client. Most of this has been done but we need a inspect of the -code to make sure. - ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-8964 - -Lustre client readahead/writeback control needs to better suit kernel providings. -Currently its being explored. We could end up replacing the CLIO read ahead -abstract with the kernel proper version. - ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-9862 - -Patch that landed for LU-7890 leads to static checker errors ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-9868 - -dcache/namei fixes for lustre ------------------------------------------------------------------------------- - -https://jira.hpdd.intel.com/browse/LU-10467 - -use standard linux wait_events macros work by Neil Brown - ------------------------------------------------------------------------------- - -Please send any patches to Greg Kroah-Hartman , Andreas Dilger -, James Simmons and -Oleg Drokin . diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs.h b/drivers/staging/lustre/include/linux/libcfs/libcfs.h deleted file mode 100644 index edc7ed0dcb9420d473af62ab0f1b7f35ca8deee4..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/linux/libcfs/libcfs.h +++ /dev/null @@ -1,76 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#ifndef __LIBCFS_LIBCFS_H__ -#define __LIBCFS_LIBCFS_H__ - -#include -#include -#include - -#include -#include -#include - -#define LIBCFS_VERSION "0.7.0" - -extern struct blocking_notifier_head libcfs_ioctl_list; -static inline int notifier_from_ioctl_errno(int err) -{ - if (err == -EINVAL) - return NOTIFY_OK; - return notifier_from_errno(err) | NOTIFY_STOP_MASK; -} - -int libcfs_setup(void); - -extern struct workqueue_struct *cfs_rehash_wq; - -void lustre_insert_debugfs(struct ctl_table *table); -int lprocfs_call_handler(void *data, int write, loff_t *ppos, - void __user *buffer, size_t *lenp, - int (*handler)(void *data, int write, loff_t pos, - void __user *buffer, int len)); - -/* - * Memory - */ -#if BITS_PER_LONG == 32 -/* limit to lowmem on 32-bit systems */ -#define NUM_CACHEPAGES \ - min(totalram_pages, 1UL << (30 - PAGE_SHIFT) * 3 / 4) -#else -#define NUM_CACHEPAGES totalram_pages -#endif - -#endif /* __LIBCFS_LIBCFS_H__ */ diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_cpu.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_cpu.h deleted file mode 100644 index 61641c41c492ba57f9c66f174844bd47f8269714..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/linux/libcfs/libcfs_cpu.h +++ /dev/null @@ -1,434 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * GPL HEADER END - */ -/* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * - * Copyright (c) 2012, 2015 Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * libcfs/include/libcfs/libcfs_cpu.h - * - * CPU partition - * . CPU partition is virtual processing unit - * - * . CPU partition can present 1-N cores, or 1-N NUMA nodes, - * in other words, CPU partition is a processors pool. - * - * CPU Partition Table (CPT) - * . a set of CPU partitions - * - * . There are two modes for CPT: CFS_CPU_MODE_NUMA and CFS_CPU_MODE_SMP - * - * . User can specify total number of CPU partitions while creating a - * CPT, ID of CPU partition is always start from 0. - * - * Example: if there are 8 cores on the system, while creating a CPT - * with cpu_npartitions=4: - * core[0, 1] = partition[0], core[2, 3] = partition[1] - * core[4, 5] = partition[2], core[6, 7] = partition[3] - * - * cpu_npartitions=1: - * core[0, 1, ... 7] = partition[0] - * - * . User can also specify CPU partitions by string pattern - * - * Examples: cpu_partitions="0[0,1], 1[2,3]" - * cpu_partitions="N 0[0-3], 1[4-8]" - * - * The first character "N" means following numbers are numa ID - * - * . NUMA allocators, CPU affinity threads are built over CPU partitions, - * instead of HW CPUs or HW nodes. - * - * . By default, Lustre modules should refer to the global cfs_cpt_tab, - * instead of accessing HW CPUs directly, so concurrency of Lustre can be - * configured by cpu_npartitions of the global cfs_cpt_tab - * - * . If cpu_npartitions=1(all CPUs in one pool), lustre should work the - * same way as 2.2 or earlier versions - * - * Author: liang@whamcloud.com - */ - -#ifndef __LIBCFS_CPU_H__ -#define __LIBCFS_CPU_H__ - -#include -#include -#include - -/* any CPU partition */ -#define CFS_CPT_ANY (-1) - -#ifdef CONFIG_SMP -/** virtual processing unit */ -struct cfs_cpu_partition { - /* CPUs mask for this partition */ - cpumask_var_t cpt_cpumask; - /* nodes mask for this partition */ - nodemask_t *cpt_nodemask; - /* spread rotor for NUMA allocator */ - unsigned int cpt_spread_rotor; -}; - - -/** descriptor for CPU partitions */ -struct cfs_cpt_table { - /* version, reserved for hotplug */ - unsigned int ctb_version; - /* spread rotor for NUMA allocator */ - unsigned int ctb_spread_rotor; - /* # of CPU partitions */ - unsigned int ctb_nparts; - /* partitions tables */ - struct cfs_cpu_partition *ctb_parts; - /* shadow HW CPU to CPU partition ID */ - int *ctb_cpu2cpt; - /* all cpus in this partition table */ - cpumask_var_t ctb_cpumask; - /* all nodes in this partition table */ - nodemask_t *ctb_nodemask; -}; - -extern struct cfs_cpt_table *cfs_cpt_tab; - -/** - * return cpumask of CPU partition \a cpt - */ -cpumask_var_t *cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt); -/** - * print string information of cpt-table - */ -int cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len); -/** - * return total number of CPU partitions in \a cptab - */ -int -cfs_cpt_number(struct cfs_cpt_table *cptab); -/** - * return number of HW cores or hyper-threadings in a CPU partition \a cpt - */ -int cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt); -/** - * is there any online CPU in CPU partition \a cpt - */ -int cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt); -/** - * return nodemask of CPU partition \a cpt - */ -nodemask_t *cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt); -/** - * shadow current HW processor ID to CPU-partition ID of \a cptab - */ -int cfs_cpt_current(struct cfs_cpt_table *cptab, int remap); -/** - * shadow HW processor ID \a CPU to CPU-partition ID by \a cptab - */ -int cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu); -/** - * bind current thread on a CPU-partition \a cpt of \a cptab - */ -int cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt); -/** - * add \a cpu to CPU partition @cpt of \a cptab, return 1 for success, - * otherwise 0 is returned - */ -int cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu); -/** - * remove \a cpu from CPU partition \a cpt of \a cptab - */ -void cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu); -/** - * add all cpus in \a mask to CPU partition \a cpt - * return 1 if successfully set all CPUs, otherwise return 0 - */ -int cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, - int cpt, cpumask_t *mask); -/** - * remove all cpus in \a mask from CPU partition \a cpt - */ -void cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, - int cpt, cpumask_t *mask); -/** - * add all cpus in NUMA node \a node to CPU partition \a cpt - * return 1 if successfully set all CPUs, otherwise return 0 - */ -int cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node); -/** - * remove all cpus in NUMA node \a node from CPU partition \a cpt - */ -void cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node); - -/** - * add all cpus in node mask \a mask to CPU partition \a cpt - * return 1 if successfully set all CPUs, otherwise return 0 - */ -int cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, - int cpt, nodemask_t *mask); -/** - * remove all cpus in node mask \a mask from CPU partition \a cpt - */ -void cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, - int cpt, nodemask_t *mask); -/** - * unset all cpus for CPU partition \a cpt - */ -void cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt); -/** - * convert partition id \a cpt to numa node id, if there are more than one - * nodes in this partition, it might return a different node id each time. - */ -int cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt); - -/** - * return number of HTs in the same core of \a cpu - */ -int cfs_cpu_ht_nsiblings(int cpu); - -int cfs_cpu_init(void); -void cfs_cpu_fini(void); - -#else /* !CONFIG_SMP */ -struct cfs_cpt_table; -#define cfs_cpt_tab ((struct cfs_cpt_table *)NULL) - -static inline cpumask_var_t * -cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt) -{ - return NULL; -} - -static inline int -cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len) -{ - return 0; -} -static inline int -cfs_cpt_number(struct cfs_cpt_table *cptab) -{ - return 1; -} - -static inline int -cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt) -{ - return 1; -} - -static inline int -cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt) -{ - return 1; -} - -static inline nodemask_t * -cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt) -{ - return NULL; -} - -static inline int -cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu) -{ - return 1; -} - -static inline void -cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu) -{ -} - -static inline int -cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask) -{ - return 1; -} - -static inline void -cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask) -{ -} - -static inline int -cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node) -{ - return 1; -} - -static inline void -cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node) -{ -} - -static inline int -cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask) -{ - return 1; -} - -static inline void -cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask) -{ -} - -static inline void -cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt) -{ -} - -static inline int -cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt) -{ - return 0; -} - -static inline int -cfs_cpu_ht_nsiblings(int cpu) -{ - return 1; -} - -static inline int -cfs_cpt_current(struct cfs_cpt_table *cptab, int remap) -{ - return 0; -} - -static inline int -cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu) -{ - return 0; -} - -static inline int -cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt) -{ - return 0; -} - -static inline int -cfs_cpu_init(void) -{ - return 0; -} - -static inline void cfs_cpu_fini(void) -{ -} - -#endif /* CONFIG_SMP */ - -/** - * destroy a CPU partition table - */ -void cfs_cpt_table_free(struct cfs_cpt_table *cptab); -/** - * create a cfs_cpt_table with \a ncpt number of partitions - */ -struct cfs_cpt_table *cfs_cpt_table_alloc(unsigned int ncpt); - -/* - * allocate per-cpu-partition data, returned value is an array of pointers, - * variable can be indexed by CPU ID. - * cptab != NULL: size of array is number of CPU partitions - * cptab == NULL: size of array is number of HW cores - */ -void *cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size); -/* - * destroy per-cpu-partition variable - */ -void cfs_percpt_free(void *vars); -int cfs_percpt_number(void *vars); - -#define cfs_percpt_for_each(var, i, vars) \ - for (i = 0; i < cfs_percpt_number(vars) && \ - ((var) = (vars)[i]) != NULL; i++) - -/* - * percpu partition lock - * - * There are some use-cases like this in Lustre: - * . each CPU partition has it's own private data which is frequently changed, - * and mostly by the local CPU partition. - * . all CPU partitions share some global data, these data are rarely changed. - * - * LNet is typical example. - * CPU partition lock is designed for this kind of use-cases: - * . each CPU partition has it's own private lock - * . change on private data just needs to take the private lock - * . read on shared data just needs to take _any_ of private locks - * . change on shared data needs to take _all_ private locks, - * which is slow and should be really rare. - */ -enum { - CFS_PERCPT_LOCK_EX = -1, /* negative */ -}; - -struct cfs_percpt_lock { - /* cpu-partition-table for this lock */ - struct cfs_cpt_table *pcl_cptab; - /* exclusively locked */ - unsigned int pcl_locked; - /* private lock table */ - spinlock_t **pcl_locks; -}; - -/* return number of private locks */ -#define cfs_percpt_lock_num(pcl) cfs_cpt_number(pcl->pcl_cptab) - -/* - * create a cpu-partition lock based on CPU partition table \a cptab, - * each private lock has extra \a psize bytes padding data - */ -struct cfs_percpt_lock *cfs_percpt_lock_create(struct cfs_cpt_table *cptab, - struct lock_class_key *keys); -/* destroy a cpu-partition lock */ -void cfs_percpt_lock_free(struct cfs_percpt_lock *pcl); - -/* lock private lock \a index of \a pcl */ -void cfs_percpt_lock(struct cfs_percpt_lock *pcl, int index); - -/* unlock private lock \a index of \a pcl */ -void cfs_percpt_unlock(struct cfs_percpt_lock *pcl, int index); - -#define CFS_PERCPT_LOCK_KEYS 256 - -/* NB: don't allocate keys dynamically, lockdep needs them to be in ".data" */ -#define cfs_percpt_lock_alloc(cptab) \ -({ \ - static struct lock_class_key ___keys[CFS_PERCPT_LOCK_KEYS]; \ - struct cfs_percpt_lock *___lk; \ - \ - if (cfs_cpt_number(cptab) > CFS_PERCPT_LOCK_KEYS) \ - ___lk = cfs_percpt_lock_create(cptab, NULL); \ - else \ - ___lk = cfs_percpt_lock_create(cptab, ___keys); \ - ___lk; \ -}) - -/** - * iterate over all CPU partitions in \a cptab - */ -#define cfs_cpt_for_each(i, cptab) \ - for (i = 0; i < cfs_cpt_number(cptab); i++) - -#endif /* __LIBCFS_CPU_H__ */ diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_crypto.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_crypto.h deleted file mode 100644 index 176fae7319e37ee7890982ad98f7157f9a649f7d..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/linux/libcfs/libcfs_crypto.h +++ /dev/null @@ -1,208 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see http://www.gnu.org/licenses - * - * Please visit http://www.xyratex.com/contact if you need additional - * information or have any questions. - * - * GPL HEADER END - */ - -/* - * Copyright 2012 Xyratex Technology Limited - */ - -#ifndef _LIBCFS_CRYPTO_H -#define _LIBCFS_CRYPTO_H - -#include -struct page; - -struct cfs_crypto_hash_type { - char *cht_name; /*< hash algorithm name, equal to - * format name for crypto api - */ - unsigned int cht_key; /*< init key by default (valid for - * 4 bytes context like crc32, adler - */ - unsigned int cht_size; /**< hash digest size */ -}; - -enum cfs_crypto_hash_alg { - CFS_HASH_ALG_NULL = 0, - CFS_HASH_ALG_ADLER32, - CFS_HASH_ALG_CRC32, - CFS_HASH_ALG_MD5, - CFS_HASH_ALG_SHA1, - CFS_HASH_ALG_SHA256, - CFS_HASH_ALG_SHA384, - CFS_HASH_ALG_SHA512, - CFS_HASH_ALG_CRC32C, - CFS_HASH_ALG_MAX, - CFS_HASH_ALG_UNKNOWN = 0xff -}; - -static struct cfs_crypto_hash_type hash_types[] = { - [CFS_HASH_ALG_NULL] = { - .cht_name = "null", - .cht_key = 0, - .cht_size = 0 - }, - [CFS_HASH_ALG_ADLER32] = { - .cht_name = "adler32", - .cht_key = 1, - .cht_size = 4 - }, - [CFS_HASH_ALG_CRC32] = { - .cht_name = "crc32", - .cht_key = ~0, - .cht_size = 4 - }, - [CFS_HASH_ALG_CRC32C] = { - .cht_name = "crc32c", - .cht_key = ~0, - .cht_size = 4 - }, - [CFS_HASH_ALG_MD5] = { - .cht_name = "md5", - .cht_key = 0, - .cht_size = 16 - }, - [CFS_HASH_ALG_SHA1] = { - .cht_name = "sha1", - .cht_key = 0, - .cht_size = 20 - }, - [CFS_HASH_ALG_SHA256] = { - .cht_name = "sha256", - .cht_key = 0, - .cht_size = 32 - }, - [CFS_HASH_ALG_SHA384] = { - .cht_name = "sha384", - .cht_key = 0, - .cht_size = 48 - }, - [CFS_HASH_ALG_SHA512] = { - .cht_name = "sha512", - .cht_key = 0, - .cht_size = 64 - }, - [CFS_HASH_ALG_MAX] = { - .cht_name = NULL, - .cht_key = 0, - .cht_size = 64 - }, -}; - -/* Maximum size of hash_types[].cht_size */ -#define CFS_CRYPTO_HASH_DIGESTSIZE_MAX 64 - -/** - * Return hash algorithm information for the specified algorithm identifier - * - * Hash information includes algorithm name, initial seed, hash size. - * - * \retval cfs_crypto_hash_type for valid ID (CFS_HASH_ALG_*) - * \retval NULL for unknown algorithm identifier - */ -static inline const struct cfs_crypto_hash_type * -cfs_crypto_hash_type(enum cfs_crypto_hash_alg hash_alg) -{ - struct cfs_crypto_hash_type *ht; - - if (hash_alg < CFS_HASH_ALG_MAX) { - ht = &hash_types[hash_alg]; - if (ht->cht_name) - return ht; - } - return NULL; -} - -/** - * Return hash name for hash algorithm identifier - * - * \param[in] hash_alg hash alrgorithm id (CFS_HASH_ALG_*) - * - * \retval string name of known hash algorithm - * \retval "unknown" if hash algorithm is unknown - */ -static inline const char * -cfs_crypto_hash_name(enum cfs_crypto_hash_alg hash_alg) -{ - const struct cfs_crypto_hash_type *ht; - - ht = cfs_crypto_hash_type(hash_alg); - if (ht) - return ht->cht_name; - return "unknown"; -} - -/** - * Return digest size for hash algorithm type - * - * \param[in] hash_alg hash alrgorithm id (CFS_HASH_ALG_*) - * - * \retval hash algorithm digest size in bytes - * \retval 0 if hash algorithm type is unknown - */ -static inline int cfs_crypto_hash_digestsize(enum cfs_crypto_hash_alg hash_alg) -{ - const struct cfs_crypto_hash_type *ht; - - ht = cfs_crypto_hash_type(hash_alg); - if (ht) - return ht->cht_size; - return 0; -} - -/** - * Find hash algorithm ID for the specified algorithm name - * - * \retval hash algorithm ID for valid ID (CFS_HASH_ALG_*) - * \retval CFS_HASH_ALG_UNKNOWN for unknown algorithm name - */ -static inline unsigned char cfs_crypto_hash_alg(const char *algname) -{ - enum cfs_crypto_hash_alg hash_alg; - - for (hash_alg = 0; hash_alg < CFS_HASH_ALG_MAX; hash_alg++) - if (!strcmp(hash_types[hash_alg].cht_name, algname)) - return hash_alg; - - return CFS_HASH_ALG_UNKNOWN; -} - -int cfs_crypto_hash_digest(enum cfs_crypto_hash_alg hash_alg, - const void *buf, unsigned int buf_len, - unsigned char *key, unsigned int key_len, - unsigned char *hash, unsigned int *hash_len); - -struct ahash_request * -cfs_crypto_hash_init(enum cfs_crypto_hash_alg hash_alg, - unsigned char *key, unsigned int key_len); -int cfs_crypto_hash_update_page(struct ahash_request *desc, - struct page *page, unsigned int offset, - unsigned int len); -int cfs_crypto_hash_update(struct ahash_request *desc, const void *buf, - unsigned int buf_len); -int cfs_crypto_hash_final(struct ahash_request *desc, - unsigned char *hash, unsigned int *hash_len); -int cfs_crypto_register(void); -void cfs_crypto_unregister(void); -int cfs_crypto_hash_speed(enum cfs_crypto_hash_alg hash_alg); -#endif diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h deleted file mode 100644 index 17534a76362ae5e4f70c1273beed4025c26ebdaf..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/linux/libcfs/libcfs_debug.h +++ /dev/null @@ -1,207 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * libcfs/include/libcfs/libcfs_debug.h - * - * Debug messages and assertions - * - */ - -#ifndef __LIBCFS_DEBUG_H__ -#define __LIBCFS_DEBUG_H__ - -#include -#include - -/* - * Debugging - */ -extern unsigned int libcfs_subsystem_debug; -extern unsigned int libcfs_stack; -extern unsigned int libcfs_debug; -extern unsigned int libcfs_printk; -extern unsigned int libcfs_console_ratelimit; -extern unsigned int libcfs_console_max_delay; -extern unsigned int libcfs_console_min_delay; -extern unsigned int libcfs_console_backoff; -extern unsigned int libcfs_debug_binary; -extern char libcfs_debug_file_path_arr[PATH_MAX]; - -int libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys); -int libcfs_debug_str2mask(int *mask, const char *str, int is_subsys); - -/* Has there been an LBUG? */ -extern unsigned int libcfs_catastrophe; -extern unsigned int libcfs_panic_on_lbug; - -/* Enable debug-checks on stack size - except on x86_64 */ -#if !defined(__x86_64__) -# ifdef __ia64__ -# define CDEBUG_STACK() (THREAD_SIZE - \ - ((unsigned long)__builtin_dwarf_cfa() & \ - (THREAD_SIZE - 1))) -# else -# define CDEBUG_STACK() (THREAD_SIZE - \ - ((unsigned long)__builtin_frame_address(0) & \ - (THREAD_SIZE - 1))) -# endif /* __ia64__ */ - -#define __CHECK_STACK(msgdata, mask, cdls) \ -do { \ - if (unlikely(CDEBUG_STACK() > libcfs_stack)) { \ - LIBCFS_DEBUG_MSG_DATA_INIT(msgdata, D_WARNING, NULL); \ - libcfs_stack = CDEBUG_STACK(); \ - libcfs_debug_msg(msgdata, \ - "maximum lustre stack %lu\n", \ - CDEBUG_STACK()); \ - (msgdata)->msg_mask = mask; \ - (msgdata)->msg_cdls = cdls; \ - dump_stack(); \ - /*panic("LBUG");*/ \ - } \ -} while (0) -#define CFS_CHECK_STACK(msgdata, mask, cdls) __CHECK_STACK(msgdata, mask, cdls) -#else /* __x86_64__ */ -#define CFS_CHECK_STACK(msgdata, mask, cdls) do {} while (0) -#define CDEBUG_STACK() (0L) -#endif /* __x86_64__ */ - -#ifndef DEBUG_SUBSYSTEM -# define DEBUG_SUBSYSTEM S_UNDEFINED -#endif - -#define CDEBUG_DEFAULT_MAX_DELAY (600 * HZ) /* jiffies */ -#define CDEBUG_DEFAULT_MIN_DELAY ((HZ + 1) / 2) /* jiffies */ -#define CDEBUG_DEFAULT_BACKOFF 2 -struct cfs_debug_limit_state { - unsigned long cdls_next; - unsigned int cdls_delay; - int cdls_count; -}; - -struct libcfs_debug_msg_data { - const char *msg_file; - const char *msg_fn; - int msg_subsys; - int msg_line; - int msg_mask; - struct cfs_debug_limit_state *msg_cdls; -}; - -#define LIBCFS_DEBUG_MSG_DATA_INIT(data, mask, cdls) \ -do { \ - (data)->msg_subsys = DEBUG_SUBSYSTEM; \ - (data)->msg_file = __FILE__; \ - (data)->msg_fn = __func__; \ - (data)->msg_line = __LINE__; \ - (data)->msg_cdls = (cdls); \ - (data)->msg_mask = (mask); \ -} while (0) - -#define LIBCFS_DEBUG_MSG_DATA_DECL(dataname, mask, cdls) \ - static struct libcfs_debug_msg_data dataname = { \ - .msg_subsys = DEBUG_SUBSYSTEM, \ - .msg_file = __FILE__, \ - .msg_fn = __func__, \ - .msg_line = __LINE__, \ - .msg_cdls = (cdls) }; \ - dataname.msg_mask = (mask) - -/** - * Filters out logging messages based on mask and subsystem. - */ -static inline int cfs_cdebug_show(unsigned int mask, unsigned int subsystem) -{ - return mask & D_CANTMASK || - ((libcfs_debug & mask) && (libcfs_subsystem_debug & subsystem)); -} - -#define __CDEBUG(cdls, mask, format, ...) \ -do { \ - static struct libcfs_debug_msg_data msgdata; \ - \ - CFS_CHECK_STACK(&msgdata, mask, cdls); \ - \ - if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ - LIBCFS_DEBUG_MSG_DATA_INIT(&msgdata, mask, cdls); \ - libcfs_debug_msg(&msgdata, format, ## __VA_ARGS__); \ - } \ -} while (0) - -#define CDEBUG(mask, format, ...) __CDEBUG(NULL, mask, format, ## __VA_ARGS__) - -#define CDEBUG_LIMIT(mask, format, ...) \ -do { \ - static struct cfs_debug_limit_state cdls; \ - \ - __CDEBUG(&cdls, mask, format, ## __VA_ARGS__); \ -} while (0) - -/* - * Lustre Error Checksum: calculates checksum - * of Hex number by XORing the nybbles. - */ -#define LERRCHKSUM(hexnum) (((hexnum) & 0xf) ^ ((hexnum) >> 4 & 0xf) ^ \ - ((hexnum) >> 8 & 0xf)) - -#define CWARN(format, ...) CDEBUG_LIMIT(D_WARNING, format, ## __VA_ARGS__) -#define CERROR(format, ...) CDEBUG_LIMIT(D_ERROR, format, ## __VA_ARGS__) -#define CNETERR(format, a...) CDEBUG_LIMIT(D_NETERROR, format, ## a) -#define CEMERG(format, ...) CDEBUG_LIMIT(D_EMERG, format, ## __VA_ARGS__) - -#define LCONSOLE(mask, format, ...) CDEBUG(D_CONSOLE | (mask), format, ## __VA_ARGS__) -#define LCONSOLE_INFO(format, ...) CDEBUG_LIMIT(D_CONSOLE, format, ## __VA_ARGS__) -#define LCONSOLE_WARN(format, ...) CDEBUG_LIMIT(D_CONSOLE | D_WARNING, format, ## __VA_ARGS__) -#define LCONSOLE_ERROR_MSG(errnum, format, ...) CDEBUG_LIMIT(D_CONSOLE | D_ERROR, \ - "%x-%x: " format, errnum, LERRCHKSUM(errnum), ## __VA_ARGS__) -#define LCONSOLE_ERROR(format, ...) LCONSOLE_ERROR_MSG(0x00, format, ## __VA_ARGS__) - -#define LCONSOLE_EMERG(format, ...) CDEBUG(D_CONSOLE | D_EMERG, format, ## __VA_ARGS__) - -int libcfs_debug_msg(struct libcfs_debug_msg_data *msgdata, - const char *format1, ...) - __printf(2, 3); - -int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata, - const char *format1, - va_list args, const char *format2, ...) - __printf(4, 5); - -/* other external symbols that tracefile provides: */ -int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob, - const char __user *usr_buffer, int usr_buffer_nob); -int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob, - const char *knl_buffer, char *append); - -#define LIBCFS_DEBUG_FILE_PATH_DEFAULT "/tmp/lustre-log" - -#endif /* __LIBCFS_DEBUG_H__ */ diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h deleted file mode 100644 index 8074e390b4d12b6ce1ee147475500eb28817a7fe..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/linux/libcfs/libcfs_fail.h +++ /dev/null @@ -1,194 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see http://www.gnu.org/licenses - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Oracle Corporation, Inc. - */ - -#ifndef _LIBCFS_FAIL_H -#define _LIBCFS_FAIL_H - -#include -#include - -extern unsigned long cfs_fail_loc; -extern unsigned int cfs_fail_val; -extern int cfs_fail_err; - -extern wait_queue_head_t cfs_race_waitq; -extern int cfs_race_state; - -int __cfs_fail_check_set(u32 id, u32 value, int set); -int __cfs_fail_timeout_set(u32 id, u32 value, int ms, int set); - -enum { - CFS_FAIL_LOC_NOSET = 0, - CFS_FAIL_LOC_ORSET = 1, - CFS_FAIL_LOC_RESET = 2, - CFS_FAIL_LOC_VALUE = 3 -}; - -/* Failure injection control */ -#define CFS_FAIL_MASK_SYS 0x0000FF00 -#define CFS_FAIL_MASK_LOC (0x000000FF | CFS_FAIL_MASK_SYS) - -#define CFS_FAILED_BIT 30 -/* CFS_FAILED is 0x40000000 */ -#define CFS_FAILED BIT(CFS_FAILED_BIT) - -#define CFS_FAIL_ONCE_BIT 31 -/* CFS_FAIL_ONCE is 0x80000000 */ -#define CFS_FAIL_ONCE BIT(CFS_FAIL_ONCE_BIT) - -/* The following flags aren't made to be combined */ -#define CFS_FAIL_SKIP 0x20000000 /* skip N times then fail */ -#define CFS_FAIL_SOME 0x10000000 /* only fail N times */ -#define CFS_FAIL_RAND 0x08000000 /* fail 1/N of the times */ -#define CFS_FAIL_USR1 0x04000000 /* user flag */ - -#define CFS_FAULT 0x02000000 /* match any CFS_FAULT_CHECK */ - -static inline bool CFS_FAIL_PRECHECK(u32 id) -{ - return cfs_fail_loc && - ((cfs_fail_loc & CFS_FAIL_MASK_LOC) == (id & CFS_FAIL_MASK_LOC) || - (cfs_fail_loc & id & CFS_FAULT)); -} - -static inline int cfs_fail_check_set(u32 id, u32 value, - int set, int quiet) -{ - int ret = 0; - - if (unlikely(CFS_FAIL_PRECHECK(id))) { - ret = __cfs_fail_check_set(id, value, set); - if (ret) { - if (quiet) { - CDEBUG(D_INFO, "*** cfs_fail_loc=%x, val=%u***\n", - id, value); - } else { - LCONSOLE_INFO("*** cfs_fail_loc=%x, val=%u***\n", - id, value); - } - } - } - - return ret; -} - -/* If id hit cfs_fail_loc, return 1, otherwise return 0 */ -#define CFS_FAIL_CHECK(id) \ - cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET, 0) -#define CFS_FAIL_CHECK_QUIET(id) \ - cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET, 1) - -/* - * If id hit cfs_fail_loc and cfs_fail_val == (-1 or value) return 1, - * otherwise return 0 - */ -#define CFS_FAIL_CHECK_VALUE(id, value) \ - cfs_fail_check_set(id, value, CFS_FAIL_LOC_VALUE, 0) -#define CFS_FAIL_CHECK_VALUE_QUIET(id, value) \ - cfs_fail_check_set(id, value, CFS_FAIL_LOC_VALUE, 1) - -/* - * If id hit cfs_fail_loc, cfs_fail_loc |= value and return 1, - * otherwise return 0 - */ -#define CFS_FAIL_CHECK_ORSET(id, value) \ - cfs_fail_check_set(id, value, CFS_FAIL_LOC_ORSET, 0) -#define CFS_FAIL_CHECK_ORSET_QUIET(id, value) \ - cfs_fail_check_set(id, value, CFS_FAIL_LOC_ORSET, 1) - -/* - * If id hit cfs_fail_loc, cfs_fail_loc = value and return 1, - * otherwise return 0 - */ -#define CFS_FAIL_CHECK_RESET(id, value) \ - cfs_fail_check_set(id, value, CFS_FAIL_LOC_RESET, 0) -#define CFS_FAIL_CHECK_RESET_QUIET(id, value) \ - cfs_fail_check_set(id, value, CFS_FAIL_LOC_RESET, 1) - -static inline int cfs_fail_timeout_set(u32 id, u32 value, int ms, int set) -{ - if (unlikely(CFS_FAIL_PRECHECK(id))) - return __cfs_fail_timeout_set(id, value, ms, set); - return 0; -} - -/* If id hit cfs_fail_loc, sleep for seconds or milliseconds */ -#define CFS_FAIL_TIMEOUT(id, secs) \ - cfs_fail_timeout_set(id, 0, (secs) * 1000, CFS_FAIL_LOC_NOSET) - -#define CFS_FAIL_TIMEOUT_MS(id, ms) \ - cfs_fail_timeout_set(id, 0, ms, CFS_FAIL_LOC_NOSET) - -/* - * If id hit cfs_fail_loc, cfs_fail_loc |= value and - * sleep seconds or milliseconds - */ -#define CFS_FAIL_TIMEOUT_ORSET(id, value, secs) \ - cfs_fail_timeout_set(id, value, (secs) * 1000, CFS_FAIL_LOC_ORSET) - -#define CFS_FAIL_TIMEOUT_RESET(id, value, secs) \ - cfs_fail_timeout_set(id, value, (secs) * 1000, CFS_FAIL_LOC_RESET) - -#define CFS_FAIL_TIMEOUT_MS_ORSET(id, value, ms) \ - cfs_fail_timeout_set(id, value, ms, CFS_FAIL_LOC_ORSET) - -#define CFS_FAULT_CHECK(id) \ - CFS_FAIL_CHECK(CFS_FAULT | (id)) - -/* - * The idea here is to synchronise two threads to force a race. The - * first thread that calls this with a matching fail_loc is put to - * sleep. The next thread that calls with the same fail_loc wakes up - * the first and continues. - */ -static inline void cfs_race(u32 id) -{ - if (CFS_FAIL_PRECHECK(id)) { - if (unlikely(__cfs_fail_check_set(id, 0, CFS_FAIL_LOC_NOSET))) { - int rc; - - cfs_race_state = 0; - CERROR("cfs_race id %x sleeping\n", id); - rc = wait_event_interruptible(cfs_race_waitq, - !!cfs_race_state); - CERROR("cfs_fail_race id %x awake, rc=%d\n", id, rc); - } else { - CERROR("cfs_fail_race id %x waking\n", id); - cfs_race_state = 1; - wake_up(&cfs_race_waitq); - } - } -} - -#define CFS_RACE(id) cfs_race(id) - -#endif /* _LIBCFS_FAIL_H */ diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_hash.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_hash.h deleted file mode 100644 index be315958a4b308c42d0b58fe8b1edff4e35fc627..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/linux/libcfs/libcfs_hash.h +++ /dev/null @@ -1,869 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015 Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * libcfs/include/libcfs/libcfs_hash.h - * - * Hashing routines - * - */ - -#ifndef __LIBCFS_HASH_H__ -#define __LIBCFS_HASH_H__ - -#include -#include -#include -#include - -/* - * Knuth recommends primes in approximately golden ratio to the maximum - * integer representable by a machine word for multiplicative hashing. - * Chuck Lever verified the effectiveness of this technique: - * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf - * - * These primes are chosen to be bit-sparse, that is operations on - * them can use shifts and additions instead of multiplications for - * machines where multiplications are slow. - */ -/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ -#define CFS_GOLDEN_RATIO_PRIME_32 0x9e370001UL -/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */ -#define CFS_GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001ULL - -/** disable debug */ -#define CFS_HASH_DEBUG_NONE 0 -/* - * record hash depth and output to console when it's too deep, - * computing overhead is low but consume more memory - */ -#define CFS_HASH_DEBUG_1 1 -/** expensive, check key validation */ -#define CFS_HASH_DEBUG_2 2 - -#define CFS_HASH_DEBUG_LEVEL CFS_HASH_DEBUG_NONE - -struct cfs_hash_ops; -struct cfs_hash_lock_ops; -struct cfs_hash_hlist_ops; - -union cfs_hash_lock { - rwlock_t rw; /**< rwlock */ - spinlock_t spin; /**< spinlock */ -}; - -/** - * cfs_hash_bucket is a container of: - * - lock, counter ... - * - array of hash-head starting from hsb_head[0], hash-head can be one of - * . struct cfs_hash_head - * . struct cfs_hash_head_dep - * . struct cfs_hash_dhead - * . struct cfs_hash_dhead_dep - * which depends on requirement of user - * - some extra bytes (caller can require it while creating hash) - */ -struct cfs_hash_bucket { - union cfs_hash_lock hsb_lock; /**< bucket lock */ - u32 hsb_count; /**< current entries */ - u32 hsb_version; /**< change version */ - unsigned int hsb_index; /**< index of bucket */ - int hsb_depmax; /**< max depth on bucket */ - long hsb_head[0]; /**< hash-head array */ -}; - -/** - * cfs_hash bucket descriptor, it's normally in stack of caller - */ -struct cfs_hash_bd { - /* address of bucket */ - struct cfs_hash_bucket *bd_bucket; - /* offset in bucket */ - unsigned int bd_offset; -}; - -#define CFS_HASH_NAME_LEN 16 /**< default name length */ -#define CFS_HASH_BIGNAME_LEN 64 /**< bigname for param tree */ - -#define CFS_HASH_BKT_BITS 3 /**< default bits of bucket */ -#define CFS_HASH_BITS_MAX 30 /**< max bits of bucket */ -#define CFS_HASH_BITS_MIN CFS_HASH_BKT_BITS - -/** - * common hash attributes. - */ -enum cfs_hash_tag { - /** - * don't need any lock, caller will protect operations with it's - * own lock. With this flag: - * . CFS_HASH_NO_BKTLOCK, CFS_HASH_RW_BKTLOCK, CFS_HASH_SPIN_BKTLOCK - * will be ignored. - * . Some functions will be disabled with this flag, i.e: - * cfs_hash_for_each_empty, cfs_hash_rehash - */ - CFS_HASH_NO_LOCK = BIT(0), - /** no bucket lock, use one spinlock to protect the whole hash */ - CFS_HASH_NO_BKTLOCK = BIT(1), - /** rwlock to protect bucket */ - CFS_HASH_RW_BKTLOCK = BIT(2), - /** spinlock to protect bucket */ - CFS_HASH_SPIN_BKTLOCK = BIT(3), - /** always add new item to tail */ - CFS_HASH_ADD_TAIL = BIT(4), - /** hash-table doesn't have refcount on item */ - CFS_HASH_NO_ITEMREF = BIT(5), - /** big name for param-tree */ - CFS_HASH_BIGNAME = BIT(6), - /** track global count */ - CFS_HASH_COUNTER = BIT(7), - /** rehash item by new key */ - CFS_HASH_REHASH_KEY = BIT(8), - /** Enable dynamic hash resizing */ - CFS_HASH_REHASH = BIT(9), - /** can shrink hash-size */ - CFS_HASH_SHRINK = BIT(10), - /** assert hash is empty on exit */ - CFS_HASH_ASSERT_EMPTY = BIT(11), - /** record hlist depth */ - CFS_HASH_DEPTH = BIT(12), - /** - * rehash is always scheduled in a different thread, so current - * change on hash table is non-blocking - */ - CFS_HASH_NBLK_CHANGE = BIT(13), - /** - * NB, we typed hs_flags as u16, please change it - * if you need to extend >=16 flags - */ -}; - -/** most used attributes */ -#define CFS_HASH_DEFAULT (CFS_HASH_RW_BKTLOCK | \ - CFS_HASH_COUNTER | CFS_HASH_REHASH) - -/** - * cfs_hash is a hash-table implementation for general purpose, it can support: - * . two refcount modes - * hash-table with & without refcount - * . four lock modes - * nolock, one-spinlock, rw-bucket-lock, spin-bucket-lock - * . general operations - * lookup, add(add_tail or add_head), delete - * . rehash - * grows or shrink - * . iteration - * locked iteration and unlocked iteration - * . bigname - * support long name hash - * . debug - * trace max searching depth - * - * Rehash: - * When the htable grows or shrinks, a separate task (cfs_hash_rehash_worker) - * is spawned to handle the rehash in the background, it's possible that other - * processes can concurrently perform additions, deletions, and lookups - * without being blocked on rehash completion, because rehash will release - * the global wrlock for each bucket. - * - * rehash and iteration can't run at the same time because it's too tricky - * to keep both of them safe and correct. - * As they are relatively rare operations, so: - * . if iteration is in progress while we try to launch rehash, then - * it just giveup, iterator will launch rehash at the end. - * . if rehash is in progress while we try to iterate the hash table, - * then we just wait (shouldn't be very long time), anyway, nobody - * should expect iteration of whole hash-table to be non-blocking. - * - * During rehashing, a (key,object) pair may be in one of two buckets, - * depending on whether the worker task has yet to transfer the object - * to its new location in the table. Lookups and deletions need to search both - * locations; additions must take care to only insert into the new bucket. - */ - -struct cfs_hash { - /** - * serialize with rehash, or serialize all operations if - * the hash-table has CFS_HASH_NO_BKTLOCK - */ - union cfs_hash_lock hs_lock; - /** hash operations */ - struct cfs_hash_ops *hs_ops; - /** hash lock operations */ - struct cfs_hash_lock_ops *hs_lops; - /** hash list operations */ - struct cfs_hash_hlist_ops *hs_hops; - /** hash buckets-table */ - struct cfs_hash_bucket **hs_buckets; - /** total number of items on this hash-table */ - atomic_t hs_count; - /** hash flags, see cfs_hash_tag for detail */ - u16 hs_flags; - /** # of extra-bytes for bucket, for user saving extended attributes */ - u16 hs_extra_bytes; - /** wants to iterate */ - u8 hs_iterating; - /** hash-table is dying */ - u8 hs_exiting; - /** current hash bits */ - u8 hs_cur_bits; - /** min hash bits */ - u8 hs_min_bits; - /** max hash bits */ - u8 hs_max_bits; - /** bits for rehash */ - u8 hs_rehash_bits; - /** bits for each bucket */ - u8 hs_bkt_bits; - /** resize min threshold */ - u16 hs_min_theta; - /** resize max threshold */ - u16 hs_max_theta; - /** resize count */ - u32 hs_rehash_count; - /** # of iterators (caller of cfs_hash_for_each_*) */ - u32 hs_iterators; - /** rehash workitem */ - struct work_struct hs_rehash_work; - /** refcount on this hash table */ - atomic_t hs_refcount; - /** rehash buckets-table */ - struct cfs_hash_bucket **hs_rehash_buckets; -#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 - /** serialize debug members */ - spinlock_t hs_dep_lock; - /** max depth */ - unsigned int hs_dep_max; - /** id of the deepest bucket */ - unsigned int hs_dep_bkt; - /** offset in the deepest bucket */ - unsigned int hs_dep_off; - /** bits when we found the max depth */ - unsigned int hs_dep_bits; - /** workitem to output max depth */ - struct work_struct hs_dep_work; -#endif - /** name of htable */ - char hs_name[0]; -}; - -struct cfs_hash_lock_ops { - /** lock the hash table */ - void (*hs_lock)(union cfs_hash_lock *lock, int exclusive); - /** unlock the hash table */ - void (*hs_unlock)(union cfs_hash_lock *lock, int exclusive); - /** lock the hash bucket */ - void (*hs_bkt_lock)(union cfs_hash_lock *lock, int exclusive); - /** unlock the hash bucket */ - void (*hs_bkt_unlock)(union cfs_hash_lock *lock, int exclusive); -}; - -struct cfs_hash_hlist_ops { - /** return hlist_head of hash-head of @bd */ - struct hlist_head *(*hop_hhead)(struct cfs_hash *hs, - struct cfs_hash_bd *bd); - /** return hash-head size */ - int (*hop_hhead_size)(struct cfs_hash *hs); - /** add @hnode to hash-head of @bd */ - int (*hop_hnode_add)(struct cfs_hash *hs, struct cfs_hash_bd *bd, - struct hlist_node *hnode); - /** remove @hnode from hash-head of @bd */ - int (*hop_hnode_del)(struct cfs_hash *hs, struct cfs_hash_bd *bd, - struct hlist_node *hnode); -}; - -struct cfs_hash_ops { - /** return hashed value from @key */ - unsigned int (*hs_hash)(struct cfs_hash *hs, const void *key, - unsigned int mask); - /** return key address of @hnode */ - void * (*hs_key)(struct hlist_node *hnode); - /** copy key from @hnode to @key */ - void (*hs_keycpy)(struct hlist_node *hnode, void *key); - /** - * compare @key with key of @hnode - * returns 1 on a match - */ - int (*hs_keycmp)(const void *key, struct hlist_node *hnode); - /** return object address of @hnode, i.e: container_of(...hnode) */ - void * (*hs_object)(struct hlist_node *hnode); - /** get refcount of item, always called with holding bucket-lock */ - void (*hs_get)(struct cfs_hash *hs, struct hlist_node *hnode); - /** release refcount of item */ - void (*hs_put)(struct cfs_hash *hs, struct hlist_node *hnode); - /** release refcount of item, always called with holding bucket-lock */ - void (*hs_put_locked)(struct cfs_hash *hs, - struct hlist_node *hnode); - /** it's called before removing of @hnode */ - void (*hs_exit)(struct cfs_hash *hs, struct hlist_node *hnode); -}; - -/** total number of buckets in @hs */ -#define CFS_HASH_NBKT(hs) \ - BIT((hs)->hs_cur_bits - (hs)->hs_bkt_bits) - -/** total number of buckets in @hs while rehashing */ -#define CFS_HASH_RH_NBKT(hs) \ - BIT((hs)->hs_rehash_bits - (hs)->hs_bkt_bits) - -/** number of hlist for in bucket */ -#define CFS_HASH_BKT_NHLIST(hs) BIT((hs)->hs_bkt_bits) - -/** total number of hlist in @hs */ -#define CFS_HASH_NHLIST(hs) BIT((hs)->hs_cur_bits) - -/** total number of hlist in @hs while rehashing */ -#define CFS_HASH_RH_NHLIST(hs) BIT((hs)->hs_rehash_bits) - -static inline int -cfs_hash_with_no_lock(struct cfs_hash *hs) -{ - /* caller will serialize all operations for this hash-table */ - return hs->hs_flags & CFS_HASH_NO_LOCK; -} - -static inline int -cfs_hash_with_no_bktlock(struct cfs_hash *hs) -{ - /* no bucket lock, one single lock to protect the hash-table */ - return hs->hs_flags & CFS_HASH_NO_BKTLOCK; -} - -static inline int -cfs_hash_with_rw_bktlock(struct cfs_hash *hs) -{ - /* rwlock to protect hash bucket */ - return hs->hs_flags & CFS_HASH_RW_BKTLOCK; -} - -static inline int -cfs_hash_with_spin_bktlock(struct cfs_hash *hs) -{ - /* spinlock to protect hash bucket */ - return hs->hs_flags & CFS_HASH_SPIN_BKTLOCK; -} - -static inline int -cfs_hash_with_add_tail(struct cfs_hash *hs) -{ - return hs->hs_flags & CFS_HASH_ADD_TAIL; -} - -static inline int -cfs_hash_with_no_itemref(struct cfs_hash *hs) -{ - /* - * hash-table doesn't keep refcount on item, - * item can't be removed from hash unless it's - * ZERO refcount - */ - return hs->hs_flags & CFS_HASH_NO_ITEMREF; -} - -static inline int -cfs_hash_with_bigname(struct cfs_hash *hs) -{ - return hs->hs_flags & CFS_HASH_BIGNAME; -} - -static inline int -cfs_hash_with_counter(struct cfs_hash *hs) -{ - return hs->hs_flags & CFS_HASH_COUNTER; -} - -static inline int -cfs_hash_with_rehash(struct cfs_hash *hs) -{ - return hs->hs_flags & CFS_HASH_REHASH; -} - -static inline int -cfs_hash_with_rehash_key(struct cfs_hash *hs) -{ - return hs->hs_flags & CFS_HASH_REHASH_KEY; -} - -static inline int -cfs_hash_with_shrink(struct cfs_hash *hs) -{ - return hs->hs_flags & CFS_HASH_SHRINK; -} - -static inline int -cfs_hash_with_assert_empty(struct cfs_hash *hs) -{ - return hs->hs_flags & CFS_HASH_ASSERT_EMPTY; -} - -static inline int -cfs_hash_with_depth(struct cfs_hash *hs) -{ - return hs->hs_flags & CFS_HASH_DEPTH; -} - -static inline int -cfs_hash_with_nblk_change(struct cfs_hash *hs) -{ - return hs->hs_flags & CFS_HASH_NBLK_CHANGE; -} - -static inline int -cfs_hash_is_exiting(struct cfs_hash *hs) -{ - /* cfs_hash_destroy is called */ - return hs->hs_exiting; -} - -static inline int -cfs_hash_is_rehashing(struct cfs_hash *hs) -{ - /* rehash is launched */ - return !!hs->hs_rehash_bits; -} - -static inline int -cfs_hash_is_iterating(struct cfs_hash *hs) -{ - /* someone is calling cfs_hash_for_each_* */ - return hs->hs_iterating || hs->hs_iterators; -} - -static inline int -cfs_hash_bkt_size(struct cfs_hash *hs) -{ - return offsetof(struct cfs_hash_bucket, hsb_head[0]) + - hs->hs_hops->hop_hhead_size(hs) * CFS_HASH_BKT_NHLIST(hs) + - hs->hs_extra_bytes; -} - -static inline unsigned -cfs_hash_id(struct cfs_hash *hs, const void *key, unsigned int mask) -{ - return hs->hs_ops->hs_hash(hs, key, mask); -} - -static inline void * -cfs_hash_key(struct cfs_hash *hs, struct hlist_node *hnode) -{ - return hs->hs_ops->hs_key(hnode); -} - -static inline void -cfs_hash_keycpy(struct cfs_hash *hs, struct hlist_node *hnode, void *key) -{ - if (hs->hs_ops->hs_keycpy) - hs->hs_ops->hs_keycpy(hnode, key); -} - -/** - * Returns 1 on a match, - */ -static inline int -cfs_hash_keycmp(struct cfs_hash *hs, const void *key, struct hlist_node *hnode) -{ - return hs->hs_ops->hs_keycmp(key, hnode); -} - -static inline void * -cfs_hash_object(struct cfs_hash *hs, struct hlist_node *hnode) -{ - return hs->hs_ops->hs_object(hnode); -} - -static inline void -cfs_hash_get(struct cfs_hash *hs, struct hlist_node *hnode) -{ - return hs->hs_ops->hs_get(hs, hnode); -} - -static inline void -cfs_hash_put_locked(struct cfs_hash *hs, struct hlist_node *hnode) -{ - return hs->hs_ops->hs_put_locked(hs, hnode); -} - -static inline void -cfs_hash_put(struct cfs_hash *hs, struct hlist_node *hnode) -{ - return hs->hs_ops->hs_put(hs, hnode); -} - -static inline void -cfs_hash_exit(struct cfs_hash *hs, struct hlist_node *hnode) -{ - if (hs->hs_ops->hs_exit) - hs->hs_ops->hs_exit(hs, hnode); -} - -static inline void cfs_hash_lock(struct cfs_hash *hs, int excl) -{ - hs->hs_lops->hs_lock(&hs->hs_lock, excl); -} - -static inline void cfs_hash_unlock(struct cfs_hash *hs, int excl) -{ - hs->hs_lops->hs_unlock(&hs->hs_lock, excl); -} - -static inline int cfs_hash_dec_and_lock(struct cfs_hash *hs, - atomic_t *condition) -{ - LASSERT(cfs_hash_with_no_bktlock(hs)); - return atomic_dec_and_lock(condition, &hs->hs_lock.spin); -} - -static inline void cfs_hash_bd_lock(struct cfs_hash *hs, - struct cfs_hash_bd *bd, int excl) -{ - hs->hs_lops->hs_bkt_lock(&bd->bd_bucket->hsb_lock, excl); -} - -static inline void cfs_hash_bd_unlock(struct cfs_hash *hs, - struct cfs_hash_bd *bd, int excl) -{ - hs->hs_lops->hs_bkt_unlock(&bd->bd_bucket->hsb_lock, excl); -} - -/** - * operations on cfs_hash bucket (bd: bucket descriptor), - * they are normally for hash-table without rehash - */ -void cfs_hash_bd_get(struct cfs_hash *hs, const void *key, - struct cfs_hash_bd *bd); - -static inline void -cfs_hash_bd_get_and_lock(struct cfs_hash *hs, const void *key, - struct cfs_hash_bd *bd, int excl) -{ - cfs_hash_bd_get(hs, key, bd); - cfs_hash_bd_lock(hs, bd, excl); -} - -static inline unsigned -cfs_hash_bd_index_get(struct cfs_hash *hs, struct cfs_hash_bd *bd) -{ - return bd->bd_offset | (bd->bd_bucket->hsb_index << hs->hs_bkt_bits); -} - -static inline void -cfs_hash_bd_index_set(struct cfs_hash *hs, unsigned int index, - struct cfs_hash_bd *bd) -{ - bd->bd_bucket = hs->hs_buckets[index >> hs->hs_bkt_bits]; - bd->bd_offset = index & (CFS_HASH_BKT_NHLIST(hs) - 1U); -} - -static inline void * -cfs_hash_bd_extra_get(struct cfs_hash *hs, struct cfs_hash_bd *bd) -{ - return (void *)bd->bd_bucket + - cfs_hash_bkt_size(hs) - hs->hs_extra_bytes; -} - -static inline u32 -cfs_hash_bd_version_get(struct cfs_hash_bd *bd) -{ - /* need hold cfs_hash_bd_lock */ - return bd->bd_bucket->hsb_version; -} - -static inline u32 -cfs_hash_bd_count_get(struct cfs_hash_bd *bd) -{ - /* need hold cfs_hash_bd_lock */ - return bd->bd_bucket->hsb_count; -} - -static inline int -cfs_hash_bd_depmax_get(struct cfs_hash_bd *bd) -{ - return bd->bd_bucket->hsb_depmax; -} - -static inline int -cfs_hash_bd_compare(struct cfs_hash_bd *bd1, struct cfs_hash_bd *bd2) -{ - if (bd1->bd_bucket->hsb_index != bd2->bd_bucket->hsb_index) - return bd1->bd_bucket->hsb_index - bd2->bd_bucket->hsb_index; - - if (bd1->bd_offset != bd2->bd_offset) - return bd1->bd_offset - bd2->bd_offset; - - return 0; -} - -void cfs_hash_bd_add_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, - struct hlist_node *hnode); -void cfs_hash_bd_del_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, - struct hlist_node *hnode); -void cfs_hash_bd_move_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd_old, - struct cfs_hash_bd *bd_new, - struct hlist_node *hnode); - -static inline int -cfs_hash_bd_dec_and_lock(struct cfs_hash *hs, struct cfs_hash_bd *bd, - atomic_t *condition) -{ - LASSERT(cfs_hash_with_spin_bktlock(hs)); - return atomic_dec_and_lock(condition, &bd->bd_bucket->hsb_lock.spin); -} - -static inline struct hlist_head * -cfs_hash_bd_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd) -{ - return hs->hs_hops->hop_hhead(hs, bd); -} - -struct hlist_node * -cfs_hash_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, - const void *key); -struct hlist_node * -cfs_hash_bd_peek_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, - const void *key); - -/** - * operations on cfs_hash bucket (bd: bucket descriptor), - * they are safe for hash-table with rehash - */ -void cfs_hash_dual_bd_get(struct cfs_hash *hs, const void *key, - struct cfs_hash_bd *bds); -void cfs_hash_dual_bd_lock(struct cfs_hash *hs, struct cfs_hash_bd *bds, - int excl); -void cfs_hash_dual_bd_unlock(struct cfs_hash *hs, struct cfs_hash_bd *bds, - int excl); - -static inline void -cfs_hash_dual_bd_get_and_lock(struct cfs_hash *hs, const void *key, - struct cfs_hash_bd *bds, int excl) -{ - cfs_hash_dual_bd_get(hs, key, bds); - cfs_hash_dual_bd_lock(hs, bds, excl); -} - -struct hlist_node * -cfs_hash_dual_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, - const void *key); -struct hlist_node * -cfs_hash_dual_bd_findadd_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, - const void *key, struct hlist_node *hnode, - int insist_add); -struct hlist_node * -cfs_hash_dual_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, - const void *key, struct hlist_node *hnode); - -/* Hash init/cleanup functions */ -struct cfs_hash * -cfs_hash_create(char *name, unsigned int cur_bits, unsigned int max_bits, - unsigned int bkt_bits, unsigned int extra_bytes, - unsigned int min_theta, unsigned int max_theta, - struct cfs_hash_ops *ops, unsigned int flags); - -struct cfs_hash *cfs_hash_getref(struct cfs_hash *hs); -void cfs_hash_putref(struct cfs_hash *hs); - -/* Hash addition functions */ -void cfs_hash_add(struct cfs_hash *hs, const void *key, - struct hlist_node *hnode); -int cfs_hash_add_unique(struct cfs_hash *hs, const void *key, - struct hlist_node *hnode); -void *cfs_hash_findadd_unique(struct cfs_hash *hs, const void *key, - struct hlist_node *hnode); - -/* Hash deletion functions */ -void *cfs_hash_del(struct cfs_hash *hs, const void *key, - struct hlist_node *hnode); -void *cfs_hash_del_key(struct cfs_hash *hs, const void *key); - -/* Hash lookup/for_each functions */ -#define CFS_HASH_LOOP_HOG 1024 - -typedef int (*cfs_hash_for_each_cb_t)(struct cfs_hash *hs, - struct cfs_hash_bd *bd, - struct hlist_node *node, - void *data); -void * -cfs_hash_lookup(struct cfs_hash *hs, const void *key); -void -cfs_hash_for_each(struct cfs_hash *hs, cfs_hash_for_each_cb_t cb, void *data); -void -cfs_hash_for_each_safe(struct cfs_hash *hs, cfs_hash_for_each_cb_t cb, - void *data); -int -cfs_hash_for_each_nolock(struct cfs_hash *hs, cfs_hash_for_each_cb_t cb, - void *data, int start); -int -cfs_hash_for_each_empty(struct cfs_hash *hs, cfs_hash_for_each_cb_t cb, - void *data); -void -cfs_hash_for_each_key(struct cfs_hash *hs, const void *key, - cfs_hash_for_each_cb_t cb, void *data); -typedef int (*cfs_hash_cond_opt_cb_t)(void *obj, void *data); -void -cfs_hash_cond_del(struct cfs_hash *hs, cfs_hash_cond_opt_cb_t cb, void *data); - -void -cfs_hash_hlist_for_each(struct cfs_hash *hs, unsigned int hindex, - cfs_hash_for_each_cb_t cb, void *data); -int cfs_hash_is_empty(struct cfs_hash *hs); -u64 cfs_hash_size_get(struct cfs_hash *hs); - -/* - * Rehash - Theta is calculated to be the average chained - * hash depth assuming a perfectly uniform hash function. - */ -void cfs_hash_rehash_cancel_locked(struct cfs_hash *hs); -void cfs_hash_rehash_cancel(struct cfs_hash *hs); -void cfs_hash_rehash(struct cfs_hash *hs, int do_rehash); -void cfs_hash_rehash_key(struct cfs_hash *hs, const void *old_key, - void *new_key, struct hlist_node *hnode); - -#if CFS_HASH_DEBUG_LEVEL > CFS_HASH_DEBUG_1 -/* Validate hnode references the correct key */ -static inline void -cfs_hash_key_validate(struct cfs_hash *hs, const void *key, - struct hlist_node *hnode) -{ - LASSERT(cfs_hash_keycmp(hs, key, hnode)); -} - -/* Validate hnode is in the correct bucket */ -static inline void -cfs_hash_bucket_validate(struct cfs_hash *hs, struct cfs_hash_bd *bd, - struct hlist_node *hnode) -{ - struct cfs_hash_bd bds[2]; - - cfs_hash_dual_bd_get(hs, cfs_hash_key(hs, hnode), bds); - LASSERT(bds[0].bd_bucket == bd->bd_bucket || - bds[1].bd_bucket == bd->bd_bucket); -} - -#else /* CFS_HASH_DEBUG_LEVEL > CFS_HASH_DEBUG_1 */ - -static inline void -cfs_hash_key_validate(struct cfs_hash *hs, const void *key, - struct hlist_node *hnode) {} - -static inline void -cfs_hash_bucket_validate(struct cfs_hash *hs, struct cfs_hash_bd *bd, - struct hlist_node *hnode) {} - -#endif /* CFS_HASH_DEBUG_LEVEL */ - -#define CFS_HASH_THETA_BITS 10 -#define CFS_HASH_MIN_THETA BIT(CFS_HASH_THETA_BITS - 1) -#define CFS_HASH_MAX_THETA BIT(CFS_HASH_THETA_BITS + 1) - -/* Return integer component of theta */ -static inline int __cfs_hash_theta_int(int theta) -{ - return (theta >> CFS_HASH_THETA_BITS); -} - -/* Return a fractional value between 0 and 999 */ -static inline int __cfs_hash_theta_frac(int theta) -{ - return ((theta * 1000) >> CFS_HASH_THETA_BITS) - - (__cfs_hash_theta_int(theta) * 1000); -} - -static inline int __cfs_hash_theta(struct cfs_hash *hs) -{ - return (atomic_read(&hs->hs_count) << - CFS_HASH_THETA_BITS) >> hs->hs_cur_bits; -} - -static inline void -__cfs_hash_set_theta(struct cfs_hash *hs, int min, int max) -{ - LASSERT(min < max); - hs->hs_min_theta = (u16)min; - hs->hs_max_theta = (u16)max; -} - -/* Generic debug formatting routines mainly for proc handler */ -struct seq_file; -void cfs_hash_debug_header(struct seq_file *m); -void cfs_hash_debug_str(struct cfs_hash *hs, struct seq_file *m); - -/* - * Generic djb2 hash algorithm for character arrays. - */ -static inline unsigned -cfs_hash_djb2_hash(const void *key, size_t size, unsigned int mask) -{ - unsigned int i, hash = 5381; - - LASSERT(key); - - for (i = 0; i < size; i++) - hash = hash * 33 + ((char *)key)[i]; - - return (hash & mask); -} - -/* - * Generic u32 hash algorithm. - */ -static inline unsigned -cfs_hash_u32_hash(const u32 key, unsigned int mask) -{ - return ((key * CFS_GOLDEN_RATIO_PRIME_32) & mask); -} - -/* - * Generic u64 hash algorithm. - */ -static inline unsigned -cfs_hash_u64_hash(const u64 key, unsigned int mask) -{ - return ((unsigned int)(key * CFS_GOLDEN_RATIO_PRIME_64) & mask); -} - -/** iterate over all buckets in @bds (array of struct cfs_hash_bd) */ -#define cfs_hash_for_each_bd(bds, n, i) \ - for (i = 0; i < n && (bds)[i].bd_bucket != NULL; i++) - -/** iterate over all buckets of @hs */ -#define cfs_hash_for_each_bucket(hs, bd, pos) \ - for (pos = 0; \ - pos < CFS_HASH_NBKT(hs) && \ - ((bd)->bd_bucket = (hs)->hs_buckets[pos]) != NULL; pos++) - -/** iterate over all hlist of bucket @bd */ -#define cfs_hash_bd_for_each_hlist(hs, bd, hlist) \ - for ((bd)->bd_offset = 0; \ - (bd)->bd_offset < CFS_HASH_BKT_NHLIST(hs) && \ - (hlist = cfs_hash_bd_hhead(hs, bd)) != NULL; \ - (bd)->bd_offset++) - -/* !__LIBCFS__HASH_H__ */ -#endif diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h deleted file mode 100644 index 491d5971d19925faf8aff744557c3e3ad40e67b2..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h +++ /dev/null @@ -1,200 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * libcfs/include/libcfs/libcfs_private.h - * - * Various defines for libcfs. - * - */ - -#ifndef __LIBCFS_PRIVATE_H__ -#define __LIBCFS_PRIVATE_H__ - -#ifndef DEBUG_SUBSYSTEM -# define DEBUG_SUBSYSTEM S_UNDEFINED -#endif - -#define LASSERTF(cond, fmt, ...) \ -do { \ - if (unlikely(!(cond))) { \ - LIBCFS_DEBUG_MSG_DATA_DECL(__msg_data, D_EMERG, NULL); \ - libcfs_debug_msg(&__msg_data, \ - "ASSERTION( %s ) failed: " fmt, #cond, \ - ## __VA_ARGS__); \ - lbug_with_loc(&__msg_data); \ - } \ -} while (0) - -#define LASSERT(cond) LASSERTF(cond, "\n") - -#ifdef CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK -/** - * This is for more expensive checks that one doesn't want to be enabled all - * the time. LINVRNT() has to be explicitly enabled by - * CONFIG_LUSTRE_DEBUG_EXPENSIVE_CHECK option. - */ -# define LINVRNT(exp) LASSERT(exp) -#else -# define LINVRNT(exp) ((void)sizeof !!(exp)) -#endif - -void __noreturn lbug_with_loc(struct libcfs_debug_msg_data *msg); - -#define LBUG() \ -do { \ - LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_EMERG, NULL); \ - lbug_with_loc(&msgdata); \ -} while (0) - -/* - * Use #define rather than inline, as lnet_cpt_table() might - * not be defined yet - */ -#define kmalloc_cpt(size, flags, cpt) \ - kmalloc_node(size, flags, cfs_cpt_spread_node(lnet_cpt_table(), cpt)) - -#define kzalloc_cpt(size, flags, cpt) \ - kmalloc_node(size, flags | __GFP_ZERO, \ - cfs_cpt_spread_node(lnet_cpt_table(), cpt)) - -#define kvmalloc_cpt(size, flags, cpt) \ - kvmalloc_node(size, flags, \ - cfs_cpt_spread_node(lnet_cpt_table(), cpt)) - -#define kvzalloc_cpt(size, flags, cpt) \ - kvmalloc_node(size, flags | __GFP_ZERO, \ - cfs_cpt_spread_node(lnet_cpt_table(), cpt)) - -/******************************************************************************/ - -void libcfs_debug_dumplog(void); -int libcfs_debug_init(unsigned long bufsize); -int libcfs_debug_cleanup(void); -int libcfs_debug_clear_buffer(void); -int libcfs_debug_mark_buffer(const char *text); - -/* - * allocate a variable array, returned value is an array of pointers. - * Caller can specify length of array by count. - */ -void *cfs_array_alloc(int count, unsigned int size); -void cfs_array_free(void *vars); - -#define LASSERT_ATOMIC_ENABLED (1) - -#if LASSERT_ATOMIC_ENABLED - -/** assert value of @a is equal to @v */ -#define LASSERT_ATOMIC_EQ(a, v) \ - LASSERTF(atomic_read(a) == v, "value: %d\n", atomic_read((a))) - -/** assert value of @a is unequal to @v */ -#define LASSERT_ATOMIC_NE(a, v) \ - LASSERTF(atomic_read(a) != v, "value: %d\n", atomic_read((a))) - -/** assert value of @a is little than @v */ -#define LASSERT_ATOMIC_LT(a, v) \ - LASSERTF(atomic_read(a) < v, "value: %d\n", atomic_read((a))) - -/** assert value of @a is little/equal to @v */ -#define LASSERT_ATOMIC_LE(a, v) \ - LASSERTF(atomic_read(a) <= v, "value: %d\n", atomic_read((a))) - -/** assert value of @a is great than @v */ -#define LASSERT_ATOMIC_GT(a, v) \ - LASSERTF(atomic_read(a) > v, "value: %d\n", atomic_read((a))) - -/** assert value of @a is great/equal to @v */ -#define LASSERT_ATOMIC_GE(a, v) \ - LASSERTF(atomic_read(a) >= v, "value: %d\n", atomic_read((a))) - -/** assert value of @a is great than @v1 and little than @v2 */ -#define LASSERT_ATOMIC_GT_LT(a, v1, v2) \ -do { \ - int __v = atomic_read(a); \ - LASSERTF(__v > v1 && __v < v2, "value: %d\n", __v); \ -} while (0) - -/** assert value of @a is great than @v1 and little/equal to @v2 */ -#define LASSERT_ATOMIC_GT_LE(a, v1, v2) \ -do { \ - int __v = atomic_read(a); \ - LASSERTF(__v > v1 && __v <= v2, "value: %d\n", __v); \ -} while (0) - -/** assert value of @a is great/equal to @v1 and little than @v2 */ -#define LASSERT_ATOMIC_GE_LT(a, v1, v2) \ -do { \ - int __v = atomic_read(a); \ - LASSERTF(__v >= v1 && __v < v2, "value: %d\n", __v); \ -} while (0) - -/** assert value of @a is great/equal to @v1 and little/equal to @v2 */ -#define LASSERT_ATOMIC_GE_LE(a, v1, v2) \ -do { \ - int __v = atomic_read(a); \ - LASSERTF(__v >= v1 && __v <= v2, "value: %d\n", __v); \ -} while (0) - -#else /* !LASSERT_ATOMIC_ENABLED */ - -#define LASSERT_ATOMIC_EQ(a, v) do {} while (0) -#define LASSERT_ATOMIC_NE(a, v) do {} while (0) -#define LASSERT_ATOMIC_LT(a, v) do {} while (0) -#define LASSERT_ATOMIC_LE(a, v) do {} while (0) -#define LASSERT_ATOMIC_GT(a, v) do {} while (0) -#define LASSERT_ATOMIC_GE(a, v) do {} while (0) -#define LASSERT_ATOMIC_GT_LT(a, v1, v2) do {} while (0) -#define LASSERT_ATOMIC_GT_LE(a, v1, v2) do {} while (0) -#define LASSERT_ATOMIC_GE_LT(a, v1, v2) do {} while (0) -#define LASSERT_ATOMIC_GE_LE(a, v1, v2) do {} while (0) - -#endif /* LASSERT_ATOMIC_ENABLED */ - -#define LASSERT_ATOMIC_ZERO(a) LASSERT_ATOMIC_EQ(a, 0) -#define LASSERT_ATOMIC_POS(a) LASSERT_ATOMIC_GT(a, 0) - -/* implication */ -#define ergo(a, b) (!(a) || (b)) -/* logical equivalence */ -#define equi(a, b) (!!(a) == !!(b)) - -#ifndef HAVE_CFS_SIZE_ROUND -static inline size_t cfs_size_round(int val) -{ - return round_up(val, 8); -} - -#define HAVE_CFS_SIZE_ROUND -#endif - -#endif diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_string.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_string.h deleted file mode 100644 index cd7c3ccb2dc0756f1aace380450bcb6a5dc5d705..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/linux/libcfs/libcfs_string.h +++ /dev/null @@ -1,102 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * libcfs/include/libcfs/libcfs_string.h - * - * Generic string manipulation functions. - * - * Author: Nathan Rutman - */ - -#ifndef __LIBCFS_STRING_H__ -#define __LIBCFS_STRING_H__ - -#include - -/* libcfs_string.c */ -/* Convert a text string to a bitmask */ -int cfs_str2mask(const char *str, const char *(*bit2str)(int bit), - int *oldmask, int minmask, int allmask); -/* trim leading and trailing space characters */ -char *cfs_firststr(char *str, size_t size); - -/** - * Structure to represent NULL-less strings. - */ -struct cfs_lstr { - char *ls_str; - int ls_len; -}; - -/* - * Structure to represent \ token of the syntax. - */ -struct cfs_range_expr { - /* - * Link to cfs_expr_list::el_exprs. - */ - struct list_head re_link; - u32 re_lo; - u32 re_hi; - u32 re_stride; -}; - -struct cfs_expr_list { - struct list_head el_link; - struct list_head el_exprs; -}; - -int cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res); -int cfs_str2num_check(char *str, int nob, unsigned int *num, - unsigned int min, unsigned int max); -int cfs_expr_list_match(u32 value, struct cfs_expr_list *expr_list); -int cfs_expr_list_print(char *buffer, int count, - struct cfs_expr_list *expr_list); -int cfs_expr_list_values(struct cfs_expr_list *expr_list, - int max, u32 **values); -static inline void -cfs_expr_list_values_free(u32 *values, int num) -{ - /* - * This array is allocated by kvalloc(), so it shouldn't be freed - * by OBD_FREE() if it's called by module other than libcfs & LNet, - * otherwise we will see fake memory leak - */ - kvfree(values); -} - -void cfs_expr_list_free(struct cfs_expr_list *expr_list); -int cfs_expr_list_parse(char *str, int len, unsigned int min, unsigned int max, - struct cfs_expr_list **elpp); -void cfs_expr_list_free_list(struct list_head *list); - -#endif diff --git a/drivers/staging/lustre/include/linux/lnet/api.h b/drivers/staging/lustre/include/linux/lnet/api.h deleted file mode 100644 index dae2e4f0056c1a55fb2c3c61da8f8e9191a20346..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/linux/lnet/api.h +++ /dev/null @@ -1,212 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011 - 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Seagate, Inc. - */ - -#ifndef __LNET_API_H__ -#define __LNET_API_H__ - -/** \defgroup lnet LNet - * - * The Lustre Networking subsystem. - * - * LNet is an asynchronous message-passing API, which provides an unreliable - * connectionless service that can't guarantee any order. It supports OFA IB, - * TCP/IP, and Cray Interconnects, and routes between heterogeneous networks. - * - * @{ - */ - -#include - -/** \defgroup lnet_init_fini Initialization and cleanup - * The LNet must be properly initialized before any LNet calls can be made. - * @{ - */ -int LNetNIInit(lnet_pid_t requested_pid); -int LNetNIFini(void); -/** @} lnet_init_fini */ - -/** \defgroup lnet_addr LNet addressing and basic types - * - * Addressing scheme and basic data types of LNet. - * - * The LNet API is memory-oriented, so LNet must be able to address not only - * end-points but also memory region within a process address space. - * An ::lnet_nid_t addresses an end-point. An ::lnet_pid_t identifies a process - * in a node. A portal represents an opening in the address space of a - * process. Match bits is criteria to identify a region of memory inside a - * portal, and offset specifies an offset within the memory region. - * - * LNet creates a table of portals for each process during initialization. - * This table has MAX_PORTALS entries and its size can't be dynamically - * changed. A portal stays empty until the owning process starts to add - * memory regions to it. A portal is sometimes called an index because - * it's an entry in the portals table of a process. - * - * \see LNetMEAttach - * @{ - */ -int LNetGetId(unsigned int index, struct lnet_process_id *id); -int LNetDist(lnet_nid_t nid, lnet_nid_t *srcnid, __u32 *order); - -/** @} lnet_addr */ - -/** \defgroup lnet_me Match entries - * - * A match entry (abbreviated as ME) describes a set of criteria to accept - * incoming requests. - * - * A portal is essentially a match list plus a set of attributes. A match - * list is a chain of MEs. Each ME includes a pointer to a memory descriptor - * and a set of match criteria. The match criteria can be used to reject - * incoming requests based on process ID or the match bits provided in the - * request. MEs can be dynamically inserted into a match list by LNetMEAttach() - * and LNetMEInsert(), and removed from its list by LNetMEUnlink(). - * @{ - */ -int LNetMEAttach(unsigned int portal, - struct lnet_process_id match_id_in, - __u64 match_bits_in, - __u64 ignore_bits_in, - enum lnet_unlink unlink_in, - enum lnet_ins_pos pos_in, - struct lnet_handle_me *handle_out); - -int LNetMEInsert(struct lnet_handle_me current_in, - struct lnet_process_id match_id_in, - __u64 match_bits_in, - __u64 ignore_bits_in, - enum lnet_unlink unlink_in, - enum lnet_ins_pos position_in, - struct lnet_handle_me *handle_out); - -int LNetMEUnlink(struct lnet_handle_me current_in); -/** @} lnet_me */ - -/** \defgroup lnet_md Memory descriptors - * - * A memory descriptor contains information about a region of a user's - * memory (either in kernel or user space) and optionally points to an - * event queue where information about the operations performed on the - * memory descriptor are recorded. Memory descriptor is abbreviated as - * MD and can be used interchangeably with the memory region it describes. - * - * The LNet API provides two operations to create MDs: LNetMDAttach() - * and LNetMDBind(); one operation to unlink and release the resources - * associated with a MD: LNetMDUnlink(). - * @{ - */ -int LNetMDAttach(struct lnet_handle_me current_in, - struct lnet_md md_in, - enum lnet_unlink unlink_in, - struct lnet_handle_md *md_handle_out); - -int LNetMDBind(struct lnet_md md_in, - enum lnet_unlink unlink_in, - struct lnet_handle_md *md_handle_out); - -int LNetMDUnlink(struct lnet_handle_md md_in); -/** @} lnet_md */ - -/** \defgroup lnet_eq Events and event queues - * - * Event queues (abbreviated as EQ) are used to log operations performed on - * local MDs. In particular, they signal the completion of a data transmission - * into or out of a MD. They can also be used to hold acknowledgments for - * completed PUT operations and indicate when a MD has been unlinked. Multiple - * MDs can share a single EQ. An EQ may have an optional event handler - * associated with it. If an event handler exists, it will be run for each - * event that is deposited into the EQ. - * - * In addition to the lnet_handle_eq, the LNet API defines two types - * associated with events: The ::lnet_event_kind defines the kinds of events - * that can be stored in an EQ. The lnet_event defines a structure that - * holds the information about with an event. - * - * There are five functions for dealing with EQs: LNetEQAlloc() is used to - * create an EQ and allocate the resources needed, while LNetEQFree() - * releases these resources and free the EQ. LNetEQGet() retrieves the next - * event from an EQ, and LNetEQWait() can be used to block a process until - * an EQ has at least one event. LNetEQPoll() can be used to test or wait - * on multiple EQs. - * @{ - */ -int LNetEQAlloc(unsigned int count_in, - lnet_eq_handler_t handler, - struct lnet_handle_eq *handle_out); - -int LNetEQFree(struct lnet_handle_eq eventq_in); - -int LNetEQPoll(struct lnet_handle_eq *eventqs_in, - int neq_in, - int timeout_ms, - int interruptible, - struct lnet_event *event_out, - int *which_eq_out); -/** @} lnet_eq */ - -/** \defgroup lnet_data Data movement operations - * - * The LNet API provides two data movement operations: LNetPut() - * and LNetGet(). - * @{ - */ -int LNetPut(lnet_nid_t self, - struct lnet_handle_md md_in, - enum lnet_ack_req ack_req_in, - struct lnet_process_id target_in, - unsigned int portal_in, - __u64 match_bits_in, - unsigned int offset_in, - __u64 hdr_data_in); - -int LNetGet(lnet_nid_t self, - struct lnet_handle_md md_in, - struct lnet_process_id target_in, - unsigned int portal_in, - __u64 match_bits_in, - unsigned int offset_in); -/** @} lnet_data */ - -/** \defgroup lnet_misc Miscellaneous operations. - * Miscellaneous operations. - * @{ - */ -int LNetSetLazyPortal(int portal); -int LNetClearLazyPortal(int portal); -int LNetCtl(unsigned int cmd, void *arg); -void LNetDebugPeer(struct lnet_process_id id); - -/** @} lnet_misc */ - -/** @} lnet */ -#endif diff --git a/drivers/staging/lustre/include/linux/lnet/lib-lnet.h b/drivers/staging/lustre/include/linux/lnet/lib-lnet.h deleted file mode 100644 index 973c17a1c4a150783a037b0a21fd766feb607554..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/linux/lnet/lib-lnet.h +++ /dev/null @@ -1,652 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Seagate, Inc. - * - * lnet/include/lnet/lib-lnet.h - */ - -#ifndef __LNET_LIB_LNET_H__ -#define __LNET_LIB_LNET_H__ - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -extern struct lnet the_lnet; /* THE network */ - -#if (BITS_PER_LONG == 32) -/* 2 CPTs, allowing more CPTs might make us under memory pressure */ -#define LNET_CPT_MAX_BITS 1 - -#else /* 64-bit system */ -/* - * 256 CPTs for thousands of CPUs, allowing more CPTs might make us - * under risk of consuming all lh_cookie. - */ -#define LNET_CPT_MAX_BITS 8 -#endif /* BITS_PER_LONG == 32 */ - -/* max allowed CPT number */ -#define LNET_CPT_MAX (1 << LNET_CPT_MAX_BITS) - -#define LNET_CPT_NUMBER (the_lnet.ln_cpt_number) -#define LNET_CPT_BITS (the_lnet.ln_cpt_bits) -#define LNET_CPT_MASK ((1ULL << LNET_CPT_BITS) - 1) - -/** exclusive lock */ -#define LNET_LOCK_EX CFS_PERCPT_LOCK_EX - -/* need both kernel and user-land acceptor */ -#define LNET_ACCEPTOR_MIN_RESERVED_PORT 512 -#define LNET_ACCEPTOR_MAX_RESERVED_PORT 1023 - -static inline int lnet_is_route_alive(struct lnet_route *route) -{ - /* gateway is down */ - if (!route->lr_gateway->lp_alive) - return 0; - /* no NI status, assume it's alive */ - if ((route->lr_gateway->lp_ping_feats & - LNET_PING_FEAT_NI_STATUS) == 0) - return 1; - /* has NI status, check # down NIs */ - return route->lr_downis == 0; -} - -static inline int lnet_is_wire_handle_none(struct lnet_handle_wire *wh) -{ - return (wh->wh_interface_cookie == LNET_WIRE_HANDLE_COOKIE_NONE && - wh->wh_object_cookie == LNET_WIRE_HANDLE_COOKIE_NONE); -} - -static inline int lnet_md_exhausted(struct lnet_libmd *md) -{ - return (!md->md_threshold || - ((md->md_options & LNET_MD_MAX_SIZE) && - md->md_offset + md->md_max_size > md->md_length)); -} - -static inline int lnet_md_unlinkable(struct lnet_libmd *md) -{ - /* - * Should unlink md when its refcount is 0 and either: - * - md has been flagged for deletion (by auto unlink or - * LNetM[DE]Unlink, in the latter case md may not be exhausted). - * - auto unlink is on and md is exhausted. - */ - if (md->md_refcount) - return 0; - - if (md->md_flags & LNET_MD_FLAG_ZOMBIE) - return 1; - - return ((md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) && - lnet_md_exhausted(md)); -} - -#define lnet_cpt_table() (the_lnet.ln_cpt_table) -#define lnet_cpt_current() cfs_cpt_current(the_lnet.ln_cpt_table, 1) - -static inline int -lnet_cpt_of_cookie(__u64 cookie) -{ - unsigned int cpt = (cookie >> LNET_COOKIE_TYPE_BITS) & LNET_CPT_MASK; - - /* - * LNET_CPT_NUMBER doesn't have to be power2, which means we can - * get illegal cpt from it's invalid cookie - */ - return cpt < LNET_CPT_NUMBER ? cpt : cpt % LNET_CPT_NUMBER; -} - -static inline void -lnet_res_lock(int cpt) -{ - cfs_percpt_lock(the_lnet.ln_res_lock, cpt); -} - -static inline void -lnet_res_unlock(int cpt) -{ - cfs_percpt_unlock(the_lnet.ln_res_lock, cpt); -} - -static inline int -lnet_res_lock_current(void) -{ - int cpt = lnet_cpt_current(); - - lnet_res_lock(cpt); - return cpt; -} - -static inline void -lnet_net_lock(int cpt) -{ - cfs_percpt_lock(the_lnet.ln_net_lock, cpt); -} - -static inline void -lnet_net_unlock(int cpt) -{ - cfs_percpt_unlock(the_lnet.ln_net_lock, cpt); -} - -static inline int -lnet_net_lock_current(void) -{ - int cpt = lnet_cpt_current(); - - lnet_net_lock(cpt); - return cpt; -} - -#define LNET_LOCK() lnet_net_lock(LNET_LOCK_EX) -#define LNET_UNLOCK() lnet_net_unlock(LNET_LOCK_EX) - -#define lnet_ptl_lock(ptl) spin_lock(&(ptl)->ptl_lock) -#define lnet_ptl_unlock(ptl) spin_unlock(&(ptl)->ptl_lock) -#define lnet_eq_wait_lock() spin_lock(&the_lnet.ln_eq_wait_lock) -#define lnet_eq_wait_unlock() spin_unlock(&the_lnet.ln_eq_wait_lock) -#define lnet_ni_lock(ni) spin_lock(&(ni)->ni_lock) -#define lnet_ni_unlock(ni) spin_unlock(&(ni)->ni_lock) - -#define MAX_PORTALS 64 - -static inline struct lnet_libmd * -lnet_md_alloc(struct lnet_md *umd) -{ - struct lnet_libmd *md; - unsigned int size; - unsigned int niov; - - if (umd->options & LNET_MD_KIOV) { - niov = umd->length; - size = offsetof(struct lnet_libmd, md_iov.kiov[niov]); - } else { - niov = umd->options & LNET_MD_IOVEC ? umd->length : 1; - size = offsetof(struct lnet_libmd, md_iov.iov[niov]); - } - - md = kzalloc(size, GFP_NOFS); - - if (md) { - /* Set here in case of early free */ - md->md_options = umd->options; - md->md_niov = niov; - INIT_LIST_HEAD(&md->md_list); - } - - return md; -} - -struct lnet_libhandle *lnet_res_lh_lookup(struct lnet_res_container *rec, - __u64 cookie); -void lnet_res_lh_initialize(struct lnet_res_container *rec, - struct lnet_libhandle *lh); -static inline void -lnet_res_lh_invalidate(struct lnet_libhandle *lh) -{ - /* NB: cookie is still useful, don't reset it */ - list_del(&lh->lh_hash_chain); -} - -static inline void -lnet_eq2handle(struct lnet_handle_eq *handle, struct lnet_eq *eq) -{ - if (!eq) { - LNetInvalidateEQHandle(handle); - return; - } - - handle->cookie = eq->eq_lh.lh_cookie; -} - -static inline struct lnet_eq * -lnet_handle2eq(struct lnet_handle_eq *handle) -{ - struct lnet_libhandle *lh; - - lh = lnet_res_lh_lookup(&the_lnet.ln_eq_container, handle->cookie); - if (!lh) - return NULL; - - return lh_entry(lh, struct lnet_eq, eq_lh); -} - -static inline void -lnet_md2handle(struct lnet_handle_md *handle, struct lnet_libmd *md) -{ - handle->cookie = md->md_lh.lh_cookie; -} - -static inline struct lnet_libmd * -lnet_handle2md(struct lnet_handle_md *handle) -{ - /* ALWAYS called with resource lock held */ - struct lnet_libhandle *lh; - int cpt; - - cpt = lnet_cpt_of_cookie(handle->cookie); - lh = lnet_res_lh_lookup(the_lnet.ln_md_containers[cpt], - handle->cookie); - if (!lh) - return NULL; - - return lh_entry(lh, struct lnet_libmd, md_lh); -} - -static inline struct lnet_libmd * -lnet_wire_handle2md(struct lnet_handle_wire *wh) -{ - /* ALWAYS called with resource lock held */ - struct lnet_libhandle *lh; - int cpt; - - if (wh->wh_interface_cookie != the_lnet.ln_interface_cookie) - return NULL; - - cpt = lnet_cpt_of_cookie(wh->wh_object_cookie); - lh = lnet_res_lh_lookup(the_lnet.ln_md_containers[cpt], - wh->wh_object_cookie); - if (!lh) - return NULL; - - return lh_entry(lh, struct lnet_libmd, md_lh); -} - -static inline void -lnet_me2handle(struct lnet_handle_me *handle, struct lnet_me *me) -{ - handle->cookie = me->me_lh.lh_cookie; -} - -static inline struct lnet_me * -lnet_handle2me(struct lnet_handle_me *handle) -{ - /* ALWAYS called with resource lock held */ - struct lnet_libhandle *lh; - int cpt; - - cpt = lnet_cpt_of_cookie(handle->cookie); - lh = lnet_res_lh_lookup(the_lnet.ln_me_containers[cpt], - handle->cookie); - if (!lh) - return NULL; - - return lh_entry(lh, struct lnet_me, me_lh); -} - -static inline void -lnet_peer_addref_locked(struct lnet_peer *lp) -{ - LASSERT(lp->lp_refcount > 0); - lp->lp_refcount++; -} - -void lnet_destroy_peer_locked(struct lnet_peer *lp); - -static inline void -lnet_peer_decref_locked(struct lnet_peer *lp) -{ - LASSERT(lp->lp_refcount > 0); - lp->lp_refcount--; - if (!lp->lp_refcount) - lnet_destroy_peer_locked(lp); -} - -static inline int -lnet_isrouter(struct lnet_peer *lp) -{ - return lp->lp_rtr_refcount ? 1 : 0; -} - -static inline void -lnet_ni_addref_locked(struct lnet_ni *ni, int cpt) -{ - LASSERT(cpt >= 0 && cpt < LNET_CPT_NUMBER); - LASSERT(*ni->ni_refs[cpt] >= 0); - - (*ni->ni_refs[cpt])++; -} - -static inline void -lnet_ni_addref(struct lnet_ni *ni) -{ - lnet_net_lock(0); - lnet_ni_addref_locked(ni, 0); - lnet_net_unlock(0); -} - -static inline void -lnet_ni_decref_locked(struct lnet_ni *ni, int cpt) -{ - LASSERT(cpt >= 0 && cpt < LNET_CPT_NUMBER); - LASSERT(*ni->ni_refs[cpt] > 0); - - (*ni->ni_refs[cpt])--; -} - -static inline void -lnet_ni_decref(struct lnet_ni *ni) -{ - lnet_net_lock(0); - lnet_ni_decref_locked(ni, 0); - lnet_net_unlock(0); -} - -void lnet_ni_free(struct lnet_ni *ni); -struct lnet_ni * -lnet_ni_alloc(__u32 net, struct cfs_expr_list *el, struct list_head *nilist); - -static inline int -lnet_nid2peerhash(lnet_nid_t nid) -{ - return hash_long(nid, LNET_PEER_HASH_BITS); -} - -static inline struct list_head * -lnet_net2rnethash(__u32 net) -{ - return &the_lnet.ln_remote_nets_hash[(LNET_NETNUM(net) + - LNET_NETTYP(net)) & - ((1U << the_lnet.ln_remote_nets_hbits) - 1)]; -} - -extern struct lnet_lnd the_lolnd; -extern int avoid_asym_router_failure; - -int lnet_cpt_of_nid_locked(lnet_nid_t nid); -int lnet_cpt_of_nid(lnet_nid_t nid); -struct lnet_ni *lnet_nid2ni_locked(lnet_nid_t nid, int cpt); -struct lnet_ni *lnet_net2ni_locked(__u32 net, int cpt); -struct lnet_ni *lnet_net2ni(__u32 net); - -extern int portal_rotor; - -int lnet_lib_init(void); -void lnet_lib_exit(void); - -int lnet_notify(struct lnet_ni *ni, lnet_nid_t peer, int alive, - unsigned long when); -void lnet_notify_locked(struct lnet_peer *lp, int notifylnd, int alive, - unsigned long when); -int lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway_nid, - unsigned int priority); -int lnet_check_routes(void); -int lnet_del_route(__u32 net, lnet_nid_t gw_nid); -void lnet_destroy_routes(void); -int lnet_get_route(int idx, __u32 *net, __u32 *hops, - lnet_nid_t *gateway, __u32 *alive, __u32 *priority); -int lnet_get_rtr_pool_cfg(int idx, struct lnet_ioctl_pool_cfg *pool_cfg); - -void lnet_router_debugfs_init(void); -void lnet_router_debugfs_fini(void); -int lnet_rtrpools_alloc(int im_a_router); -void lnet_destroy_rtrbuf(struct lnet_rtrbuf *rb, int npages); -int lnet_rtrpools_adjust(int tiny, int small, int large); -int lnet_rtrpools_enable(void); -void lnet_rtrpools_disable(void); -void lnet_rtrpools_free(int keep_pools); -struct lnet_remotenet *lnet_find_net_locked(__u32 net); -int lnet_dyn_add_ni(lnet_pid_t requested_pid, - struct lnet_ioctl_config_data *conf); -int lnet_dyn_del_ni(__u32 net); -int lnet_clear_lazy_portal(struct lnet_ni *ni, int portal, char *reason); - -int lnet_islocalnid(lnet_nid_t nid); -int lnet_islocalnet(__u32 net); - -void lnet_msg_attach_md(struct lnet_msg *msg, struct lnet_libmd *md, - unsigned int offset, unsigned int mlen); -void lnet_msg_detach_md(struct lnet_msg *msg, int status); -void lnet_build_unlink_event(struct lnet_libmd *md, struct lnet_event *ev); -void lnet_build_msg_event(struct lnet_msg *msg, enum lnet_event_kind ev_type); -void lnet_msg_commit(struct lnet_msg *msg, int cpt); -void lnet_msg_decommit(struct lnet_msg *msg, int cpt, int status); - -void lnet_eq_enqueue_event(struct lnet_eq *eq, struct lnet_event *ev); -void lnet_prep_send(struct lnet_msg *msg, int type, - struct lnet_process_id target, unsigned int offset, - unsigned int len); -int lnet_send(lnet_nid_t nid, struct lnet_msg *msg, lnet_nid_t rtr_nid); -void lnet_return_tx_credits_locked(struct lnet_msg *msg); -void lnet_return_rx_credits_locked(struct lnet_msg *msg); -void lnet_schedule_blocked_locked(struct lnet_rtrbufpool *rbp); -void lnet_drop_routed_msgs_locked(struct list_head *list, int cpt); - -/* portals functions */ -/* portals attributes */ -static inline int -lnet_ptl_is_lazy(struct lnet_portal *ptl) -{ - return !!(ptl->ptl_options & LNET_PTL_LAZY); -} - -static inline int -lnet_ptl_is_unique(struct lnet_portal *ptl) -{ - return !!(ptl->ptl_options & LNET_PTL_MATCH_UNIQUE); -} - -static inline int -lnet_ptl_is_wildcard(struct lnet_portal *ptl) -{ - return !!(ptl->ptl_options & LNET_PTL_MATCH_WILDCARD); -} - -static inline void -lnet_ptl_setopt(struct lnet_portal *ptl, int opt) -{ - ptl->ptl_options |= opt; -} - -static inline void -lnet_ptl_unsetopt(struct lnet_portal *ptl, int opt) -{ - ptl->ptl_options &= ~opt; -} - -/* match-table functions */ -struct list_head *lnet_mt_match_head(struct lnet_match_table *mtable, - struct lnet_process_id id, __u64 mbits); -struct lnet_match_table *lnet_mt_of_attach(unsigned int index, - struct lnet_process_id id, - __u64 mbits, __u64 ignore_bits, - enum lnet_ins_pos pos); -int lnet_mt_match_md(struct lnet_match_table *mtable, - struct lnet_match_info *info, struct lnet_msg *msg); - -/* portals match/attach functions */ -void lnet_ptl_attach_md(struct lnet_me *me, struct lnet_libmd *md, - struct list_head *matches, struct list_head *drops); -void lnet_ptl_detach_md(struct lnet_me *me, struct lnet_libmd *md); -int lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg); - -/* initialized and finalize portals */ -int lnet_portals_create(void); -void lnet_portals_destroy(void); - -/* message functions */ -int lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, - lnet_nid_t fromnid, void *private, int rdma_req); -int lnet_parse_local(struct lnet_ni *ni, struct lnet_msg *msg); -int lnet_parse_forward_locked(struct lnet_ni *ni, struct lnet_msg *msg); - -void lnet_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg, - int delayed, unsigned int offset, unsigned int mlen, - unsigned int rlen); -void lnet_ni_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg, - int delayed, unsigned int offset, - unsigned int mlen, unsigned int rlen); - -struct lnet_msg *lnet_create_reply_msg(struct lnet_ni *ni, - struct lnet_msg *get_msg); -void lnet_set_reply_msg_len(struct lnet_ni *ni, struct lnet_msg *msg, - unsigned int len); - -void lnet_finalize(struct lnet_ni *ni, struct lnet_msg *msg, int rc); - -void lnet_drop_message(struct lnet_ni *ni, int cpt, void *private, - unsigned int nob); -void lnet_drop_delayed_msg_list(struct list_head *head, char *reason); -void lnet_recv_delayed_msg_list(struct list_head *head); - -int lnet_msg_container_setup(struct lnet_msg_container *container, int cpt); -void lnet_msg_container_cleanup(struct lnet_msg_container *container); -void lnet_msg_containers_destroy(void); -int lnet_msg_containers_create(void); - -char *lnet_msgtyp2str(int type); -void lnet_print_hdr(struct lnet_hdr *hdr); -int lnet_fail_nid(lnet_nid_t nid, unsigned int threshold); - -/** \addtogroup lnet_fault_simulation @{ */ - -int lnet_fault_ctl(int cmd, struct libcfs_ioctl_data *data); -int lnet_fault_init(void); -void lnet_fault_fini(void); - -bool lnet_drop_rule_match(struct lnet_hdr *hdr); - -int lnet_delay_rule_add(struct lnet_fault_attr *attr); -int lnet_delay_rule_del(lnet_nid_t src, lnet_nid_t dst, bool shutdown); -int lnet_delay_rule_list(int pos, struct lnet_fault_attr *attr, - struct lnet_fault_stat *stat); -void lnet_delay_rule_reset(void); -void lnet_delay_rule_check(void); -bool lnet_delay_rule_match_locked(struct lnet_hdr *hdr, struct lnet_msg *msg); - -/** @} lnet_fault_simulation */ - -void lnet_counters_get(struct lnet_counters *counters); -void lnet_counters_reset(void); - -unsigned int lnet_iov_nob(unsigned int niov, struct kvec *iov); -int lnet_extract_iov(int dst_niov, struct kvec *dst, - int src_niov, const struct kvec *src, - unsigned int offset, unsigned int len); - -unsigned int lnet_kiov_nob(unsigned int niov, struct bio_vec *iov); -int lnet_extract_kiov(int dst_niov, struct bio_vec *dst, - int src_niov, const struct bio_vec *src, - unsigned int offset, unsigned int len); - -void lnet_copy_iov2iter(struct iov_iter *to, - unsigned int nsiov, const struct kvec *siov, - unsigned int soffset, unsigned int nob); -void lnet_copy_kiov2iter(struct iov_iter *to, - unsigned int nkiov, const struct bio_vec *kiov, - unsigned int kiovoffset, unsigned int nob); - -void lnet_me_unlink(struct lnet_me *me); - -void lnet_md_unlink(struct lnet_libmd *md); -void lnet_md_deconstruct(struct lnet_libmd *lmd, struct lnet_md *umd); - -void lnet_register_lnd(struct lnet_lnd *lnd); -void lnet_unregister_lnd(struct lnet_lnd *lnd); - -int lnet_connect(struct socket **sockp, lnet_nid_t peer_nid, - __u32 local_ip, __u32 peer_ip, int peer_port); -void lnet_connect_console_error(int rc, lnet_nid_t peer_nid, - __u32 peer_ip, int port); -int lnet_count_acceptor_nis(void); -int lnet_acceptor_timeout(void); -int lnet_acceptor_port(void); - -int lnet_count_acceptor_nis(void); -int lnet_acceptor_port(void); - -int lnet_acceptor_start(void); -void lnet_acceptor_stop(void); - -int lnet_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask); -int lnet_ipif_enumerate(char ***names); -void lnet_ipif_free_enumeration(char **names, int n); -int lnet_sock_setbuf(struct socket *socket, int txbufsize, int rxbufsize); -int lnet_sock_getbuf(struct socket *socket, int *txbufsize, int *rxbufsize); -int lnet_sock_getaddr(struct socket *socket, bool remote, __u32 *ip, int *port); -int lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout); -int lnet_sock_read(struct socket *sock, void *buffer, int nob, int timeout); - -int lnet_sock_listen(struct socket **sockp, __u32 ip, int port, int backlog); -int lnet_sock_accept(struct socket **newsockp, struct socket *sock); -int lnet_sock_connect(struct socket **sockp, int *fatal, - __u32 local_ip, int local_port, - __u32 peer_ip, int peer_port); -void libcfs_sock_release(struct socket *sock); - -int lnet_peers_start_down(void); -int lnet_peer_buffer_credits(struct lnet_ni *ni); - -int lnet_router_checker_start(void); -void lnet_router_checker_stop(void); -void lnet_router_ni_update_locked(struct lnet_peer *gw, __u32 net); -void lnet_swap_pinginfo(struct lnet_ping_info *info); - -int lnet_parse_ip2nets(char **networksp, char *ip2nets); -int lnet_parse_routes(char *route_str, int *im_a_router); -int lnet_parse_networks(struct list_head *nilist, char *networks); -int lnet_net_unique(__u32 net, struct list_head *nilist); - -int lnet_nid2peer_locked(struct lnet_peer **lpp, lnet_nid_t nid, int cpt); -struct lnet_peer *lnet_find_peer_locked(struct lnet_peer_table *ptable, - lnet_nid_t nid); -void lnet_peer_tables_cleanup(struct lnet_ni *ni); -void lnet_peer_tables_destroy(void); -int lnet_peer_tables_create(void); -void lnet_debug_peer(lnet_nid_t nid); -int lnet_get_peer_info(__u32 peer_index, __u64 *nid, - char alivness[LNET_MAX_STR_LEN], - __u32 *cpt_iter, __u32 *refcount, - __u32 *ni_peer_tx_credits, __u32 *peer_tx_credits, - __u32 *peer_rtr_credits, __u32 *peer_min_rtr_credtis, - __u32 *peer_tx_qnob); - -static inline void -lnet_peer_set_alive(struct lnet_peer *lp) -{ - lp->lp_last_query = jiffies; - lp->lp_last_alive = jiffies; - if (!lp->lp_alive) - lnet_notify_locked(lp, 0, 1, lp->lp_last_alive); -} - -#endif diff --git a/drivers/staging/lustre/include/linux/lnet/lib-types.h b/drivers/staging/lustre/include/linux/lnet/lib-types.h deleted file mode 100644 index cfe8ee424e944426dd672f72fa033be1e3006c4d..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/linux/lnet/lib-types.h +++ /dev/null @@ -1,666 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Seagate, Inc. - * - * lnet/include/lnet/lib-types.h - */ - -#ifndef __LNET_LIB_TYPES_H__ -#define __LNET_LIB_TYPES_H__ - -#include -#include -#include -#include - -#include -#include - -/* Max payload size */ -#define LNET_MAX_PAYLOAD CONFIG_LNET_MAX_PAYLOAD -#if (LNET_MAX_PAYLOAD < LNET_MTU) -# error "LNET_MAX_PAYLOAD too small - error in configure --with-max-payload-mb" -#elif (LNET_MAX_PAYLOAD > (PAGE_SIZE * LNET_MAX_IOV)) -# error "LNET_MAX_PAYLOAD too large - error in configure --with-max-payload-mb" -#endif - -/* forward refs */ -struct lnet_libmd; - -struct lnet_msg { - struct list_head msg_activelist; - struct list_head msg_list; /* Q for credits/MD */ - - struct lnet_process_id msg_target; - /* where is it from, it's only for building event */ - lnet_nid_t msg_from; - __u32 msg_type; - - /* committed for sending */ - unsigned int msg_tx_committed:1; - /* CPT # this message committed for sending */ - unsigned int msg_tx_cpt:15; - /* committed for receiving */ - unsigned int msg_rx_committed:1; - /* CPT # this message committed for receiving */ - unsigned int msg_rx_cpt:15; - /* queued for tx credit */ - unsigned int msg_tx_delayed:1; - /* queued for RX buffer */ - unsigned int msg_rx_delayed:1; - /* ready for pending on RX delay list */ - unsigned int msg_rx_ready_delay:1; - - unsigned int msg_vmflush:1; /* VM trying to free memory */ - unsigned int msg_target_is_router:1; /* sending to a router */ - unsigned int msg_routing:1; /* being forwarded */ - unsigned int msg_ack:1; /* ack on finalize (PUT) */ - unsigned int msg_sending:1; /* outgoing message */ - unsigned int msg_receiving:1; /* being received */ - unsigned int msg_txcredit:1; /* taken an NI send credit */ - unsigned int msg_peertxcredit:1; /* taken a peer send credit */ - unsigned int msg_rtrcredit:1; /* taken a global router credit */ - unsigned int msg_peerrtrcredit:1; /* taken a peer router credit */ - unsigned int msg_onactivelist:1; /* on the activelist */ - unsigned int msg_rdma_get:1; - - struct lnet_peer *msg_txpeer; /* peer I'm sending to */ - struct lnet_peer *msg_rxpeer; /* peer I received from */ - - void *msg_private; - struct lnet_libmd *msg_md; - - unsigned int msg_len; - unsigned int msg_wanted; - unsigned int msg_offset; - unsigned int msg_niov; - struct kvec *msg_iov; - struct bio_vec *msg_kiov; - - struct lnet_event msg_ev; - struct lnet_hdr msg_hdr; -}; - -struct lnet_libhandle { - struct list_head lh_hash_chain; - __u64 lh_cookie; -}; - -#define lh_entry(ptr, type, member) \ - ((type *)((char *)(ptr) - (char *)(&((type *)0)->member))) - -struct lnet_eq { - struct list_head eq_list; - struct lnet_libhandle eq_lh; - unsigned long eq_enq_seq; - unsigned long eq_deq_seq; - unsigned int eq_size; - lnet_eq_handler_t eq_callback; - struct lnet_event *eq_events; - int **eq_refs; /* percpt refcount for EQ */ -}; - -struct lnet_me { - struct list_head me_list; - struct lnet_libhandle me_lh; - struct lnet_process_id me_match_id; - unsigned int me_portal; - unsigned int me_pos; /* hash offset in mt_hash */ - __u64 me_match_bits; - __u64 me_ignore_bits; - enum lnet_unlink me_unlink; - struct lnet_libmd *me_md; -}; - -struct lnet_libmd { - struct list_head md_list; - struct lnet_libhandle md_lh; - struct lnet_me *md_me; - char *md_start; - unsigned int md_offset; - unsigned int md_length; - unsigned int md_max_size; - int md_threshold; - int md_refcount; - unsigned int md_options; - unsigned int md_flags; - void *md_user_ptr; - struct lnet_eq *md_eq; - unsigned int md_niov; /* # frags */ - union { - struct kvec iov[LNET_MAX_IOV]; - struct bio_vec kiov[LNET_MAX_IOV]; - } md_iov; -}; - -#define LNET_MD_FLAG_ZOMBIE BIT(0) -#define LNET_MD_FLAG_AUTO_UNLINK BIT(1) -#define LNET_MD_FLAG_ABORTED BIT(2) - -struct lnet_test_peer { - /* info about peers we are trying to fail */ - struct list_head tp_list; /* ln_test_peers */ - lnet_nid_t tp_nid; /* matching nid */ - unsigned int tp_threshold; /* # failures to simulate */ -}; - -#define LNET_COOKIE_TYPE_MD 1 -#define LNET_COOKIE_TYPE_ME 2 -#define LNET_COOKIE_TYPE_EQ 3 -#define LNET_COOKIE_TYPE_BITS 2 -#define LNET_COOKIE_MASK ((1ULL << LNET_COOKIE_TYPE_BITS) - 1ULL) - -struct lnet_ni; /* forward ref */ - -struct lnet_lnd { - /* fields managed by portals */ - struct list_head lnd_list; /* stash in the LND table */ - int lnd_refcount; /* # active instances */ - - /* fields initialised by the LND */ - __u32 lnd_type; - - int (*lnd_startup)(struct lnet_ni *ni); - void (*lnd_shutdown)(struct lnet_ni *ni); - int (*lnd_ctl)(struct lnet_ni *ni, unsigned int cmd, void *arg); - - /* - * In data movement APIs below, payload buffers are described as a set - * of 'niov' fragments which are... - * EITHER - * in virtual memory (struct iovec *iov != NULL) - * OR - * in pages (kernel only: plt_kiov_t *kiov != NULL). - * The LND may NOT overwrite these fragment descriptors. - * An 'offset' and may specify a byte offset within the set of - * fragments to start from - */ - - /* - * Start sending a preformatted message. 'private' is NULL for PUT and - * GET messages; otherwise this is a response to an incoming message - * and 'private' is the 'private' passed to lnet_parse(). Return - * non-zero for immediate failure, otherwise complete later with - * lnet_finalize() - */ - int (*lnd_send)(struct lnet_ni *ni, void *private, - struct lnet_msg *msg); - - /* - * Start receiving 'mlen' bytes of payload data, skipping the following - * 'rlen' - 'mlen' bytes. 'private' is the 'private' passed to - * lnet_parse(). Return non-zero for immediate failure, otherwise - * complete later with lnet_finalize(). This also gives back a receive - * credit if the LND does flow control. - */ - int (*lnd_recv)(struct lnet_ni *ni, void *private, struct lnet_msg *msg, - int delayed, struct iov_iter *to, unsigned int rlen); - - /* - * lnet_parse() has had to delay processing of this message - * (e.g. waiting for a forwarding buffer or send credits). Give the - * LND a chance to free urgently needed resources. If called, return 0 - * for success and do NOT give back a receive credit; that has to wait - * until lnd_recv() gets called. On failure return < 0 and - * release resources; lnd_recv() will not be called. - */ - int (*lnd_eager_recv)(struct lnet_ni *ni, void *private, - struct lnet_msg *msg, void **new_privatep); - - /* notification of peer health */ - void (*lnd_notify)(struct lnet_ni *ni, lnet_nid_t peer, int alive); - - /* query of peer aliveness */ - void (*lnd_query)(struct lnet_ni *ni, lnet_nid_t peer, - unsigned long *when); - - /* accept a new connection */ - int (*lnd_accept)(struct lnet_ni *ni, struct socket *sock); -}; - -struct lnet_tx_queue { - int tq_credits; /* # tx credits free */ - int tq_credits_min; /* lowest it's been */ - int tq_credits_max; /* total # tx credits */ - struct list_head tq_delayed; /* delayed TXs */ -}; - -struct lnet_ni { - spinlock_t ni_lock; - struct list_head ni_list; /* chain on ln_nis */ - struct list_head ni_cptlist; /* chain on ln_nis_cpt */ - int ni_maxtxcredits; /* # tx credits */ - /* # per-peer send credits */ - int ni_peertxcredits; - /* # per-peer router buffer credits */ - int ni_peerrtrcredits; - /* seconds to consider peer dead */ - int ni_peertimeout; - int ni_ncpts; /* number of CPTs */ - __u32 *ni_cpts; /* bond NI on some CPTs */ - lnet_nid_t ni_nid; /* interface's NID */ - void *ni_data; /* instance-specific data */ - struct lnet_lnd *ni_lnd; /* procedural interface */ - struct lnet_tx_queue **ni_tx_queues; /* percpt TX queues */ - int **ni_refs; /* percpt reference count */ - time64_t ni_last_alive;/* when I was last alive */ - struct lnet_ni_status *ni_status; /* my health status */ - /* per NI LND tunables */ - struct lnet_ioctl_config_lnd_tunables *ni_lnd_tunables; - /* equivalent interfaces to use */ - char *ni_interfaces[LNET_MAX_INTERFACES]; - /* original net namespace */ - struct net *ni_net_ns; -}; - -#define LNET_PROTO_PING_MATCHBITS 0x8000000000000000LL - -/* - * NB: value of these features equal to LNET_PROTO_PING_VERSION_x - * of old LNet, so there shouldn't be any compatibility issue - */ -#define LNET_PING_FEAT_INVAL (0) /* no feature */ -#define LNET_PING_FEAT_BASE BIT(0) /* just a ping */ -#define LNET_PING_FEAT_NI_STATUS BIT(1) /* return NI status */ -#define LNET_PING_FEAT_RTE_DISABLED BIT(2) /* Routing enabled */ - -#define LNET_PING_FEAT_MASK (LNET_PING_FEAT_BASE | \ - LNET_PING_FEAT_NI_STATUS) - -/* router checker data, per router */ -#define LNET_MAX_RTR_NIS 16 -#define LNET_PINGINFO_SIZE offsetof(struct lnet_ping_info, pi_ni[LNET_MAX_RTR_NIS]) -struct lnet_rc_data { - /* chain on the_lnet.ln_zombie_rcd or ln_deathrow_rcd */ - struct list_head rcd_list; - struct lnet_handle_md rcd_mdh; /* ping buffer MD */ - struct lnet_peer *rcd_gateway; /* reference to gateway */ - struct lnet_ping_info *rcd_pinginfo; /* ping buffer */ -}; - -struct lnet_peer { - struct list_head lp_hashlist; /* chain on peer hash */ - struct list_head lp_txq; /* messages blocking for - * tx credits - */ - struct list_head lp_rtrq; /* messages blocking for - * router credits - */ - struct list_head lp_rtr_list; /* chain on router list */ - int lp_txcredits; /* # tx credits available */ - int lp_mintxcredits; /* low water mark */ - int lp_rtrcredits; /* # router credits */ - int lp_minrtrcredits; /* low water mark */ - unsigned int lp_alive:1; /* alive/dead? */ - unsigned int lp_notify:1; /* notification outstanding? */ - unsigned int lp_notifylnd:1;/* outstanding notification - * for LND? - */ - unsigned int lp_notifying:1; /* some thread is handling - * notification - */ - unsigned int lp_ping_notsent;/* SEND event outstanding - * from ping - */ - int lp_alive_count; /* # times router went - * dead<->alive - */ - long lp_txqnob; /* ytes queued for sending */ - unsigned long lp_timestamp; /* time of last aliveness - * news - */ - unsigned long lp_ping_timestamp;/* time of last ping - * attempt - */ - unsigned long lp_ping_deadline; /* != 0 if ping reply - * expected - */ - unsigned long lp_last_alive; /* when I was last alive */ - unsigned long lp_last_query; /* when lp_ni was queried - * last time - */ - struct lnet_ni *lp_ni; /* interface peer is on */ - lnet_nid_t lp_nid; /* peer's NID */ - int lp_refcount; /* # refs */ - int lp_cpt; /* CPT this peer attached on */ - /* # refs from lnet_route::lr_gateway */ - int lp_rtr_refcount; - /* returned RC ping features */ - unsigned int lp_ping_feats; - struct list_head lp_routes; /* routers on this peer */ - struct lnet_rc_data *lp_rcd; /* router checker state */ -}; - -/* peer hash size */ -#define LNET_PEER_HASH_BITS 9 -#define LNET_PEER_HASH_SIZE (1 << LNET_PEER_HASH_BITS) - -/* peer hash table */ -struct lnet_peer_table { - int pt_version; /* /proc validity stamp */ - int pt_number; /* # peers extant */ - /* # zombies to go to deathrow (and not there yet) */ - int pt_zombies; - struct list_head pt_deathrow; /* zombie peers */ - struct list_head *pt_hash; /* NID->peer hash */ -}; - -/* - * peer aliveness is enabled only on routers for peers in a network where the - * lnet_ni::ni_peertimeout has been set to a positive value - */ -#define lnet_peer_aliveness_enabled(lp) (the_lnet.ln_routing && \ - (lp)->lp_ni->ni_peertimeout > 0) - -struct lnet_route { - struct list_head lr_list; /* chain on net */ - struct list_head lr_gwlist; /* chain on gateway */ - struct lnet_peer *lr_gateway; /* router node */ - __u32 lr_net; /* remote network number */ - int lr_seq; /* sequence for round-robin */ - unsigned int lr_downis; /* number of down NIs */ - __u32 lr_hops; /* how far I am */ - unsigned int lr_priority; /* route priority */ -}; - -#define LNET_REMOTE_NETS_HASH_DEFAULT (1U << 7) -#define LNET_REMOTE_NETS_HASH_MAX (1U << 16) -#define LNET_REMOTE_NETS_HASH_SIZE (1 << the_lnet.ln_remote_nets_hbits) - -struct lnet_remotenet { - struct list_head lrn_list; /* chain on - * ln_remote_nets_hash - */ - struct list_head lrn_routes; /* routes to me */ - __u32 lrn_net; /* my net number */ -}; - -/** lnet message has credit and can be submitted to lnd for send/receive */ -#define LNET_CREDIT_OK 0 -/** lnet message is waiting for credit */ -#define LNET_CREDIT_WAIT 1 - -struct lnet_rtrbufpool { - struct list_head rbp_bufs; /* my free buffer pool */ - struct list_head rbp_msgs; /* messages blocking - * for a buffer - */ - int rbp_npages; /* # pages in each buffer */ - /* requested number of buffers */ - int rbp_req_nbuffers; - /* # buffers actually allocated */ - int rbp_nbuffers; - int rbp_credits; /* # free buffers - * blocked messages - */ - int rbp_mincredits; /* low water mark */ -}; - -struct lnet_rtrbuf { - struct list_head rb_list; /* chain on rbp_bufs */ - struct lnet_rtrbufpool *rb_pool; /* owning pool */ - struct bio_vec rb_kiov[0]; /* the buffer space */ -}; - -#define LNET_PEER_HASHSIZE 503 /* prime! */ - -#define LNET_TINY_BUF_IDX 0 -#define LNET_SMALL_BUF_IDX 1 -#define LNET_LARGE_BUF_IDX 2 - -/* # different router buffer pools */ -#define LNET_NRBPOOLS (LNET_LARGE_BUF_IDX + 1) - -enum lnet_match_flags { - /* Didn't match anything */ - LNET_MATCHMD_NONE = BIT(0), - /* Matched OK */ - LNET_MATCHMD_OK = BIT(1), - /* Must be discarded */ - LNET_MATCHMD_DROP = BIT(2), - /* match and buffer is exhausted */ - LNET_MATCHMD_EXHAUSTED = BIT(3), - /* match or drop */ - LNET_MATCHMD_FINISH = (LNET_MATCHMD_OK | LNET_MATCHMD_DROP), -}; - -/* Options for lnet_portal::ptl_options */ -#define LNET_PTL_LAZY BIT(0) -#define LNET_PTL_MATCH_UNIQUE BIT(1) /* unique match, for RDMA */ -#define LNET_PTL_MATCH_WILDCARD BIT(2) /* wildcard match, request portal */ - -/* parameter for matching operations (GET, PUT) */ -struct lnet_match_info { - __u64 mi_mbits; - struct lnet_process_id mi_id; - unsigned int mi_opc; - unsigned int mi_portal; - unsigned int mi_rlength; - unsigned int mi_roffset; -}; - -/* ME hash of RDMA portal */ -#define LNET_MT_HASH_BITS 8 -#define LNET_MT_HASH_SIZE (1 << LNET_MT_HASH_BITS) -#define LNET_MT_HASH_MASK (LNET_MT_HASH_SIZE - 1) -/* - * we allocate (LNET_MT_HASH_SIZE + 1) entries for lnet_match_table::mt_hash, - * the last entry is reserved for MEs with ignore-bits - */ -#define LNET_MT_HASH_IGNORE LNET_MT_HASH_SIZE -/* - * __u64 has 2^6 bits, so need 2^(LNET_MT_HASH_BITS - LNET_MT_BITS_U64) which - * is 4 __u64s as bit-map, and add an extra __u64 (only use one bit) for the - * ME-list with ignore-bits, which is mtable::mt_hash[LNET_MT_HASH_IGNORE] - */ -#define LNET_MT_BITS_U64 6 /* 2^6 bits */ -#define LNET_MT_EXHAUSTED_BITS (LNET_MT_HASH_BITS - LNET_MT_BITS_U64) -#define LNET_MT_EXHAUSTED_BMAP ((1 << LNET_MT_EXHAUSTED_BITS) + 1) - -/* portal match table */ -struct lnet_match_table { - /* reserved for upcoming patches, CPU partition ID */ - unsigned int mt_cpt; - unsigned int mt_portal; /* portal index */ - /* - * match table is set as "enabled" if there's non-exhausted MD - * attached on mt_mhash, it's only valid for wildcard portal - */ - unsigned int mt_enabled; - /* bitmap to flag whether MEs on mt_hash are exhausted or not */ - __u64 mt_exhausted[LNET_MT_EXHAUSTED_BMAP]; - struct list_head *mt_mhash; /* matching hash */ -}; - -/* these are only useful for wildcard portal */ -/* Turn off message rotor for wildcard portals */ -#define LNET_PTL_ROTOR_OFF 0 -/* round-robin dispatch all PUT messages for wildcard portals */ -#define LNET_PTL_ROTOR_ON 1 -/* round-robin dispatch routed PUT message for wildcard portals */ -#define LNET_PTL_ROTOR_RR_RT 2 -/* dispatch routed PUT message by hashing source NID for wildcard portals */ -#define LNET_PTL_ROTOR_HASH_RT 3 - -struct lnet_portal { - spinlock_t ptl_lock; - unsigned int ptl_index; /* portal ID, reserved */ - /* flags on this portal: lazy, unique... */ - unsigned int ptl_options; - /* list of messages which are stealing buffer */ - struct list_head ptl_msg_stealing; - /* messages blocking for MD */ - struct list_head ptl_msg_delayed; - /* Match table for each CPT */ - struct lnet_match_table **ptl_mtables; - /* spread rotor of incoming "PUT" */ - unsigned int ptl_rotor; - /* # active entries for this portal */ - int ptl_mt_nmaps; - /* array of active entries' cpu-partition-id */ - int ptl_mt_maps[0]; -}; - -#define LNET_LH_HASH_BITS 12 -#define LNET_LH_HASH_SIZE (1ULL << LNET_LH_HASH_BITS) -#define LNET_LH_HASH_MASK (LNET_LH_HASH_SIZE - 1) - -/* resource container (ME, MD, EQ) */ -struct lnet_res_container { - unsigned int rec_type; /* container type */ - __u64 rec_lh_cookie; /* cookie generator */ - struct list_head rec_active; /* active resource list */ - struct list_head *rec_lh_hash; /* handle hash */ -}; - -/* message container */ -struct lnet_msg_container { - int msc_init; /* initialized or not */ - /* max # threads finalizing */ - int msc_nfinalizers; - /* msgs waiting to complete finalizing */ - struct list_head msc_finalizing; - struct list_head msc_active; /* active message list */ - /* threads doing finalization */ - void **msc_finalizers; -}; - -/* Router Checker states */ -#define LNET_RC_STATE_SHUTDOWN 0 /* not started */ -#define LNET_RC_STATE_RUNNING 1 /* started up OK */ -#define LNET_RC_STATE_STOPPING 2 /* telling thread to stop */ - -struct lnet { - /* CPU partition table of LNet */ - struct cfs_cpt_table *ln_cpt_table; - /* number of CPTs in ln_cpt_table */ - unsigned int ln_cpt_number; - unsigned int ln_cpt_bits; - - /* protect LNet resources (ME/MD/EQ) */ - struct cfs_percpt_lock *ln_res_lock; - /* # portals */ - int ln_nportals; - /* the vector of portals */ - struct lnet_portal **ln_portals; - /* percpt ME containers */ - struct lnet_res_container **ln_me_containers; - /* percpt MD container */ - struct lnet_res_container **ln_md_containers; - - /* Event Queue container */ - struct lnet_res_container ln_eq_container; - wait_queue_head_t ln_eq_waitq; - spinlock_t ln_eq_wait_lock; - unsigned int ln_remote_nets_hbits; - - /* protect NI, peer table, credits, routers, rtrbuf... */ - struct cfs_percpt_lock *ln_net_lock; - /* percpt message containers for active/finalizing/freed message */ - struct lnet_msg_container **ln_msg_containers; - struct lnet_counters **ln_counters; - struct lnet_peer_table **ln_peer_tables; - /* failure simulation */ - struct list_head ln_test_peers; - struct list_head ln_drop_rules; - struct list_head ln_delay_rules; - - struct list_head ln_nis; /* LND instances */ - /* NIs bond on specific CPT(s) */ - struct list_head ln_nis_cpt; - /* dying LND instances */ - struct list_head ln_nis_zombie; - struct lnet_ni *ln_loni; /* the loopback NI */ - - /* remote networks with routes to them */ - struct list_head *ln_remote_nets_hash; - /* validity stamp */ - __u64 ln_remote_nets_version; - /* list of all known routers */ - struct list_head ln_routers; - /* validity stamp */ - __u64 ln_routers_version; - /* percpt router buffer pools */ - struct lnet_rtrbufpool **ln_rtrpools; - - struct lnet_handle_md ln_ping_target_md; - struct lnet_handle_eq ln_ping_target_eq; - struct lnet_ping_info *ln_ping_info; - - /* router checker startup/shutdown state */ - int ln_rc_state; - /* router checker's event queue */ - struct lnet_handle_eq ln_rc_eqh; - /* rcd still pending on net */ - struct list_head ln_rcd_deathrow; - /* rcd ready for free */ - struct list_head ln_rcd_zombie; - /* serialise startup/shutdown */ - struct completion ln_rc_signal; - - struct mutex ln_api_mutex; - struct mutex ln_lnd_mutex; - struct mutex ln_delay_mutex; - /* Have I called LNetNIInit myself? */ - int ln_niinit_self; - /* LNetNIInit/LNetNIFini counter */ - int ln_refcount; - /* shutdown in progress */ - int ln_shutdown; - - int ln_routing; /* am I a router? */ - lnet_pid_t ln_pid; /* requested pid */ - /* uniquely identifies this ni in this epoch */ - __u64 ln_interface_cookie; - /* registered LNDs */ - struct list_head ln_lnds; - - /* test protocol compatibility flags */ - int ln_testprotocompat; - - /* - * 0 - load the NIs from the mod params - * 1 - do not load the NIs from the mod params - * Reverse logic to ensure that other calls to LNetNIInit - * need no change - */ - bool ln_nis_from_mod_params; - - /* - * waitq for router checker. As long as there are no routes in - * the list, the router checker will sleep on this queue. when - * routes are added the thread will wake up - */ - wait_queue_head_t ln_rc_waitq; - -}; - -#endif diff --git a/drivers/staging/lustre/include/linux/lnet/socklnd.h b/drivers/staging/lustre/include/linux/lnet/socklnd.h deleted file mode 100644 index 6bd1bca190a378193de56a8f3d12790b3f0ecd55..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/linux/lnet/socklnd.h +++ /dev/null @@ -1,87 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012 - 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Seagate, Inc. - * - * lnet/include/lnet/socklnd.h - */ -#ifndef __LNET_LNET_SOCKLND_H__ -#define __LNET_LNET_SOCKLND_H__ - -#include -#include - -struct ksock_hello_msg { - __u32 kshm_magic; /* magic number of socklnd message */ - __u32 kshm_version; /* version of socklnd message */ - lnet_nid_t kshm_src_nid; /* sender's nid */ - lnet_nid_t kshm_dst_nid; /* destination nid */ - lnet_pid_t kshm_src_pid; /* sender's pid */ - lnet_pid_t kshm_dst_pid; /* destination pid */ - __u64 kshm_src_incarnation; /* sender's incarnation */ - __u64 kshm_dst_incarnation; /* destination's incarnation */ - __u32 kshm_ctype; /* connection type */ - __u32 kshm_nips; /* # IP addrs */ - __u32 kshm_ips[0]; /* IP addrs */ -} WIRE_ATTR; - -struct ksock_lnet_msg { - struct lnet_hdr ksnm_hdr; /* lnet hdr */ - - /* - * ksnm_payload is removed because of winnt compiler's limitation: - * zero-sized array can only be placed at the tail of [nested] - * structure definitions. lnet payload will be stored just after - * the body of structure ksock_lnet_msg_t - */ -} WIRE_ATTR; - -struct ksock_msg { - __u32 ksm_type; /* type of socklnd message */ - __u32 ksm_csum; /* checksum if != 0 */ - __u64 ksm_zc_cookies[2]; /* Zero-Copy request/ACK cookie */ - union { - struct ksock_lnet_msg lnetmsg; /* lnet message, it's empty if - * it's NOOP - */ - } WIRE_ATTR ksm_u; -} WIRE_ATTR; - -#define KSOCK_MSG_NOOP 0xC0 /* ksm_u empty */ -#define KSOCK_MSG_LNET 0xC1 /* lnet msg */ - -/* - * We need to know this number to parse hello msg from ksocklnd in - * other LND (usocklnd, for example) - */ -#define KSOCK_PROTO_V2 2 -#define KSOCK_PROTO_V3 3 - -#endif diff --git a/drivers/staging/lustre/include/uapi/linux/lnet/libcfs_debug.h b/drivers/staging/lustre/include/uapi/linux/lnet/libcfs_debug.h deleted file mode 100644 index c4d9472b374f7ee93de8cbe39ba44dad6221547a..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/uapi/linux/lnet/libcfs_debug.h +++ /dev/null @@ -1,149 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2014, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * libcfs/include/libcfs/libcfs_debug.h - * - * Debug messages and assertions - * - */ - -#ifndef __UAPI_LIBCFS_DEBUG_H__ -#define __UAPI_LIBCFS_DEBUG_H__ - -/** - * Format for debug message headers - */ -struct ptldebug_header { - __u32 ph_len; - __u32 ph_flags; - __u32 ph_subsys; - __u32 ph_mask; - __u16 ph_cpu_id; - __u16 ph_type; - /* time_t overflow in 2106 */ - __u32 ph_sec; - __u64 ph_usec; - __u32 ph_stack; - __u32 ph_pid; - __u32 ph_extern_pid; - __u32 ph_line_num; -} __attribute__((packed)); - -#define PH_FLAG_FIRST_RECORD 1 - -/* Debugging subsystems (32 bits, non-overlapping) */ -#define S_UNDEFINED 0x00000001 -#define S_MDC 0x00000002 -#define S_MDS 0x00000004 -#define S_OSC 0x00000008 -#define S_OST 0x00000010 -#define S_CLASS 0x00000020 -#define S_LOG 0x00000040 -#define S_LLITE 0x00000080 -#define S_RPC 0x00000100 -#define S_MGMT 0x00000200 -#define S_LNET 0x00000400 -#define S_LND 0x00000800 /* ALL LNDs */ -#define S_PINGER 0x00001000 -#define S_FILTER 0x00002000 -#define S_LIBCFS 0x00004000 -#define S_ECHO 0x00008000 -#define S_LDLM 0x00010000 -#define S_LOV 0x00020000 -#define S_LQUOTA 0x00040000 -#define S_OSD 0x00080000 -#define S_LFSCK 0x00100000 -#define S_SNAPSHOT 0x00200000 -/* unused */ -#define S_LMV 0x00800000 /* b_new_cmd */ -/* unused */ -#define S_SEC 0x02000000 /* upcall cache */ -#define S_GSS 0x04000000 /* b_new_cmd */ -/* unused */ -#define S_MGC 0x10000000 -#define S_MGS 0x20000000 -#define S_FID 0x40000000 /* b_new_cmd */ -#define S_FLD 0x80000000 /* b_new_cmd */ - -#define LIBCFS_DEBUG_SUBSYS_NAMES { \ - "undefined", "mdc", "mds", "osc", "ost", "class", "log", \ - "llite", "rpc", "mgmt", "lnet", "lnd", "pinger", "filter", \ - "libcfs", "echo", "ldlm", "lov", "lquota", "osd", "lfsck", \ - "snapshot", "", "lmv", "", "sec", "gss", "", "mgc", "mgs", \ - "fid", "fld", NULL } - -/* Debugging masks (32 bits, non-overlapping) */ -#define D_TRACE 0x00000001 /* ENTRY/EXIT markers */ -#define D_INODE 0x00000002 -#define D_SUPER 0x00000004 -#define D_EXT2 0x00000008 /* anything from ext2_debug */ -#define D_MALLOC 0x00000010 /* print malloc, free information */ -#define D_CACHE 0x00000020 /* cache-related items */ -#define D_INFO 0x00000040 /* general information */ -#define D_IOCTL 0x00000080 /* ioctl related information */ -#define D_NETERROR 0x00000100 /* network errors */ -#define D_NET 0x00000200 /* network communications */ -#define D_WARNING 0x00000400 /* CWARN(...) == CDEBUG (D_WARNING, ...) */ -#define D_BUFFS 0x00000800 -#define D_OTHER 0x00001000 -#define D_DENTRY 0x00002000 -#define D_NETTRACE 0x00004000 -#define D_PAGE 0x00008000 /* bulk page handling */ -#define D_DLMTRACE 0x00010000 -#define D_ERROR 0x00020000 /* CERROR(...) == CDEBUG (D_ERROR, ...) */ -#define D_EMERG 0x00040000 /* CEMERG(...) == CDEBUG (D_EMERG, ...) */ -#define D_HA 0x00080000 /* recovery and failover */ -#define D_RPCTRACE 0x00100000 /* for distributed debugging */ -#define D_VFSTRACE 0x00200000 -#define D_READA 0x00400000 /* read-ahead */ -#define D_MMAP 0x00800000 -#define D_CONFIG 0x01000000 -#define D_CONSOLE 0x02000000 -#define D_QUOTA 0x04000000 -#define D_SEC 0x08000000 -#define D_LFSCK 0x10000000 /* For both OI scrub and LFSCK */ -#define D_HSM 0x20000000 -#define D_SNAPSHOT 0x40000000 /* snapshot */ -#define D_LAYOUT 0x80000000 - -#define LIBCFS_DEBUG_MASKS_NAMES { \ - "trace", "inode", "super", "ext2", "malloc", "cache", "info", \ - "ioctl", "neterror", "net", "warning", "buffs", "other", \ - "dentry", "nettrace", "page", "dlmtrace", "error", "emerg", \ - "ha", "rpctrace", "vfstrace", "reada", "mmap", "config", \ - "console", "quota", "sec", "lfsck", "hsm", "snapshot", "layout",\ - NULL } - -#define D_CANTMASK (D_ERROR | D_EMERG | D_WARNING | D_CONSOLE) - -#define LIBCFS_DEBUG_FILE_PATH_DEFAULT "/tmp/lustre-log" - -#endif /* __UAPI_LIBCFS_DEBUG_H__ */ diff --git a/drivers/staging/lustre/include/uapi/linux/lnet/libcfs_ioctl.h b/drivers/staging/lustre/include/uapi/linux/lnet/libcfs_ioctl.h deleted file mode 100644 index cce6b58e3682ed77ab1b96cdade56362573e8ecc..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/uapi/linux/lnet/libcfs_ioctl.h +++ /dev/null @@ -1,141 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * libcfs/include/libcfs/libcfs_ioctl.h - * - * Low-level ioctl data structures. Kernel ioctl functions declared here, - * and user space functions are in libcfs/util/ioctl.h. - * - */ - -#ifndef __LIBCFS_IOCTL_H__ -#define __LIBCFS_IOCTL_H__ - -#include -#include - -#define LIBCFS_IOCTL_VERSION 0x0001000a -#define LIBCFS_IOCTL_VERSION2 0x0001000b - -struct libcfs_ioctl_hdr { - __u32 ioc_len; - __u32 ioc_version; -}; - -/** max size to copy from userspace */ -#define LIBCFS_IOC_DATA_MAX (128 * 1024) - -struct libcfs_ioctl_data { - struct libcfs_ioctl_hdr ioc_hdr; - - __u64 ioc_nid; - __u64 ioc_u64[1]; - - __u32 ioc_flags; - __u32 ioc_count; - __u32 ioc_net; - __u32 ioc_u32[7]; - - __u32 ioc_inllen1; - char *ioc_inlbuf1; - __u32 ioc_inllen2; - char *ioc_inlbuf2; - - __u32 ioc_plen1; /* buffers in userspace */ - void __user *ioc_pbuf1; - __u32 ioc_plen2; /* buffers in userspace */ - void __user *ioc_pbuf2; - - char ioc_bulk[0]; -}; - -struct libcfs_debug_ioctl_data { - struct libcfs_ioctl_hdr hdr; - unsigned int subs; - unsigned int debug; -}; - -/* 'f' ioctls are defined in lustre_ioctl.h and lustre_user.h except for: */ -#define LIBCFS_IOC_DEBUG_MASK _IOWR('f', 250, long) -#define IOCTL_LIBCFS_TYPE long - -#define IOC_LIBCFS_TYPE ('e') -#define IOC_LIBCFS_MIN_NR 30 -/* libcfs ioctls */ -/* IOC_LIBCFS_PANIC obsolete in 2.8.0, was _IOWR('e', 30, IOCTL_LIBCFS_TYPE) */ -#define IOC_LIBCFS_CLEAR_DEBUG _IOWR('e', 31, IOCTL_LIBCFS_TYPE) -#define IOC_LIBCFS_MARK_DEBUG _IOWR('e', 32, IOCTL_LIBCFS_TYPE) -/* IOC_LIBCFS_MEMHOG obsolete in 2.8.0, was _IOWR('e', 36, IOCTL_LIBCFS_TYPE) */ -/* lnet ioctls */ -#define IOC_LIBCFS_GET_NI _IOWR('e', 50, IOCTL_LIBCFS_TYPE) -#define IOC_LIBCFS_FAIL_NID _IOWR('e', 51, IOCTL_LIBCFS_TYPE) -#define IOC_LIBCFS_NOTIFY_ROUTER _IOWR('e', 55, IOCTL_LIBCFS_TYPE) -#define IOC_LIBCFS_UNCONFIGURE _IOWR('e', 56, IOCTL_LIBCFS_TYPE) -/* IOC_LIBCFS_PORTALS_COMPATIBILITY _IOWR('e', 57, IOCTL_LIBCFS_TYPE) */ -#define IOC_LIBCFS_LNET_DIST _IOWR('e', 58, IOCTL_LIBCFS_TYPE) -#define IOC_LIBCFS_CONFIGURE _IOWR('e', 59, IOCTL_LIBCFS_TYPE) -#define IOC_LIBCFS_TESTPROTOCOMPAT _IOWR('e', 60, IOCTL_LIBCFS_TYPE) -#define IOC_LIBCFS_PING _IOWR('e', 61, IOCTL_LIBCFS_TYPE) -/* IOC_LIBCFS_DEBUG_PEER _IOWR('e', 62, IOCTL_LIBCFS_TYPE) */ -#define IOC_LIBCFS_LNETST _IOWR('e', 63, IOCTL_LIBCFS_TYPE) -#define IOC_LIBCFS_LNET_FAULT _IOWR('e', 64, IOCTL_LIBCFS_TYPE) -/* lnd ioctls */ -#define IOC_LIBCFS_REGISTER_MYNID _IOWR('e', 70, IOCTL_LIBCFS_TYPE) -#define IOC_LIBCFS_CLOSE_CONNECTION _IOWR('e', 71, IOCTL_LIBCFS_TYPE) -#define IOC_LIBCFS_PUSH_CONNECTION _IOWR('e', 72, IOCTL_LIBCFS_TYPE) -#define IOC_LIBCFS_GET_CONN _IOWR('e', 73, IOCTL_LIBCFS_TYPE) -#define IOC_LIBCFS_DEL_PEER _IOWR('e', 74, IOCTL_LIBCFS_TYPE) -#define IOC_LIBCFS_ADD_PEER _IOWR('e', 75, IOCTL_LIBCFS_TYPE) -#define IOC_LIBCFS_GET_PEER _IOWR('e', 76, IOCTL_LIBCFS_TYPE) -/* ioctl 77 is free for use */ -#define IOC_LIBCFS_ADD_INTERFACE _IOWR('e', 78, IOCTL_LIBCFS_TYPE) -#define IOC_LIBCFS_DEL_INTERFACE _IOWR('e', 79, IOCTL_LIBCFS_TYPE) -#define IOC_LIBCFS_GET_INTERFACE _IOWR('e', 80, IOCTL_LIBCFS_TYPE) - -/* - * DLC Specific IOCTL numbers. - * In order to maintain backward compatibility with any possible external - * tools which might be accessing the IOCTL numbers, a new group of IOCTL - * number have been allocated. - */ -#define IOCTL_CONFIG_SIZE struct lnet_ioctl_config_data -#define IOC_LIBCFS_ADD_ROUTE _IOWR(IOC_LIBCFS_TYPE, 81, IOCTL_CONFIG_SIZE) -#define IOC_LIBCFS_DEL_ROUTE _IOWR(IOC_LIBCFS_TYPE, 82, IOCTL_CONFIG_SIZE) -#define IOC_LIBCFS_GET_ROUTE _IOWR(IOC_LIBCFS_TYPE, 83, IOCTL_CONFIG_SIZE) -#define IOC_LIBCFS_ADD_NET _IOWR(IOC_LIBCFS_TYPE, 84, IOCTL_CONFIG_SIZE) -#define IOC_LIBCFS_DEL_NET _IOWR(IOC_LIBCFS_TYPE, 85, IOCTL_CONFIG_SIZE) -#define IOC_LIBCFS_GET_NET _IOWR(IOC_LIBCFS_TYPE, 86, IOCTL_CONFIG_SIZE) -#define IOC_LIBCFS_CONFIG_RTR _IOWR(IOC_LIBCFS_TYPE, 87, IOCTL_CONFIG_SIZE) -#define IOC_LIBCFS_ADD_BUF _IOWR(IOC_LIBCFS_TYPE, 88, IOCTL_CONFIG_SIZE) -#define IOC_LIBCFS_GET_BUF _IOWR(IOC_LIBCFS_TYPE, 89, IOCTL_CONFIG_SIZE) -#define IOC_LIBCFS_GET_PEER_INFO _IOWR(IOC_LIBCFS_TYPE, 90, IOCTL_CONFIG_SIZE) -#define IOC_LIBCFS_GET_LNET_STATS _IOWR(IOC_LIBCFS_TYPE, 91, IOCTL_CONFIG_SIZE) -#define IOC_LIBCFS_MAX_NR 91 - -#endif /* __LIBCFS_IOCTL_H__ */ diff --git a/drivers/staging/lustre/include/uapi/linux/lnet/lnet-dlc.h b/drivers/staging/lustre/include/uapi/linux/lnet/lnet-dlc.h deleted file mode 100644 index c1619f411d81aaddb60aabf01d0e210e578a5481..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/uapi/linux/lnet/lnet-dlc.h +++ /dev/null @@ -1,150 +0,0 @@ -/* - * LGPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library. - * - * LGPL HEADER END - * - */ -/* - * Copyright (c) 2014, Intel Corporation. - */ -/* - * Author: Amir Shehata - */ - -#ifndef LNET_DLC_H -#define LNET_DLC_H - -#include -#include - -#define MAX_NUM_SHOW_ENTRIES 32 -#define LNET_MAX_STR_LEN 128 -#define LNET_MAX_SHOW_NUM_CPT 128 -#define LNET_UNDEFINED_HOPS ((__u32)(-1)) - -struct lnet_ioctl_config_lnd_cmn_tunables { - __u32 lct_version; - __u32 lct_peer_timeout; - __u32 lct_peer_tx_credits; - __u32 lct_peer_rtr_credits; - __u32 lct_max_tx_credits; -}; - -struct lnet_ioctl_config_o2iblnd_tunables { - __u32 lnd_version; - __u32 lnd_peercredits_hiw; - __u32 lnd_map_on_demand; - __u32 lnd_concurrent_sends; - __u32 lnd_fmr_pool_size; - __u32 lnd_fmr_flush_trigger; - __u32 lnd_fmr_cache; - __u16 lnd_conns_per_peer; - __u16 pad; -}; - -struct lnet_ioctl_config_lnd_tunables { - struct lnet_ioctl_config_lnd_cmn_tunables lt_cmn; - union { - struct lnet_ioctl_config_o2iblnd_tunables lt_o2ib; - } lt_tun_u; -}; - -struct lnet_ioctl_net_config { - char ni_interfaces[LNET_MAX_INTERFACES][LNET_MAX_STR_LEN]; - __u32 ni_status; - __u32 ni_cpts[LNET_MAX_SHOW_NUM_CPT]; - char cfg_bulk[0]; -}; - -#define LNET_TINY_BUF_IDX 0 -#define LNET_SMALL_BUF_IDX 1 -#define LNET_LARGE_BUF_IDX 2 - -/* # different router buffer pools */ -#define LNET_NRBPOOLS (LNET_LARGE_BUF_IDX + 1) - -struct lnet_ioctl_pool_cfg { - struct { - __u32 pl_npages; - __u32 pl_nbuffers; - __u32 pl_credits; - __u32 pl_mincredits; - } pl_pools[LNET_NRBPOOLS]; - __u32 pl_routing; -}; - -struct lnet_ioctl_config_data { - struct libcfs_ioctl_hdr cfg_hdr; - - __u32 cfg_net; - __u32 cfg_count; - __u64 cfg_nid; - __u32 cfg_ncpts; - - union { - struct { - __u32 rtr_hop; - __u32 rtr_priority; - __u32 rtr_flags; - } cfg_route; - struct { - char net_intf[LNET_MAX_STR_LEN]; - __s32 net_peer_timeout; - __s32 net_peer_tx_credits; - __s32 net_peer_rtr_credits; - __s32 net_max_tx_credits; - __u32 net_cksum_algo; - __u32 net_interface_count; - } cfg_net; - struct { - __u32 buf_enable; - __s32 buf_tiny; - __s32 buf_small; - __s32 buf_large; - } cfg_buffers; - } cfg_config_u; - - char cfg_bulk[0]; -}; - -struct lnet_ioctl_peer { - struct libcfs_ioctl_hdr pr_hdr; - __u32 pr_count; - __u32 pr_pad; - __u64 pr_nid; - - union { - struct { - char cr_aliveness[LNET_MAX_STR_LEN]; - __u32 cr_refcount; - __u32 cr_ni_peer_tx_credits; - __u32 cr_peer_tx_credits; - __u32 cr_peer_rtr_credits; - __u32 cr_peer_min_rtr_credits; - __u32 cr_peer_tx_qnob; - __u32 cr_ncpt; - } pr_peer_credits; - } pr_lnd_u; -}; - -struct lnet_ioctl_lnet_stats { - struct libcfs_ioctl_hdr st_hdr; - struct lnet_counters st_cntrs; -}; - -#endif /* LNET_DLC_H */ diff --git a/drivers/staging/lustre/include/uapi/linux/lnet/lnet-types.h b/drivers/staging/lustre/include/uapi/linux/lnet/lnet-types.h deleted file mode 100644 index 1be9b7aa7326d3d9b28e60a11df892d4a2b049d9..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/uapi/linux/lnet/lnet-types.h +++ /dev/null @@ -1,669 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012 - 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Seagate, Inc. - */ - -#ifndef __LNET_TYPES_H__ -#define __LNET_TYPES_H__ - -#include -#include - -/** \addtogroup lnet - * @{ - */ - -#define LNET_VERSION "0.6.0" - -/** \addtogroup lnet_addr - * @{ - */ - -/** Portal reserved for LNet's own use. - * \see lustre/include/lustre/lustre_idl.h for Lustre portal assignments. - */ -#define LNET_RESERVED_PORTAL 0 - -/** - * Address of an end-point in an LNet network. - * - * A node can have multiple end-points and hence multiple addresses. - * An LNet network can be a simple network (e.g. tcp0) or a network of - * LNet networks connected by LNet routers. Therefore an end-point address - * has two parts: network ID, and address within a network. - * - * \see LNET_NIDNET, LNET_NIDADDR, and LNET_MKNID. - */ -typedef __u64 lnet_nid_t; -/** - * ID of a process in a node. Shortened as PID to distinguish from - * lnet_process_id, the global process ID. - */ -typedef __u32 lnet_pid_t; - -/** wildcard NID that matches any end-point address */ -#define LNET_NID_ANY ((lnet_nid_t)(-1)) -/** wildcard PID that matches any lnet_pid_t */ -#define LNET_PID_ANY ((lnet_pid_t)(-1)) - -#define LNET_PID_RESERVED 0xf0000000 /* reserved bits in PID */ -#define LNET_PID_USERFLAG 0x80000000 /* set in userspace peers */ -#define LNET_PID_LUSTRE 12345 - -#define LNET_TIME_FOREVER (-1) - -/* how an LNET NID encodes net:address */ -/** extract the address part of an lnet_nid_t */ - -static inline __u32 LNET_NIDADDR(lnet_nid_t nid) -{ - return nid & 0xffffffff; -} - -static inline __u32 LNET_NIDNET(lnet_nid_t nid) -{ - return (nid >> 32) & 0xffffffff; -} - -static inline lnet_nid_t LNET_MKNID(__u32 net, __u32 addr) -{ - return (((__u64)net) << 32) | addr; -} - -static inline __u32 LNET_NETNUM(__u32 net) -{ - return net & 0xffff; -} - -static inline __u32 LNET_NETTYP(__u32 net) -{ - return (net >> 16) & 0xffff; -} - -static inline __u32 LNET_MKNET(__u32 type, __u32 num) -{ - return (type << 16) | num; -} - -#define WIRE_ATTR __packed - -/* Packed version of lnet_process_id to transfer via network */ -struct lnet_process_id_packed { - /* node id / process id */ - lnet_nid_t nid; - lnet_pid_t pid; -} WIRE_ATTR; - -/* - * The wire handle's interface cookie only matches one network interface in - * one epoch (i.e. new cookie when the interface restarts or the node - * reboots). The object cookie only matches one object on that interface - * during that object's lifetime (i.e. no cookie re-use). - */ -struct lnet_handle_wire { - __u64 wh_interface_cookie; - __u64 wh_object_cookie; -} WIRE_ATTR; - -enum lnet_msg_type { - LNET_MSG_ACK = 0, - LNET_MSG_PUT, - LNET_MSG_GET, - LNET_MSG_REPLY, - LNET_MSG_HELLO, -}; - -/* - * The variant fields of the portals message header are aligned on an 8 - * byte boundary in the message header. Note that all types used in these - * wire structs MUST be fixed size and the smaller types are placed at the - * end. - */ -struct lnet_ack { - struct lnet_handle_wire dst_wmd; - __u64 match_bits; - __u32 mlength; -} WIRE_ATTR; - -struct lnet_put { - struct lnet_handle_wire ack_wmd; - __u64 match_bits; - __u64 hdr_data; - __u32 ptl_index; - __u32 offset; -} WIRE_ATTR; - -struct lnet_get { - struct lnet_handle_wire return_wmd; - __u64 match_bits; - __u32 ptl_index; - __u32 src_offset; - __u32 sink_length; -} WIRE_ATTR; - -struct lnet_reply { - struct lnet_handle_wire dst_wmd; -} WIRE_ATTR; - -struct lnet_hello { - __u64 incarnation; - __u32 type; -} WIRE_ATTR; - -struct lnet_hdr { - lnet_nid_t dest_nid; - lnet_nid_t src_nid; - lnet_pid_t dest_pid; - lnet_pid_t src_pid; - __u32 type; /* enum lnet_msg_type */ - __u32 payload_length; /* payload data to follow */ - /*<------__u64 aligned------->*/ - union { - struct lnet_ack ack; - struct lnet_put put; - struct lnet_get get; - struct lnet_reply reply; - struct lnet_hello hello; - } msg; -} WIRE_ATTR; - -/* - * A HELLO message contains a magic number and protocol version - * code in the header's dest_nid, the peer's NID in the src_nid, and - * LNET_MSG_HELLO in the type field. All other common fields are zero - * (including payload_size; i.e. no payload). - * This is for use by byte-stream LNDs (e.g. TCP/IP) to check the peer is - * running the same protocol and to find out its NID. These LNDs should - * exchange HELLO messages when a connection is first established. Individual - * LNDs can put whatever else they fancy in struct lnet_hdr::msg. - */ -struct lnet_magicversion { - __u32 magic; /* LNET_PROTO_TCP_MAGIC */ - __u16 version_major; /* increment on incompatible change */ - __u16 version_minor; /* increment on compatible change */ -} WIRE_ATTR; - -/* PROTO MAGIC for LNDs */ -#define LNET_PROTO_IB_MAGIC 0x0be91b91 -#define LNET_PROTO_GNI_MAGIC 0xb00fbabe /* ask Kim */ -#define LNET_PROTO_TCP_MAGIC 0xeebc0ded -#define LNET_PROTO_ACCEPTOR_MAGIC 0xacce7100 -#define LNET_PROTO_PING_MAGIC 0x70696E67 /* 'ping' */ - -/* Placeholder for a future "unified" protocol across all LNDs */ -/* - * Current LNDs that receive a request with this magic will respond with a - * "stub" reply using their current protocol - */ -#define LNET_PROTO_MAGIC 0x45726963 /* ! */ - -#define LNET_PROTO_TCP_VERSION_MAJOR 1 -#define LNET_PROTO_TCP_VERSION_MINOR 0 - -/* Acceptor connection request */ -struct lnet_acceptor_connreq { - __u32 acr_magic; /* PTL_ACCEPTOR_PROTO_MAGIC */ - __u32 acr_version; /* protocol version */ - __u64 acr_nid; /* target NID */ -} WIRE_ATTR; - -#define LNET_PROTO_ACCEPTOR_VERSION 1 - -struct lnet_ni_status { - lnet_nid_t ns_nid; - __u32 ns_status; - __u32 ns_unused; -} WIRE_ATTR; - -struct lnet_ping_info { - __u32 pi_magic; - __u32 pi_features; - lnet_pid_t pi_pid; - __u32 pi_nnis; - struct lnet_ni_status pi_ni[0]; -} WIRE_ATTR; - -struct lnet_counters { - __u32 msgs_alloc; - __u32 msgs_max; - __u32 errors; - __u32 send_count; - __u32 recv_count; - __u32 route_count; - __u32 drop_count; - __u64 send_length; - __u64 recv_length; - __u64 route_length; - __u64 drop_length; -} WIRE_ATTR; - -#define LNET_NI_STATUS_UP 0x15aac0de -#define LNET_NI_STATUS_DOWN 0xdeadface -#define LNET_NI_STATUS_INVALID 0x00000000 - -#define LNET_MAX_INTERFACES 16 - -/** - * Objects maintained by the LNet are accessed through handles. Handle types - * have names of the form lnet_handle_xx, where xx is one of the two letter - * object type codes ('eq' for event queue, 'md' for memory descriptor, and - * 'me' for match entry). Each type of object is given a unique handle type - * to enhance type checking. - */ -#define LNET_WIRE_HANDLE_COOKIE_NONE (-1) - -struct lnet_handle_eq { - u64 cookie; -}; - -/** - * Invalidate eq handle @h. - */ -static inline void LNetInvalidateEQHandle(struct lnet_handle_eq *h) -{ - h->cookie = LNET_WIRE_HANDLE_COOKIE_NONE; -} - -/** - * Check whether eq handle @h is invalid. - * - * @return 1 if handle is invalid, 0 if valid. - */ -static inline int LNetEQHandleIsInvalid(struct lnet_handle_eq h) -{ - return (LNET_WIRE_HANDLE_COOKIE_NONE == h.cookie); -} - -struct lnet_handle_md { - u64 cookie; -}; - -/** - * Invalidate md handle @h. - */ -static inline void LNetInvalidateMDHandle(struct lnet_handle_md *h) -{ - h->cookie = LNET_WIRE_HANDLE_COOKIE_NONE; -} - -/** - * Check whether eq handle @h is invalid. - * - * @return 1 if handle is invalid, 0 if valid. - */ -static inline int LNetMDHandleIsInvalid(struct lnet_handle_md h) -{ - return (LNET_WIRE_HANDLE_COOKIE_NONE == h.cookie); -} - -struct lnet_handle_me { - u64 cookie; -}; - -/** - * Global process ID. - */ -struct lnet_process_id { - /** node id */ - lnet_nid_t nid; - /** process id */ - lnet_pid_t pid; -}; -/** @} lnet_addr */ - -/** \addtogroup lnet_me - * @{ - */ - -/** - * Specifies whether the match entry or memory descriptor should be unlinked - * automatically (LNET_UNLINK) or not (LNET_RETAIN). - */ -enum lnet_unlink { - LNET_RETAIN = 0, - LNET_UNLINK -}; - -/** - * Values of the type lnet_ins_pos are used to control where a new match - * entry is inserted. The value LNET_INS_BEFORE is used to insert the new - * entry before the current entry or before the head of the list. The value - * LNET_INS_AFTER is used to insert the new entry after the current entry - * or after the last item in the list. - */ -enum lnet_ins_pos { - /** insert ME before current position or head of the list */ - LNET_INS_BEFORE, - /** insert ME after current position or tail of the list */ - LNET_INS_AFTER, - /** attach ME at tail of local CPU partition ME list */ - LNET_INS_LOCAL -}; - -/** @} lnet_me */ - -/** \addtogroup lnet_md - * @{ - */ - -/** - * Defines the visible parts of a memory descriptor. Values of this type - * are used to initialize memory descriptors. - */ -struct lnet_md { - /** - * Specify the memory region associated with the memory descriptor. - * If the options field has: - * - LNET_MD_KIOV bit set: The start field points to the starting - * address of an array of struct bio_vec and the length field specifies - * the number of entries in the array. The length can't be bigger - * than LNET_MAX_IOV. The struct bio_vec is used to describe page-based - * fragments that are not necessarily mapped in virtual memory. - * - LNET_MD_IOVEC bit set: The start field points to the starting - * address of an array of struct iovec and the length field specifies - * the number of entries in the array. The length can't be bigger - * than LNET_MAX_IOV. The struct iovec is used to describe fragments - * that have virtual addresses. - * - Otherwise: The memory region is contiguous. The start field - * specifies the starting address for the memory region and the - * length field specifies its length. - * - * When the memory region is fragmented, all fragments but the first - * one must start on page boundary, and all but the last must end on - * page boundary. - */ - void *start; - unsigned int length; - /** - * Specifies the maximum number of operations that can be performed - * on the memory descriptor. An operation is any action that could - * possibly generate an event. In the usual case, the threshold value - * is decremented for each operation on the MD. When the threshold - * drops to zero, the MD becomes inactive and does not respond to - * operations. A threshold value of LNET_MD_THRESH_INF indicates that - * there is no bound on the number of operations that may be applied - * to a MD. - */ - int threshold; - /** - * Specifies the largest incoming request that the memory descriptor - * should respond to. When the unused portion of a MD (length - - * local offset) falls below this value, the MD becomes inactive and - * does not respond to further operations. This value is only used - * if the LNET_MD_MAX_SIZE option is set. - */ - int max_size; - /** - * Specifies the behavior of the memory descriptor. A bitwise OR - * of the following values can be used: - * - LNET_MD_OP_PUT: The LNet PUT operation is allowed on this MD. - * - LNET_MD_OP_GET: The LNet GET operation is allowed on this MD. - * - LNET_MD_MANAGE_REMOTE: The offset used in accessing the memory - * region is provided by the incoming request. By default, the - * offset is maintained locally. When maintained locally, the - * offset is incremented by the length of the request so that - * the next operation (PUT or GET) will access the next part of - * the memory region. Note that only one offset variable exists - * per memory descriptor. If both PUT and GET operations are - * performed on a memory descriptor, the offset is updated each time. - * - LNET_MD_TRUNCATE: The length provided in the incoming request can - * be reduced to match the memory available in the region (determined - * by subtracting the offset from the length of the memory region). - * By default, if the length in the incoming operation is greater - * than the amount of memory available, the operation is rejected. - * - LNET_MD_ACK_DISABLE: An acknowledgment should not be sent for - * incoming PUT operations, even if requested. By default, - * acknowledgments are sent for PUT operations that request an - * acknowledgment. Acknowledgments are never sent for GET operations. - * The data sent in the REPLY serves as an implicit acknowledgment. - * - LNET_MD_KIOV: The start and length fields specify an array of - * struct bio_vec. - * - LNET_MD_IOVEC: The start and length fields specify an array of - * struct iovec. - * - LNET_MD_MAX_SIZE: The max_size field is valid. - * - * Note: - * - LNET_MD_KIOV or LNET_MD_IOVEC allows for a scatter/gather - * capability for memory descriptors. They can't be both set. - * - When LNET_MD_MAX_SIZE is set, the total length of the memory - * region (i.e. sum of all fragment lengths) must not be less than - * \a max_size. - */ - unsigned int options; - /** - * A user-specified value that is associated with the memory - * descriptor. The value does not need to be a pointer, but must fit - * in the space used by a pointer. This value is recorded in events - * associated with operations on this MD. - */ - void *user_ptr; - /** - * A handle for the event queue used to log the operations performed on - * the memory region. If this argument is a NULL handle (i.e. nullified - * by LNetInvalidateHandle()), operations performed on this memory - * descriptor are not logged. - */ - struct lnet_handle_eq eq_handle; -}; - -/* - * Max Transfer Unit (minimum supported everywhere). - * CAVEAT EMPTOR, with multinet (i.e. routers forwarding between networks) - * these limits are system wide and not interface-local. - */ -#define LNET_MTU_BITS 20 -#define LNET_MTU (1 << LNET_MTU_BITS) - -/** limit on the number of fragments in discontiguous MDs */ -#define LNET_MAX_IOV 256 - -/** - * Options for the MD structure. See lnet_md::options. - */ -#define LNET_MD_OP_PUT (1 << 0) -/** See lnet_md::options. */ -#define LNET_MD_OP_GET (1 << 1) -/** See lnet_md::options. */ -#define LNET_MD_MANAGE_REMOTE (1 << 2) -/* unused (1 << 3) */ -/** See lnet_md::options. */ -#define LNET_MD_TRUNCATE (1 << 4) -/** See lnet_md::options. */ -#define LNET_MD_ACK_DISABLE (1 << 5) -/** See lnet_md::options. */ -#define LNET_MD_IOVEC (1 << 6) -/** See lnet_md::options. */ -#define LNET_MD_MAX_SIZE (1 << 7) -/** See lnet_md::options. */ -#define LNET_MD_KIOV (1 << 8) - -/* For compatibility with Cray Portals */ -#define LNET_MD_PHYS 0 - -/** Infinite threshold on MD operations. See lnet_md::threshold */ -#define LNET_MD_THRESH_INF (-1) - -/** @} lnet_md */ - -/** \addtogroup lnet_eq - * @{ - */ - -/** - * Six types of events can be logged in an event queue. - */ -enum lnet_event_kind { - /** An incoming GET operation has completed on the MD. */ - LNET_EVENT_GET = 1, - /** - * An incoming PUT operation has completed on the MD. The - * underlying layers will not alter the memory (on behalf of this - * operation) once this event has been logged. - */ - LNET_EVENT_PUT, - /** - * A REPLY operation has completed. This event is logged after the - * data (if any) from the REPLY has been written into the MD. - */ - LNET_EVENT_REPLY, - /** An acknowledgment has been received. */ - LNET_EVENT_ACK, - /** - * An outgoing send (PUT or GET) operation has completed. This event - * is logged after the entire buffer has been sent and it is safe for - * the caller to reuse the buffer. - * - * Note: - * - The LNET_EVENT_SEND doesn't guarantee message delivery. It can - * happen even when the message has not yet been put out on wire. - * - It's unsafe to assume that in an outgoing GET operation - * the LNET_EVENT_SEND event would happen before the - * LNET_EVENT_REPLY event. The same holds for LNET_EVENT_SEND and - * LNET_EVENT_ACK events in an outgoing PUT operation. - */ - LNET_EVENT_SEND, - /** - * A MD has been unlinked. Note that LNetMDUnlink() does not - * necessarily trigger an LNET_EVENT_UNLINK event. - * \see LNetMDUnlink - */ - LNET_EVENT_UNLINK, -}; - -#define LNET_SEQ_GT(a, b) (((signed long)((a) - (b))) > 0) - -/** - * Information about an event on a MD. - */ -struct lnet_event { - /** The identifier (nid, pid) of the target. */ - struct lnet_process_id target; - /** The identifier (nid, pid) of the initiator. */ - struct lnet_process_id initiator; - /** - * The NID of the immediate sender. If the request has been forwarded - * by routers, this is the NID of the last hop; otherwise it's the - * same as the initiator. - */ - lnet_nid_t sender; - /** Indicates the type of the event. */ - enum lnet_event_kind type; - /** The portal table index specified in the request */ - unsigned int pt_index; - /** A copy of the match bits specified in the request. */ - __u64 match_bits; - /** The length (in bytes) specified in the request. */ - unsigned int rlength; - /** - * The length (in bytes) of the data that was manipulated by the - * operation. For truncated operations, the manipulated length will be - * the number of bytes specified by the MD (possibly with an offset, - * see lnet_md). For all other operations, the manipulated length - * will be the length of the requested operation, i.e. rlength. - */ - unsigned int mlength; - /** - * The handle to the MD associated with the event. The handle may be - * invalid if the MD has been unlinked. - */ - struct lnet_handle_md md_handle; - /** - * A snapshot of the state of the MD immediately after the event has - * been processed. In particular, the threshold field in md will - * reflect the value of the threshold after the operation occurred. - */ - struct lnet_md md; - /** - * 64 bits of out-of-band user data. Only valid for LNET_EVENT_PUT. - * \see LNetPut - */ - __u64 hdr_data; - /** - * Indicates the completion status of the operation. It's 0 for - * successful operations, otherwise it's an error code. - */ - int status; - /** - * Indicates whether the MD has been unlinked. Note that: - * - An event with unlinked set is the last event on the MD. - * - This field is also set for an explicit LNET_EVENT_UNLINK event. - * \see LNetMDUnlink - */ - int unlinked; - /** - * The displacement (in bytes) into the memory region that the - * operation used. The offset can be determined by the operation for - * a remote managed MD or by the local MD. - * \see lnet_md::options - */ - unsigned int offset; - /** - * The sequence number for this event. Sequence numbers are unique - * to each event. - */ - volatile unsigned long sequence; -}; - -/** - * Event queue handler function type. - * - * The EQ handler runs for each event that is deposited into the EQ. The - * handler is supplied with a pointer to the event that triggered the - * handler invocation. - * - * The handler must not block, must be reentrant, and must not call any LNet - * API functions. It should return as quickly as possible. - */ -typedef void (*lnet_eq_handler_t)(struct lnet_event *event); -#define LNET_EQ_HANDLER_NONE NULL -/** @} lnet_eq */ - -/** \addtogroup lnet_data - * @{ - */ - -/** - * Specify whether an acknowledgment should be sent by target when the PUT - * operation completes (i.e., when the data has been written to a MD of the - * target process). - * - * \see lnet_md::options for the discussion on LNET_MD_ACK_DISABLE by which - * acknowledgments can be disabled for a MD. - */ -enum lnet_ack_req { - /** Request an acknowledgment */ - LNET_ACK_REQ, - /** Request that no acknowledgment should be generated. */ - LNET_NOACK_REQ -}; -/** @} lnet_data */ - -/** @} lnet */ -#endif diff --git a/drivers/staging/lustre/include/uapi/linux/lnet/lnetctl.h b/drivers/staging/lustre/include/uapi/linux/lnet/lnetctl.h deleted file mode 100644 index cccb32dd28f2c8351931129633ea175c27d82a1c..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/uapi/linux/lnet/lnetctl.h +++ /dev/null @@ -1,123 +0,0 @@ -/* - * This file is part of Portals, http://www.sf.net/projects/lustre/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * header for lnet ioctl - */ -#ifndef _LNETCTL_H_ -#define _LNETCTL_H_ - -#include - -/** \addtogroup lnet_fault_simulation - * @{ - */ - -enum { - LNET_CTL_DROP_ADD, - LNET_CTL_DROP_DEL, - LNET_CTL_DROP_RESET, - LNET_CTL_DROP_LIST, - LNET_CTL_DELAY_ADD, - LNET_CTL_DELAY_DEL, - LNET_CTL_DELAY_RESET, - LNET_CTL_DELAY_LIST, -}; - -#define LNET_ACK_BIT (1 << 0) -#define LNET_PUT_BIT (1 << 1) -#define LNET_GET_BIT (1 << 2) -#define LNET_REPLY_BIT (1 << 3) - -/** ioctl parameter for LNet fault simulation */ -struct lnet_fault_attr { - /** - * source NID of drop rule - * LNET_NID_ANY is wildcard for all sources - * 255.255.255.255@net is wildcard for all addresses from @net - */ - lnet_nid_t fa_src; - /** destination NID of drop rule, see \a dr_src for details */ - lnet_nid_t fa_dst; - /** - * Portal mask to drop, -1 means all portals, for example: - * fa_ptl_mask = (1 << _LDLM_CB_REQUEST_PORTAL ) | - * (1 << LDLM_CANCEL_REQUEST_PORTAL) - * - * If it is non-zero then only PUT and GET will be filtered, otherwise - * there is no portal filter, all matched messages will be checked. - */ - __u64 fa_ptl_mask; - /** - * message types to drop, for example: - * dra_type = LNET_DROP_ACK_BIT | LNET_DROP_PUT_BIT - * - * If it is non-zero then only specified message types are filtered, - * otherwise all message types will be checked. - */ - __u32 fa_msg_mask; - union { - /** message drop simulation */ - struct { - /** drop rate of this rule */ - __u32 da_rate; - /** - * time interval of message drop, it is exclusive - * with da_rate - */ - __u32 da_interval; - } drop; - /** message latency simulation */ - struct { - __u32 la_rate; - /** - * time interval of message delay, it is exclusive - * with la_rate - */ - __u32 la_interval; - /** latency to delay */ - __u32 la_latency; - } delay; - __u64 space[8]; - } u; -}; - -/** fault simluation stats */ -struct lnet_fault_stat { - /** total # matched messages */ - __u64 fs_count; - /** # dropped LNET_MSG_PUT by this rule */ - __u64 fs_put; - /** # dropped LNET_MSG_ACK by this rule */ - __u64 fs_ack; - /** # dropped LNET_MSG_GET by this rule */ - __u64 fs_get; - /** # dropped LNET_MSG_REPLY by this rule */ - __u64 fs_reply; - union { - struct { - /** total # dropped messages */ - __u64 ds_dropped; - } drop; - struct { - /** total # delayed messages */ - __u64 ls_delayed; - } delay; - __u64 space[8]; - } u; -}; - -/** @} lnet_fault_simulation */ - -#define LNET_DEV_ID 0 -#define LNET_DEV_PATH "/dev/lnet" - -#endif diff --git a/drivers/staging/lustre/include/uapi/linux/lnet/lnetst.h b/drivers/staging/lustre/include/uapi/linux/lnet/lnetst.h deleted file mode 100644 index a4f9ff01d45819c61114fc1c55647ffeecccbf0a..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/uapi/linux/lnet/lnetst.h +++ /dev/null @@ -1,556 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011 - 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Seagate, Inc. - * - * lnet/include/lnet/lnetst.h - * - * Author: Liang Zhen - */ - -#ifndef __LNET_ST_H__ -#define __LNET_ST_H__ - -#include - -#define LST_FEAT_NONE (0) -#define LST_FEAT_BULK_LEN (1 << 0) /* enable variable page size */ - -#define LST_FEATS_EMPTY (LST_FEAT_NONE) -#define LST_FEATS_MASK (LST_FEAT_NONE | LST_FEAT_BULK_LEN) - -#define LST_NAME_SIZE 32 /* max name buffer length */ - -#define LSTIO_DEBUG 0xC00 /* debug */ -#define LSTIO_SESSION_NEW 0xC01 /* create session */ -#define LSTIO_SESSION_END 0xC02 /* end session */ -#define LSTIO_SESSION_INFO 0xC03 /* query session */ -#define LSTIO_GROUP_ADD 0xC10 /* add group */ -#define LSTIO_GROUP_LIST 0xC11 /* list all groups in session */ -#define LSTIO_GROUP_INFO 0xC12 /* query default information of - * specified group - */ -#define LSTIO_GROUP_DEL 0xC13 /* delete group */ -#define LSTIO_NODES_ADD 0xC14 /* add nodes to specified group */ -#define LSTIO_GROUP_UPDATE 0xC15 /* update group */ -#define LSTIO_BATCH_ADD 0xC20 /* add batch */ -#define LSTIO_BATCH_START 0xC21 /* start batch */ -#define LSTIO_BATCH_STOP 0xC22 /* stop batch */ -#define LSTIO_BATCH_DEL 0xC23 /* delete batch */ -#define LSTIO_BATCH_LIST 0xC24 /* show all batches in the session */ -#define LSTIO_BATCH_INFO 0xC25 /* show defail of specified batch */ -#define LSTIO_TEST_ADD 0xC26 /* add test (to batch) */ -#define LSTIO_BATCH_QUERY 0xC27 /* query batch status */ -#define LSTIO_STAT_QUERY 0xC30 /* get stats */ - -struct lst_sid { - lnet_nid_t ses_nid; /* nid of console node */ - __u64 ses_stamp; /* time stamp */ -}; /*** session id */ - -extern struct lst_sid LST_INVALID_SID; - -struct lst_bid { - __u64 bat_id; /* unique id in session */ -}; /*** batch id (group of tests) */ - -/* Status of test node */ -#define LST_NODE_ACTIVE 0x1 /* node in this session */ -#define LST_NODE_BUSY 0x2 /* node is taken by other session */ -#define LST_NODE_DOWN 0x4 /* node is down */ -#define LST_NODE_UNKNOWN 0x8 /* node not in session */ - -struct lstcon_node_ent { - struct lnet_process_id nde_id; /* id of node */ - int nde_state; /* state of node */ -}; /*** node entry, for list_group command */ - -struct lstcon_ndlist_ent { - int nle_nnode; /* # of nodes */ - int nle_nactive; /* # of active nodes */ - int nle_nbusy; /* # of busy nodes */ - int nle_ndown; /* # of down nodes */ - int nle_nunknown; /* # of unknown nodes */ -}; /*** node_list entry, for list_batch command */ - -struct lstcon_test_ent { - int tse_type; /* test type */ - int tse_loop; /* loop count */ - int tse_concur; /* concurrency of test */ -}; /* test summary entry, for - * list_batch command - */ - -struct lstcon_batch_ent { - int bae_state; /* batch status */ - int bae_timeout; /* batch timeout */ - int bae_ntest; /* # of tests in the batch */ -}; /* batch summary entry, for - * list_batch command - */ - -struct lstcon_test_batch_ent { - struct lstcon_ndlist_ent tbe_cli_nle; /* client (group) node_list - * entry - */ - struct lstcon_ndlist_ent tbe_srv_nle; /* server (group) node_list - * entry - */ - union { - struct lstcon_test_ent tbe_test; /* test entry */ - struct lstcon_batch_ent tbe_batch;/* batch entry */ - } u; -}; /* test/batch verbose information entry, - * for list_batch command - */ - -struct lstcon_rpc_ent { - struct list_head rpe_link; /* link chain */ - struct lnet_process_id rpe_peer; /* peer's id */ - struct timeval rpe_stamp; /* time stamp of RPC */ - int rpe_state; /* peer's state */ - int rpe_rpc_errno; /* RPC errno */ - - struct lst_sid rpe_sid; /* peer's session id */ - int rpe_fwk_errno; /* framework errno */ - int rpe_priv[4]; /* private data */ - char rpe_payload[0]; /* private reply payload */ -}; - -struct lstcon_trans_stat { - int trs_rpc_stat[4]; /* RPCs stat (0: total 1: failed - * 2: finished - * 4: reserved - */ - int trs_rpc_errno; /* RPC errno */ - int trs_fwk_stat[8]; /* framework stat */ - int trs_fwk_errno; /* errno of the first remote error */ - void *trs_fwk_private; /* private framework stat */ -}; - -static inline int -lstcon_rpc_stat_total(struct lstcon_trans_stat *stat, int inc) -{ - return inc ? ++stat->trs_rpc_stat[0] : stat->trs_rpc_stat[0]; -} - -static inline int -lstcon_rpc_stat_success(struct lstcon_trans_stat *stat, int inc) -{ - return inc ? ++stat->trs_rpc_stat[1] : stat->trs_rpc_stat[1]; -} - -static inline int -lstcon_rpc_stat_failure(struct lstcon_trans_stat *stat, int inc) -{ - return inc ? ++stat->trs_rpc_stat[2] : stat->trs_rpc_stat[2]; -} - -static inline int -lstcon_sesop_stat_success(struct lstcon_trans_stat *stat, int inc) -{ - return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0]; -} - -static inline int -lstcon_sesop_stat_failure(struct lstcon_trans_stat *stat, int inc) -{ - return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1]; -} - -static inline int -lstcon_sesqry_stat_active(struct lstcon_trans_stat *stat, int inc) -{ - return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0]; -} - -static inline int -lstcon_sesqry_stat_busy(struct lstcon_trans_stat *stat, int inc) -{ - return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1]; -} - -static inline int -lstcon_sesqry_stat_unknown(struct lstcon_trans_stat *stat, int inc) -{ - return inc ? ++stat->trs_fwk_stat[2] : stat->trs_fwk_stat[2]; -} - -static inline int -lstcon_tsbop_stat_success(struct lstcon_trans_stat *stat, int inc) -{ - return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0]; -} - -static inline int -lstcon_tsbop_stat_failure(struct lstcon_trans_stat *stat, int inc) -{ - return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1]; -} - -static inline int -lstcon_tsbqry_stat_idle(struct lstcon_trans_stat *stat, int inc) -{ - return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0]; -} - -static inline int -lstcon_tsbqry_stat_run(struct lstcon_trans_stat *stat, int inc) -{ - return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1]; -} - -static inline int -lstcon_tsbqry_stat_failure(struct lstcon_trans_stat *stat, int inc) -{ - return inc ? ++stat->trs_fwk_stat[2] : stat->trs_fwk_stat[2]; -} - -static inline int -lstcon_statqry_stat_success(struct lstcon_trans_stat *stat, int inc) -{ - return inc ? ++stat->trs_fwk_stat[0] : stat->trs_fwk_stat[0]; -} - -static inline int -lstcon_statqry_stat_failure(struct lstcon_trans_stat *stat, int inc) -{ - return inc ? ++stat->trs_fwk_stat[1] : stat->trs_fwk_stat[1]; -} - -/* create a session */ -struct lstio_session_new_args { - int lstio_ses_key; /* IN: local key */ - int lstio_ses_timeout; /* IN: session timeout */ - int lstio_ses_force; /* IN: force create ? */ - /** IN: session features */ - unsigned int lstio_ses_feats; - struct lst_sid __user *lstio_ses_idp; /* OUT: session id */ - int lstio_ses_nmlen; /* IN: name length */ - char __user *lstio_ses_namep; /* IN: session name */ -}; - -/* query current session */ -struct lstio_session_info_args { - struct lst_sid __user *lstio_ses_idp; /* OUT: session id */ - int __user *lstio_ses_keyp; /* OUT: local key */ - /** OUT: session features */ - unsigned int __user *lstio_ses_featp; - struct lstcon_ndlist_ent __user *lstio_ses_ndinfo;/* OUT: */ - int lstio_ses_nmlen; /* IN: name length */ - char __user *lstio_ses_namep; /* OUT: session name */ -}; - -/* delete a session */ -struct lstio_session_end_args { - int lstio_ses_key; /* IN: session key */ -}; - -#define LST_OPC_SESSION 1 -#define LST_OPC_GROUP 2 -#define LST_OPC_NODES 3 -#define LST_OPC_BATCHCLI 4 -#define LST_OPC_BATCHSRV 5 - -struct lstio_debug_args { - int lstio_dbg_key; /* IN: session key */ - int lstio_dbg_type; /* IN: debug - * session|batch| - * group|nodes list - */ - int lstio_dbg_flags; /* IN: reserved debug - * flags - */ - int lstio_dbg_timeout; /* IN: timeout of - * debug - */ - int lstio_dbg_nmlen; /* IN: len of name */ - char __user *lstio_dbg_namep; /* IN: name of - * group|batch - */ - int lstio_dbg_count; /* IN: # of test nodes - * to debug - */ - struct lnet_process_id __user *lstio_dbg_idsp; /* IN: id of test - * nodes - */ - struct list_head __user *lstio_dbg_resultp; /* OUT: list head of - * result buffer - */ -}; - -struct lstio_group_add_args { - int lstio_grp_key; /* IN: session key */ - int lstio_grp_nmlen; /* IN: name length */ - char __user *lstio_grp_namep; /* IN: group name */ -}; - -struct lstio_group_del_args { - int lstio_grp_key; /* IN: session key */ - int lstio_grp_nmlen; /* IN: name length */ - char __user *lstio_grp_namep; /* IN: group name */ -}; - -#define LST_GROUP_CLEAN 1 /* remove inactive nodes in the group */ -#define LST_GROUP_REFRESH 2 /* refresh inactive nodes - * in the group - */ -#define LST_GROUP_RMND 3 /* delete nodes from the group */ - -struct lstio_group_update_args { - int lstio_grp_key; /* IN: session key */ - int lstio_grp_opc; /* IN: OPC */ - int lstio_grp_args; /* IN: arguments */ - int lstio_grp_nmlen; /* IN: name length */ - char __user *lstio_grp_namep; /* IN: group name */ - int lstio_grp_count; /* IN: # of nodes id */ - struct lnet_process_id __user *lstio_grp_idsp; /* IN: array of nodes */ - struct list_head __user *lstio_grp_resultp; /* OUT: list head of - * result buffer - */ -}; - -struct lstio_group_nodes_args { - int lstio_grp_key; /* IN: session key */ - int lstio_grp_nmlen; /* IN: name length */ - char __user *lstio_grp_namep; /* IN: group name */ - int lstio_grp_count; /* IN: # of nodes */ - /** OUT: session features */ - unsigned int __user *lstio_grp_featp; - struct lnet_process_id __user *lstio_grp_idsp; /* IN: nodes */ - struct list_head __user *lstio_grp_resultp; /* OUT: list head of - * result buffer - */ -}; - -struct lstio_group_list_args { - int lstio_grp_key; /* IN: session key */ - int lstio_grp_idx; /* IN: group idx */ - int lstio_grp_nmlen; /* IN: name len */ - char __user *lstio_grp_namep; /* OUT: name */ -}; - -struct lstio_group_info_args { - int lstio_grp_key; /* IN: session key */ - int lstio_grp_nmlen; /* IN: name len */ - char __user *lstio_grp_namep; /* IN: name */ - struct lstcon_ndlist_ent __user *lstio_grp_entp;/* OUT: description - * of group - */ - int __user *lstio_grp_idxp; /* IN/OUT: node index */ - int __user *lstio_grp_ndentp; /* IN/OUT: # of nodent */ - struct lstcon_node_ent __user *lstio_grp_dentsp;/* OUT: nodent array */ -}; - -#define LST_DEFAULT_BATCH "batch" /* default batch name */ - -struct lstio_batch_add_args { - int lstio_bat_key; /* IN: session key */ - int lstio_bat_nmlen; /* IN: name length */ - char __user *lstio_bat_namep; /* IN: batch name */ -}; - -struct lstio_batch_del_args { - int lstio_bat_key; /* IN: session key */ - int lstio_bat_nmlen; /* IN: name length */ - char __user *lstio_bat_namep; /* IN: batch name */ -}; - -struct lstio_batch_run_args { - int lstio_bat_key; /* IN: session key */ - int lstio_bat_timeout; /* IN: timeout for - * the batch - */ - int lstio_bat_nmlen; /* IN: name length */ - char __user *lstio_bat_namep; /* IN: batch name */ - struct list_head __user *lstio_bat_resultp; /* OUT: list head of - * result buffer - */ -}; - -struct lstio_batch_stop_args { - int lstio_bat_key; /* IN: session key */ - int lstio_bat_force; /* IN: abort unfinished - * test RPC - */ - int lstio_bat_nmlen; /* IN: name length */ - char __user *lstio_bat_namep; /* IN: batch name */ - struct list_head __user *lstio_bat_resultp; /* OUT: list head of - * result buffer - */ -}; - -struct lstio_batch_query_args { - int lstio_bat_key; /* IN: session key */ - int lstio_bat_testidx; /* IN: test index */ - int lstio_bat_client; /* IN: we testing - * client? - */ - int lstio_bat_timeout; /* IN: timeout for - * waiting - */ - int lstio_bat_nmlen; /* IN: name length */ - char __user *lstio_bat_namep; /* IN: batch name */ - struct list_head __user *lstio_bat_resultp; /* OUT: list head of - * result buffer - */ -}; - -struct lstio_batch_list_args { - int lstio_bat_key; /* IN: session key */ - int lstio_bat_idx; /* IN: index */ - int lstio_bat_nmlen; /* IN: name length */ - char __user *lstio_bat_namep; /* IN: batch name */ -}; - -struct lstio_batch_info_args { - int lstio_bat_key; /* IN: session key */ - int lstio_bat_nmlen; /* IN: name length */ - char __user *lstio_bat_namep; /* IN: name */ - int lstio_bat_server; /* IN: query server - * or not - */ - int lstio_bat_testidx; /* IN: test index */ - struct lstcon_test_batch_ent __user *lstio_bat_entp;/* OUT: batch ent */ - - int __user *lstio_bat_idxp; /* IN/OUT: index of node */ - int __user *lstio_bat_ndentp; /* IN/OUT: # of nodent */ - struct lstcon_node_ent __user *lstio_bat_dentsp;/* array of nodent */ -}; - -/* add stat in session */ -struct lstio_stat_args { - int lstio_sta_key; /* IN: session key */ - int lstio_sta_timeout; /* IN: timeout for - * stat request - */ - int lstio_sta_nmlen; /* IN: group name - * length - */ - char __user *lstio_sta_namep; /* IN: group name */ - int lstio_sta_count; /* IN: # of pid */ - struct lnet_process_id __user *lstio_sta_idsp; /* IN: pid */ - struct list_head __user *lstio_sta_resultp; /* OUT: list head of - * result buffer - */ -}; - -enum lst_test_type { - LST_TEST_BULK = 1, - LST_TEST_PING = 2 -}; - -/* create a test in a batch */ -#define LST_MAX_CONCUR 1024 /* Max concurrency of test */ - -struct lstio_test_args { - int lstio_tes_key; /* IN: session key */ - int lstio_tes_bat_nmlen; /* IN: batch name len */ - char __user *lstio_tes_bat_name; /* IN: batch name */ - int lstio_tes_type; /* IN: test type */ - int lstio_tes_oneside; /* IN: one sided test */ - int lstio_tes_loop; /* IN: loop count */ - int lstio_tes_concur; /* IN: concurrency */ - - int lstio_tes_dist; /* IN: node distribution in - * destination groups - */ - int lstio_tes_span; /* IN: node span in - * destination groups - */ - int lstio_tes_sgrp_nmlen; /* IN: source group - * name length - */ - char __user *lstio_tes_sgrp_name; /* IN: group name */ - int lstio_tes_dgrp_nmlen; /* IN: destination group - * name length - */ - char __user *lstio_tes_dgrp_name; /* IN: group name */ - - int lstio_tes_param_len; /* IN: param buffer len */ - void __user *lstio_tes_param; /* IN: parameter for specified - * test: lstio_bulk_param_t, - * lstio_ping_param_t, - * ... more - */ - int __user *lstio_tes_retp; /* OUT: private returned - * value - */ - struct list_head __user *lstio_tes_resultp;/* OUT: list head of - * result buffer - */ -}; - -enum lst_brw_type { - LST_BRW_READ = 1, - LST_BRW_WRITE = 2 -}; - -enum lst_brw_flags { - LST_BRW_CHECK_NONE = 1, - LST_BRW_CHECK_SIMPLE = 2, - LST_BRW_CHECK_FULL = 3 -}; - -struct lst_test_bulk_param { - int blk_opc; /* bulk operation code */ - int blk_size; /* size (bytes) */ - int blk_time; /* time of running the test*/ - int blk_flags; /* reserved flags */ - int blk_cli_off; /* bulk offset on client */ - int blk_srv_off; /* reserved: bulk offset on server */ -}; - -struct lst_test_ping_param { - int png_size; /* size of ping message */ - int png_time; /* time */ - int png_loop; /* loop */ - int png_flags; /* reserved flags */ -}; - -struct srpc_counters { - __u32 errors; - __u32 rpcs_sent; - __u32 rpcs_rcvd; - __u32 rpcs_dropped; - __u32 rpcs_expired; - __u64 bulk_get; - __u64 bulk_put; -} WIRE_ATTR; - -struct sfw_counters { - /** milliseconds since current session started */ - __u32 running_ms; - __u32 active_batches; - __u32 zombie_sessions; - __u32 brw_errors; - __u32 ping_errors; -} WIRE_ATTR; - -#endif diff --git a/drivers/staging/lustre/include/uapi/linux/lnet/nidstr.h b/drivers/staging/lustre/include/uapi/linux/lnet/nidstr.h deleted file mode 100644 index 882074ed6021283b743c6057b72dbfb35b8b7e3c..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/uapi/linux/lnet/nidstr.h +++ /dev/null @@ -1,119 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -#ifndef _LNET_NIDSTRINGS_H -#define _LNET_NIDSTRINGS_H - -#include - -/** - * Lustre Network Driver types. - */ -enum { - /* - * Only add to these values (i.e. don't ever change or redefine them): - * network addresses depend on them... - */ - QSWLND = 1, - SOCKLND = 2, - GMLND = 3, - PTLLND = 4, - O2IBLND = 5, - CIBLND = 6, - OPENIBLND = 7, - IIBLND = 8, - LOLND = 9, - RALND = 10, - VIBLND = 11, - MXLND = 12, - GNILND = 13, - GNIIPLND = 14, -}; - -struct list_head; - -#define LNET_NIDSTR_COUNT 1024 /* # of nidstrings */ -#define LNET_NIDSTR_SIZE 32 /* size of each one (see below for usage) */ - -/* support decl needed by both kernel and user space */ -char *libcfs_next_nidstring(void); -int libcfs_isknown_lnd(__u32 lnd); -char *libcfs_lnd2modname(__u32 lnd); -char *libcfs_lnd2str_r(__u32 lnd, char *buf, size_t buf_size); -static inline char *libcfs_lnd2str(__u32 lnd) -{ - return libcfs_lnd2str_r(lnd, libcfs_next_nidstring(), - LNET_NIDSTR_SIZE); -} - -int libcfs_str2lnd(const char *str); -char *libcfs_net2str_r(__u32 net, char *buf, size_t buf_size); -static inline char *libcfs_net2str(__u32 net) -{ - return libcfs_net2str_r(net, libcfs_next_nidstring(), - LNET_NIDSTR_SIZE); -} - -char *libcfs_nid2str_r(lnet_nid_t nid, char *buf, size_t buf_size); -static inline char *libcfs_nid2str(lnet_nid_t nid) -{ - return libcfs_nid2str_r(nid, libcfs_next_nidstring(), - LNET_NIDSTR_SIZE); -} - -__u32 libcfs_str2net(const char *str); -lnet_nid_t libcfs_str2nid(const char *str); -int libcfs_str2anynid(lnet_nid_t *nid, const char *str); -char *libcfs_id2str(struct lnet_process_id id); -void cfs_free_nidlist(struct list_head *list); -int cfs_parse_nidlist(char *str, int len, struct list_head *list); -int cfs_print_nidlist(char *buffer, int count, struct list_head *list); -int cfs_match_nid(lnet_nid_t nid, struct list_head *list); - -int cfs_ip_addr_parse(char *str, int len, struct list_head *list); -int cfs_ip_addr_match(__u32 addr, struct list_head *list); -bool cfs_nidrange_is_contiguous(struct list_head *nidlist); -void cfs_nidrange_find_min_max(struct list_head *nidlist, char *min_nid, - char *max_nid, size_t nidstr_length); - -struct netstrfns { - __u32 nf_type; - char *nf_name; - char *nf_modname; - void (*nf_addr2str)(__u32 addr, char *str, size_t size); - int (*nf_str2addr)(const char *str, int nob, __u32 *addr); - int (*nf_parse_addrlist)(char *str, int len, - struct list_head *list); - int (*nf_print_addrlist)(char *buffer, int count, - struct list_head *list); - int (*nf_match_addr)(__u32 addr, struct list_head *list); - bool (*nf_is_contiguous)(struct list_head *nidlist); - void (*nf_min_max)(struct list_head *nidlist, __u32 *min_nid, - __u32 *max_nid); -}; - -#endif /* _LNET_NIDSTRINGS_H */ diff --git a/drivers/staging/lustre/include/uapi/linux/lnet/socklnd.h b/drivers/staging/lustre/include/uapi/linux/lnet/socklnd.h deleted file mode 100644 index 6453e053fa99d2c59d4c816a9be19ed9c86aa051..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/uapi/linux/lnet/socklnd.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * #defines shared between socknal implementation and utilities - */ -#ifndef __UAPI_LNET_SOCKLND_H__ -#define __UAPI_LNET_SOCKLND_H__ - -#define SOCKLND_CONN_NONE (-1) -#define SOCKLND_CONN_ANY 0 -#define SOCKLND_CONN_CONTROL 1 -#define SOCKLND_CONN_BULK_IN 2 -#define SOCKLND_CONN_BULK_OUT 3 -#define SOCKLND_CONN_NTYPES 4 - -#define SOCKLND_CONN_ACK SOCKLND_CONN_BULK_IN - -#endif diff --git a/drivers/staging/lustre/include/uapi/linux/lustre/lustre_cfg.h b/drivers/staging/lustre/include/uapi/linux/lustre/lustre_cfg.h deleted file mode 100644 index 11b51d93f64c1d09eb506bda42b167c700d9059d..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/uapi/linux/lustre/lustre_cfg.h +++ /dev/null @@ -1,261 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#ifndef _UAPI_LUSTRE_CFG_H_ -#define _UAPI_LUSTRE_CFG_H_ - -#include -#include -#include - -/** \defgroup cfg cfg - * - * @{ - */ - -/* - * 1cf6 - * lcfG - */ -#define LUSTRE_CFG_VERSION 0x1cf60001 -#define LUSTRE_CFG_MAX_BUFCOUNT 8 - -#define LCFG_HDR_SIZE(count) \ - __ALIGN_KERNEL(offsetof(struct lustre_cfg, lcfg_buflens[(count)]), 8) - -/** If the LCFG_REQUIRED bit is set in a configuration command, - * then the client is required to understand this parameter - * in order to mount the filesystem. If it does not understand - * a REQUIRED command the client mount will fail. - */ -#define LCFG_REQUIRED 0x0001000 - -enum lcfg_command_type { - LCFG_ATTACH = 0x00cf001, /**< create a new obd instance */ - LCFG_DETACH = 0x00cf002, /**< destroy obd instance */ - LCFG_SETUP = 0x00cf003, /**< call type-specific setup */ - LCFG_CLEANUP = 0x00cf004, /**< call type-specific cleanup - */ - LCFG_ADD_UUID = 0x00cf005, /**< add a nid to a niduuid */ - LCFG_DEL_UUID = 0x00cf006, /**< remove a nid from - * a niduuid - */ - LCFG_MOUNTOPT = 0x00cf007, /**< create a profile - * (mdc, osc) - */ - LCFG_DEL_MOUNTOPT = 0x00cf008, /**< destroy a profile */ - LCFG_SET_TIMEOUT = 0x00cf009, /**< set obd_timeout */ - LCFG_SET_UPCALL = 0x00cf00a, /**< deprecated */ - LCFG_ADD_CONN = 0x00cf00b, /**< add a failover niduuid to - * an obd - */ - LCFG_DEL_CONN = 0x00cf00c, /**< remove a failover niduuid */ - LCFG_LOV_ADD_OBD = 0x00cf00d, /**< add an osc to a lov */ - LCFG_LOV_DEL_OBD = 0x00cf00e, /**< remove an osc from a lov */ - LCFG_PARAM = 0x00cf00f, /**< set a proc parameter */ - LCFG_MARKER = 0x00cf010, /**< metadata about next - * cfg rec - */ - LCFG_LOG_START = 0x00ce011, /**< mgc only, process a - * cfg log - */ - LCFG_LOG_END = 0x00ce012, /**< stop processing updates */ - LCFG_LOV_ADD_INA = 0x00ce013, /**< like LOV_ADD_OBD, - * inactive - */ - LCFG_ADD_MDC = 0x00cf014, /**< add an mdc to a lmv */ - LCFG_DEL_MDC = 0x00cf015, /**< remove an mdc from a lmv */ - LCFG_SPTLRPC_CONF = 0x00ce016, /**< security */ - LCFG_POOL_NEW = 0x00ce020, /**< create an ost pool name */ - LCFG_POOL_ADD = 0x00ce021, /**< add an ost to a pool */ - LCFG_POOL_REM = 0x00ce022, /**< remove an ost from a pool */ - LCFG_POOL_DEL = 0x00ce023, /**< destroy an ost pool name */ - LCFG_SET_LDLM_TIMEOUT = 0x00ce030, /**< set ldlm_timeout */ - LCFG_PRE_CLEANUP = 0x00cf031, /**< call type-specific pre - * cleanup cleanup - */ - LCFG_SET_PARAM = 0x00ce032, /**< use set_param syntax to set - * a proc parameters - */ -}; - -struct lustre_cfg_bufs { - void *lcfg_buf[LUSTRE_CFG_MAX_BUFCOUNT]; - __u32 lcfg_buflen[LUSTRE_CFG_MAX_BUFCOUNT]; - __u32 lcfg_bufcount; -}; - -struct lustre_cfg { - __u32 lcfg_version; - __u32 lcfg_command; - - __u32 lcfg_num; - __u32 lcfg_flags; - __u64 lcfg_nid; - __u32 lcfg_nal; /* not used any more */ - - __u32 lcfg_bufcount; - __u32 lcfg_buflens[0]; -}; - -enum cfg_record_type { - PORTALS_CFG_TYPE = 1, - LUSTRE_CFG_TYPE = 123, -}; - -#define LUSTRE_CFG_BUFLEN(lcfg, idx) \ - ((lcfg)->lcfg_bufcount <= (idx) ? 0 : (lcfg)->lcfg_buflens[(idx)]) - -static inline void lustre_cfg_bufs_set(struct lustre_cfg_bufs *bufs, - __u32 index, void *buf, __u32 buflen) -{ - if (index >= LUSTRE_CFG_MAX_BUFCOUNT) - return; - - if (!bufs) - return; - - if (bufs->lcfg_bufcount <= index) - bufs->lcfg_bufcount = index + 1; - - bufs->lcfg_buf[index] = buf; - bufs->lcfg_buflen[index] = buflen; -} - -static inline void lustre_cfg_bufs_set_string(struct lustre_cfg_bufs *bufs, - __u32 index, char *str) -{ - lustre_cfg_bufs_set(bufs, index, str, str ? strlen(str) + 1 : 0); -} - -static inline void lustre_cfg_bufs_reset(struct lustre_cfg_bufs *bufs, - char *name) -{ - memset((bufs), 0, sizeof(*bufs)); - if (name) - lustre_cfg_bufs_set_string(bufs, 0, name); -} - -static inline void *lustre_cfg_buf(struct lustre_cfg *lcfg, __u32 index) -{ - __u32 i; - size_t offset; - __u32 bufcount; - - if (!lcfg) - return NULL; - - bufcount = lcfg->lcfg_bufcount; - if (index >= bufcount) - return NULL; - - offset = LCFG_HDR_SIZE(lcfg->lcfg_bufcount); - for (i = 0; i < index; i++) - offset += __ALIGN_KERNEL(lcfg->lcfg_buflens[i], 8); - return (char *)lcfg + offset; -} - -static inline void lustre_cfg_bufs_init(struct lustre_cfg_bufs *bufs, - struct lustre_cfg *lcfg) -{ - __u32 i; - - bufs->lcfg_bufcount = lcfg->lcfg_bufcount; - for (i = 0; i < bufs->lcfg_bufcount; i++) { - bufs->lcfg_buflen[i] = lcfg->lcfg_buflens[i]; - bufs->lcfg_buf[i] = lustre_cfg_buf(lcfg, i); - } -} - -static inline __u32 lustre_cfg_len(__u32 bufcount, __u32 *buflens) -{ - __u32 i; - __u32 len; - - len = LCFG_HDR_SIZE(bufcount); - for (i = 0; i < bufcount; i++) - len += __ALIGN_KERNEL(buflens[i], 8); - - return __ALIGN_KERNEL(len, 8); -} - -static inline void lustre_cfg_init(struct lustre_cfg *lcfg, int cmd, - struct lustre_cfg_bufs *bufs) -{ - char *ptr; - __u32 i; - - lcfg->lcfg_version = LUSTRE_CFG_VERSION; - lcfg->lcfg_command = cmd; - lcfg->lcfg_bufcount = bufs->lcfg_bufcount; - - ptr = (char *)lcfg + LCFG_HDR_SIZE(lcfg->lcfg_bufcount); - for (i = 0; i < lcfg->lcfg_bufcount; i++) { - lcfg->lcfg_buflens[i] = bufs->lcfg_buflen[i]; - if (bufs->lcfg_buf[i]) { - memcpy(ptr, bufs->lcfg_buf[i], bufs->lcfg_buflen[i]); - ptr += __ALIGN_KERNEL(bufs->lcfg_buflen[i], 8); - } - } -} - -static inline int lustre_cfg_sanity_check(void *buf, size_t len) -{ - struct lustre_cfg *lcfg = (struct lustre_cfg *)buf; - - if (!lcfg) - return -EINVAL; - - /* check that the first bits of the struct are valid */ - if (len < LCFG_HDR_SIZE(0)) - return -EINVAL; - - if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) - return -EINVAL; - - if (lcfg->lcfg_bufcount >= LUSTRE_CFG_MAX_BUFCOUNT) - return -EINVAL; - - /* check that the buflens are valid */ - if (len < LCFG_HDR_SIZE(lcfg->lcfg_bufcount)) - return -EINVAL; - - /* make sure all the pointers point inside the data */ - if (len < lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens)) - return -EINVAL; - - return 0; -} - -/** @} cfg */ - -#endif /* _UAPI_LUSTRE_CFG_H_ */ diff --git a/drivers/staging/lustre/include/uapi/linux/lustre/lustre_fid.h b/drivers/staging/lustre/include/uapi/linux/lustre/lustre_fid.h deleted file mode 100644 index 2e7a8d103777d297dd2a77b7f8bc267be14ae45f..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/uapi/linux/lustre/lustre_fid.h +++ /dev/null @@ -1,293 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2014, Intel Corporation. - * - * Copyright 2016 Cray Inc, all rights reserved. - * Author: Ben Evans. - * - * all fid manipulation functions go here - * - * FIDS are globally unique within a Lustre filessytem, and are made up - * of three parts: sequence, Object ID, and version. - * - */ -#ifndef _UAPI_LUSTRE_FID_H_ -#define _UAPI_LUSTRE_FID_H_ - -#include - -/** returns fid object sequence */ -static inline __u64 fid_seq(const struct lu_fid *fid) -{ - return fid->f_seq; -} - -/** returns fid object id */ -static inline __u32 fid_oid(const struct lu_fid *fid) -{ - return fid->f_oid; -} - -/** returns fid object version */ -static inline __u32 fid_ver(const struct lu_fid *fid) -{ - return fid->f_ver; -} - -static inline void fid_zero(struct lu_fid *fid) -{ - memset(fid, 0, sizeof(*fid)); -} - -static inline __u64 fid_ver_oid(const struct lu_fid *fid) -{ - return (__u64)fid_ver(fid) << 32 | fid_oid(fid); -} - -static inline bool fid_seq_is_mdt0(__u64 seq) -{ - return seq == FID_SEQ_OST_MDT0; -} - -static inline bool fid_seq_is_mdt(__u64 seq) -{ - return seq == FID_SEQ_OST_MDT0 || seq >= FID_SEQ_NORMAL; -}; - -static inline bool fid_seq_is_echo(__u64 seq) -{ - return seq == FID_SEQ_ECHO; -} - -static inline bool fid_is_echo(const struct lu_fid *fid) -{ - return fid_seq_is_echo(fid_seq(fid)); -} - -static inline bool fid_seq_is_llog(__u64 seq) -{ - return seq == FID_SEQ_LLOG; -} - -static inline bool fid_is_llog(const struct lu_fid *fid) -{ - /* file with OID == 0 is not llog but contains last oid */ - return fid_seq_is_llog(fid_seq(fid)) && fid_oid(fid) > 0; -} - -static inline bool fid_seq_is_rsvd(__u64 seq) -{ - return seq > FID_SEQ_OST_MDT0 && seq <= FID_SEQ_RSVD; -}; - -static inline bool fid_seq_is_special(__u64 seq) -{ - return seq == FID_SEQ_SPECIAL; -}; - -static inline bool fid_seq_is_local_file(__u64 seq) -{ - return seq == FID_SEQ_LOCAL_FILE || - seq == FID_SEQ_LOCAL_NAME; -}; - -static inline bool fid_seq_is_root(__u64 seq) -{ - return seq == FID_SEQ_ROOT; -} - -static inline bool fid_seq_is_dot(__u64 seq) -{ - return seq == FID_SEQ_DOT_LUSTRE; -} - -static inline bool fid_seq_is_default(__u64 seq) -{ - return seq == FID_SEQ_LOV_DEFAULT; -} - -static inline bool fid_is_mdt0(const struct lu_fid *fid) -{ - return fid_seq_is_mdt0(fid_seq(fid)); -} - -/** - * Check if a fid is igif or not. - * \param fid the fid to be tested. - * \return true if the fid is an igif; otherwise false. - */ -static inline bool fid_seq_is_igif(__u64 seq) -{ - return seq >= FID_SEQ_IGIF && seq <= FID_SEQ_IGIF_MAX; -} - -static inline bool fid_is_igif(const struct lu_fid *fid) -{ - return fid_seq_is_igif(fid_seq(fid)); -} - -/** - * Check if a fid is idif or not. - * \param fid the fid to be tested. - * \return true if the fid is an idif; otherwise false. - */ -static inline bool fid_seq_is_idif(__u64 seq) -{ - return seq >= FID_SEQ_IDIF && seq <= FID_SEQ_IDIF_MAX; -} - -static inline bool fid_is_idif(const struct lu_fid *fid) -{ - return fid_seq_is_idif(fid_seq(fid)); -} - -static inline bool fid_is_local_file(const struct lu_fid *fid) -{ - return fid_seq_is_local_file(fid_seq(fid)); -} - -static inline bool fid_seq_is_norm(__u64 seq) -{ - return (seq >= FID_SEQ_NORMAL); -} - -static inline bool fid_is_norm(const struct lu_fid *fid) -{ - return fid_seq_is_norm(fid_seq(fid)); -} - -/* convert an OST objid into an IDIF FID SEQ number */ -static inline __u64 fid_idif_seq(__u64 id, __u32 ost_idx) -{ - return FID_SEQ_IDIF | (ost_idx << 16) | ((id >> 32) & 0xffff); -} - -/* convert a packed IDIF FID into an OST objid */ -static inline __u64 fid_idif_id(__u64 seq, __u32 oid, __u32 ver) -{ - return ((__u64)ver << 48) | ((seq & 0xffff) << 32) | oid; -} - -static inline __u32 idif_ost_idx(__u64 seq) -{ - return (seq >> 16) & 0xffff; -} - -/* extract ost index from IDIF FID */ -static inline __u32 fid_idif_ost_idx(const struct lu_fid *fid) -{ - return idif_ost_idx(fid_seq(fid)); -} - -/** - * Get inode number from an igif. - * \param fid an igif to get inode number from. - * \return inode number for the igif. - */ -static inline ino_t lu_igif_ino(const struct lu_fid *fid) -{ - return fid_seq(fid); -} - -/** - * Get inode generation from an igif. - * \param fid an igif to get inode generation from. - * \return inode generation for the igif. - */ -static inline __u32 lu_igif_gen(const struct lu_fid *fid) -{ - return fid_oid(fid); -} - -/** - * Build igif from the inode number/generation. - */ -static inline void lu_igif_build(struct lu_fid *fid, __u32 ino, __u32 gen) -{ - fid->f_seq = ino; - fid->f_oid = gen; - fid->f_ver = 0; -} - -/* - * Fids are transmitted across network (in the sender byte-ordering), - * and stored on disk in big-endian order. - */ -static inline void fid_cpu_to_le(struct lu_fid *dst, const struct lu_fid *src) -{ - dst->f_seq = __cpu_to_le64(fid_seq(src)); - dst->f_oid = __cpu_to_le32(fid_oid(src)); - dst->f_ver = __cpu_to_le32(fid_ver(src)); -} - -static inline void fid_le_to_cpu(struct lu_fid *dst, const struct lu_fid *src) -{ - dst->f_seq = __le64_to_cpu(fid_seq(src)); - dst->f_oid = __le32_to_cpu(fid_oid(src)); - dst->f_ver = __le32_to_cpu(fid_ver(src)); -} - -static inline void fid_cpu_to_be(struct lu_fid *dst, const struct lu_fid *src) -{ - dst->f_seq = __cpu_to_be64(fid_seq(src)); - dst->f_oid = __cpu_to_be32(fid_oid(src)); - dst->f_ver = __cpu_to_be32(fid_ver(src)); -} - -static inline void fid_be_to_cpu(struct lu_fid *dst, const struct lu_fid *src) -{ - dst->f_seq = __be64_to_cpu(fid_seq(src)); - dst->f_oid = __be32_to_cpu(fid_oid(src)); - dst->f_ver = __be32_to_cpu(fid_ver(src)); -} - -static inline bool fid_is_sane(const struct lu_fid *fid) -{ - return fid && ((fid_seq(fid) >= FID_SEQ_START && !fid_ver(fid)) || - fid_is_igif(fid) || fid_is_idif(fid) || - fid_seq_is_rsvd(fid_seq(fid))); -} - -static inline bool lu_fid_eq(const struct lu_fid *f0, const struct lu_fid *f1) -{ - return !memcmp(f0, f1, sizeof(*f0)); -} - -static inline int lu_fid_cmp(const struct lu_fid *f0, - const struct lu_fid *f1) -{ - if (fid_seq(f0) != fid_seq(f1)) - return fid_seq(f0) > fid_seq(f1) ? 1 : -1; - - if (fid_oid(f0) != fid_oid(f1)) - return fid_oid(f0) > fid_oid(f1) ? 1 : -1; - - if (fid_ver(f0) != fid_ver(f1)) - return fid_ver(f0) > fid_ver(f1) ? 1 : -1; - - return 0; -} -#endif diff --git a/drivers/staging/lustre/include/uapi/linux/lustre/lustre_fiemap.h b/drivers/staging/lustre/include/uapi/linux/lustre/lustre_fiemap.h deleted file mode 100644 index d375a476f5eadb4a2b180f13d8c23513df32fdfd..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/uapi/linux/lustre/lustre_fiemap.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2014, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * FIEMAP data structures and flags. This header file will be used until - * fiemap.h is available in the upstream kernel. - * - * Author: Kalpak Shah - * Author: Andreas Dilger - */ - -#ifndef _LUSTRE_FIEMAP_H -#define _LUSTRE_FIEMAP_H - -#include -#include - -/* XXX: We use fiemap_extent::fe_reserved[0] */ -#define fe_device fe_reserved[0] - -static inline size_t fiemap_count_to_size(size_t extent_count) -{ - return sizeof(struct fiemap) + extent_count * - sizeof(struct fiemap_extent); -} - -static inline unsigned int fiemap_size_to_count(size_t array_size) -{ - return (array_size - sizeof(struct fiemap)) / - sizeof(struct fiemap_extent); -} - -#define FIEMAP_FLAG_DEVICE_ORDER 0x40000000 /* return device ordered mapping */ - -#ifdef FIEMAP_FLAGS_COMPAT -#undef FIEMAP_FLAGS_COMPAT -#endif - -/* Lustre specific flags - use a high bit, don't conflict with upstream flag */ -#define FIEMAP_EXTENT_NO_DIRECT 0x40000000 /* Data mapping undefined */ -#define FIEMAP_EXTENT_NET 0x80000000 /* Data stored remotely. - * Sets NO_DIRECT flag - */ - -#endif /* _LUSTRE_FIEMAP_H */ diff --git a/drivers/staging/lustre/include/uapi/linux/lustre/lustre_idl.h b/drivers/staging/lustre/include/uapi/linux/lustre/lustre_idl.h deleted file mode 100644 index 6c7e3992d646c34658560db01d125d4868c8322b..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/uapi/linux/lustre/lustre_idl.h +++ /dev/null @@ -1,2690 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Lustre wire protocol definitions. - */ - -/** \defgroup lustreidl lustreidl - * - * Lustre wire protocol definitions. - * - * ALL structs passing over the wire should be declared here. Structs - * that are used in interfaces with userspace should go in lustre_user.h. - * - * All structs being declared here should be built from simple fixed-size - * types (__u8, __u16, __u32, __u64) or be built from other types or - * structs also declared in this file. Similarly, all flags and magic - * values in those structs should also be declared here. This ensures - * that the Lustre wire protocol is not influenced by external dependencies. - * - * The only other acceptable items in this file are VERY SIMPLE accessor - * functions to avoid callers grubbing inside the structures. Nothing that - * depends on external functions or definitions should be in here. - * - * Structs must be properly aligned to put 64-bit values on an 8-byte - * boundary. Any structs being added here must also be added to - * utils/wirecheck.c and "make newwiretest" run to regenerate the - * utils/wiretest.c sources. This allows us to verify that wire structs - * have the proper alignment/size on all architectures. - * - * DO NOT CHANGE any of the structs, flags, values declared here and used - * in released Lustre versions. Some structs may have padding fields that - * can be used. Some structs might allow addition at the end (verify this - * in the code to ensure that new/old clients that see this larger struct - * do not fail, otherwise you need to implement protocol compatibility). - * - * @{ - */ - -#ifndef _LUSTRE_IDL_H_ -#define _LUSTRE_IDL_H_ - -#include -#include - -#include -/* Defn's shared with user-space. */ -#include -#include - -/* - * GENERAL STUFF - */ -/* FOO_REQUEST_PORTAL is for incoming requests on the FOO - * FOO_REPLY_PORTAL is for incoming replies on the FOO - * FOO_BULK_PORTAL is for incoming bulk on the FOO - */ - -/* Lustre service names are following the format - * service name + MDT + seq name - */ -#define LUSTRE_MDT_MAXNAMELEN 80 - -#define CONNMGR_REQUEST_PORTAL 1 -#define CONNMGR_REPLY_PORTAL 2 -/*#define OSC_REQUEST_PORTAL 3 */ -#define OSC_REPLY_PORTAL 4 -/*#define OSC_BULK_PORTAL 5 */ -#define OST_IO_PORTAL 6 -#define OST_CREATE_PORTAL 7 -#define OST_BULK_PORTAL 8 -/*#define MDC_REQUEST_PORTAL 9 */ -#define MDC_REPLY_PORTAL 10 -/*#define MDC_BULK_PORTAL 11 */ -#define MDS_REQUEST_PORTAL 12 -/*#define MDS_REPLY_PORTAL 13 */ -#define MDS_BULK_PORTAL 14 -#define LDLM_CB_REQUEST_PORTAL 15 -#define LDLM_CB_REPLY_PORTAL 16 -#define LDLM_CANCEL_REQUEST_PORTAL 17 -#define LDLM_CANCEL_REPLY_PORTAL 18 -/*#define PTLBD_REQUEST_PORTAL 19 */ -/*#define PTLBD_REPLY_PORTAL 20 */ -/*#define PTLBD_BULK_PORTAL 21 */ -#define MDS_SETATTR_PORTAL 22 -#define MDS_READPAGE_PORTAL 23 -#define OUT_PORTAL 24 - -#define MGC_REPLY_PORTAL 25 -#define MGS_REQUEST_PORTAL 26 -#define MGS_REPLY_PORTAL 27 -#define OST_REQUEST_PORTAL 28 -#define FLD_REQUEST_PORTAL 29 -#define SEQ_METADATA_PORTAL 30 -#define SEQ_DATA_PORTAL 31 -#define SEQ_CONTROLLER_PORTAL 32 -#define MGS_BULK_PORTAL 33 - -/* Portal 63 is reserved for the Cray Inc DVS - nic@cray.com, roe@cray.com, - * n8851@cray.com - */ - -/* packet types */ -#define PTL_RPC_MSG_REQUEST 4711 -#define PTL_RPC_MSG_ERR 4712 -#define PTL_RPC_MSG_REPLY 4713 - -/* DON'T use swabbed values of MAGIC as magic! */ -#define LUSTRE_MSG_MAGIC_V2 0x0BD00BD3 -#define LUSTRE_MSG_MAGIC_V2_SWABBED 0xD30BD00B - -#define LUSTRE_MSG_MAGIC LUSTRE_MSG_MAGIC_V2 - -#define PTLRPC_MSG_VERSION 0x00000003 -#define LUSTRE_VERSION_MASK 0xffff0000 -#define LUSTRE_OBD_VERSION 0x00010000 -#define LUSTRE_MDS_VERSION 0x00020000 -#define LUSTRE_OST_VERSION 0x00030000 -#define LUSTRE_DLM_VERSION 0x00040000 -#define LUSTRE_LOG_VERSION 0x00050000 -#define LUSTRE_MGS_VERSION 0x00060000 - -/** - * Describes a range of sequence, lsr_start is included but lsr_end is - * not in the range. - * Same structure is used in fld module where lsr_index field holds mdt id - * of the home mdt. - */ -struct lu_seq_range { - __u64 lsr_start; - __u64 lsr_end; - __u32 lsr_index; - __u32 lsr_flags; -}; - -struct lu_seq_range_array { - __u32 lsra_count; - __u32 lsra_padding; - struct lu_seq_range lsra_lsr[0]; -}; - -#define LU_SEQ_RANGE_MDT 0x0 -#define LU_SEQ_RANGE_OST 0x1 -#define LU_SEQ_RANGE_ANY 0x3 - -#define LU_SEQ_RANGE_MASK 0x3 - -/** \defgroup lu_fid lu_fid - * @{ - */ - -/** - * Flags for lustre_mdt_attrs::lma_compat and lustre_mdt_attrs::lma_incompat. - * Deprecated since HSM and SOM attributes are now stored in separate on-disk - * xattr. - */ -enum lma_compat { - LMAC_HSM = 0x00000001, -/* LMAC_SOM = 0x00000002, obsolete since 2.8.0 */ - LMAC_NOT_IN_OI = 0x00000004, /* the object does NOT need OI mapping */ - LMAC_FID_ON_OST = 0x00000008, /* For OST-object, its OI mapping is - * under /O//d. - */ -}; - -/** - * Masks for all features that should be supported by a Lustre version to - * access a specific file. - * This information is stored in lustre_mdt_attrs::lma_incompat. - */ -enum lma_incompat { - LMAI_RELEASED = 0x00000001, /* file is released */ - LMAI_AGENT = 0x00000002, /* agent inode */ - LMAI_REMOTE_PARENT = 0x00000004, /* the parent of the object - * is on the remote MDT - */ -}; - -#define LMA_INCOMPAT_SUPP (LMAI_AGENT | LMAI_REMOTE_PARENT) - -/** - * fid constants - */ -enum { - /** LASTID file has zero OID */ - LUSTRE_FID_LASTID_OID = 0UL, - /** initial fid id value */ - LUSTRE_FID_INIT_OID = 1UL -}; - -/* copytool uses a 32b bitmask field to encode archive-Ids during register - * with MDT thru kuc. - * archive num = 0 => all - * archive num from 1 to 32 - */ -#define LL_HSM_MAX_ARCHIVE (sizeof(__u32) * 8) - -/** - * Note that reserved SEQ numbers below 12 will conflict with ldiskfs - * inodes in the IGIF namespace, so these reserved SEQ numbers can be - * used for other purposes and not risk collisions with existing inodes. - * - * Different FID Format - * http://wiki.old.lustre.org/index.php/Architecture_-_Interoperability_fids_zfs - */ -enum fid_seq { - FID_SEQ_OST_MDT0 = 0, - FID_SEQ_LLOG = 1, /* unnamed llogs */ - FID_SEQ_ECHO = 2, - FID_SEQ_OST_MDT1 = 3, - FID_SEQ_OST_MAX = 9, /* Max MDT count before OST_on_FID */ - FID_SEQ_LLOG_NAME = 10, /* named llogs */ - FID_SEQ_RSVD = 11, - FID_SEQ_IGIF = 12, - FID_SEQ_IGIF_MAX = 0x0ffffffffULL, - FID_SEQ_IDIF = 0x100000000ULL, - FID_SEQ_IDIF_MAX = 0x1ffffffffULL, - /* Normal FID sequence starts from this value, i.e. 1<<33 */ - FID_SEQ_START = 0x200000000ULL, - /* sequence for local pre-defined FIDs listed in local_oid */ - FID_SEQ_LOCAL_FILE = 0x200000001ULL, - FID_SEQ_DOT_LUSTRE = 0x200000002ULL, - /* sequence is used for local named objects FIDs generated - * by local_object_storage library - */ - FID_SEQ_LOCAL_NAME = 0x200000003ULL, - /* Because current FLD will only cache the fid sequence, instead - * of oid on the client side, if the FID needs to be exposed to - * clients sides, it needs to make sure all of fids under one - * sequence will be located in one MDT. - */ - FID_SEQ_SPECIAL = 0x200000004ULL, - FID_SEQ_QUOTA = 0x200000005ULL, - FID_SEQ_QUOTA_GLB = 0x200000006ULL, - FID_SEQ_ROOT = 0x200000007ULL, /* Located on MDT0 */ - FID_SEQ_NORMAL = 0x200000400ULL, - FID_SEQ_LOV_DEFAULT = 0xffffffffffffffffULL -}; - -#define OBIF_OID_MAX_BITS 32 -#define OBIF_MAX_OID (1ULL << OBIF_OID_MAX_BITS) -#define OBIF_OID_MASK ((1ULL << OBIF_OID_MAX_BITS) - 1) -#define IDIF_OID_MAX_BITS 48 -#define IDIF_MAX_OID (1ULL << IDIF_OID_MAX_BITS) -#define IDIF_OID_MASK ((1ULL << IDIF_OID_MAX_BITS) - 1) - -/** OID for FID_SEQ_SPECIAL */ -enum special_oid { - /* Big Filesystem Lock to serialize rename operations */ - FID_OID_SPECIAL_BFL = 1UL, -}; - -/** OID for FID_SEQ_DOT_LUSTRE */ -enum dot_lustre_oid { - FID_OID_DOT_LUSTRE = 1UL, - FID_OID_DOT_LUSTRE_OBF = 2UL, -}; - -/** OID for FID_SEQ_ROOT */ -enum root_oid { - FID_OID_ROOT = 1UL, - FID_OID_ECHO_ROOT = 2UL, -}; - -/** @} lu_fid */ - -/** \defgroup lu_dir lu_dir - * @{ - */ - -/** - * Enumeration of possible directory entry attributes. - * - * Attributes follow directory entry header in the order they appear in this - * enumeration. - */ -enum lu_dirent_attrs { - LUDA_FID = 0x0001, - LUDA_TYPE = 0x0002, - LUDA_64BITHASH = 0x0004, -}; - -/** - * Layout of readdir pages, as transmitted on wire. - */ -struct lu_dirent { - /** valid if LUDA_FID is set. */ - struct lu_fid lde_fid; - /** a unique entry identifier: a hash or an offset. */ - __u64 lde_hash; - /** total record length, including all attributes. */ - __u16 lde_reclen; - /** name length */ - __u16 lde_namelen; - /** optional variable size attributes following this entry. - * taken from enum lu_dirent_attrs. - */ - __u32 lde_attrs; - /** name is followed by the attributes indicated in ->ldp_attrs, in - * their natural order. After the last attribute, padding bytes are - * added to make ->lde_reclen a multiple of 8. - */ - char lde_name[0]; -}; - -/* - * Definitions of optional directory entry attributes formats. - * - * Individual attributes do not have their length encoded in a generic way. It - * is assumed that consumer of an attribute knows its format. This means that - * it is impossible to skip over an unknown attribute, except by skipping over all - * remaining attributes (by using ->lde_reclen), which is not too - * constraining, because new server versions will append new attributes at - * the end of an entry. - */ - -/** - * Fid directory attribute: a fid of an object referenced by the entry. This - * will be almost always requested by the client and supplied by the server. - * - * Aligned to 8 bytes. - */ -/* To have compatibility with 1.8, lets have fid in lu_dirent struct. */ - -/** - * File type. - * - * Aligned to 2 bytes. - */ -struct luda_type { - __u16 lt_type; -}; - -#ifndef IFSHIFT -#define IFSHIFT 12 -#endif - -#ifndef IFTODT -#define IFTODT(type) (((type) & S_IFMT) >> IFSHIFT) -#endif -#ifndef DTTOIF -#define DTTOIF(dirtype) ((dirtype) << IFSHIFT) -#endif - -struct lu_dirpage { - __le64 ldp_hash_start; - __le64 ldp_hash_end; - __le32 ldp_flags; - __le32 ldp_pad0; - struct lu_dirent ldp_entries[0]; -}; - -enum lu_dirpage_flags { - /** - * dirpage contains no entry. - */ - LDF_EMPTY = 1 << 0, - /** - * last entry's lde_hash equals ldp_hash_end. - */ - LDF_COLLIDE = 1 << 1 -}; - -static inline struct lu_dirent *lu_dirent_start(struct lu_dirpage *dp) -{ - if (__le32_to_cpu(dp->ldp_flags) & LDF_EMPTY) - return NULL; - else - return dp->ldp_entries; -} - -static inline struct lu_dirent *lu_dirent_next(struct lu_dirent *ent) -{ - struct lu_dirent *next; - - if (__le16_to_cpu(ent->lde_reclen) != 0) - next = ((void *)ent) + __le16_to_cpu(ent->lde_reclen); - else - next = NULL; - - return next; -} - -static inline size_t lu_dirent_calc_size(size_t namelen, __u16 attr) -{ - size_t size; - - if (attr & LUDA_TYPE) { - const size_t align = sizeof(struct luda_type) - 1; - - size = (sizeof(struct lu_dirent) + namelen + align) & ~align; - size += sizeof(struct luda_type); - } else { - size = sizeof(struct lu_dirent) + namelen; - } - - return (size + 7) & ~7; -} - -#define MDS_DIR_END_OFF 0xfffffffffffffffeULL - -/** - * MDS_READPAGE page size - * - * This is the directory page size packed in MDS_READPAGE RPC. - * It's different than PAGE_SIZE because the client needs to - * access the struct lu_dirpage header packed at the beginning of - * the "page" and without this there isn't any way to know find the - * lu_dirpage header is if client and server PAGE_SIZE differ. - */ -#define LU_PAGE_SHIFT 12 -#define LU_PAGE_SIZE (1UL << LU_PAGE_SHIFT) -#define LU_PAGE_MASK (~(LU_PAGE_SIZE - 1)) - -#define LU_PAGE_COUNT (1 << (PAGE_SHIFT - LU_PAGE_SHIFT)) - -/** @} lu_dir */ - -struct lustre_handle { - __u64 cookie; -}; - -#define DEAD_HANDLE_MAGIC 0xdeadbeefcafebabeULL - -static inline bool lustre_handle_is_used(const struct lustre_handle *lh) -{ - return lh->cookie != 0ull; -} - -static inline bool lustre_handle_equal(const struct lustre_handle *lh1, - const struct lustre_handle *lh2) -{ - return lh1->cookie == lh2->cookie; -} - -static inline void lustre_handle_copy(struct lustre_handle *tgt, - const struct lustre_handle *src) -{ - tgt->cookie = src->cookie; -} - -/* flags for lm_flags */ -#define MSGHDR_AT_SUPPORT 0x1 -#define MSGHDR_CKSUM_INCOMPAT18 0x2 - -#define lustre_msg lustre_msg_v2 -/* we depend on this structure to be 8-byte aligned */ -/* this type is only endian-adjusted in lustre_unpack_msg() */ -struct lustre_msg_v2 { - __u32 lm_bufcount; - __u32 lm_secflvr; - __u32 lm_magic; - __u32 lm_repsize; - __u32 lm_cksum; - __u32 lm_flags; - __u32 lm_padding_2; - __u32 lm_padding_3; - __u32 lm_buflens[0]; -}; - -/* without gss, ptlrpc_body is put at the first buffer. */ -#define PTLRPC_NUM_VERSIONS 4 - -struct ptlrpc_body_v3 { - struct lustre_handle pb_handle; - __u32 pb_type; - __u32 pb_version; - __u32 pb_opc; - __u32 pb_status; - __u64 pb_last_xid; /* highest replied XID without lower unreplied XID */ - __u16 pb_tag; /* virtual slot idx for multiple modifying RPCs */ - __u16 pb_padding0; - __u32 pb_padding1; - __u64 pb_last_committed; - __u64 pb_transno; - __u32 pb_flags; - __u32 pb_op_flags; - __u32 pb_conn_cnt; - __u32 pb_timeout; /* for req, the deadline, for rep, the service est */ - __u32 pb_service_time; /* for rep, actual service time */ - __u32 pb_limit; - __u64 pb_slv; - /* VBR: pre-versions */ - __u64 pb_pre_versions[PTLRPC_NUM_VERSIONS]; - __u64 pb_mbits; /**< match bits for bulk request */ - /* padding for future needs */ - __u64 pb_padding64_0; - __u64 pb_padding64_1; - __u64 pb_padding64_2; - char pb_jobid[LUSTRE_JOBID_SIZE]; -}; - -#define ptlrpc_body ptlrpc_body_v3 - -struct ptlrpc_body_v2 { - struct lustre_handle pb_handle; - __u32 pb_type; - __u32 pb_version; - __u32 pb_opc; - __u32 pb_status; - __u64 pb_last_xid; /* highest replied XID without lower unreplied XID */ - __u16 pb_tag; /* virtual slot idx for multiple modifying RPCs */ - __u16 pb_padding0; - __u32 pb_padding1; - __u64 pb_last_committed; - __u64 pb_transno; - __u32 pb_flags; - __u32 pb_op_flags; - __u32 pb_conn_cnt; - __u32 pb_timeout; /* for req, the deadline, for rep, the service est */ - __u32 pb_service_time; /* for rep, actual service time, also used for - * net_latency of req - */ - __u32 pb_limit; - __u64 pb_slv; - /* VBR: pre-versions */ - __u64 pb_pre_versions[PTLRPC_NUM_VERSIONS]; - __u64 pb_mbits; /**< unused in V2 */ - /* padding for future needs */ - __u64 pb_padding64_0; - __u64 pb_padding64_1; - __u64 pb_padding64_2; -}; - -/* message body offset for lustre_msg_v2 */ -/* ptlrpc body offset in all request/reply messages */ -#define MSG_PTLRPC_BODY_OFF 0 - -/* normal request/reply message record offset */ -#define REQ_REC_OFF 1 -#define REPLY_REC_OFF 1 - -/* ldlm request message body offset */ -#define DLM_LOCKREQ_OFF 1 /* lockreq offset */ -#define DLM_REQ_REC_OFF 2 /* normal dlm request record offset */ - -/* ldlm intent lock message body offset */ -#define DLM_INTENT_IT_OFF 2 /* intent lock it offset */ -#define DLM_INTENT_REC_OFF 3 /* intent lock record offset */ - -/* ldlm reply message body offset */ -#define DLM_LOCKREPLY_OFF 1 /* lockrep offset */ -#define DLM_REPLY_REC_OFF 2 /* reply record offset */ - -/** only use in req->rq_{req,rep}_swab_mask */ -#define MSG_PTLRPC_HEADER_OFF 31 - -/* Flags that are operation-specific go in the top 16 bits. */ -#define MSG_OP_FLAG_MASK 0xffff0000 -#define MSG_OP_FLAG_SHIFT 16 - -/* Flags that apply to all requests are in the bottom 16 bits */ -#define MSG_GEN_FLAG_MASK 0x0000ffff -#define MSG_LAST_REPLAY 0x0001 -#define MSG_RESENT 0x0002 -#define MSG_REPLAY 0x0004 -/* #define MSG_AT_SUPPORT 0x0008 - * This was used in early prototypes of adaptive timeouts, and while there - * shouldn't be any users of that code there also isn't a need for using this - * bits. Defer usage until at least 1.10 to avoid potential conflict. - */ -#define MSG_DELAY_REPLAY 0x0010 -#define MSG_VERSION_REPLAY 0x0020 -#define MSG_REQ_REPLAY_DONE 0x0040 -#define MSG_LOCK_REPLAY_DONE 0x0080 - -/* - * Flags for all connect opcodes (MDS_CONNECT, OST_CONNECT) - */ - -#define MSG_CONNECT_RECOVERING 0x00000001 -#define MSG_CONNECT_RECONNECT 0x00000002 -#define MSG_CONNECT_REPLAYABLE 0x00000004 -/*#define MSG_CONNECT_PEER 0x8 */ -#define MSG_CONNECT_LIBCLIENT 0x00000010 -#define MSG_CONNECT_INITIAL 0x00000020 -#define MSG_CONNECT_ASYNC 0x00000040 -#define MSG_CONNECT_NEXT_VER 0x00000080 /* use next version of lustre_msg */ -#define MSG_CONNECT_TRANSNO 0x00000100 /* report transno */ - -/* Connect flags */ -#define OBD_CONNECT_RDONLY 0x1ULL /*client has read-only access*/ -#define OBD_CONNECT_INDEX 0x2ULL /*connect specific LOV idx */ -#define OBD_CONNECT_MDS 0x4ULL /*connect from MDT to OST */ -#define OBD_CONNECT_GRANT 0x8ULL /*OSC gets grant at connect */ -#define OBD_CONNECT_SRVLOCK 0x10ULL /*server takes locks for cli */ -#define OBD_CONNECT_VERSION 0x20ULL /*Lustre versions in ocd */ -#define OBD_CONNECT_REQPORTAL 0x40ULL /*Separate non-IO req portal */ -#define OBD_CONNECT_ACL 0x80ULL /*access control lists */ -#define OBD_CONNECT_XATTR 0x100ULL /*client use extended attr */ -#define OBD_CONNECT_LARGE_ACL 0x200ULL /* more than 32 ACL entries */ -#define OBD_CONNECT_TRUNCLOCK 0x400ULL /*locks on server for punch */ -#define OBD_CONNECT_TRANSNO 0x800ULL /*replay sends init transno */ -#define OBD_CONNECT_IBITS 0x1000ULL /*support for inodebits locks*/ -#define OBD_CONNECT_JOIN 0x2000ULL /*files can be concatenated. - *We do not support JOIN FILE - *anymore, reserve this flags - *just for preventing such bit - *to be reused. - */ -#define OBD_CONNECT_ATTRFID 0x4000ULL /*Server can GetAttr By Fid*/ -#define OBD_CONNECT_NODEVOH 0x8000ULL /*No open hndl on specl nodes*/ -#define OBD_CONNECT_RMT_CLIENT 0x10000ULL /* Remote client, never used - * in production. Removed in - * 2.9. Keep this flag to - * avoid reuse. - */ -#define OBD_CONNECT_RMT_CLIENT_FORCE 0x20000ULL /* Remote client by force, - * never used in production. - * Removed in 2.9. Keep this - * flag to avoid reuse - */ -#define OBD_CONNECT_BRW_SIZE 0x40000ULL /*Max bytes per rpc */ -#define OBD_CONNECT_QUOTA64 0x80000ULL /*Not used since 2.4 */ -#define OBD_CONNECT_MDS_CAPA 0x100000ULL /*MDS capability */ -#define OBD_CONNECT_OSS_CAPA 0x200000ULL /*OSS capability */ -#define OBD_CONNECT_CANCELSET 0x400000ULL /*Early batched cancels. */ -#define OBD_CONNECT_SOM 0x800000ULL /*Size on MDS */ -#define OBD_CONNECT_AT 0x1000000ULL /*client uses AT */ -#define OBD_CONNECT_LRU_RESIZE 0x2000000ULL /*LRU resize feature. */ -#define OBD_CONNECT_MDS_MDS 0x4000000ULL /*MDS-MDS connection */ -#define OBD_CONNECT_REAL 0x8000000ULL /* obsolete since 2.8 */ -#define OBD_CONNECT_CHANGE_QS 0x10000000ULL /*Not used since 2.4 */ -#define OBD_CONNECT_CKSUM 0x20000000ULL /*support several cksum algos*/ -#define OBD_CONNECT_FID 0x40000000ULL /*FID is supported by server */ -#define OBD_CONNECT_VBR 0x80000000ULL /*version based recovery */ -#define OBD_CONNECT_LOV_V3 0x100000000ULL /*client supports LOV v3 EA */ -#define OBD_CONNECT_GRANT_SHRINK 0x200000000ULL /* support grant shrink */ -#define OBD_CONNECT_SKIP_ORPHAN 0x400000000ULL /* don't reuse orphan objids */ -#define OBD_CONNECT_MAX_EASIZE 0x800000000ULL /* preserved for large EA */ -#define OBD_CONNECT_FULL20 0x1000000000ULL /* it is 2.0 client */ -#define OBD_CONNECT_LAYOUTLOCK 0x2000000000ULL /* client uses layout lock */ -#define OBD_CONNECT_64BITHASH 0x4000000000ULL /* client supports 64-bits - * directory hash - */ -#define OBD_CONNECT_MAXBYTES 0x8000000000ULL /* max stripe size */ -#define OBD_CONNECT_IMP_RECOV 0x10000000000ULL /* imp recovery support */ -#define OBD_CONNECT_JOBSTATS 0x20000000000ULL /* jobid in ptlrpc_body */ -#define OBD_CONNECT_UMASK 0x40000000000ULL /* create uses client umask */ -#define OBD_CONNECT_EINPROGRESS 0x80000000000ULL /* client handles -EINPROGRESS - * RPC error properly - */ -#define OBD_CONNECT_GRANT_PARAM 0x100000000000ULL/* extra grant params used for - * finer space reservation - */ -#define OBD_CONNECT_FLOCK_OWNER 0x200000000000ULL /* for the fixed 1.8 - * policy and 2.x server - */ -#define OBD_CONNECT_LVB_TYPE 0x400000000000ULL /* variable type of LVB */ -#define OBD_CONNECT_NANOSEC_TIME 0x800000000000ULL /* nanosecond timestamps */ -#define OBD_CONNECT_LIGHTWEIGHT 0x1000000000000ULL/* lightweight connection */ -#define OBD_CONNECT_SHORTIO 0x2000000000000ULL/* short io */ -#define OBD_CONNECT_PINGLESS 0x4000000000000ULL/* pings not required */ -#define OBD_CONNECT_FLOCK_DEAD 0x8000000000000ULL/* flock deadlock detection */ -#define OBD_CONNECT_DISP_STRIPE 0x10000000000000ULL/*create stripe disposition*/ -#define OBD_CONNECT_OPEN_BY_FID 0x20000000000000ULL /* open by fid won't pack - * name in request - */ -#define OBD_CONNECT_LFSCK 0x40000000000000ULL/* support online LFSCK */ -#define OBD_CONNECT_UNLINK_CLOSE 0x100000000000000ULL/* close file in unlink */ -#define OBD_CONNECT_MULTIMODRPCS 0x200000000000000ULL /* support multiple modify - * RPCs in parallel - */ -#define OBD_CONNECT_DIR_STRIPE 0x400000000000000ULL/* striped DNE dir */ -#define OBD_CONNECT_SUBTREE 0x800000000000000ULL /* fileset mount */ -#define OBD_CONNECT_LOCK_AHEAD 0x1000000000000000ULL /* lock ahead */ -/** bulk matchbits is sent within ptlrpc_body */ -#define OBD_CONNECT_BULK_MBITS 0x2000000000000000ULL -#define OBD_CONNECT_OBDOPACK 0x4000000000000000ULL /* compact OUT obdo */ -#define OBD_CONNECT_FLAGS2 0x8000000000000000ULL /* second flags word */ - -/* XXX README XXX: - * Please DO NOT add flag values here before first ensuring that this same - * flag value is not in use on some other branch. Please clear any such - * changes with senior engineers before starting to use a new flag. Then, - * submit a small patch against EVERY branch that ONLY adds the new flag, - * updates obd_connect_names[] for lprocfs_rd_connect_flags(), adds the - * flag to check_obd_connect_data(), and updates wiretests accordingly, so it - * can be approved and landed easily to reserve the flag for future use. - */ - -/* The MNE_SWAB flag is overloading the MDS_MDS bit only for the MGS - * connection. It is a temporary bug fix for Imperative Recovery interop - * between 2.2 and 2.3 x86/ppc nodes, and can be removed when interop for - * 2.2 clients/servers is no longer needed. LU-1252/LU-1644. - */ -#define OBD_CONNECT_MNE_SWAB OBD_CONNECT_MDS_MDS - -#define OCD_HAS_FLAG(ocd, flg) \ - (!!((ocd)->ocd_connect_flags & OBD_CONNECT_##flg)) - -/* Features required for this version of the client to work with server */ -#define CLIENT_CONNECT_MDT_REQD (OBD_CONNECT_IBITS | OBD_CONNECT_FID | \ - OBD_CONNECT_FULL20) - -/* This structure is used for both request and reply. - * - * If we eventually have separate connect data for different types, which we - * almost certainly will, then perhaps we stick a union in here. - */ -struct obd_connect_data { - __u64 ocd_connect_flags; /* OBD_CONNECT_* per above */ - __u32 ocd_version; /* lustre release version number */ - __u32 ocd_grant; /* initial cache grant amount (bytes) */ - __u32 ocd_index; /* LOV index to connect to */ - __u32 ocd_brw_size; /* Maximum BRW size in bytes */ - __u64 ocd_ibits_known; /* inode bits this client understands */ - __u8 ocd_blocksize; /* log2 of the backend filesystem blocksize */ - __u8 ocd_inodespace; /* log2 of the per-inode space consumption */ - __u16 ocd_grant_extent; /* per-extent grant overhead, in 1K blocks */ - __u32 ocd_unused; /* also fix lustre_swab_connect */ - __u64 ocd_transno; /* first transno from client to be replayed */ - __u32 ocd_group; /* MDS group on OST */ - __u32 ocd_cksum_types; /* supported checksum algorithms */ - __u32 ocd_max_easize; /* How big LOV EA can be on MDS */ - __u32 ocd_instance; /* instance # of this target */ - __u64 ocd_maxbytes; /* Maximum stripe size in bytes */ - /* Fields after ocd_maxbytes are only accessible by the receiver - * if the corresponding flag in ocd_connect_flags is set. Accessing - * any field after ocd_maxbytes on the receiver without a valid flag - * may result in out-of-bound memory access and kernel oops. - */ - __u16 ocd_maxmodrpcs; /* Maximum modify RPCs in parallel */ - __u16 padding0; /* added 2.1.0. also fix lustre_swab_connect */ - __u32 padding1; /* added 2.1.0. also fix lustre_swab_connect */ - __u64 ocd_connect_flags2; - __u64 padding3; /* added 2.1.0. also fix lustre_swab_connect */ - __u64 padding4; /* added 2.1.0. also fix lustre_swab_connect */ - __u64 padding5; /* added 2.1.0. also fix lustre_swab_connect */ - __u64 padding6; /* added 2.1.0. also fix lustre_swab_connect */ - __u64 padding7; /* added 2.1.0. also fix lustre_swab_connect */ - __u64 padding8; /* added 2.1.0. also fix lustre_swab_connect */ - __u64 padding9; /* added 2.1.0. also fix lustre_swab_connect */ - __u64 paddingA; /* added 2.1.0. also fix lustre_swab_connect */ - __u64 paddingB; /* added 2.1.0. also fix lustre_swab_connect */ - __u64 paddingC; /* added 2.1.0. also fix lustre_swab_connect */ - __u64 paddingD; /* added 2.1.0. also fix lustre_swab_connect */ - __u64 paddingE; /* added 2.1.0. also fix lustre_swab_connect */ - __u64 paddingF; /* added 2.1.0. also fix lustre_swab_connect */ -}; - -/* XXX README XXX: - * Please DO NOT use any fields here before first ensuring that this same - * field is not in use on some other branch. Please clear any such changes - * with senior engineers before starting to use a new field. Then, submit - * a small patch against EVERY branch that ONLY adds the new field along with - * the matching OBD_CONNECT flag, so that can be approved and landed easily to - * reserve the flag for future use. - */ - -/* - * Supported checksum algorithms. Up to 32 checksum types are supported. - * (32-bit mask stored in obd_connect_data::ocd_cksum_types) - * Please update DECLARE_CKSUM_NAME/OBD_CKSUM_ALL in obd.h when adding a new - * algorithm and also the OBD_FL_CKSUM* flags. - */ -enum cksum_type { - OBD_CKSUM_CRC32 = 0x00000001, - OBD_CKSUM_ADLER = 0x00000002, - OBD_CKSUM_CRC32C = 0x00000004, -}; - -/* - * OST requests: OBDO & OBD request records - */ - -/* opcodes */ -enum ost_cmd { - OST_REPLY = 0, /* reply ? */ - OST_GETATTR = 1, - OST_SETATTR = 2, - OST_READ = 3, - OST_WRITE = 4, - OST_CREATE = 5, - OST_DESTROY = 6, - OST_GET_INFO = 7, - OST_CONNECT = 8, - OST_DISCONNECT = 9, - OST_PUNCH = 10, - OST_OPEN = 11, - OST_CLOSE = 12, - OST_STATFS = 13, - OST_SYNC = 16, - OST_SET_INFO = 17, - OST_QUOTACHECK = 18, /* not used since 2.4 */ - OST_QUOTACTL = 19, - OST_QUOTA_ADJUST_QUNIT = 20, /* not used since 2.4 */ - OST_LAST_OPC -}; -#define OST_FIRST_OPC OST_REPLY - -enum obdo_flags { - OBD_FL_INLINEDATA = 0x00000001, - OBD_FL_OBDMDEXISTS = 0x00000002, - OBD_FL_DELORPHAN = 0x00000004, /* if set in o_flags delete orphans */ - OBD_FL_NORPC = 0x00000008, /* set in o_flags do in OSC not OST */ - OBD_FL_IDONLY = 0x00000010, /* set in o_flags only adjust obj id*/ - OBD_FL_RECREATE_OBJS = 0x00000020, /* recreate missing obj */ - OBD_FL_DEBUG_CHECK = 0x00000040, /* echo client/server debug check */ - OBD_FL_NO_USRQUOTA = 0x00000100, /* the object's owner is over quota */ - OBD_FL_NO_GRPQUOTA = 0x00000200, /* the object's group is over quota */ - OBD_FL_CREATE_CROW = 0x00000400, /* object should be create on write */ - OBD_FL_SRVLOCK = 0x00000800, /* delegate DLM locking to server */ - OBD_FL_CKSUM_CRC32 = 0x00001000, /* CRC32 checksum type */ - OBD_FL_CKSUM_ADLER = 0x00002000, /* ADLER checksum type */ - OBD_FL_CKSUM_CRC32C = 0x00004000, /* CRC32C checksum type */ - OBD_FL_CKSUM_RSVD2 = 0x00008000, /* for future cksum types */ - OBD_FL_CKSUM_RSVD3 = 0x00010000, /* for future cksum types */ - OBD_FL_SHRINK_GRANT = 0x00020000, /* object shrink the grant */ - OBD_FL_MMAP = 0x00040000, /* object is mmapped on the client. - * XXX: obsoleted - reserved for old - * clients prior than 2.2 - */ - OBD_FL_RECOV_RESEND = 0x00080000, /* recoverable resent */ - OBD_FL_NOSPC_BLK = 0x00100000, /* no more block space on OST */ - OBD_FL_FLUSH = 0x00200000, /* flush pages on the OST */ - OBD_FL_SHORT_IO = 0x00400000, /* short io request */ - - /* Note that while these checksum values are currently separate bits, - * in 2.x we can actually allow all values from 1-31 if we wanted. - */ - OBD_FL_CKSUM_ALL = OBD_FL_CKSUM_CRC32 | OBD_FL_CKSUM_ADLER | - OBD_FL_CKSUM_CRC32C, - - /* mask for local-only flag, which won't be sent over network */ - OBD_FL_LOCAL_MASK = 0xF0000000, -}; - -/* - * All LOV EA magics should have the same postfix, if some new version - * Lustre instroduces new LOV EA magic, then when down-grade to an old - * Lustre, even though the old version system does not recognizes such - * new magic, it still can distinguish the corrupted cases by checking - * the magic's postfix. - */ -#define LOV_MAGIC_MAGIC 0x0BD0 -#define LOV_MAGIC_MASK 0xFFFF - -#define LOV_MAGIC_V1 (0x0BD10000 | LOV_MAGIC_MAGIC) -#define LOV_MAGIC_JOIN_V1 (0x0BD20000 | LOV_MAGIC_MAGIC) -#define LOV_MAGIC_V3 (0x0BD30000 | LOV_MAGIC_MAGIC) -#define LOV_MAGIC_MIGRATE (0x0BD40000 | LOV_MAGIC_MAGIC) -/* reserved for specifying OSTs */ -#define LOV_MAGIC_SPECIFIC (0x0BD50000 | LOV_MAGIC_MAGIC) -#define LOV_MAGIC LOV_MAGIC_V1 - -/* - * magic for fully defined striping - * the idea is that we should have different magics for striping "hints" - * (struct lov_user_md_v[13]) and defined ready-to-use striping (struct - * lov_mds_md_v[13]). at the moment the magics are used in wire protocol, - * we can't just change it w/o long way preparation, but we still need a - * mechanism to allow LOD to differentiate hint versus ready striping. - * so, at the moment we do a trick: MDT knows what to expect from request - * depending on the case (replay uses ready striping, non-replay req uses - * hints), so MDT replaces magic with appropriate one and now LOD can - * easily understand what's inside -bzzz - */ -#define LOV_MAGIC_V1_DEF 0x0CD10BD0 -#define LOV_MAGIC_V3_DEF 0x0CD30BD0 - -#define lov_pattern(pattern) (pattern & ~LOV_PATTERN_F_MASK) -#define lov_pattern_flags(pattern) (pattern & LOV_PATTERN_F_MASK) - -#define lov_ost_data lov_ost_data_v1 -struct lov_ost_data_v1 { /* per-stripe data structure (little-endian)*/ - struct ost_id l_ost_oi; /* OST object ID */ - __u32 l_ost_gen; /* generation of this l_ost_idx */ - __u32 l_ost_idx; /* OST index in LOV (lov_tgt_desc->tgts) */ -}; - -#define lov_mds_md lov_mds_md_v1 -struct lov_mds_md_v1 { /* LOV EA mds/wire data (little-endian) */ - __u32 lmm_magic; /* magic number = LOV_MAGIC_V1 */ - __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */ - struct ost_id lmm_oi; /* LOV object ID */ - __u32 lmm_stripe_size; /* size of stripe in bytes */ - /* lmm_stripe_count used to be __u32 */ - __u16 lmm_stripe_count; /* num stripes in use for this object */ - __u16 lmm_layout_gen; /* layout generation number */ - struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */ -}; - -#define MAX_MD_SIZE \ - (sizeof(struct lov_mds_md) + 4 * sizeof(struct lov_ost_data)) -#define MIN_MD_SIZE \ - (sizeof(struct lov_mds_md) + 1 * sizeof(struct lov_ost_data)) - -#define XATTR_NAME_ACL_ACCESS "system.posix_acl_access" -#define XATTR_NAME_ACL_DEFAULT "system.posix_acl_default" -#define XATTR_USER_PREFIX "user." -#define XATTR_TRUSTED_PREFIX "trusted." -#define XATTR_SECURITY_PREFIX "security." -#define XATTR_LUSTRE_PREFIX "lustre." - -#define XATTR_NAME_LOV "trusted.lov" -#define XATTR_NAME_LMA "trusted.lma" -#define XATTR_NAME_LMV "trusted.lmv" -#define XATTR_NAME_DEFAULT_LMV "trusted.dmv" -#define XATTR_NAME_LINK "trusted.link" -#define XATTR_NAME_FID "trusted.fid" -#define XATTR_NAME_VERSION "trusted.version" -#define XATTR_NAME_SOM "trusted.som" -#define XATTR_NAME_HSM "trusted.hsm" -#define XATTR_NAME_LFSCK_NAMESPACE "trusted.lfsck_namespace" - -struct lov_mds_md_v3 { /* LOV EA mds/wire data (little-endian) */ - __u32 lmm_magic; /* magic number = LOV_MAGIC_V3 */ - __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */ - struct ost_id lmm_oi; /* LOV object ID */ - __u32 lmm_stripe_size; /* size of stripe in bytes */ - /* lmm_stripe_count used to be __u32 */ - __u16 lmm_stripe_count; /* num stripes in use for this object */ - __u16 lmm_layout_gen; /* layout generation number */ - char lmm_pool_name[LOV_MAXPOOLNAME + 1]; /* must be 32bit aligned */ - struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */ -}; - -static inline __u32 lov_mds_md_size(__u16 stripes, __u32 lmm_magic) -{ - if (lmm_magic == LOV_MAGIC_V3) - return sizeof(struct lov_mds_md_v3) + - stripes * sizeof(struct lov_ost_data_v1); - else - return sizeof(struct lov_mds_md_v1) + - stripes * sizeof(struct lov_ost_data_v1); -} - -static inline __u32 -lov_mds_md_max_stripe_count(size_t buf_size, __u32 lmm_magic) -{ - switch (lmm_magic) { - case LOV_MAGIC_V1: { - struct lov_mds_md_v1 lmm; - - if (buf_size < sizeof(lmm)) - return 0; - - return (buf_size - sizeof(lmm)) / sizeof(lmm.lmm_objects[0]); - } - case LOV_MAGIC_V3: { - struct lov_mds_md_v3 lmm; - - if (buf_size < sizeof(lmm)) - return 0; - - return (buf_size - sizeof(lmm)) / sizeof(lmm.lmm_objects[0]); - } - default: - return 0; - } -} - -#define OBD_MD_FLID (0x00000001ULL) /* object ID */ -#define OBD_MD_FLATIME (0x00000002ULL) /* access time */ -#define OBD_MD_FLMTIME (0x00000004ULL) /* data modification time */ -#define OBD_MD_FLCTIME (0x00000008ULL) /* change time */ -#define OBD_MD_FLSIZE (0x00000010ULL) /* size */ -#define OBD_MD_FLBLOCKS (0x00000020ULL) /* allocated blocks count */ -#define OBD_MD_FLBLKSZ (0x00000040ULL) /* block size */ -#define OBD_MD_FLMODE (0x00000080ULL) /* access bits (mode & ~S_IFMT) */ -#define OBD_MD_FLTYPE (0x00000100ULL) /* object type (mode & S_IFMT) */ -#define OBD_MD_FLUID (0x00000200ULL) /* user ID */ -#define OBD_MD_FLGID (0x00000400ULL) /* group ID */ -#define OBD_MD_FLFLAGS (0x00000800ULL) /* flags word */ -#define OBD_MD_FLNLINK (0x00002000ULL) /* link count */ -#define OBD_MD_FLGENER (0x00004000ULL) /* generation number */ -/*#define OBD_MD_FLINLINE (0x00008000ULL) inline data. used until 1.6.5 */ -#define OBD_MD_FLRDEV (0x00010000ULL) /* device number */ -#define OBD_MD_FLEASIZE (0x00020000ULL) /* extended attribute data */ -#define OBD_MD_LINKNAME (0x00040000ULL) /* symbolic link target */ -#define OBD_MD_FLHANDLE (0x00080000ULL) /* file/lock handle */ -#define OBD_MD_FLCKSUM (0x00100000ULL) /* bulk data checksum */ -#define OBD_MD_FLQOS (0x00200000ULL) /* quality of service stats */ -/*#define OBD_MD_FLOSCOPQ (0x00400000ULL) osc opaque data, never used */ -/* OBD_MD_FLCOOKIE (0x00800000ULL) obsolete in 2.8 */ -#define OBD_MD_FLGROUP (0x01000000ULL) /* group */ -#define OBD_MD_FLFID (0x02000000ULL) /* ->ost write inline fid */ -#define OBD_MD_FLEPOCH (0x04000000ULL) /* ->ost write with ioepoch */ - /* ->mds if epoch opens or closes - */ -#define OBD_MD_FLGRANT (0x08000000ULL) /* ost preallocation space grant */ -#define OBD_MD_FLDIREA (0x10000000ULL) /* dir's extended attribute data */ -#define OBD_MD_FLUSRQUOTA (0x20000000ULL) /* over quota flags sent from ost */ -#define OBD_MD_FLGRPQUOTA (0x40000000ULL) /* over quota flags sent from ost */ -#define OBD_MD_FLMODEASIZE (0x80000000ULL) /* EA size will be changed */ - -#define OBD_MD_MDS (0x0000000100000000ULL) /* where an inode lives on */ -#define OBD_MD_REINT (0x0000000200000000ULL) /* reintegrate oa */ -#define OBD_MD_MEA (0x0000000400000000ULL) /* CMD split EA */ -#define OBD_MD_TSTATE (0x0000000800000000ULL) /* transient state field */ - -#define OBD_MD_FLXATTR (0x0000001000000000ULL) /* xattr */ -#define OBD_MD_FLXATTRLS (0x0000002000000000ULL) /* xattr list */ -#define OBD_MD_FLXATTRRM (0x0000004000000000ULL) /* xattr remove */ -#define OBD_MD_FLACL (0x0000008000000000ULL) /* ACL */ -/* OBD_MD_FLRMTPERM (0x0000010000000000ULL) remote perm, obsolete */ -#define OBD_MD_FLMDSCAPA (0x0000020000000000ULL) /* MDS capability */ -#define OBD_MD_FLOSSCAPA (0x0000040000000000ULL) /* OSS capability */ -#define OBD_MD_FLCKSPLIT (0x0000080000000000ULL) /* Check split on server */ -#define OBD_MD_FLCROSSREF (0x0000100000000000ULL) /* Cross-ref case */ -#define OBD_MD_FLGETATTRLOCK (0x0000200000000000ULL) /* Get IOEpoch attributes - * under lock; for xattr - * requests means the - * client holds the lock - */ -#define OBD_MD_FLOBJCOUNT (0x0000400000000000ULL) /* for multiple destroy */ - -/* OBD_MD_FLRMTLSETFACL (0x0001000000000000ULL) lfs lsetfacl, obsolete */ -/* OBD_MD_FLRMTLGETFACL (0x0002000000000000ULL) lfs lgetfacl, obsolete */ -/* OBD_MD_FLRMTRSETFACL (0x0004000000000000ULL) lfs rsetfacl, obsolete */ -/* OBD_MD_FLRMTRGETFACL (0x0008000000000000ULL) lfs rgetfacl, obsolete */ - -#define OBD_MD_FLDATAVERSION (0x0010000000000000ULL) /* iversion sum */ -#define OBD_MD_CLOSE_INTENT_EXECED (0x0020000000000000ULL) /* close intent - * executed - */ - -#define OBD_MD_DEFAULT_MEA (0x0040000000000000ULL) /* default MEA */ - -#define OBD_MD_FLGETATTR (OBD_MD_FLID | OBD_MD_FLATIME | OBD_MD_FLMTIME | \ - OBD_MD_FLCTIME | OBD_MD_FLSIZE | OBD_MD_FLBLKSZ | \ - OBD_MD_FLMODE | OBD_MD_FLTYPE | OBD_MD_FLUID | \ - OBD_MD_FLGID | OBD_MD_FLFLAGS | OBD_MD_FLNLINK | \ - OBD_MD_FLGENER | OBD_MD_FLRDEV | OBD_MD_FLGROUP) - -#define OBD_MD_FLXATTRALL (OBD_MD_FLXATTR | OBD_MD_FLXATTRLS) - -/* don't forget obdo_fid which is way down at the bottom so it can - * come after the definition of llog_cookie - */ - -enum hss_valid { - HSS_SETMASK = 0x01, - HSS_CLEARMASK = 0x02, - HSS_ARCHIVE_ID = 0x04, -}; - -struct hsm_state_set { - __u32 hss_valid; - __u32 hss_archive_id; - __u64 hss_setmask; - __u64 hss_clearmask; -}; - -/* ost_body.data values for OST_BRW */ - -#define OBD_BRW_READ 0x01 -#define OBD_BRW_WRITE 0x02 -#define OBD_BRW_RWMASK (OBD_BRW_READ | OBD_BRW_WRITE) -#define OBD_BRW_SYNC 0x08 /* this page is a part of synchronous - * transfer and is not accounted in - * the grant. - */ -#define OBD_BRW_CHECK 0x10 -#define OBD_BRW_FROM_GRANT 0x20 /* the osc manages this under llite */ -#define OBD_BRW_GRANTED 0x40 /* the ost manages this */ -#define OBD_BRW_NOCACHE 0x80 /* this page is a part of non-cached IO */ -#define OBD_BRW_NOQUOTA 0x100 -#define OBD_BRW_SRVLOCK 0x200 /* Client holds no lock over this page */ -#define OBD_BRW_ASYNC 0x400 /* Server may delay commit to disk */ -#define OBD_BRW_MEMALLOC 0x800 /* Client runs in the "kswapd" context */ -#define OBD_BRW_OVER_USRQUOTA 0x1000 /* Running out of user quota */ -#define OBD_BRW_OVER_GRPQUOTA 0x2000 /* Running out of group quota */ -#define OBD_BRW_SOFT_SYNC 0x4000 /* This flag notifies the server - * that the client is running low on - * space for unstable pages; asking - * it to sync quickly - */ - -#define OBD_OBJECT_EOF LUSTRE_EOF - -#define OST_MIN_PRECREATE 32 -#define OST_MAX_PRECREATE 20000 - -struct obd_ioobj { - struct ost_id ioo_oid; /* object ID, if multi-obj BRW */ - __u32 ioo_max_brw; /* low 16 bits were o_mode before 2.4, - * now (PTLRPC_BULK_OPS_COUNT - 1) in - * high 16 bits in 2.4 and later - */ - __u32 ioo_bufcnt; /* number of niobufs for this object */ -}; - -/* - * NOTE: IOOBJ_MAX_BRW_BITS defines the _offset_ of the max_brw field in - * ioo_max_brw, NOT the maximum number of bits in PTLRPC_BULK_OPS_BITS. - * That said, ioo_max_brw is a 32-bit field so the limit is also 16 bits. - */ -#define IOOBJ_MAX_BRW_BITS 16 -#define ioobj_max_brw_get(ioo) (((ioo)->ioo_max_brw >> IOOBJ_MAX_BRW_BITS) + 1) -#define ioobj_max_brw_set(ioo, num) \ -do { (ioo)->ioo_max_brw = ((num) - 1) << IOOBJ_MAX_BRW_BITS; } while (0) - -/* multiple of 8 bytes => can array */ -struct niobuf_remote { - __u64 rnb_offset; - __u32 rnb_len; - __u32 rnb_flags; -}; - -/* lock value block communicated between the filter and llite */ - -/* OST_LVB_ERR_INIT is needed because the return code in rc is - * negative, i.e. because ((MASK + rc) & MASK) != MASK. - */ -#define OST_LVB_ERR_INIT 0xffbadbad80000000ULL -#define OST_LVB_ERR_MASK 0xffbadbad00000000ULL -#define OST_LVB_IS_ERR(blocks) \ - ((blocks & OST_LVB_ERR_MASK) == OST_LVB_ERR_MASK) -#define OST_LVB_SET_ERR(blocks, rc) \ - do { blocks = OST_LVB_ERR_INIT + rc; } while (0) -#define OST_LVB_GET_ERR(blocks) (int)(blocks - OST_LVB_ERR_INIT) - -struct ost_lvb_v1 { - __u64 lvb_size; - __s64 lvb_mtime; - __s64 lvb_atime; - __s64 lvb_ctime; - __u64 lvb_blocks; -}; - -struct ost_lvb { - __u64 lvb_size; - __s64 lvb_mtime; - __s64 lvb_atime; - __s64 lvb_ctime; - __u64 lvb_blocks; - __u32 lvb_mtime_ns; - __u32 lvb_atime_ns; - __u32 lvb_ctime_ns; - __u32 lvb_padding; -}; - -/* - * lquota data structures - */ - -/* The lquota_id structure is a union of all the possible identifier types that - * can be used with quota, this includes: - * - 64-bit user ID - * - 64-bit group ID - * - a FID which can be used for per-directory quota in the future - */ -union lquota_id { - struct lu_fid qid_fid; /* FID for per-directory quota */ - __u64 qid_uid; /* user identifier */ - __u64 qid_gid; /* group identifier */ -}; - -/* quotactl management */ -struct obd_quotactl { - __u32 qc_cmd; - __u32 qc_type; /* see Q_* flag below */ - __u32 qc_id; - __u32 qc_stat; - struct obd_dqinfo qc_dqinfo; - struct obd_dqblk qc_dqblk; -}; - -#define Q_COPY(out, in, member) (out)->member = (in)->member - -#define QCTL_COPY(out, in) \ -do { \ - Q_COPY(out, in, qc_cmd); \ - Q_COPY(out, in, qc_type); \ - Q_COPY(out, in, qc_id); \ - Q_COPY(out, in, qc_stat); \ - Q_COPY(out, in, qc_dqinfo); \ - Q_COPY(out, in, qc_dqblk); \ -} while (0) - -/* Data structures associated with the quota locks */ - -/* Glimpse descriptor used for the index & per-ID quota locks */ -struct ldlm_gl_lquota_desc { - union lquota_id gl_id; /* quota ID subject to the glimpse */ - __u64 gl_flags; /* see LQUOTA_FL* below */ - __u64 gl_ver; /* new index version */ - __u64 gl_hardlimit; /* new hardlimit or qunit value */ - __u64 gl_softlimit; /* new softlimit */ - __u64 gl_time; - __u64 gl_pad2; -}; - -/* quota glimpse flags */ -#define LQUOTA_FL_EDQUOT 0x1 /* user/group out of quota space on QMT */ - -/* LVB used with quota (global and per-ID) locks */ -struct lquota_lvb { - __u64 lvb_flags; /* see LQUOTA_FL* above */ - __u64 lvb_id_may_rel; /* space that might be released later */ - __u64 lvb_id_rel; /* space released by the slave for this ID */ - __u64 lvb_id_qunit; /* current qunit value */ - __u64 lvb_pad1; -}; - -/* op codes */ -enum quota_cmd { - QUOTA_DQACQ = 601, - QUOTA_DQREL = 602, - QUOTA_LAST_OPC -}; -#define QUOTA_FIRST_OPC QUOTA_DQACQ - -/* - * MDS REQ RECORDS - */ - -/* opcodes */ -enum mds_cmd { - MDS_GETATTR = 33, - MDS_GETATTR_NAME = 34, - MDS_CLOSE = 35, - MDS_REINT = 36, - MDS_READPAGE = 37, - MDS_CONNECT = 38, - MDS_DISCONNECT = 39, - MDS_GETSTATUS = 40, - MDS_STATFS = 41, - MDS_PIN = 42, /* obsolete, never used in a release */ - MDS_UNPIN = 43, /* obsolete, never used in a release */ - MDS_SYNC = 44, - MDS_DONE_WRITING = 45, /* obsolete since 2.8.0 */ - MDS_SET_INFO = 46, - MDS_QUOTACHECK = 47, /* not used since 2.4 */ - MDS_QUOTACTL = 48, - MDS_GETXATTR = 49, - MDS_SETXATTR = 50, /* obsolete, now it's MDS_REINT op */ - MDS_WRITEPAGE = 51, - MDS_IS_SUBDIR = 52, /* obsolete, never used in a release */ - MDS_GET_INFO = 53, - MDS_HSM_STATE_GET = 54, - MDS_HSM_STATE_SET = 55, - MDS_HSM_ACTION = 56, - MDS_HSM_PROGRESS = 57, - MDS_HSM_REQUEST = 58, - MDS_HSM_CT_REGISTER = 59, - MDS_HSM_CT_UNREGISTER = 60, - MDS_SWAP_LAYOUTS = 61, - MDS_LAST_OPC -}; - -#define MDS_FIRST_OPC MDS_GETATTR - -/* - * Do not exceed 63 - */ - -enum mdt_reint_cmd { - REINT_SETATTR = 1, - REINT_CREATE = 2, - REINT_LINK = 3, - REINT_UNLINK = 4, - REINT_RENAME = 5, - REINT_OPEN = 6, - REINT_SETXATTR = 7, - REINT_RMENTRY = 8, - REINT_MIGRATE = 9, - REINT_MAX -}; - -/* the disposition of the intent outlines what was executed */ -#define DISP_IT_EXECD 0x00000001 -#define DISP_LOOKUP_EXECD 0x00000002 -#define DISP_LOOKUP_NEG 0x00000004 -#define DISP_LOOKUP_POS 0x00000008 -#define DISP_OPEN_CREATE 0x00000010 -#define DISP_OPEN_OPEN 0x00000020 -#define DISP_ENQ_COMPLETE 0x00400000 /* obsolete and unused */ -#define DISP_ENQ_OPEN_REF 0x00800000 -#define DISP_ENQ_CREATE_REF 0x01000000 -#define DISP_OPEN_LOCK 0x02000000 -#define DISP_OPEN_LEASE 0x04000000 -#define DISP_OPEN_STRIPE 0x08000000 -#define DISP_OPEN_DENY 0x10000000 - -/* INODE LOCK PARTS */ -#define MDS_INODELOCK_LOOKUP 0x000001 /* For namespace, dentry etc, and also - * was used to protect permission (mode, - * owner, group etc) before 2.4. - */ -#define MDS_INODELOCK_UPDATE 0x000002 /* size, links, timestamps */ -#define MDS_INODELOCK_OPEN 0x000004 /* For opened files */ -#define MDS_INODELOCK_LAYOUT 0x000008 /* for layout */ - -/* The PERM bit is added int 2.4, and it is used to protect permission(mode, - * owner, group, acl etc), so to separate the permission from LOOKUP lock. - * Because for remote directories(in DNE), these locks will be granted by - * different MDTs(different ldlm namespace). - * - * For local directory, MDT will always grant UPDATE_LOCK|PERM_LOCK together. - * For Remote directory, the master MDT, where the remote directory is, will - * grant UPDATE_LOCK|PERM_LOCK, and the remote MDT, where the name entry is, - * will grant LOOKUP_LOCK. - */ -#define MDS_INODELOCK_PERM 0x000010 -#define MDS_INODELOCK_XATTR 0x000020 /* extended attributes */ - -#define MDS_INODELOCK_MAXSHIFT 5 -/* This FULL lock is useful to take on unlink sort of operations */ -#define MDS_INODELOCK_FULL ((1 << (MDS_INODELOCK_MAXSHIFT + 1)) - 1) - -/* NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2], - * but was moved into name[1] along with the OID to avoid consuming the - * name[2,3] fields that need to be used for the quota id (also a FID). - */ -enum { - LUSTRE_RES_ID_SEQ_OFF = 0, - LUSTRE_RES_ID_VER_OID_OFF = 1, - LUSTRE_RES_ID_WAS_VER_OFF = 2, /* see note above */ - LUSTRE_RES_ID_QUOTA_SEQ_OFF = 2, - LUSTRE_RES_ID_QUOTA_VER_OID_OFF = 3, - LUSTRE_RES_ID_HSH_OFF = 3 -}; - -#define MDS_STATUS_CONN 1 -#define MDS_STATUS_LOV 2 - -/* these should be identical to their EXT4_*_FL counterparts, they are - * redefined here only to avoid dragging in fs/ext4/ext4.h - */ -#define LUSTRE_SYNC_FL 0x00000008 /* Synchronous updates */ -#define LUSTRE_IMMUTABLE_FL 0x00000010 /* Immutable file */ -#define LUSTRE_APPEND_FL 0x00000020 /* writes to file may only append */ -#define LUSTRE_NODUMP_FL 0x00000040 /* do not dump file */ -#define LUSTRE_NOATIME_FL 0x00000080 /* do not update atime */ -#define LUSTRE_INDEX_FL 0x00001000 /* hash-indexed directory */ -#define LUSTRE_DIRSYNC_FL 0x00010000 /* dirsync behaviour (dir only) */ -#define LUSTRE_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ -#define LUSTRE_DIRECTIO_FL 0x00100000 /* Use direct i/o */ -#define LUSTRE_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ - -/* Convert wire LUSTRE_*_FL to corresponding client local VFS S_* values - * for the client inode i_flags. The LUSTRE_*_FL are the Lustre wire - * protocol equivalents of LDISKFS_*_FL values stored on disk, while - * the S_* flags are kernel-internal values that change between kernel - * versions. These flags are set/cleared via FSFILT_IOC_{GET,SET}_FLAGS. - * See b=16526 for a full history. - */ -static inline int ll_ext_to_inode_flags(int flags) -{ - return (((flags & LUSTRE_SYNC_FL) ? S_SYNC : 0) | - ((flags & LUSTRE_NOATIME_FL) ? S_NOATIME : 0) | - ((flags & LUSTRE_APPEND_FL) ? S_APPEND : 0) | - ((flags & LUSTRE_DIRSYNC_FL) ? S_DIRSYNC : 0) | - ((flags & LUSTRE_IMMUTABLE_FL) ? S_IMMUTABLE : 0)); -} - -static inline int ll_inode_to_ext_flags(int iflags) -{ - return (((iflags & S_SYNC) ? LUSTRE_SYNC_FL : 0) | - ((iflags & S_NOATIME) ? LUSTRE_NOATIME_FL : 0) | - ((iflags & S_APPEND) ? LUSTRE_APPEND_FL : 0) | - ((iflags & S_DIRSYNC) ? LUSTRE_DIRSYNC_FL : 0) | - ((iflags & S_IMMUTABLE) ? LUSTRE_IMMUTABLE_FL : 0)); -} - -/* 64 possible states */ -enum md_transient_state { - MS_RESTORE = (1 << 0), /* restore is running */ -}; - -struct mdt_body { - struct lu_fid mbo_fid1; - struct lu_fid mbo_fid2; - struct lustre_handle mbo_handle; - __u64 mbo_valid; - __u64 mbo_size; /* Offset, in the case of MDS_READPAGE */ - __s64 mbo_mtime; - __s64 mbo_atime; - __s64 mbo_ctime; - __u64 mbo_blocks; /* XID, in the case of MDS_READPAGE */ - __u64 mbo_ioepoch; - __u64 mbo_t_state; /* transient file state defined in - * enum md_transient_state - * was "ino" until 2.4.0 - */ - __u32 mbo_fsuid; - __u32 mbo_fsgid; - __u32 mbo_capability; - __u32 mbo_mode; - __u32 mbo_uid; - __u32 mbo_gid; - __u32 mbo_flags; /* LUSTRE_*_FL file attributes */ - __u32 mbo_rdev; - __u32 mbo_nlink; /* #bytes to read in the case of MDS_READPAGE */ - __u32 mbo_unused2; /* was "generation" until 2.4.0 */ - __u32 mbo_suppgid; - __u32 mbo_eadatasize; - __u32 mbo_aclsize; - __u32 mbo_max_mdsize; - __u32 mbo_unused3; /* was max_cookiesize until 2.8 */ - __u32 mbo_uid_h; /* high 32-bits of uid, for FUID */ - __u32 mbo_gid_h; /* high 32-bits of gid, for FUID */ - __u32 mbo_padding_5; /* also fix lustre_swab_mdt_body */ - __u64 mbo_padding_6; - __u64 mbo_padding_7; - __u64 mbo_padding_8; - __u64 mbo_padding_9; - __u64 mbo_padding_10; -}; /* 216 */ - -struct mdt_ioepoch { - struct lustre_handle mio_handle; - __u64 mio_unused1; /* was ioepoch */ - __u32 mio_unused2; /* was flags */ - __u32 mio_padding; -}; - -/* permissions for md_perm.mp_perm */ -enum { - CFS_SETUID_PERM = 0x01, - CFS_SETGID_PERM = 0x02, - CFS_SETGRP_PERM = 0x04, -}; - -struct mdt_rec_setattr { - __u32 sa_opcode; - __u32 sa_cap; - __u32 sa_fsuid; - __u32 sa_fsuid_h; - __u32 sa_fsgid; - __u32 sa_fsgid_h; - __u32 sa_suppgid; - __u32 sa_suppgid_h; - __u32 sa_padding_1; - __u32 sa_padding_1_h; - struct lu_fid sa_fid; - __u64 sa_valid; - __u32 sa_uid; - __u32 sa_gid; - __u64 sa_size; - __u64 sa_blocks; - __s64 sa_mtime; - __s64 sa_atime; - __s64 sa_ctime; - __u32 sa_attr_flags; - __u32 sa_mode; - __u32 sa_bias; /* some operation flags */ - __u32 sa_padding_3; - __u32 sa_padding_4; - __u32 sa_padding_5; -}; - -/* - * Attribute flags used in mdt_rec_setattr::sa_valid. - * The kernel's #defines for ATTR_* should not be used over the network - * since the client and MDS may run different kernels (see bug 13828) - * Therefore, we should only use MDS_ATTR_* attributes for sa_valid. - */ -#define MDS_ATTR_MODE 0x1ULL /* = 1 */ -#define MDS_ATTR_UID 0x2ULL /* = 2 */ -#define MDS_ATTR_GID 0x4ULL /* = 4 */ -#define MDS_ATTR_SIZE 0x8ULL /* = 8 */ -#define MDS_ATTR_ATIME 0x10ULL /* = 16 */ -#define MDS_ATTR_MTIME 0x20ULL /* = 32 */ -#define MDS_ATTR_CTIME 0x40ULL /* = 64 */ -#define MDS_ATTR_ATIME_SET 0x80ULL /* = 128 */ -#define MDS_ATTR_MTIME_SET 0x100ULL /* = 256 */ -#define MDS_ATTR_FORCE 0x200ULL /* = 512, Not a change, but a change it */ -#define MDS_ATTR_ATTR_FLAG 0x400ULL /* = 1024 */ -#define MDS_ATTR_KILL_SUID 0x800ULL /* = 2048 */ -#define MDS_ATTR_KILL_SGID 0x1000ULL /* = 4096 */ -#define MDS_ATTR_CTIME_SET 0x2000ULL /* = 8192 */ -#define MDS_ATTR_FROM_OPEN 0x4000ULL /* = 16384, called from open path, - * ie O_TRUNC - */ -#define MDS_ATTR_BLOCKS 0x8000ULL /* = 32768 */ - -#define MDS_FMODE_CLOSED 00000000 -#define MDS_FMODE_EXEC 00000004 -/* MDS_FMODE_EPOCH 01000000 obsolete since 2.8.0 */ -/* MDS_FMODE_TRUNC 02000000 obsolete since 2.8.0 */ -/* MDS_FMODE_SOM 04000000 obsolete since 2.8.0 */ - -#define MDS_OPEN_CREATED 00000010 -#define MDS_OPEN_CROSS 00000020 - -#define MDS_OPEN_CREAT 00000100 -#define MDS_OPEN_EXCL 00000200 -#define MDS_OPEN_TRUNC 00001000 -#define MDS_OPEN_APPEND 00002000 -#define MDS_OPEN_SYNC 00010000 -#define MDS_OPEN_DIRECTORY 00200000 - -#define MDS_OPEN_BY_FID 040000000 /* open_by_fid for known object */ -#define MDS_OPEN_DELAY_CREATE 0100000000 /* delay initial object create */ -#define MDS_OPEN_OWNEROVERRIDE 0200000000 /* NFSD rw-reopen ro file for owner */ -#define MDS_OPEN_JOIN_FILE 0400000000 /* open for join file. - * We do not support JOIN FILE - * anymore, reserve this flags - * just for preventing such bit - * to be reused. - */ - -#define MDS_OPEN_LOCK 04000000000 /* This open requires open lock */ -#define MDS_OPEN_HAS_EA 010000000000 /* specify object create pattern */ -#define MDS_OPEN_HAS_OBJS 020000000000 /* Just set the EA the obj exist */ -#define MDS_OPEN_NORESTORE 0100000000000ULL /* Do not restore file at open */ -#define MDS_OPEN_NEWSTRIPE 0200000000000ULL /* New stripe needed (restripe or - * hsm restore) - */ -#define MDS_OPEN_VOLATILE 0400000000000ULL /* File is volatile = created - * unlinked - */ -#define MDS_OPEN_LEASE 01000000000000ULL /* Open the file and grant lease - * delegation, succeed if it's not - * being opened with conflict mode. - */ -#define MDS_OPEN_RELEASE 02000000000000ULL /* Open the file for HSM release */ - -#define MDS_OPEN_FL_INTERNAL (MDS_OPEN_HAS_EA | MDS_OPEN_HAS_OBJS | \ - MDS_OPEN_OWNEROVERRIDE | MDS_OPEN_LOCK | \ - MDS_OPEN_BY_FID | MDS_OPEN_LEASE | \ - MDS_OPEN_RELEASE) - -enum mds_op_bias { - MDS_CHECK_SPLIT = 1 << 0, - MDS_CROSS_REF = 1 << 1, - MDS_VTX_BYPASS = 1 << 2, - MDS_PERM_BYPASS = 1 << 3, -/* MDS_SOM = 1 << 4, obsolete since 2.8.0 */ - MDS_QUOTA_IGNORE = 1 << 5, - MDS_CLOSE_CLEANUP = 1 << 6, - MDS_KEEP_ORPHAN = 1 << 7, - MDS_RECOV_OPEN = 1 << 8, - MDS_DATA_MODIFIED = 1 << 9, - MDS_CREATE_VOLATILE = 1 << 10, - MDS_OWNEROVERRIDE = 1 << 11, - MDS_HSM_RELEASE = 1 << 12, - MDS_RENAME_MIGRATE = 1 << 13, - MDS_CLOSE_LAYOUT_SWAP = 1 << 14, -}; - -/* instance of mdt_reint_rec */ -struct mdt_rec_create { - __u32 cr_opcode; - __u32 cr_cap; - __u32 cr_fsuid; - __u32 cr_fsuid_h; - __u32 cr_fsgid; - __u32 cr_fsgid_h; - __u32 cr_suppgid1; - __u32 cr_suppgid1_h; - __u32 cr_suppgid2; - __u32 cr_suppgid2_h; - struct lu_fid cr_fid1; - struct lu_fid cr_fid2; - struct lustre_handle cr_old_handle; /* handle in case of open replay */ - __s64 cr_time; - __u64 cr_rdev; - __u64 cr_ioepoch; - __u64 cr_padding_1; /* rr_blocks */ - __u32 cr_mode; - __u32 cr_bias; - /* use of helpers set/get_mrc_cr_flags() is needed to access - * 64 bits cr_flags [cr_flags_l, cr_flags_h], this is done to - * extend cr_flags size without breaking 1.8 compat - */ - __u32 cr_flags_l; /* for use with open, low 32 bits */ - __u32 cr_flags_h; /* for use with open, high 32 bits */ - __u32 cr_umask; /* umask for create */ - __u32 cr_padding_4; /* rr_padding_4 */ -}; - -/* instance of mdt_reint_rec */ -struct mdt_rec_link { - __u32 lk_opcode; - __u32 lk_cap; - __u32 lk_fsuid; - __u32 lk_fsuid_h; - __u32 lk_fsgid; - __u32 lk_fsgid_h; - __u32 lk_suppgid1; - __u32 lk_suppgid1_h; - __u32 lk_suppgid2; - __u32 lk_suppgid2_h; - struct lu_fid lk_fid1; - struct lu_fid lk_fid2; - __s64 lk_time; - __u64 lk_padding_1; /* rr_atime */ - __u64 lk_padding_2; /* rr_ctime */ - __u64 lk_padding_3; /* rr_size */ - __u64 lk_padding_4; /* rr_blocks */ - __u32 lk_bias; - __u32 lk_padding_5; /* rr_mode */ - __u32 lk_padding_6; /* rr_flags */ - __u32 lk_padding_7; /* rr_padding_2 */ - __u32 lk_padding_8; /* rr_padding_3 */ - __u32 lk_padding_9; /* rr_padding_4 */ -}; - -/* instance of mdt_reint_rec */ -struct mdt_rec_unlink { - __u32 ul_opcode; - __u32 ul_cap; - __u32 ul_fsuid; - __u32 ul_fsuid_h; - __u32 ul_fsgid; - __u32 ul_fsgid_h; - __u32 ul_suppgid1; - __u32 ul_suppgid1_h; - __u32 ul_suppgid2; - __u32 ul_suppgid2_h; - struct lu_fid ul_fid1; - struct lu_fid ul_fid2; - __s64 ul_time; - __u64 ul_padding_2; /* rr_atime */ - __u64 ul_padding_3; /* rr_ctime */ - __u64 ul_padding_4; /* rr_size */ - __u64 ul_padding_5; /* rr_blocks */ - __u32 ul_bias; - __u32 ul_mode; - __u32 ul_padding_6; /* rr_flags */ - __u32 ul_padding_7; /* rr_padding_2 */ - __u32 ul_padding_8; /* rr_padding_3 */ - __u32 ul_padding_9; /* rr_padding_4 */ -}; - -/* instance of mdt_reint_rec */ -struct mdt_rec_rename { - __u32 rn_opcode; - __u32 rn_cap; - __u32 rn_fsuid; - __u32 rn_fsuid_h; - __u32 rn_fsgid; - __u32 rn_fsgid_h; - __u32 rn_suppgid1; - __u32 rn_suppgid1_h; - __u32 rn_suppgid2; - __u32 rn_suppgid2_h; - struct lu_fid rn_fid1; - struct lu_fid rn_fid2; - __s64 rn_time; - __u64 rn_padding_1; /* rr_atime */ - __u64 rn_padding_2; /* rr_ctime */ - __u64 rn_padding_3; /* rr_size */ - __u64 rn_padding_4; /* rr_blocks */ - __u32 rn_bias; /* some operation flags */ - __u32 rn_mode; /* cross-ref rename has mode */ - __u32 rn_padding_5; /* rr_flags */ - __u32 rn_padding_6; /* rr_padding_2 */ - __u32 rn_padding_7; /* rr_padding_3 */ - __u32 rn_padding_8; /* rr_padding_4 */ -}; - -/* instance of mdt_reint_rec */ -struct mdt_rec_setxattr { - __u32 sx_opcode; - __u32 sx_cap; - __u32 sx_fsuid; - __u32 sx_fsuid_h; - __u32 sx_fsgid; - __u32 sx_fsgid_h; - __u32 sx_suppgid1; - __u32 sx_suppgid1_h; - __u32 sx_suppgid2; - __u32 sx_suppgid2_h; - struct lu_fid sx_fid; - __u64 sx_padding_1; /* These three are rr_fid2 */ - __u32 sx_padding_2; - __u32 sx_padding_3; - __u64 sx_valid; - __s64 sx_time; - __u64 sx_padding_5; /* rr_ctime */ - __u64 sx_padding_6; /* rr_size */ - __u64 sx_padding_7; /* rr_blocks */ - __u32 sx_size; - __u32 sx_flags; - __u32 sx_padding_8; /* rr_flags */ - __u32 sx_padding_9; /* rr_padding_2 */ - __u32 sx_padding_10; /* rr_padding_3 */ - __u32 sx_padding_11; /* rr_padding_4 */ -}; - -/* - * mdt_rec_reint is the template for all mdt_reint_xxx structures. - * Do NOT change the size of various members, otherwise the value - * will be broken in lustre_swab_mdt_rec_reint(). - * - * If you add new members in other mdt_reint_xxx structures and need to use the - * rr_padding_x fields, then update lustre_swab_mdt_rec_reint() also. - */ -struct mdt_rec_reint { - __u32 rr_opcode; - __u32 rr_cap; - __u32 rr_fsuid; - __u32 rr_fsuid_h; - __u32 rr_fsgid; - __u32 rr_fsgid_h; - __u32 rr_suppgid1; - __u32 rr_suppgid1_h; - __u32 rr_suppgid2; - __u32 rr_suppgid2_h; - struct lu_fid rr_fid1; - struct lu_fid rr_fid2; - __s64 rr_mtime; - __s64 rr_atime; - __s64 rr_ctime; - __u64 rr_size; - __u64 rr_blocks; - __u32 rr_bias; - __u32 rr_mode; - __u32 rr_flags; - __u32 rr_flags_h; - __u32 rr_umask; - __u32 rr_padding_4; /* also fix lustre_swab_mdt_rec_reint */ -}; - -/* lmv structures */ -struct lmv_desc { - __u32 ld_tgt_count; /* how many MDS's */ - __u32 ld_active_tgt_count; /* how many active */ - __u32 ld_default_stripe_count; /* how many objects are used */ - __u32 ld_pattern; /* default hash pattern */ - __u64 ld_default_hash_size; - __u64 ld_padding_1; /* also fix lustre_swab_lmv_desc */ - __u32 ld_padding_2; /* also fix lustre_swab_lmv_desc */ - __u32 ld_qos_maxage; /* in second */ - __u32 ld_padding_3; /* also fix lustre_swab_lmv_desc */ - __u32 ld_padding_4; /* also fix lustre_swab_lmv_desc */ - struct obd_uuid ld_uuid; -}; - -/* LMV layout EA, and it will be stored both in master and slave object */ -struct lmv_mds_md_v1 { - __u32 lmv_magic; - __u32 lmv_stripe_count; - __u32 lmv_master_mdt_index; /* On master object, it is master - * MDT index, on slave object, it - * is stripe index of the slave obj - */ - __u32 lmv_hash_type; /* dir stripe policy, i.e. indicate - * which hash function to be used, - * Note: only lower 16 bits is being - * used for now. Higher 16 bits will - * be used to mark the object status, - * for example migrating or dead. - */ - __u32 lmv_layout_version; /* Used for directory restriping */ - __u32 lmv_padding1; - __u64 lmv_padding2; - __u64 lmv_padding3; - char lmv_pool_name[LOV_MAXPOOLNAME + 1];/* pool name */ - struct lu_fid lmv_stripe_fids[0]; /* FIDs for each stripe */ -}; - -#define LMV_MAGIC_V1 0x0CD20CD0 /* normal stripe lmv magic */ -#define LMV_MAGIC LMV_MAGIC_V1 - -/* #define LMV_USER_MAGIC 0x0CD30CD0 */ -#define LMV_MAGIC_STRIPE 0x0CD40CD0 /* magic for dir sub_stripe */ - -/* - *Right now only the lower part(0-16bits) of lmv_hash_type is being used, - * and the higher part will be the flag to indicate the status of object, - * for example the object is being migrated. And the hash function - * might be interpreted differently with different flags. - */ -#define LMV_HASH_TYPE_MASK 0x0000ffff - -#define LMV_HASH_FLAG_MIGRATION 0x80000000 -#define LMV_HASH_FLAG_DEAD 0x40000000 - -/** - * The FNV-1a hash algorithm is as follows: - * hash = FNV_offset_basis - * for each octet_of_data to be hashed - * hash = hash XOR octet_of_data - * hash = hash × FNV_prime - * return hash - * http://en.wikipedia.org/wiki/Fowler–Noll–Vo_hash_function#FNV-1a_hash - * - * http://www.isthe.com/chongo/tech/comp/fnv/index.html#FNV-reference-source - * FNV_prime is 2^40 + 2^8 + 0xb3 = 0x100000001b3ULL - **/ -#define LUSTRE_FNV_1A_64_PRIME 0x100000001b3ULL -#define LUSTRE_FNV_1A_64_OFFSET_BIAS 0xcbf29ce484222325ULL -static inline __u64 lustre_hash_fnv_1a_64(const void *buf, size_t size) -{ - __u64 hash = LUSTRE_FNV_1A_64_OFFSET_BIAS; - const unsigned char *p = buf; - size_t i; - - for (i = 0; i < size; i++) { - hash ^= p[i]; - hash *= LUSTRE_FNV_1A_64_PRIME; - } - - return hash; -} - -union lmv_mds_md { - __u32 lmv_magic; - struct lmv_mds_md_v1 lmv_md_v1; - struct lmv_user_md lmv_user_md; -}; - -static inline ssize_t lmv_mds_md_size(int stripe_count, unsigned int lmm_magic) -{ - ssize_t len = -EINVAL; - - switch (lmm_magic) { - case LMV_MAGIC_V1: { - struct lmv_mds_md_v1 *lmm1; - - len = sizeof(*lmm1); - len += stripe_count * sizeof(lmm1->lmv_stripe_fids[0]); - break; } - default: - break; - } - return len; -} - -static inline int lmv_mds_md_stripe_count_get(const union lmv_mds_md *lmm) -{ - switch (__le32_to_cpu(lmm->lmv_magic)) { - case LMV_MAGIC_V1: - return __le32_to_cpu(lmm->lmv_md_v1.lmv_stripe_count); - case LMV_USER_MAGIC: - return __le32_to_cpu(lmm->lmv_user_md.lum_stripe_count); - default: - return -EINVAL; - } -} - -enum fld_rpc_opc { - FLD_QUERY = 900, - FLD_READ = 901, - FLD_LAST_OPC, - FLD_FIRST_OPC = FLD_QUERY -}; - -enum seq_rpc_opc { - SEQ_QUERY = 700, - SEQ_LAST_OPC, - SEQ_FIRST_OPC = SEQ_QUERY -}; - -enum seq_op { - SEQ_ALLOC_SUPER = 0, - SEQ_ALLOC_META = 1 -}; - -enum fld_op { - FLD_CREATE = 0, - FLD_DELETE = 1, - FLD_LOOKUP = 2, -}; - -/* - * LOV data structures - */ - -#define LOV_MAX_UUID_BUFFER_SIZE 8192 -/* The size of the buffer the lov/mdc reserves for the - * array of UUIDs returned by the MDS. With the current - * protocol, this will limit the max number of OSTs per LOV - */ - -#define LOV_DESC_MAGIC 0xB0CCDE5C -#define LOV_DESC_QOS_MAXAGE_DEFAULT 5 /* Seconds */ -#define LOV_DESC_STRIPE_SIZE_DEFAULT (1 << LNET_MTU_BITS) - -/* LOV settings descriptor (should only contain static info) */ -struct lov_desc { - __u32 ld_tgt_count; /* how many OBD's */ - __u32 ld_active_tgt_count; /* how many active */ - __u32 ld_default_stripe_count; /* how many objects are used */ - __u32 ld_pattern; /* default PATTERN_RAID0 */ - __u64 ld_default_stripe_size; /* in bytes */ - __u64 ld_default_stripe_offset; /* in bytes */ - __u32 ld_padding_0; /* unused */ - __u32 ld_qos_maxage; /* in second */ - __u32 ld_padding_1; /* also fix lustre_swab_lov_desc */ - __u32 ld_padding_2; /* also fix lustre_swab_lov_desc */ - struct obd_uuid ld_uuid; -}; - -#define ld_magic ld_active_tgt_count /* for swabbing from llogs */ - -/* - * LDLM requests: - */ -/* opcodes -- MUST be distinct from OST/MDS opcodes */ -enum ldlm_cmd { - LDLM_ENQUEUE = 101, - LDLM_CONVERT = 102, - LDLM_CANCEL = 103, - LDLM_BL_CALLBACK = 104, - LDLM_CP_CALLBACK = 105, - LDLM_GL_CALLBACK = 106, - LDLM_SET_INFO = 107, - LDLM_LAST_OPC -}; -#define LDLM_FIRST_OPC LDLM_ENQUEUE - -#define RES_NAME_SIZE 4 -struct ldlm_res_id { - __u64 name[RES_NAME_SIZE]; -}; - -#define DLDLMRES "[%#llx:%#llx:%#llx].%llx" -#define PLDLMRES(res) (res)->lr_name.name[0], (res)->lr_name.name[1], \ - (res)->lr_name.name[2], (res)->lr_name.name[3] - -/* lock types */ -enum ldlm_mode { - LCK_MINMODE = 0, - LCK_EX = 1, - LCK_PW = 2, - LCK_PR = 4, - LCK_CW = 8, - LCK_CR = 16, - LCK_NL = 32, - LCK_GROUP = 64, - LCK_COS = 128, - LCK_MAXMODE -}; - -#define LCK_MODE_NUM 8 - -enum ldlm_type { - LDLM_PLAIN = 10, - LDLM_EXTENT = 11, - LDLM_FLOCK = 12, - LDLM_IBITS = 13, - LDLM_MAX_TYPE -}; - -#define LDLM_MIN_TYPE LDLM_PLAIN - -struct ldlm_extent { - __u64 start; - __u64 end; - __u64 gid; -}; - -struct ldlm_inodebits { - __u64 bits; -}; - -struct ldlm_flock_wire { - __u64 lfw_start; - __u64 lfw_end; - __u64 lfw_owner; - __u32 lfw_padding; - __u32 lfw_pid; -}; - -/* it's important that the fields of the ldlm_extent structure match - * the first fields of the ldlm_flock structure because there is only - * one ldlm_swab routine to process the ldlm_policy_data_t union. if - * this ever changes we will need to swab the union differently based - * on the resource type. - */ - -union ldlm_wire_policy_data { - struct ldlm_extent l_extent; - struct ldlm_flock_wire l_flock; - struct ldlm_inodebits l_inodebits; -}; - -union ldlm_gl_desc { - struct ldlm_gl_lquota_desc lquota_desc; -}; - -enum ldlm_intent_flags { - IT_OPEN = 0x00000001, - IT_CREAT = 0x00000002, - IT_OPEN_CREAT = 0x00000003, - IT_READDIR = 0x00000004, - IT_GETATTR = 0x00000008, - IT_LOOKUP = 0x00000010, - IT_UNLINK = 0x00000020, - IT_TRUNC = 0x00000040, - IT_GETXATTR = 0x00000080, - IT_EXEC = 0x00000100, - IT_PIN = 0x00000200, - IT_LAYOUT = 0x00000400, - IT_QUOTA_DQACQ = 0x00000800, - IT_QUOTA_CONN = 0x00001000, - IT_SETXATTR = 0x00002000, -}; - -struct ldlm_intent { - __u64 opc; -}; - -struct ldlm_resource_desc { - enum ldlm_type lr_type; - __u32 lr_padding; /* also fix lustre_swab_ldlm_resource_desc */ - struct ldlm_res_id lr_name; -}; - -struct ldlm_lock_desc { - struct ldlm_resource_desc l_resource; - enum ldlm_mode l_req_mode; - enum ldlm_mode l_granted_mode; - union ldlm_wire_policy_data l_policy_data; -}; - -#define LDLM_LOCKREQ_HANDLES 2 -#define LDLM_ENQUEUE_CANCEL_OFF 1 - -struct ldlm_request { - __u32 lock_flags; - __u32 lock_count; - struct ldlm_lock_desc lock_desc; - struct lustre_handle lock_handle[LDLM_LOCKREQ_HANDLES]; -}; - -struct ldlm_reply { - __u32 lock_flags; - __u32 lock_padding; /* also fix lustre_swab_ldlm_reply */ - struct ldlm_lock_desc lock_desc; - struct lustre_handle lock_handle; - __u64 lock_policy_res1; - __u64 lock_policy_res2; -}; - -#define ldlm_flags_to_wire(flags) ((__u32)(flags)) -#define ldlm_flags_from_wire(flags) ((__u64)(flags)) - -/* - * Opcodes for mountconf (mgs and mgc) - */ -enum mgs_cmd { - MGS_CONNECT = 250, - MGS_DISCONNECT, - MGS_EXCEPTION, /* node died, etc. */ - MGS_TARGET_REG, /* whenever target starts up */ - MGS_TARGET_DEL, - MGS_SET_INFO, - MGS_CONFIG_READ, - MGS_LAST_OPC -}; -#define MGS_FIRST_OPC MGS_CONNECT - -#define MGS_PARAM_MAXLEN 1024 -#define KEY_SET_INFO "set_info" - -struct mgs_send_param { - char mgs_param[MGS_PARAM_MAXLEN]; -}; - -/* We pass this info to the MGS so it can write config logs */ -#define MTI_NAME_MAXLEN 64 -#define MTI_PARAM_MAXLEN 4096 -#define MTI_NIDS_MAX 32 -struct mgs_target_info { - __u32 mti_lustre_ver; - __u32 mti_stripe_index; - __u32 mti_config_ver; - __u32 mti_flags; - __u32 mti_nid_count; - __u32 mti_instance; /* Running instance of target */ - char mti_fsname[MTI_NAME_MAXLEN]; - char mti_svname[MTI_NAME_MAXLEN]; - char mti_uuid[sizeof(struct obd_uuid)]; - __u64 mti_nids[MTI_NIDS_MAX]; /* host nids (lnet_nid_t)*/ - char mti_params[MTI_PARAM_MAXLEN]; -}; - -struct mgs_nidtbl_entry { - __u64 mne_version; /* table version of this entry */ - __u32 mne_instance; /* target instance # */ - __u32 mne_index; /* target index */ - __u32 mne_length; /* length of this entry - by bytes */ - __u8 mne_type; /* target type LDD_F_SV_TYPE_OST/MDT */ - __u8 mne_nid_type; /* type of nid(mbz). for ipv6. */ - __u8 mne_nid_size; /* size of each NID, by bytes */ - __u8 mne_nid_count; /* # of NIDs in buffer */ - union { - lnet_nid_t nids[0]; /* variable size buffer for NIDs. */ - } u; -}; - -struct mgs_config_body { - char mcb_name[MTI_NAME_MAXLEN]; /* logname */ - __u64 mcb_offset; /* next index of config log to request */ - __u16 mcb_type; /* type of log: CONFIG_T_[CONFIG|RECOVER] */ - __u8 mcb_reserved; - __u8 mcb_bits; /* bits unit size of config log */ - __u32 mcb_units; /* # of units for bulk transfer */ -}; - -struct mgs_config_res { - __u64 mcr_offset; /* index of last config log */ - __u64 mcr_size; /* size of the log */ -}; - -/* Config marker flags (in config log) */ -#define CM_START 0x01 -#define CM_END 0x02 -#define CM_SKIP 0x04 -#define CM_UPGRADE146 0x08 -#define CM_EXCLUDE 0x10 -#define CM_START_SKIP (CM_START | CM_SKIP) - -struct cfg_marker { - __u32 cm_step; /* aka config version */ - __u32 cm_flags; - __u32 cm_vers; /* lustre release version number */ - __u32 cm_padding; /* 64 bit align */ - __s64 cm_createtime; /*when this record was first created */ - __s64 cm_canceltime; /*when this record is no longer valid*/ - char cm_tgtname[MTI_NAME_MAXLEN]; - char cm_comment[MTI_NAME_MAXLEN]; -}; - -/* - * Opcodes for multiple servers. - */ - -enum obd_cmd { - OBD_PING = 400, - OBD_LOG_CANCEL, - OBD_QC_CALLBACK, /* not used since 2.4 */ - OBD_IDX_READ, - OBD_LAST_OPC -}; -#define OBD_FIRST_OPC OBD_PING - -/** - * llog contexts indices. - * - * There is compatibility problem with indexes below, they are not - * continuous and must keep their numbers for compatibility needs. - * See LU-5218 for details. - */ -enum llog_ctxt_id { - LLOG_CONFIG_ORIG_CTXT = 0, - LLOG_CONFIG_REPL_CTXT = 1, - LLOG_MDS_OST_ORIG_CTXT = 2, - LLOG_MDS_OST_REPL_CTXT = 3, /* kept just to avoid re-assignment */ - LLOG_SIZE_ORIG_CTXT = 4, - LLOG_SIZE_REPL_CTXT = 5, - LLOG_TEST_ORIG_CTXT = 8, - LLOG_TEST_REPL_CTXT = 9, /* kept just to avoid re-assignment */ - LLOG_CHANGELOG_ORIG_CTXT = 12, /**< changelog generation on mdd */ - LLOG_CHANGELOG_REPL_CTXT = 13, /**< changelog access on clients */ - /* for multiple changelog consumers */ - LLOG_CHANGELOG_USER_ORIG_CTXT = 14, - LLOG_AGENT_ORIG_CTXT = 15, /**< agent requests generation on cdt */ - LLOG_MAX_CTXTS -}; - -/** Identifier for a single log object */ -struct llog_logid { - struct ost_id lgl_oi; - __u32 lgl_ogen; -} __packed; - -/** Records written to the CATALOGS list */ -#define CATLIST "CATALOGS" -struct llog_catid { - struct llog_logid lci_logid; - __u32 lci_padding1; - __u32 lci_padding2; - __u32 lci_padding3; -} __packed; - -/* Log data record types - there is no specific reason that these need to - * be related to the RPC opcodes, but no reason not to (may be handy later?) - */ -#define LLOG_OP_MAGIC 0x10600000 -#define LLOG_OP_MASK 0xfff00000 - -enum llog_op_type { - LLOG_PAD_MAGIC = LLOG_OP_MAGIC | 0x00000, - OST_SZ_REC = LLOG_OP_MAGIC | 0x00f00, - /* OST_RAID1_REC = LLOG_OP_MAGIC | 0x01000, never used */ - MDS_UNLINK_REC = LLOG_OP_MAGIC | 0x10000 | (MDS_REINT << 8) | - REINT_UNLINK, /* obsolete after 2.5.0 */ - MDS_UNLINK64_REC = LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) | - REINT_UNLINK, - /* MDS_SETATTR_REC = LLOG_OP_MAGIC | 0x12401, obsolete 1.8.0 */ - MDS_SETATTR64_REC = LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) | - REINT_SETATTR, - OBD_CFG_REC = LLOG_OP_MAGIC | 0x20000, - /* PTL_CFG_REC = LLOG_OP_MAGIC | 0x30000, obsolete 1.4.0 */ - LLOG_GEN_REC = LLOG_OP_MAGIC | 0x40000, - /* LLOG_JOIN_REC = LLOG_OP_MAGIC | 0x50000, obsolete 1.8.0 */ - CHANGELOG_REC = LLOG_OP_MAGIC | 0x60000, - CHANGELOG_USER_REC = LLOG_OP_MAGIC | 0x70000, - HSM_AGENT_REC = LLOG_OP_MAGIC | 0x80000, - LLOG_HDR_MAGIC = LLOG_OP_MAGIC | 0x45539, - LLOG_LOGID_MAGIC = LLOG_OP_MAGIC | 0x4553b, -}; - -#define LLOG_REC_HDR_NEEDS_SWABBING(r) \ - (((r)->lrh_type & __swab32(LLOG_OP_MASK)) == __swab32(LLOG_OP_MAGIC)) - -/** Log record header - stored in little endian order. - * Each record must start with this struct, end with a llog_rec_tail, - * and be a multiple of 256 bits in size. - */ -struct llog_rec_hdr { - __u32 lrh_len; - __u32 lrh_index; - __u32 lrh_type; - __u32 lrh_id; -}; - -struct llog_rec_tail { - __u32 lrt_len; - __u32 lrt_index; -}; - -/* Where data follow just after header */ -#define REC_DATA(ptr) \ - ((void *)((char *)ptr + sizeof(struct llog_rec_hdr))) - -#define REC_DATA_LEN(rec) \ - (rec->lrh_len - sizeof(struct llog_rec_hdr) - \ - sizeof(struct llog_rec_tail)) - -struct llog_logid_rec { - struct llog_rec_hdr lid_hdr; - struct llog_logid lid_id; - __u32 lid_padding1; - __u64 lid_padding2; - __u64 lid_padding3; - struct llog_rec_tail lid_tail; -} __packed; - -struct llog_unlink_rec { - struct llog_rec_hdr lur_hdr; - __u64 lur_oid; - __u32 lur_oseq; - __u32 lur_count; - struct llog_rec_tail lur_tail; -} __packed; - -struct llog_unlink64_rec { - struct llog_rec_hdr lur_hdr; - struct lu_fid lur_fid; - __u32 lur_count; /* to destroy the lost precreated */ - __u32 lur_padding1; - __u64 lur_padding2; - __u64 lur_padding3; - struct llog_rec_tail lur_tail; -} __packed; - -struct llog_setattr64_rec { - struct llog_rec_hdr lsr_hdr; - struct ost_id lsr_oi; - __u32 lsr_uid; - __u32 lsr_uid_h; - __u32 lsr_gid; - __u32 lsr_gid_h; - __u64 lsr_valid; - struct llog_rec_tail lsr_tail; -} __packed; - -struct llog_size_change_rec { - struct llog_rec_hdr lsc_hdr; - struct ll_fid lsc_fid; - __u32 lsc_ioepoch; - __u32 lsc_padding1; - __u64 lsc_padding2; - __u64 lsc_padding3; - struct llog_rec_tail lsc_tail; -} __packed; - -/* changelog llog name, needed by client replicators */ -#define CHANGELOG_CATALOG "changelog_catalog" - -struct changelog_setinfo { - __u64 cs_recno; - __u32 cs_id; -} __packed; - -/** changelog record */ -struct llog_changelog_rec { - struct llog_rec_hdr cr_hdr; - struct changelog_rec cr; /**< Variable length field */ - struct llog_rec_tail cr_do_not_use; /**< for_sizezof_only */ -} __packed; - -struct llog_changelog_user_rec { - struct llog_rec_hdr cur_hdr; - __u32 cur_id; - __u32 cur_padding; - __u64 cur_endrec; - struct llog_rec_tail cur_tail; -} __packed; - -enum agent_req_status { - ARS_WAITING, - ARS_STARTED, - ARS_FAILED, - ARS_CANCELED, - ARS_SUCCEED, -}; - -static inline const char *agent_req_status2name(const enum agent_req_status ars) -{ - switch (ars) { - case ARS_WAITING: - return "WAITING"; - case ARS_STARTED: - return "STARTED"; - case ARS_FAILED: - return "FAILED"; - case ARS_CANCELED: - return "CANCELED"; - case ARS_SUCCEED: - return "SUCCEED"; - default: - return "UNKNOWN"; - } -} - -struct llog_agent_req_rec { - struct llog_rec_hdr arr_hdr; /**< record header */ - __u32 arr_status; /**< status of the request */ - /* must match enum - * agent_req_status - */ - __u32 arr_archive_id; /**< backend archive number */ - __u64 arr_flags; /**< req flags */ - __u64 arr_compound_id;/**< compound cookie */ - __u64 arr_req_create; /**< req. creation time */ - __u64 arr_req_change; /**< req. status change time */ - struct hsm_action_item arr_hai; /**< req. to the agent */ - struct llog_rec_tail arr_tail; /**< record tail for_sizezof_only */ -} __packed; - -/* Old llog gen for compatibility */ -struct llog_gen { - __u64 mnt_cnt; - __u64 conn_cnt; -} __packed; - -struct llog_gen_rec { - struct llog_rec_hdr lgr_hdr; - struct llog_gen lgr_gen; - __u64 padding1; - __u64 padding2; - __u64 padding3; - struct llog_rec_tail lgr_tail; -}; - -/* flags for the logs */ -enum llog_flag { - LLOG_F_ZAP_WHEN_EMPTY = 0x1, - LLOG_F_IS_CAT = 0x2, - LLOG_F_IS_PLAIN = 0x4, - LLOG_F_EXT_JOBID = 0x8, - LLOG_F_IS_FIXSIZE = 0x10, - - /* - * Note: Flags covered by LLOG_F_EXT_MASK will be inherited from - * catlog to plain log, so do not add LLOG_F_IS_FIXSIZE here, - * because the catlog record is usually fixed size, but its plain - * log record can be variable - */ - LLOG_F_EXT_MASK = LLOG_F_EXT_JOBID, -}; - -/* On-disk header structure of each log object, stored in little endian order */ -#define LLOG_MIN_CHUNK_SIZE 8192 -#define LLOG_HEADER_SIZE (96) /* sizeof (llog_log_hdr) + - * sizeof(llh_tail) - sizeof(llh_bitmap) - */ -#define LLOG_BITMAP_BYTES (LLOG_MIN_CHUNK_SIZE - LLOG_HEADER_SIZE) -#define LLOG_MIN_REC_SIZE (24) /* round(llog_rec_hdr + llog_rec_tail) */ - -/* flags for the logs */ -struct llog_log_hdr { - struct llog_rec_hdr llh_hdr; - __s64 llh_timestamp; - __u32 llh_count; - __u32 llh_bitmap_offset; - __u32 llh_size; - __u32 llh_flags; - __u32 llh_cat_idx; - /* for a catalog the first plain slot is next to it */ - struct obd_uuid llh_tgtuuid; - __u32 llh_reserved[LLOG_HEADER_SIZE / sizeof(__u32) - 23]; - /* These fields must always be at the end of the llog_log_hdr. - * Note: llh_bitmap size is variable because llog chunk size could be - * bigger than LLOG_MIN_CHUNK_SIZE, i.e. sizeof(llog_log_hdr) > 8192 - * bytes, and the real size is stored in llh_hdr.lrh_len, which means - * llh_tail should only be referred by LLOG_HDR_TAIL(). - * But this structure is also used by client/server llog interface - * (see llog_client.c), it will be kept in its original way to avoid - * compatibility issue. - */ - __u32 llh_bitmap[LLOG_BITMAP_BYTES / sizeof(__u32)]; - struct llog_rec_tail llh_tail; -} __packed; - -#undef LLOG_HEADER_SIZE -#undef LLOG_BITMAP_BYTES - -#define LLOG_HDR_BITMAP_SIZE(llh) (__u32)((llh->llh_hdr.lrh_len - \ - llh->llh_bitmap_offset - \ - sizeof(llh->llh_tail)) * 8) -#define LLOG_HDR_BITMAP(llh) (__u32 *)((char *)(llh) + \ - (llh)->llh_bitmap_offset) -#define LLOG_HDR_TAIL(llh) ((struct llog_rec_tail *)((char *)llh + \ - llh->llh_hdr.lrh_len - \ - sizeof(llh->llh_tail))) - -/** log cookies are used to reference a specific log file and a record - * therein - */ -struct llog_cookie { - struct llog_logid lgc_lgl; - __u32 lgc_subsys; - __u32 lgc_index; - __u32 lgc_padding; -} __packed; - -/** llog protocol */ -enum llogd_rpc_ops { - LLOG_ORIGIN_HANDLE_CREATE = 501, - LLOG_ORIGIN_HANDLE_NEXT_BLOCK = 502, - LLOG_ORIGIN_HANDLE_READ_HEADER = 503, - LLOG_ORIGIN_HANDLE_WRITE_REC = 504, - LLOG_ORIGIN_HANDLE_CLOSE = 505, - LLOG_ORIGIN_CONNECT = 506, - LLOG_CATINFO = 507, /* deprecated */ - LLOG_ORIGIN_HANDLE_PREV_BLOCK = 508, - LLOG_ORIGIN_HANDLE_DESTROY = 509, /* for destroy llog object*/ - LLOG_LAST_OPC, - LLOG_FIRST_OPC = LLOG_ORIGIN_HANDLE_CREATE -}; - -struct llogd_body { - struct llog_logid lgd_logid; - __u32 lgd_ctxt_idx; - __u32 lgd_llh_flags; - __u32 lgd_index; - __u32 lgd_saved_index; - __u32 lgd_len; - __u64 lgd_cur_offset; -} __packed; - -struct llogd_conn_body { - struct llog_gen lgdc_gen; - struct llog_logid lgdc_logid; - __u32 lgdc_ctxt_idx; -} __packed; - -/* Note: 64-bit types are 64-bit aligned in structure */ -struct obdo { - __u64 o_valid; /* hot fields in this obdo */ - struct ost_id o_oi; - __u64 o_parent_seq; - __u64 o_size; /* o_size-o_blocks == ost_lvb */ - __s64 o_mtime; - __s64 o_atime; - __s64 o_ctime; - __u64 o_blocks; /* brw: cli sent cached bytes */ - __u64 o_grant; - - /* 32-bit fields start here: keep an even number of them via padding */ - __u32 o_blksize; /* optimal IO blocksize */ - __u32 o_mode; /* brw: cli sent cache remain */ - __u32 o_uid; - __u32 o_gid; - __u32 o_flags; - __u32 o_nlink; /* brw: checksum */ - __u32 o_parent_oid; - __u32 o_misc; /* brw: o_dropped */ - - __u64 o_ioepoch; /* epoch in ost writes */ - __u32 o_stripe_idx; /* holds stripe idx */ - __u32 o_parent_ver; - struct lustre_handle o_handle; /* brw: lock handle to prolong locks - */ - struct llog_cookie o_lcookie; /* destroy: unlink cookie from MDS, - * obsolete in 2.8, reused in OSP - */ - __u32 o_uid_h; - __u32 o_gid_h; - - __u64 o_data_version; /* getattr: sum of iversion for - * each stripe. - * brw: grant space consumed on - * the client for the write - */ - __u64 o_padding_4; - __u64 o_padding_5; - __u64 o_padding_6; -}; - -#define o_dirty o_blocks -#define o_undirty o_mode -#define o_dropped o_misc -#define o_cksum o_nlink -#define o_grant_used o_data_version - -/* request structure for OST's */ -struct ost_body { - struct obdo oa; -}; - -/* Key for FIEMAP to be used in get_info calls */ -struct ll_fiemap_info_key { - char lfik_name[8]; - struct obdo lfik_oa; - struct fiemap lfik_fiemap; -}; - -/* security opcodes */ -enum sec_cmd { - SEC_CTX_INIT = 801, - SEC_CTX_INIT_CONT = 802, - SEC_CTX_FINI = 803, - SEC_LAST_OPC, - SEC_FIRST_OPC = SEC_CTX_INIT -}; - -/* - * capa related definitions - */ -#define CAPA_HMAC_MAX_LEN 64 -#define CAPA_HMAC_KEY_MAX_LEN 56 - -/* NB take care when changing the sequence of elements this struct, - * because the offset info is used in find_capa() - */ -struct lustre_capa { - struct lu_fid lc_fid; /** fid */ - __u64 lc_opc; /** operations allowed */ - __u64 lc_uid; /** file owner */ - __u64 lc_gid; /** file group */ - __u32 lc_flags; /** HMAC algorithm & flags */ - __u32 lc_keyid; /** key# used for the capability */ - __u32 lc_timeout; /** capa timeout value (sec) */ -/* FIXME: y2038 time_t overflow: */ - __u32 lc_expiry; /** expiry time (sec) */ - __u8 lc_hmac[CAPA_HMAC_MAX_LEN]; /** HMAC */ -} __packed; - -/** lustre_capa::lc_opc */ -enum { - CAPA_OPC_BODY_WRITE = 1 << 0, /**< write object data */ - CAPA_OPC_BODY_READ = 1 << 1, /**< read object data */ - CAPA_OPC_INDEX_LOOKUP = 1 << 2, /**< lookup object fid */ - CAPA_OPC_INDEX_INSERT = 1 << 3, /**< insert object fid */ - CAPA_OPC_INDEX_DELETE = 1 << 4, /**< delete object fid */ - CAPA_OPC_OSS_WRITE = 1 << 5, /**< write oss object data */ - CAPA_OPC_OSS_READ = 1 << 6, /**< read oss object data */ - CAPA_OPC_OSS_TRUNC = 1 << 7, /**< truncate oss object */ - CAPA_OPC_OSS_DESTROY = 1 << 8, /**< destroy oss object */ - CAPA_OPC_META_WRITE = 1 << 9, /**< write object meta data */ - CAPA_OPC_META_READ = 1 << 10, /**< read object meta data */ -}; - -#define CAPA_OPC_OSS_RW (CAPA_OPC_OSS_READ | CAPA_OPC_OSS_WRITE) -#define CAPA_OPC_MDS_ONLY \ - (CAPA_OPC_BODY_WRITE | CAPA_OPC_BODY_READ | CAPA_OPC_INDEX_LOOKUP | \ - CAPA_OPC_INDEX_INSERT | CAPA_OPC_INDEX_DELETE) -#define CAPA_OPC_OSS_ONLY \ - (CAPA_OPC_OSS_WRITE | CAPA_OPC_OSS_READ | CAPA_OPC_OSS_TRUNC | \ - CAPA_OPC_OSS_DESTROY) -#define CAPA_OPC_MDS_DEFAULT ~CAPA_OPC_OSS_ONLY -#define CAPA_OPC_OSS_DEFAULT ~(CAPA_OPC_MDS_ONLY | CAPA_OPC_OSS_ONLY) - -struct lustre_capa_key { - __u64 lk_seq; /**< mds# */ - __u32 lk_keyid; /**< key# */ - __u32 lk_padding; - __u8 lk_key[CAPA_HMAC_KEY_MAX_LEN]; /**< key */ -} __packed; - -/** The link ea holds 1 \a link_ea_entry for each hardlink */ -#define LINK_EA_MAGIC 0x11EAF1DFUL -struct link_ea_header { - __u32 leh_magic; - __u32 leh_reccount; - __u64 leh_len; /* total size */ - __u32 leh_overflow_time; - __u32 leh_padding; -}; - -/** Hardlink data is name and parent fid. - * Stored in this crazy struct for maximum packing and endian-neutrality - */ -struct link_ea_entry { - /** __u16 stored big-endian, unaligned */ - unsigned char lee_reclen[2]; - unsigned char lee_parent_fid[sizeof(struct lu_fid)]; - char lee_name[0]; -} __packed; - -/** fid2path request/reply structure */ -struct getinfo_fid2path { - struct lu_fid gf_fid; - __u64 gf_recno; - __u32 gf_linkno; - __u32 gf_pathlen; - char gf_path[0]; -} __packed; - -/** path2parent request/reply structures */ -struct getparent { - struct lu_fid gp_fid; /**< parent FID */ - __u32 gp_linkno; /**< hardlink number */ - __u32 gp_name_size; /**< size of the name field */ - char gp_name[0]; /**< zero-terminated link name */ -} __packed; - -enum { - LAYOUT_INTENT_ACCESS = 0, - LAYOUT_INTENT_READ = 1, - LAYOUT_INTENT_WRITE = 2, - LAYOUT_INTENT_GLIMPSE = 3, - LAYOUT_INTENT_TRUNC = 4, - LAYOUT_INTENT_RELEASE = 5, - LAYOUT_INTENT_RESTORE = 6 -}; - -/* enqueue layout lock with intent */ -struct layout_intent { - __u32 li_opc; /* intent operation for enqueue, read, write etc */ - __u32 li_flags; - __u64 li_start; - __u64 li_end; -}; - -/** - * On the wire version of hsm_progress structure. - * - * Contains the userspace hsm_progress and some internal fields. - */ -struct hsm_progress_kernel { - /* Field taken from struct hsm_progress */ - struct lu_fid hpk_fid; - __u64 hpk_cookie; - struct hsm_extent hpk_extent; - __u16 hpk_flags; - __u16 hpk_errval; /* positive val */ - __u32 hpk_padding1; - /* Additional fields */ - __u64 hpk_data_version; - __u64 hpk_padding2; -} __packed; - -/** layout swap request structure - * fid1 and fid2 are in mdt_body - */ -struct mdc_swap_layouts { - __u64 msl_flags; -} __packed; - -struct close_data { - struct lustre_handle cd_handle; - struct lu_fid cd_fid; - __u64 cd_data_version; - __u64 cd_reserved[8]; -}; - -#endif -/** @} lustreidl */ diff --git a/drivers/staging/lustre/include/uapi/linux/lustre/lustre_ioctl.h b/drivers/staging/lustre/include/uapi/linux/lustre/lustre_ioctl.h deleted file mode 100644 index 6e4e109fb874b2af559e7dae65a10f2b75b38339..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/uapi/linux/lustre/lustre_ioctl.h +++ /dev/null @@ -1,229 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -#ifndef _UAPI_LUSTRE_IOCTL_H_ -#define _UAPI_LUSTRE_IOCTL_H_ - -#include -#include -#include -#include - -#if !defined(__KERNEL__) && !defined(LUSTRE_UTILS) -# error This file is for Lustre internal use only. -#endif - -enum md_echo_cmd { - ECHO_MD_CREATE = 1, /* Open/Create file on MDT */ - ECHO_MD_MKDIR = 2, /* Mkdir on MDT */ - ECHO_MD_DESTROY = 3, /* Unlink file on MDT */ - ECHO_MD_RMDIR = 4, /* Rmdir on MDT */ - ECHO_MD_LOOKUP = 5, /* Lookup on MDT */ - ECHO_MD_GETATTR = 6, /* Getattr on MDT */ - ECHO_MD_SETATTR = 7, /* Setattr on MDT */ - ECHO_MD_ALLOC_FID = 8, /* Get FIDs from MDT */ -}; - -#define OBD_DEV_ID 1 -#define OBD_DEV_NAME "obd" -#define OBD_DEV_PATH "/dev/" OBD_DEV_NAME - -#define OBD_IOCTL_VERSION 0x00010004 -#define OBD_DEV_BY_DEVNAME 0xffffd0de - -struct obd_ioctl_data { - __u32 ioc_len; - __u32 ioc_version; - - union { - __u64 ioc_cookie; - __u64 ioc_u64_1; - }; - union { - __u32 ioc_conn1; - __u32 ioc_u32_1; - }; - union { - __u32 ioc_conn2; - __u32 ioc_u32_2; - }; - - struct obdo ioc_obdo1; - struct obdo ioc_obdo2; - - __u64 ioc_count; - __u64 ioc_offset; - __u32 ioc_dev; - __u32 ioc_command; - - __u64 ioc_nid; - __u32 ioc_nal; - __u32 ioc_type; - - /* buffers the kernel will treat as user pointers */ - __u32 ioc_plen1; - char __user *ioc_pbuf1; - __u32 ioc_plen2; - char __user *ioc_pbuf2; - - /* inline buffers for various arguments */ - __u32 ioc_inllen1; - char *ioc_inlbuf1; - __u32 ioc_inllen2; - char *ioc_inlbuf2; - __u32 ioc_inllen3; - char *ioc_inlbuf3; - __u32 ioc_inllen4; - char *ioc_inlbuf4; - - char ioc_bulk[0]; -}; - -struct obd_ioctl_hdr { - __u32 ioc_len; - __u32 ioc_version; -}; - -static inline __u32 obd_ioctl_packlen(struct obd_ioctl_data *data) -{ - __u32 len = __ALIGN_KERNEL(sizeof(*data), 8); - - len += __ALIGN_KERNEL(data->ioc_inllen1, 8); - len += __ALIGN_KERNEL(data->ioc_inllen2, 8); - len += __ALIGN_KERNEL(data->ioc_inllen3, 8); - len += __ALIGN_KERNEL(data->ioc_inllen4, 8); - - return len; -} - -/* - * OBD_IOC_DATA_TYPE is only for compatibility reasons with older - * Linux Lustre user tools. New ioctls should NOT use this macro as - * the ioctl "size". Instead the ioctl should get a "size" argument - * which is the actual data type used by the ioctl, to ensure the - * ioctl interface is versioned correctly. - */ -#define OBD_IOC_DATA_TYPE long - -/* IOC_LDLM_TEST _IOWR('f', 40, long) */ -/* IOC_LDLM_DUMP _IOWR('f', 41, long) */ -/* IOC_LDLM_REGRESS_START _IOWR('f', 42, long) */ -/* IOC_LDLM_REGRESS_STOP _IOWR('f', 43, long) */ - -#define OBD_IOC_CREATE _IOWR('f', 101, OBD_IOC_DATA_TYPE) -#define OBD_IOC_DESTROY _IOW('f', 104, OBD_IOC_DATA_TYPE) -/* OBD_IOC_PREALLOCATE _IOWR('f', 105, OBD_IOC_DATA_TYPE) */ - -#define OBD_IOC_SETATTR _IOW('f', 107, OBD_IOC_DATA_TYPE) -#define OBD_IOC_GETATTR _IOWR('f', 108, OBD_IOC_DATA_TYPE) -#define OBD_IOC_READ _IOWR('f', 109, OBD_IOC_DATA_TYPE) -#define OBD_IOC_WRITE _IOWR('f', 110, OBD_IOC_DATA_TYPE) - -#define OBD_IOC_STATFS _IOWR('f', 113, OBD_IOC_DATA_TYPE) -#define OBD_IOC_SYNC _IOW('f', 114, OBD_IOC_DATA_TYPE) -/* OBD_IOC_READ2 _IOWR('f', 115, OBD_IOC_DATA_TYPE) */ -/* OBD_IOC_FORMAT _IOWR('f', 116, OBD_IOC_DATA_TYPE) */ -/* OBD_IOC_PARTITION _IOWR('f', 117, OBD_IOC_DATA_TYPE) */ -/* OBD_IOC_COPY _IOWR('f', 120, OBD_IOC_DATA_TYPE) */ -/* OBD_IOC_MIGR _IOWR('f', 121, OBD_IOC_DATA_TYPE) */ -/* OBD_IOC_PUNCH _IOWR('f', 122, OBD_IOC_DATA_TYPE) */ - -/* OBD_IOC_MODULE_DEBUG _IOWR('f', 124, OBD_IOC_DATA_TYPE) */ -#define OBD_IOC_BRW_READ _IOWR('f', 125, OBD_IOC_DATA_TYPE) -#define OBD_IOC_BRW_WRITE _IOWR('f', 126, OBD_IOC_DATA_TYPE) -#define OBD_IOC_NAME2DEV _IOWR('f', 127, OBD_IOC_DATA_TYPE) -#define OBD_IOC_UUID2DEV _IOWR('f', 130, OBD_IOC_DATA_TYPE) -#define OBD_IOC_GETNAME _IOWR('f', 131, OBD_IOC_DATA_TYPE) -#define OBD_IOC_GETMDNAME _IOR('f', 131, char[MAX_OBD_NAME]) -#define OBD_IOC_GETDTNAME OBD_IOC_GETNAME -#define OBD_IOC_LOV_GET_CONFIG _IOWR('f', 132, OBD_IOC_DATA_TYPE) -#define OBD_IOC_CLIENT_RECOVER _IOW('f', 133, OBD_IOC_DATA_TYPE) -#define OBD_IOC_PING_TARGET _IOW('f', 136, OBD_IOC_DATA_TYPE) - -/* OBD_IOC_DEC_FS_USE_COUNT _IO('f', 139) */ -#define OBD_IOC_NO_TRANSNO _IOW('f', 140, OBD_IOC_DATA_TYPE) -#define OBD_IOC_SET_READONLY _IOW('f', 141, OBD_IOC_DATA_TYPE) -#define OBD_IOC_ABORT_RECOVERY _IOR('f', 142, OBD_IOC_DATA_TYPE) -/* OBD_IOC_ROOT_SQUASH _IOWR('f', 143, OBD_IOC_DATA_TYPE) */ -#define OBD_GET_VERSION _IOWR('f', 144, OBD_IOC_DATA_TYPE) -/* OBD_IOC_GSS_SUPPORT _IOWR('f', 145, OBD_IOC_DATA_TYPE) */ -/* OBD_IOC_CLOSE_UUID _IOWR('f', 147, OBD_IOC_DATA_TYPE) */ -#define OBD_IOC_CHANGELOG_SEND _IOW('f', 148, OBD_IOC_DATA_TYPE) -#define OBD_IOC_GETDEVICE _IOWR('f', 149, OBD_IOC_DATA_TYPE) -#define OBD_IOC_FID2PATH _IOWR('f', 150, OBD_IOC_DATA_TYPE) -/* lustre/lustre_user.h 151-153 */ -/* OBD_IOC_LOV_SETSTRIPE 154 LL_IOC_LOV_SETSTRIPE */ -/* OBD_IOC_LOV_GETSTRIPE 155 LL_IOC_LOV_GETSTRIPE */ -/* OBD_IOC_LOV_SETEA 156 LL_IOC_LOV_SETEA */ -/* lustre/lustre_user.h 157-159 */ -/* OBD_IOC_QUOTACHECK _IOW('f', 160, int) */ -/* OBD_IOC_POLL_QUOTACHECK _IOR('f', 161, struct if_quotacheck *) */ -#define OBD_IOC_QUOTACTL _IOWR('f', 162, struct if_quotactl) -/* lustre/lustre_user.h 163-176 */ -#define OBD_IOC_CHANGELOG_REG _IOW('f', 177, struct obd_ioctl_data) -#define OBD_IOC_CHANGELOG_DEREG _IOW('f', 178, struct obd_ioctl_data) -#define OBD_IOC_CHANGELOG_CLEAR _IOW('f', 179, struct obd_ioctl_data) -/* OBD_IOC_RECORD _IOWR('f', 180, OBD_IOC_DATA_TYPE) */ -/* OBD_IOC_ENDRECORD _IOWR('f', 181, OBD_IOC_DATA_TYPE) */ -/* OBD_IOC_PARSE _IOWR('f', 182, OBD_IOC_DATA_TYPE) */ -/* OBD_IOC_DORECORD _IOWR('f', 183, OBD_IOC_DATA_TYPE) */ -#define OBD_IOC_PROCESS_CFG _IOWR('f', 184, OBD_IOC_DATA_TYPE) -/* OBD_IOC_DUMP_LOG _IOWR('f', 185, OBD_IOC_DATA_TYPE) */ -/* OBD_IOC_CLEAR_LOG _IOWR('f', 186, OBD_IOC_DATA_TYPE) */ -#define OBD_IOC_PARAM _IOW('f', 187, OBD_IOC_DATA_TYPE) -#define OBD_IOC_POOL _IOWR('f', 188, OBD_IOC_DATA_TYPE) -#define OBD_IOC_REPLACE_NIDS _IOWR('f', 189, OBD_IOC_DATA_TYPE) - -#define OBD_IOC_CATLOGLIST _IOWR('f', 190, OBD_IOC_DATA_TYPE) -#define OBD_IOC_LLOG_INFO _IOWR('f', 191, OBD_IOC_DATA_TYPE) -#define OBD_IOC_LLOG_PRINT _IOWR('f', 192, OBD_IOC_DATA_TYPE) -#define OBD_IOC_LLOG_CANCEL _IOWR('f', 193, OBD_IOC_DATA_TYPE) -#define OBD_IOC_LLOG_REMOVE _IOWR('f', 194, OBD_IOC_DATA_TYPE) -#define OBD_IOC_LLOG_CHECK _IOWR('f', 195, OBD_IOC_DATA_TYPE) -/* OBD_IOC_LLOG_CATINFO _IOWR('f', 196, OBD_IOC_DATA_TYPE) */ -#define OBD_IOC_NODEMAP _IOWR('f', 197, OBD_IOC_DATA_TYPE) - -/* ECHO_IOC_GET_STRIPE _IOWR('f', 200, OBD_IOC_DATA_TYPE) */ -/* ECHO_IOC_SET_STRIPE _IOWR('f', 201, OBD_IOC_DATA_TYPE) */ -/* ECHO_IOC_ENQUEUE _IOWR('f', 202, OBD_IOC_DATA_TYPE) */ -/* ECHO_IOC_CANCEL _IOWR('f', 203, OBD_IOC_DATA_TYPE) */ - -#define OBD_IOC_GET_OBJ_VERSION _IOR('f', 210, OBD_IOC_DATA_TYPE) - -/* lustre/lustre_user.h 212-217 */ -#define OBD_IOC_GET_MNTOPT _IOW('f', 220, mntopt_t) -#define OBD_IOC_ECHO_MD _IOR('f', 221, struct obd_ioctl_data) -#define OBD_IOC_ECHO_ALLOC_SEQ _IOWR('f', 222, struct obd_ioctl_data) -#define OBD_IOC_START_LFSCK _IOWR('f', 230, OBD_IOC_DATA_TYPE) -#define OBD_IOC_STOP_LFSCK _IOW('f', 231, OBD_IOC_DATA_TYPE) -#define OBD_IOC_QUERY_LFSCK _IOR('f', 232, struct obd_ioctl_data) -/* lustre/lustre_user.h 240-249 */ -/* LIBCFS_IOC_DEBUG_MASK 250 */ - -#define IOC_OSC_SET_ACTIVE _IOWR('h', 21, void *) - -#endif /* _UAPI_LUSTRE_IOCTL_H_ */ diff --git a/drivers/staging/lustre/include/uapi/linux/lustre/lustre_kernelcomm.h b/drivers/staging/lustre/include/uapi/linux/lustre/lustre_kernelcomm.h deleted file mode 100644 index 94dadbe8e069e2a699cfd1da5d6a585c5b1089f0..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/uapi/linux/lustre/lustre_kernelcomm.h +++ /dev/null @@ -1,94 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2013, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * - * Author: Nathan Rutman - * - * Kernel <-> userspace communication routines. - * The definitions below are used in the kernel and userspace. - */ - -#ifndef __UAPI_LUSTRE_KERNELCOMM_H__ -#define __UAPI_LUSTRE_KERNELCOMM_H__ - -#include - -/* KUC message header. - * All current and future KUC messages should use this header. - * To avoid having to include Lustre headers from libcfs, define this here. - */ -struct kuc_hdr { - __u16 kuc_magic; - /* Each new Lustre feature should use a different transport */ - __u8 kuc_transport; - __u8 kuc_flags; - /* Message type or opcode, transport-specific */ - __u16 kuc_msgtype; - /* Including header */ - __u16 kuc_msglen; -} __aligned(sizeof(__u64)); - -#define KUC_CHANGELOG_MSG_MAXSIZE (sizeof(struct kuc_hdr) + CR_MAXSIZE) - -#define KUC_MAGIC 0x191C /*Lustre9etLinC */ - -/* kuc_msgtype values are defined in each transport */ -enum kuc_transport_type { - KUC_TRANSPORT_GENERIC = 1, - KUC_TRANSPORT_HSM = 2, - KUC_TRANSPORT_CHANGELOG = 3, -}; - -enum kuc_generic_message_type { - KUC_MSG_SHUTDOWN = 1, -}; - -/* KUC Broadcast Groups. This determines which userspace process hears which - * messages. Mutliple transports may be used within a group, or multiple - * groups may use the same transport. Broadcast - * groups need not be used if e.g. a UID is specified instead; - * use group 0 to signify unicast. - */ -#define KUC_GRP_HSM 0x02 -#define KUC_GRP_MAX KUC_GRP_HSM - -#define LK_FLG_STOP 0x01 -#define LK_NOFD -1U - -/* kernelcomm control structure, passed from userspace to kernel */ -struct lustre_kernelcomm { - __u32 lk_wfd; - __u32 lk_rfd; - __u32 lk_uid; - __u32 lk_group; - __u32 lk_data; - __u32 lk_flags; -} __packed; - -#endif /* __UAPI_LUSTRE_KERNELCOMM_H__ */ diff --git a/drivers/staging/lustre/include/uapi/linux/lustre/lustre_ostid.h b/drivers/staging/lustre/include/uapi/linux/lustre/lustre_ostid.h deleted file mode 100644 index 3343b602219b1106eab4bb9eb53aed0ebc3bc1af..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/uapi/linux/lustre/lustre_ostid.h +++ /dev/null @@ -1,236 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2014, Intel Corporation. - * - * Copyright 2015 Cray Inc, all rights reserved. - * Author: Ben Evans. - * - * Define ost_id associated functions - */ - -#ifndef _UAPI_LUSTRE_OSTID_H_ -#define _UAPI_LUSTRE_OSTID_H_ - -#include -#include - -static inline __u64 lmm_oi_id(const struct ost_id *oi) -{ - return oi->oi.oi_id; -} - -static inline __u64 lmm_oi_seq(const struct ost_id *oi) -{ - return oi->oi.oi_seq; -} - -static inline void lmm_oi_set_seq(struct ost_id *oi, __u64 seq) -{ - oi->oi.oi_seq = seq; -} - -static inline void lmm_oi_set_id(struct ost_id *oi, __u64 oid) -{ - oi->oi.oi_id = oid; -} - -static inline void lmm_oi_le_to_cpu(struct ost_id *dst_oi, - const struct ost_id *src_oi) -{ - dst_oi->oi.oi_id = __le64_to_cpu(src_oi->oi.oi_id); - dst_oi->oi.oi_seq = __le64_to_cpu(src_oi->oi.oi_seq); -} - -static inline void lmm_oi_cpu_to_le(struct ost_id *dst_oi, - const struct ost_id *src_oi) -{ - dst_oi->oi.oi_id = __cpu_to_le64(src_oi->oi.oi_id); - dst_oi->oi.oi_seq = __cpu_to_le64(src_oi->oi.oi_seq); -} - -/* extract OST sequence (group) from a wire ost_id (id/seq) pair */ -static inline __u64 ostid_seq(const struct ost_id *ostid) -{ - if (fid_seq_is_mdt0(ostid->oi.oi_seq)) - return FID_SEQ_OST_MDT0; - - if (fid_seq_is_default(ostid->oi.oi_seq)) - return FID_SEQ_LOV_DEFAULT; - - if (fid_is_idif(&ostid->oi_fid)) - return FID_SEQ_OST_MDT0; - - return fid_seq(&ostid->oi_fid); -} - -/* extract OST objid from a wire ost_id (id/seq) pair */ -static inline __u64 ostid_id(const struct ost_id *ostid) -{ - if (fid_seq_is_mdt0(ostid->oi.oi_seq)) - return ostid->oi.oi_id & IDIF_OID_MASK; - - if (fid_seq_is_default(ostid->oi.oi_seq)) - return ostid->oi.oi_id; - - if (fid_is_idif(&ostid->oi_fid)) - return fid_idif_id(fid_seq(&ostid->oi_fid), - fid_oid(&ostid->oi_fid), 0); - - return fid_oid(&ostid->oi_fid); -} - -static inline void ostid_set_seq(struct ost_id *oi, __u64 seq) -{ - if (fid_seq_is_mdt0(seq) || fid_seq_is_default(seq)) { - oi->oi.oi_seq = seq; - } else { - oi->oi_fid.f_seq = seq; - /* - * Note: if f_oid + f_ver is zero, we need init it - * to be 1, otherwise, ostid_seq will treat this - * as old ostid (oi_seq == 0) - */ - if (!oi->oi_fid.f_oid && !oi->oi_fid.f_ver) - oi->oi_fid.f_oid = LUSTRE_FID_INIT_OID; - } -} - -static inline void ostid_set_seq_mdt0(struct ost_id *oi) -{ - ostid_set_seq(oi, FID_SEQ_OST_MDT0); -} - -static inline void ostid_set_seq_echo(struct ost_id *oi) -{ - ostid_set_seq(oi, FID_SEQ_ECHO); -} - -static inline void ostid_set_seq_llog(struct ost_id *oi) -{ - ostid_set_seq(oi, FID_SEQ_LLOG); -} - -static inline void ostid_cpu_to_le(const struct ost_id *src_oi, - struct ost_id *dst_oi) -{ - if (fid_seq_is_mdt0(src_oi->oi.oi_seq)) { - dst_oi->oi.oi_id = __cpu_to_le64(src_oi->oi.oi_id); - dst_oi->oi.oi_seq = __cpu_to_le64(src_oi->oi.oi_seq); - } else { - fid_cpu_to_le(&dst_oi->oi_fid, &src_oi->oi_fid); - } -} - -static inline void ostid_le_to_cpu(const struct ost_id *src_oi, - struct ost_id *dst_oi) -{ - if (fid_seq_is_mdt0(src_oi->oi.oi_seq)) { - dst_oi->oi.oi_id = __le64_to_cpu(src_oi->oi.oi_id); - dst_oi->oi.oi_seq = __le64_to_cpu(src_oi->oi.oi_seq); - } else { - fid_le_to_cpu(&dst_oi->oi_fid, &src_oi->oi_fid); - } -} - -/** - * Sigh, because pre-2.4 uses - * struct lov_mds_md_v1 { - * ........ - * __u64 lmm_object_id; - * __u64 lmm_object_seq; - * ...... - * } - * to identify the LOV(MDT) object, and lmm_object_seq will - * be normal_fid, which make it hard to combine these conversion - * to ostid_to FID. so we will do lmm_oi/fid conversion separately - * - * We can tell the lmm_oi by this way, - * 1.8: lmm_object_id = {inode}, lmm_object_gr = 0 - * 2.1: lmm_object_id = {oid < 128k}, lmm_object_seq = FID_SEQ_NORMAL - * 2.4: lmm_oi.f_seq = FID_SEQ_NORMAL, lmm_oi.f_oid = {oid < 128k}, - * lmm_oi.f_ver = 0 - * - * But currently lmm_oi/lsm_oi does not have any "real" usages, - * except for printing some information, and the user can always - * get the real FID from LMA, besides this multiple case check might - * make swab more complicate. So we will keep using id/seq for lmm_oi. - */ - -static inline void fid_to_lmm_oi(const struct lu_fid *fid, - struct ost_id *oi) -{ - oi->oi.oi_id = fid_oid(fid); - oi->oi.oi_seq = fid_seq(fid); -} - -/** - * Unpack an OST object id/seq (group) into a FID. This is needed for - * converting all obdo, lmm, lsm, etc. 64-bit id/seq pairs into proper - * FIDs. Note that if an id/seq is already in FID/IDIF format it will - * be passed through unchanged. Only legacy OST objects in "group 0" - * will be mapped into the IDIF namespace so that they can fit into the - * struct lu_fid fields without loss. - */ -static inline int ostid_to_fid(struct lu_fid *fid, const struct ost_id *ostid, - __u32 ost_idx) -{ - __u64 seq = ostid_seq(ostid); - - if (ost_idx > 0xffff) - return -EBADF; - - if (fid_seq_is_mdt0(seq)) { - __u64 oid = ostid_id(ostid); - - /* This is a "legacy" (old 1.x/2.early) OST object in "group 0" - * that we map into the IDIF namespace. It allows up to 2^48 - * objects per OST, as this is the object namespace that has - * been in production for years. This can handle create rates - * of 1M objects/s/OST for 9 years, or combinations thereof. - */ - if (oid >= IDIF_MAX_OID) - return -EBADF; - - fid->f_seq = fid_idif_seq(oid, ost_idx); - /* truncate to 32 bits by assignment */ - fid->f_oid = oid; - /* in theory, not currently used */ - fid->f_ver = oid >> 48; - } else if (!fid_seq_is_default(seq)) { - /* This is either an IDIF object, which identifies objects - * across all OSTs, or a regular FID. The IDIF namespace - * maps legacy OST objects into the FID namespace. In both - * cases, we just pass the FID through, no conversion needed. - */ - if (ostid->oi_fid.f_ver) - return -EBADF; - - *fid = ostid->oi_fid; - } - - return 0; -} -#endif /* _UAPI_LUSTRE_OSTID_H_ */ diff --git a/drivers/staging/lustre/include/uapi/linux/lustre/lustre_param.h b/drivers/staging/lustre/include/uapi/linux/lustre/lustre_param.h deleted file mode 100644 index 1eab2ceca3380c371b0fdab1ed2166fb7e8539b8..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/uapi/linux/lustre/lustre_param.h +++ /dev/null @@ -1,94 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * User-settable parameter keys - * - * Author: Nathan Rutman - */ - -#ifndef _UAPI_LUSTRE_PARAM_H_ -#define _UAPI_LUSTRE_PARAM_H_ - -/** \defgroup param param - * - * @{ - */ - -/****************** User-settable parameter keys *********************/ -/* e.g. - * tunefs.lustre --param="failover.node=192.168.0.13@tcp0" /dev/sda - * lctl conf_param testfs-OST0000 failover.node=3@elan,192.168.0.3@tcp0 - * ... testfs-MDT0000.lov.stripesize=4M - * ... testfs-OST0000.ost.client_cache_seconds=15 - * ... testfs.sys.timeout= - * ... testfs.llite.max_read_ahead_mb=16 - */ - -/* System global or special params not handled in obd's proc - * See mgs_write_log_sys() - */ -#define PARAM_TIMEOUT "timeout=" /* global */ -#define PARAM_LDLM_TIMEOUT "ldlm_timeout=" /* global */ -#define PARAM_AT_MIN "at_min=" /* global */ -#define PARAM_AT_MAX "at_max=" /* global */ -#define PARAM_AT_EXTRA "at_extra=" /* global */ -#define PARAM_AT_EARLY_MARGIN "at_early_margin=" /* global */ -#define PARAM_AT_HISTORY "at_history=" /* global */ -#define PARAM_JOBID_VAR "jobid_var=" /* global */ -#define PARAM_MGSNODE "mgsnode=" /* only at mounttime */ -#define PARAM_FAILNODE "failover.node=" /* add failover nid */ -#define PARAM_FAILMODE "failover.mode=" /* initial mount only */ -#define PARAM_ACTIVE "active=" /* activate/deactivate */ -#define PARAM_NETWORK "network=" /* bind on nid */ -#define PARAM_ID_UPCALL "identity_upcall=" /* identity upcall */ - -/* Prefixes for parameters handled by obd's proc methods (XXX_process_config) */ -#define PARAM_OST "ost." -#define PARAM_OSD "osd." -#define PARAM_OSC "osc." -#define PARAM_MDT "mdt." -#define PARAM_HSM "mdt.hsm." -#define PARAM_MDD "mdd." -#define PARAM_MDC "mdc." -#define PARAM_LLITE "llite." -#define PARAM_LOV "lov." -#define PARAM_LOD "lod." -#define PARAM_OSP "osp." -#define PARAM_SYS "sys." /* global */ -#define PARAM_SRPC "srpc." -#define PARAM_SRPC_FLVR "srpc.flavor." -#define PARAM_SRPC_UDESC "srpc.udesc.cli2mdt" -#define PARAM_SEC "security." -#define PARAM_QUOTA "quota." /* global */ - -/** @} param */ - -#endif /* _UAPI_LUSTRE_PARAM_H_ */ diff --git a/drivers/staging/lustre/include/uapi/linux/lustre/lustre_user.h b/drivers/staging/lustre/include/uapi/linux/lustre/lustre_user.h deleted file mode 100644 index 69387f36d1f1ea1e8d26b46b336fd341da33475e..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/uapi/linux/lustre/lustre_user.h +++ /dev/null @@ -1,1327 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2010, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/include/lustre/lustre_user.h - * - * Lustre public user-space interface definitions. - */ - -#ifndef _LUSTRE_USER_H -#define _LUSTRE_USER_H - -/** \defgroup lustreuser lustreuser - * - * @{ - */ - -#ifdef __KERNEL__ -# include -# include -# include -# include /* snprintf() */ -# include -#else /* !__KERNEL__ */ -# define NEED_QUOTA_DEFS -# include /* snprintf() */ -# include -# include -# include -#endif /* __KERNEL__ */ -#include - -/* - * We need to always use 64bit version because the structure - * is shared across entire cluster where 32bit and 64bit machines - * are co-existing. - */ -#if __BITS_PER_LONG != 64 || defined(__ARCH_WANT_STAT64) -typedef struct stat64 lstat_t; -#define lstat_f lstat64 -#define fstat_f fstat64 -#define fstatat_f fstatat64 -#else -typedef struct stat lstat_t; -#define lstat_f lstat -#define fstat_f fstat -#define fstatat_f fstatat -#endif - -#define HAVE_LOV_USER_MDS_DATA - -#define LUSTRE_EOF 0xffffffffffffffffULL - -/* for statfs() */ -#define LL_SUPER_MAGIC 0x0BD00BD0 - -#ifndef FSFILT_IOC_GETFLAGS -#define FSFILT_IOC_GETFLAGS _IOR('f', 1, long) -#define FSFILT_IOC_SETFLAGS _IOW('f', 2, long) -#define FSFILT_IOC_GETVERSION _IOR('f', 3, long) -#define FSFILT_IOC_SETVERSION _IOW('f', 4, long) -#define FSFILT_IOC_GETVERSION_OLD _IOR('v', 1, long) -#define FSFILT_IOC_SETVERSION_OLD _IOW('v', 2, long) -#endif - -/* FIEMAP flags supported by Lustre */ -#define LUSTRE_FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_DEVICE_ORDER) - -enum obd_statfs_state { - OS_STATE_DEGRADED = 0x00000001, /**< RAID degraded/rebuilding */ - OS_STATE_READONLY = 0x00000002, /**< filesystem is read-only */ - OS_STATE_RDONLY_1 = 0x00000004, /**< obsolete 1.6, was EROFS=30 */ - OS_STATE_RDONLY_2 = 0x00000008, /**< obsolete 1.6, was EROFS=30 */ - OS_STATE_RDONLY_3 = 0x00000010, /**< obsolete 1.6, was EROFS=30 */ -}; - -struct obd_statfs { - __u64 os_type; - __u64 os_blocks; - __u64 os_bfree; - __u64 os_bavail; - __u64 os_files; - __u64 os_ffree; - __u8 os_fsid[40]; - __u32 os_bsize; - __u32 os_namelen; - __u64 os_maxbytes; - __u32 os_state; /**< obd_statfs_state OS_STATE_* flag */ - __u32 os_fprecreated; /* objs available now to the caller */ - /* used in QoS code to find preferred OSTs */ - __u32 os_spare2; - __u32 os_spare3; - __u32 os_spare4; - __u32 os_spare5; - __u32 os_spare6; - __u32 os_spare7; - __u32 os_spare8; - __u32 os_spare9; -}; - -/** - * File IDentifier. - * - * FID is a cluster-wide unique identifier of a file or an object (stripe). - * FIDs are never reused. - **/ -struct lu_fid { - /** - * FID sequence. Sequence is a unit of migration: all files (objects) - * with FIDs from a given sequence are stored on the same server. - * Lustre should support 2^64 objects, so even if each sequence - * has only a single object we can still enumerate 2^64 objects. - **/ - __u64 f_seq; - /* FID number within sequence. */ - __u32 f_oid; - /** - * FID version, used to distinguish different versions (in the sense - * of snapshots, etc.) of the same file system object. Not currently - * used. - **/ - __u32 f_ver; -}; - -static inline bool fid_is_zero(const struct lu_fid *fid) -{ - return !fid->f_seq && !fid->f_oid; -} - -struct filter_fid { - struct lu_fid ff_parent; /* ff_parent.f_ver == file stripe number */ -}; - -/* keep this one for compatibility */ -struct filter_fid_old { - struct lu_fid ff_parent; - __u64 ff_objid; - __u64 ff_seq; -}; - -/* Userspace should treat lu_fid as opaque, and only use the following methods - * to print or parse them. Other functions (e.g. compare, swab) could be moved - * here from lustre_idl.h if needed. - */ -struct lu_fid; - -/** - * Following struct for object attributes, that will be kept inode's EA. - * Introduced in 2.0 release (please see b15993, for details) - * Added to all objects since Lustre 2.4 as contains self FID - */ -struct lustre_mdt_attrs { - /** - * Bitfield for supported data in this structure. From enum lma_compat. - * lma_self_fid and lma_flags are always available. - */ - __u32 lma_compat; - /** - * Per-file incompat feature list. Lustre version should support all - * flags set in this field. The supported feature mask is available in - * LMA_INCOMPAT_SUPP. - */ - __u32 lma_incompat; - /** FID of this inode */ - struct lu_fid lma_self_fid; -}; - -/** - * Prior to 2.4, the LMA structure also included SOM attributes which has since - * been moved to a dedicated xattr - * lma_flags was also removed because of lma_compat/incompat fields. - */ -#define LMA_OLD_SIZE (sizeof(struct lustre_mdt_attrs) + 5 * sizeof(__u64)) - -/** - * OST object IDentifier. - */ -struct ost_id { - union { - struct { - __u64 oi_id; - __u64 oi_seq; - } oi; - struct lu_fid oi_fid; - }; -}; - -#define DOSTID "%#llx:%llu" -#define POSTID(oi) ostid_seq(oi), ostid_id(oi) - -/* - * The ioctl naming rules: - * LL_* - works on the currently opened filehandle instead of parent dir - * *_OBD_* - gets data for both OSC or MDC (LOV, LMV indirectly) - * *_MDC_* - gets/sets data related to MDC - * *_LOV_* - gets/sets data related to OSC/LOV - * *FILE* - called on parent dir and passes in a filename - * *STRIPE* - set/get lov_user_md - * *INFO - set/get lov_user_mds_data - */ -/* lustre_ioctl.h 101-150 */ -#define LL_IOC_GETFLAGS _IOR('f', 151, long) -#define LL_IOC_SETFLAGS _IOW('f', 152, long) -#define LL_IOC_CLRFLAGS _IOW('f', 153, long) -#define LL_IOC_LOV_SETSTRIPE _IOW('f', 154, long) -#define LL_IOC_LOV_GETSTRIPE _IOW('f', 155, long) -#define LL_IOC_LOV_SETEA _IOW('f', 156, long) -/* LL_IOC_RECREATE_OBJ 157 obsolete */ -/* LL_IOC_RECREATE_FID 158 obsolete */ -#define LL_IOC_GROUP_LOCK _IOW('f', 158, long) -#define LL_IOC_GROUP_UNLOCK _IOW('f', 159, long) -/* #define LL_IOC_QUOTACHECK 160 OBD_IOC_QUOTACHECK */ -/* #define LL_IOC_POLL_QUOTACHECK 161 OBD_IOC_POLL_QUOTACHECK */ -/* #define LL_IOC_QUOTACTL 162 OBD_IOC_QUOTACTL */ -#define IOC_OBD_STATFS _IOWR('f', 164, struct obd_statfs *) -/* IOC_LOV_GETINFO 165 obsolete */ -#define LL_IOC_FLUSHCTX _IOW('f', 166, long) -/* LL_IOC_RMTACL 167 obsolete */ -#define LL_IOC_GETOBDCOUNT _IOR('f', 168, long) -#define LL_IOC_LLOOP_ATTACH _IOWR('f', 169, long) -#define LL_IOC_LLOOP_DETACH _IOWR('f', 170, long) -#define LL_IOC_LLOOP_INFO _IOWR('f', 171, struct lu_fid) -#define LL_IOC_LLOOP_DETACH_BYDEV _IOWR('f', 172, long) -#define LL_IOC_PATH2FID _IOR('f', 173, long) -#define LL_IOC_GET_CONNECT_FLAGS _IOWR('f', 174, __u64 *) -#define LL_IOC_GET_MDTIDX _IOR('f', 175, int) - -/* lustre_ioctl.h 177-210 */ -#define LL_IOC_HSM_STATE_GET _IOR('f', 211, struct hsm_user_state) -#define LL_IOC_HSM_STATE_SET _IOW('f', 212, struct hsm_state_set) -#define LL_IOC_HSM_CT_START _IOW('f', 213, struct lustre_kernelcomm) -#define LL_IOC_HSM_COPY_START _IOW('f', 214, struct hsm_copy *) -#define LL_IOC_HSM_COPY_END _IOW('f', 215, struct hsm_copy *) -#define LL_IOC_HSM_PROGRESS _IOW('f', 216, struct hsm_user_request) -#define LL_IOC_HSM_REQUEST _IOW('f', 217, struct hsm_user_request) -#define LL_IOC_DATA_VERSION _IOR('f', 218, struct ioc_data_version) -#define LL_IOC_LOV_SWAP_LAYOUTS _IOW('f', 219, \ - struct lustre_swap_layouts) -#define LL_IOC_HSM_ACTION _IOR('f', 220, \ - struct hsm_current_action) -/* see for ioctl numbers 221-232 */ - -#define LL_IOC_LMV_SETSTRIPE _IOWR('f', 240, struct lmv_user_md) -#define LL_IOC_LMV_GETSTRIPE _IOWR('f', 241, struct lmv_user_md) -#define LL_IOC_SET_LEASE _IOWR('f', 243, long) -#define LL_IOC_GET_LEASE _IO('f', 244) -#define LL_IOC_HSM_IMPORT _IOWR('f', 245, struct hsm_user_import) -#define LL_IOC_LMV_SET_DEFAULT_STRIPE _IOWR('f', 246, struct lmv_user_md) -#define LL_IOC_MIGRATE _IOR('f', 247, int) -#define LL_IOC_FID2MDTIDX _IOWR('f', 248, struct lu_fid) -#define LL_IOC_GETPARENT _IOWR('f', 249, struct getparent) - -/* Lease types for use as arg and return of LL_IOC_{GET,SET}_LEASE ioctl. */ -enum ll_lease_type { - LL_LEASE_RDLCK = 0x1, - LL_LEASE_WRLCK = 0x2, - LL_LEASE_UNLCK = 0x4, -}; - -#define LL_STATFS_LMV 1 -#define LL_STATFS_LOV 2 -#define LL_STATFS_NODELAY 4 - -#define IOC_MDC_TYPE 'i' -#define IOC_MDC_LOOKUP _IOWR(IOC_MDC_TYPE, 20, struct obd_device *) -#define IOC_MDC_GETFILESTRIPE _IOWR(IOC_MDC_TYPE, 21, struct lov_user_md *) -#define IOC_MDC_GETFILEINFO _IOWR(IOC_MDC_TYPE, 22, struct lov_user_mds_data *) -#define LL_IOC_MDC_GETINFO _IOWR(IOC_MDC_TYPE, 23, struct lov_user_mds_data *) - -#define MAX_OBD_NAME 128 /* If this changes, a NEW ioctl must be added */ - -/* Define O_LOV_DELAY_CREATE to be a mask that is not useful for regular - * files, but are unlikely to be used in practice and are not harmful if - * used incorrectly. O_NOCTTY and FASYNC are only meaningful for character - * devices and are safe for use on new files (See LU-812, LU-4209). - */ -#define O_LOV_DELAY_CREATE (O_NOCTTY | FASYNC) - -#define LL_FILE_IGNORE_LOCK 0x00000001 -#define LL_FILE_GROUP_LOCKED 0x00000002 -#define LL_FILE_READAHEA 0x00000004 -#define LL_FILE_LOCKED_DIRECTIO 0x00000008 /* client-side locks with dio */ -#define LL_FILE_LOCKLESS_IO 0x00000010 /* server-side locks with cio */ -#define LL_FILE_RMTACL 0x00000020 - -#define LOV_USER_MAGIC_V1 0x0BD10BD0 -#define LOV_USER_MAGIC LOV_USER_MAGIC_V1 -#define LOV_USER_MAGIC_JOIN_V1 0x0BD20BD0 -#define LOV_USER_MAGIC_V3 0x0BD30BD0 -/* 0x0BD40BD0 is occupied by LOV_MAGIC_MIGRATE */ -#define LOV_USER_MAGIC_SPECIFIC 0x0BD50BD0 /* for specific OSTs */ - -#define LMV_USER_MAGIC 0x0CD30CD0 /*default lmv magic*/ - -#define LOV_PATTERN_RAID0 0x001 -#define LOV_PATTERN_RAID1 0x002 -#define LOV_PATTERN_FIRST 0x100 -#define LOV_PATTERN_CMOBD 0x200 - -#define LOV_PATTERN_F_MASK 0xffff0000 -#define LOV_PATTERN_F_HOLE 0x40000000 /* there is hole in LOV EA */ -#define LOV_PATTERN_F_RELEASED 0x80000000 /* HSM released file */ - -#define LOV_MAXPOOLNAME 15 -#define LOV_POOLNAMEF "%.15s" - -#define LOV_MIN_STRIPE_BITS 16 /* maximum PAGE_SIZE (ia64), power of 2 */ -#define LOV_MIN_STRIPE_SIZE (1 << LOV_MIN_STRIPE_BITS) -#define LOV_MAX_STRIPE_COUNT_OLD 160 -/* This calculation is crafted so that input of 4096 will result in 160 - * which in turn is equal to old maximal stripe count. - * XXX: In fact this is too simplified for now, what it also need is to get - * ea_type argument to clearly know how much space each stripe consumes. - * - * The limit of 12 pages is somewhat arbitrary, but is a reasonably large - * allocation that is sufficient for the current generation of systems. - * - * (max buffer size - lov+rpc header) / sizeof(struct lov_ost_data_v1) - */ -#define LOV_MAX_STRIPE_COUNT 2000 /* ((12 * 4096 - 256) / 24) */ -#define LOV_ALL_STRIPES 0xffff /* only valid for directories */ -#define LOV_V1_INSANE_STRIPE_COUNT 65532 /* maximum stripe count bz13933 */ - -#define XATTR_LUSTRE_PREFIX "lustre." -#define XATTR_LUSTRE_LOV "lustre.lov" - -#define lov_user_ost_data lov_user_ost_data_v1 -struct lov_user_ost_data_v1 { /* per-stripe data structure */ - struct ost_id l_ost_oi; /* OST object ID */ - __u32 l_ost_gen; /* generation of this OST index */ - __u32 l_ost_idx; /* OST index in LOV */ -} __packed; - -#define lov_user_md lov_user_md_v1 -struct lov_user_md_v1 { /* LOV EA user data (host-endian) */ - __u32 lmm_magic; /* magic number = LOV_USER_MAGIC_V1 */ - __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */ - struct ost_id lmm_oi; /* LOV object ID */ - __u32 lmm_stripe_size; /* size of stripe in bytes */ - __u16 lmm_stripe_count; /* num stripes in use for this object */ - union { - __u16 lmm_stripe_offset; /* starting stripe offset in - * lmm_objects, use when writing - */ - __u16 lmm_layout_gen; /* layout generation number - * used when reading - */ - }; - struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */ -} __attribute__((packed, __may_alias__)); - -struct lov_user_md_v3 { /* LOV EA user data (host-endian) */ - __u32 lmm_magic; /* magic number = LOV_USER_MAGIC_V3 */ - __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */ - struct ost_id lmm_oi; /* LOV object ID */ - __u32 lmm_stripe_size; /* size of stripe in bytes */ - __u16 lmm_stripe_count; /* num stripes in use for this object */ - union { - __u16 lmm_stripe_offset; /* starting stripe offset in - * lmm_objects, use when writing - */ - __u16 lmm_layout_gen; /* layout generation number - * used when reading - */ - }; - char lmm_pool_name[LOV_MAXPOOLNAME + 1]; /* pool name */ - struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */ -} __packed; - -static inline __u32 lov_user_md_size(__u16 stripes, __u32 lmm_magic) -{ - if (lmm_magic == LOV_USER_MAGIC_V1) - return sizeof(struct lov_user_md_v1) + - stripes * sizeof(struct lov_user_ost_data_v1); - return sizeof(struct lov_user_md_v3) + - stripes * sizeof(struct lov_user_ost_data_v1); -} - -/* Compile with -D_LARGEFILE64_SOURCE or -D_GNU_SOURCE (or #define) to - * use this. It is unsafe to #define those values in this header as it - * is possible the application has already #included . - */ -#ifdef HAVE_LOV_USER_MDS_DATA -#define lov_user_mds_data lov_user_mds_data_v1 -struct lov_user_mds_data_v1 { - lstat_t lmd_st; /* MDS stat struct */ - struct lov_user_md_v1 lmd_lmm; /* LOV EA V1 user data */ -} __packed; - -struct lov_user_mds_data_v3 { - lstat_t lmd_st; /* MDS stat struct */ - struct lov_user_md_v3 lmd_lmm; /* LOV EA V3 user data */ -} __packed; -#endif - -struct lmv_user_mds_data { - struct lu_fid lum_fid; - __u32 lum_padding; - __u32 lum_mds; -}; - -enum lmv_hash_type { - LMV_HASH_TYPE_UNKNOWN = 0, /* 0 is reserved for testing purpose */ - LMV_HASH_TYPE_ALL_CHARS = 1, - LMV_HASH_TYPE_FNV_1A_64 = 2, -}; - -#define LMV_HASH_NAME_ALL_CHARS "all_char" -#define LMV_HASH_NAME_FNV_1A_64 "fnv_1a_64" - -/* - * Got this according to how get LOV_MAX_STRIPE_COUNT, see above, - * (max buffer size - lmv+rpc header) / sizeof(struct lmv_user_mds_data) - */ -#define LMV_MAX_STRIPE_COUNT 2000 /* ((12 * 4096 - 256) / 24) */ -#define lmv_user_md lmv_user_md_v1 -struct lmv_user_md_v1 { - __u32 lum_magic; /* must be the first field */ - __u32 lum_stripe_count; /* dirstripe count */ - __u32 lum_stripe_offset; /* MDT idx for default dirstripe */ - __u32 lum_hash_type; /* Dir stripe policy */ - __u32 lum_type; /* LMV type: default or normal */ - __u32 lum_padding1; - __u32 lum_padding2; - __u32 lum_padding3; - char lum_pool_name[LOV_MAXPOOLNAME + 1]; - struct lmv_user_mds_data lum_objects[0]; -} __packed; - -static inline int lmv_user_md_size(int stripes, int lmm_magic) -{ - return sizeof(struct lmv_user_md) + - stripes * sizeof(struct lmv_user_mds_data); -} - -struct ll_recreate_obj { - __u64 lrc_id; - __u32 lrc_ost_idx; -}; - -struct ll_fid { - __u64 id; /* holds object id */ - __u32 generation; /* holds object generation */ - __u32 f_type; /* holds object type or stripe idx when passing it to - * OST for saving into EA. - */ -}; - -#define UUID_MAX 40 -struct obd_uuid { - char uuid[UUID_MAX]; -}; - -static inline bool obd_uuid_equals(const struct obd_uuid *u1, - const struct obd_uuid *u2) -{ - return strcmp((char *)u1->uuid, (char *)u2->uuid) == 0; -} - -static inline int obd_uuid_empty(struct obd_uuid *uuid) -{ - return uuid->uuid[0] == '\0'; -} - -static inline void obd_str2uuid(struct obd_uuid *uuid, const char *tmp) -{ - strncpy((char *)uuid->uuid, tmp, sizeof(*uuid)); - uuid->uuid[sizeof(*uuid) - 1] = '\0'; -} - -/* For printf's only, make sure uuid is terminated */ -static inline char *obd_uuid2str(const struct obd_uuid *uuid) -{ - if (!uuid) - return NULL; - - if (uuid->uuid[sizeof(*uuid) - 1] != '\0') { - /* Obviously not safe, but for printfs, no real harm done... - * we're always null-terminated, even in a race. - */ - static char temp[sizeof(*uuid)]; - - memcpy(temp, uuid->uuid, sizeof(*uuid) - 1); - temp[sizeof(*uuid) - 1] = '\0'; - return temp; - } - return (char *)(uuid->uuid); -} - -/* Extract fsname from uuid (or target name) of a target - * e.g. (myfs-OST0007_UUID -> myfs) - * see also deuuidify. - */ -static inline void obd_uuid2fsname(char *buf, char *uuid, int buflen) -{ - char *p; - - strncpy(buf, uuid, buflen - 1); - buf[buflen - 1] = '\0'; - p = strrchr(buf, '-'); - if (p) - *p = '\0'; -} - -/* printf display format - * * usage: printf("file FID is "DFID"\n", PFID(fid)); - */ -#define FID_NOBRACE_LEN 40 -#define FID_LEN (FID_NOBRACE_LEN + 2) -#define DFID_NOBRACE "%#llx:0x%x:0x%x" -#define DFID "[" DFID_NOBRACE "]" -#define PFID(fid) (unsigned long long)(fid)->f_seq, (fid)->f_oid, (fid)->f_ver - -/* scanf input parse format for fids in DFID_NOBRACE format - * Need to strip '[' from DFID format first or use "["SFID"]" at caller. - * usage: sscanf(fidstr, SFID, RFID(&fid)); - */ -#define SFID "0x%llx:0x%x:0x%x" -#define RFID(fid) &((fid)->f_seq), &((fid)->f_oid), &((fid)->f_ver) - -/********* Quotas **********/ - -#define Q_QUOTACHECK 0x800100 /* deprecated as of 2.4 */ -#define Q_INITQUOTA 0x800101 /* deprecated as of 2.4 */ -#define Q_GETOINFO 0x800102 /* get obd quota info */ -#define Q_GETOQUOTA 0x800103 /* get obd quotas */ -#define Q_FINVALIDATE 0x800104 /* deprecated as of 2.4 */ - -/* these must be explicitly translated into linux Q_* in ll_dir_ioctl */ -#define LUSTRE_Q_QUOTAON 0x800002 /* deprecated as of 2.4 */ -#define LUSTRE_Q_QUOTAOFF 0x800003 /* deprecated as of 2.4 */ -#define LUSTRE_Q_GETINFO 0x800005 /* get information about quota files */ -#define LUSTRE_Q_SETINFO 0x800006 /* set information about quota files */ -#define LUSTRE_Q_GETQUOTA 0x800007 /* get user quota structure */ -#define LUSTRE_Q_SETQUOTA 0x800008 /* set user quota structure */ -/* lustre-specific control commands */ -#define LUSTRE_Q_INVALIDATE 0x80000b /* deprecated as of 2.4 */ -#define LUSTRE_Q_FINVALIDATE 0x80000c /* deprecated as of 2.4 */ - -#define UGQUOTA 2 /* set both USRQUOTA and GRPQUOTA */ - -#define IDENTITY_DOWNCALL_MAGIC 0x6d6dd629 - -/* permission */ -#define N_PERMS_MAX 64 - -struct perm_downcall_data { - __u64 pdd_nid; - __u32 pdd_perm; - __u32 pdd_padding; -}; - -struct identity_downcall_data { - __u32 idd_magic; - __u32 idd_err; - __u32 idd_uid; - __u32 idd_gid; - __u32 idd_nperms; - __u32 idd_ngroups; - struct perm_downcall_data idd_perms[N_PERMS_MAX]; - __u32 idd_groups[0]; -}; - -/* lustre volatile file support - * file name header: .^L^S^T^R:volatile" - */ -#define LUSTRE_VOLATILE_HDR ".\x0c\x13\x14\x12:VOLATILE" -#define LUSTRE_VOLATILE_HDR_LEN 14 -/* hdr + MDT index */ -#define LUSTRE_VOLATILE_IDX LUSTRE_VOLATILE_HDR":%.4X:" - -enum lustre_quota_version { - LUSTRE_QUOTA_V2 = 1 -}; - -/* XXX: same as if_dqinfo struct in kernel */ -struct obd_dqinfo { - __u64 dqi_bgrace; - __u64 dqi_igrace; - __u32 dqi_flags; - __u32 dqi_valid; -}; - -/* XXX: same as if_dqblk struct in kernel, plus one padding */ -struct obd_dqblk { - __u64 dqb_bhardlimit; - __u64 dqb_bsoftlimit; - __u64 dqb_curspace; - __u64 dqb_ihardlimit; - __u64 dqb_isoftlimit; - __u64 dqb_curinodes; - __u64 dqb_btime; - __u64 dqb_itime; - __u32 dqb_valid; - __u32 dqb_padding; -}; - -enum { - QC_GENERAL = 0, - QC_MDTIDX = 1, - QC_OSTIDX = 2, - QC_UUID = 3 -}; - -struct if_quotactl { - __u32 qc_cmd; - __u32 qc_type; - __u32 qc_id; - __u32 qc_stat; - __u32 qc_valid; - __u32 qc_idx; - struct obd_dqinfo qc_dqinfo; - struct obd_dqblk qc_dqblk; - char obd_type[16]; - struct obd_uuid obd_uuid; -}; - -/* swap layout flags */ -#define SWAP_LAYOUTS_CHECK_DV1 (1 << 0) -#define SWAP_LAYOUTS_CHECK_DV2 (1 << 1) -#define SWAP_LAYOUTS_KEEP_MTIME (1 << 2) -#define SWAP_LAYOUTS_KEEP_ATIME (1 << 3) -#define SWAP_LAYOUTS_CLOSE (1 << 4) - -/* Swap XATTR_NAME_HSM as well, only on the MDT so far */ -#define SWAP_LAYOUTS_MDS_HSM (1 << 31) -struct lustre_swap_layouts { - __u64 sl_flags; - __u32 sl_fd; - __u32 sl_gid; - __u64 sl_dv1; - __u64 sl_dv2; -}; - -/********* Changelogs **********/ -/** Changelog record types */ -enum changelog_rec_type { - CL_MARK = 0, - CL_CREATE = 1, /* namespace */ - CL_MKDIR = 2, /* namespace */ - CL_HARDLINK = 3, /* namespace */ - CL_SOFTLINK = 4, /* namespace */ - CL_MKNOD = 5, /* namespace */ - CL_UNLINK = 6, /* namespace */ - CL_RMDIR = 7, /* namespace */ - CL_RENAME = 8, /* namespace */ - CL_EXT = 9, /* namespace extended record (2nd half of rename) */ - CL_OPEN = 10, /* not currently used */ - CL_CLOSE = 11, /* may be written to log only with mtime change */ - CL_LAYOUT = 12, /* file layout/striping modified */ - CL_TRUNC = 13, - CL_SETATTR = 14, - CL_XATTR = 15, - CL_HSM = 16, /* HSM specific events, see flags */ - CL_MTIME = 17, /* Precedence: setattr > mtime > ctime > atime */ - CL_CTIME = 18, - CL_ATIME = 19, - CL_LAST -}; - -static inline const char *changelog_type2str(int type) -{ - static const char *changelog_str[] = { - "MARK", "CREAT", "MKDIR", "HLINK", "SLINK", "MKNOD", "UNLNK", - "RMDIR", "RENME", "RNMTO", "OPEN", "CLOSE", "LYOUT", "TRUNC", - "SATTR", "XATTR", "HSM", "MTIME", "CTIME", "ATIME", - }; - - if (type >= 0 && type < CL_LAST) - return changelog_str[type]; - return NULL; -} - -/* per-record flags */ -#define CLF_FLAGSHIFT 12 -#define CLF_FLAGMASK ((1U << CLF_FLAGSHIFT) - 1) -#define CLF_VERMASK (~CLF_FLAGMASK) -enum changelog_rec_flags { - CLF_VERSION = 0x1000, - CLF_RENAME = 0x2000, - CLF_JOBID = 0x4000, - CLF_SUPPORTED = CLF_VERSION | CLF_RENAME | CLF_JOBID -}; - -/* Anything under the flagmask may be per-type (if desired) */ -/* Flags for unlink */ -#define CLF_UNLINK_LAST 0x0001 /* Unlink of last hardlink */ -#define CLF_UNLINK_HSM_EXISTS 0x0002 /* File has something in HSM */ - /* HSM cleaning needed */ -/* Flags for rename */ -#define CLF_RENAME_LAST 0x0001 /* rename unlink last hardlink of - * target - */ -#define CLF_RENAME_LAST_EXISTS 0x0002 /* rename unlink last hardlink of target - * has an archive in backend - */ - -/* Flags for HSM */ -/* 12b used (from high weight to low weight): - * 2b for flags - * 3b for event - * 7b for error code - */ -#define CLF_HSM_ERR_L 0 /* HSM return code, 7 bits */ -#define CLF_HSM_ERR_H 6 -#define CLF_HSM_EVENT_L 7 /* HSM event, 3 bits, see enum hsm_event */ -#define CLF_HSM_EVENT_H 9 -#define CLF_HSM_FLAG_L 10 /* HSM flags, 2 bits, 1 used, 1 spare */ -#define CLF_HSM_FLAG_H 11 -#define CLF_HSM_SPARE_L 12 /* 4 spare bits */ -#define CLF_HSM_SPARE_H 15 -#define CLF_HSM_LAST 15 - -/* Remove bits higher than _h, then extract the value - * between _h and _l by shifting lower weigth to bit 0. - */ -#define CLF_GET_BITS(_b, _h, _l) (((_b << (CLF_HSM_LAST - _h)) & 0xFFFF) \ - >> (CLF_HSM_LAST - _h + _l)) - -#define CLF_HSM_SUCCESS 0x00 -#define CLF_HSM_MAXERROR 0x7E -#define CLF_HSM_ERROVERFLOW 0x7F - -#define CLF_HSM_DIRTY 1 /* file is dirty after HSM request end */ - -/* 3 bits field => 8 values allowed */ -enum hsm_event { - HE_ARCHIVE = 0, - HE_RESTORE = 1, - HE_CANCEL = 2, - HE_RELEASE = 3, - HE_REMOVE = 4, - HE_STATE = 5, - HE_SPARE1 = 6, - HE_SPARE2 = 7, -}; - -static inline enum hsm_event hsm_get_cl_event(__u16 flags) -{ - return CLF_GET_BITS(flags, CLF_HSM_EVENT_H, CLF_HSM_EVENT_L); -} - -static inline void hsm_set_cl_event(int *flags, enum hsm_event he) -{ - *flags |= (he << CLF_HSM_EVENT_L); -} - -static inline __u16 hsm_get_cl_flags(int flags) -{ - return CLF_GET_BITS(flags, CLF_HSM_FLAG_H, CLF_HSM_FLAG_L); -} - -static inline void hsm_set_cl_flags(int *flags, int bits) -{ - *flags |= (bits << CLF_HSM_FLAG_L); -} - -static inline int hsm_get_cl_error(int flags) -{ - return CLF_GET_BITS(flags, CLF_HSM_ERR_H, CLF_HSM_ERR_L); -} - -static inline void hsm_set_cl_error(int *flags, int error) -{ - *flags |= (error << CLF_HSM_ERR_L); -} - -enum changelog_send_flag { - /* Not yet implemented */ - CHANGELOG_FLAG_FOLLOW = 0x01, - /* - * Blocking IO makes sense in case of slow user parsing of the records, - * but it also prevents us from cleaning up if the records are not - * consumed. - */ - CHANGELOG_FLAG_BLOCK = 0x02, - /* Pack jobid into the changelog records if available. */ - CHANGELOG_FLAG_JOBID = 0x04, -}; - -#define CR_MAXSIZE cfs_size_round(2 * NAME_MAX + 2 + \ - changelog_rec_offset(CLF_SUPPORTED)) - -/* 31 usable bytes string + null terminator. */ -#define LUSTRE_JOBID_SIZE 32 - -/* - * This is the minimal changelog record. It can contain extensions - * such as rename fields or process jobid. Its exact content is described - * by the cr_flags. - * - * Extensions are packed in the same order as their corresponding flags. - */ -struct changelog_rec { - __u16 cr_namelen; - __u16 cr_flags; /**< \a changelog_rec_flags */ - __u32 cr_type; /**< \a changelog_rec_type */ - __u64 cr_index; /**< changelog record number */ - __u64 cr_prev; /**< last index for this target fid */ - __u64 cr_time; - union { - struct lu_fid cr_tfid; /**< target fid */ - __u32 cr_markerflags; /**< CL_MARK flags */ - }; - struct lu_fid cr_pfid; /**< parent fid */ -} __packed; - -/* Changelog extension for RENAME. */ -struct changelog_ext_rename { - struct lu_fid cr_sfid; /**< source fid, or zero */ - struct lu_fid cr_spfid; /**< source parent fid, or zero */ -}; - -/* Changelog extension to include JOBID. */ -struct changelog_ext_jobid { - char cr_jobid[LUSTRE_JOBID_SIZE]; /**< zero-terminated string. */ -}; - -static inline size_t changelog_rec_offset(enum changelog_rec_flags crf) -{ - size_t size = sizeof(struct changelog_rec); - - if (crf & CLF_RENAME) - size += sizeof(struct changelog_ext_rename); - - if (crf & CLF_JOBID) - size += sizeof(struct changelog_ext_jobid); - - return size; -} - -static inline size_t changelog_rec_size(struct changelog_rec *rec) -{ - return changelog_rec_offset(rec->cr_flags); -} - -static inline size_t changelog_rec_varsize(struct changelog_rec *rec) -{ - return changelog_rec_size(rec) - sizeof(*rec) + rec->cr_namelen; -} - -static inline -struct changelog_ext_rename *changelog_rec_rename(struct changelog_rec *rec) -{ - enum changelog_rec_flags crf = rec->cr_flags & CLF_VERSION; - - return (struct changelog_ext_rename *)((char *)rec + - changelog_rec_offset(crf)); -} - -/* The jobid follows the rename extension, if present */ -static inline -struct changelog_ext_jobid *changelog_rec_jobid(struct changelog_rec *rec) -{ - enum changelog_rec_flags crf = rec->cr_flags & - (CLF_VERSION | CLF_RENAME); - - return (struct changelog_ext_jobid *)((char *)rec + - changelog_rec_offset(crf)); -} - -/* The name follows the rename and jobid extensions, if present */ -static inline char *changelog_rec_name(struct changelog_rec *rec) -{ - return (char *)rec + changelog_rec_offset(rec->cr_flags & - CLF_SUPPORTED); -} - -static inline size_t changelog_rec_snamelen(struct changelog_rec *rec) -{ - return rec->cr_namelen - strlen(changelog_rec_name(rec)) - 1; -} - -static inline char *changelog_rec_sname(struct changelog_rec *rec) -{ - char *cr_name = changelog_rec_name(rec); - - return cr_name + strlen(cr_name) + 1; -} - -/** - * Remap a record to the desired format as specified by the crf flags. - * The record must be big enough to contain the final remapped version. - * Superfluous extension fields are removed and missing ones are added - * and zeroed. The flags of the record are updated accordingly. - * - * The jobid and rename extensions can be added to a record, to match the - * format an application expects, typically. In this case, the newly added - * fields will be zeroed. - * The Jobid field can be removed, to guarantee compatibility with older - * clients that don't expect this field in the records they process. - * - * The following assumptions are being made: - * - CLF_RENAME will not be removed - * - CLF_JOBID will not be added without CLF_RENAME being added too - * - * @param[in,out] rec The record to remap. - * @param[in] crf_wanted Flags describing the desired extensions. - */ -static inline void changelog_remap_rec(struct changelog_rec *rec, - enum changelog_rec_flags crf_wanted) -{ - char *jid_mov, *rnm_mov; - - crf_wanted &= CLF_SUPPORTED; - - if ((rec->cr_flags & CLF_SUPPORTED) == crf_wanted) - return; - - /* First move the variable-length name field */ - memmove((char *)rec + changelog_rec_offset(crf_wanted), - changelog_rec_name(rec), rec->cr_namelen); - - /* Locations of jobid and rename extensions in the remapped record */ - jid_mov = (char *)rec + - changelog_rec_offset(crf_wanted & ~CLF_JOBID); - rnm_mov = (char *)rec + - changelog_rec_offset(crf_wanted & ~(CLF_JOBID | CLF_RENAME)); - - /* Move the extension fields to the desired positions */ - if ((crf_wanted & CLF_JOBID) && (rec->cr_flags & CLF_JOBID)) - memmove(jid_mov, changelog_rec_jobid(rec), - sizeof(struct changelog_ext_jobid)); - - if ((crf_wanted & CLF_RENAME) && (rec->cr_flags & CLF_RENAME)) - memmove(rnm_mov, changelog_rec_rename(rec), - sizeof(struct changelog_ext_rename)); - - /* Clear newly added fields */ - if ((crf_wanted & CLF_JOBID) && !(rec->cr_flags & CLF_JOBID)) - memset(jid_mov, 0, sizeof(struct changelog_ext_jobid)); - - if ((crf_wanted & CLF_RENAME) && !(rec->cr_flags & CLF_RENAME)) - memset(rnm_mov, 0, sizeof(struct changelog_ext_rename)); - - /* Update the record's flags accordingly */ - rec->cr_flags = (rec->cr_flags & CLF_FLAGMASK) | crf_wanted; -} - -struct ioc_changelog { - __u64 icc_recno; - __u32 icc_mdtindex; - __u32 icc_id; - __u32 icc_flags; -}; - -enum changelog_message_type { - CL_RECORD = 10, /* message is a changelog_rec */ - CL_EOF = 11, /* at end of current changelog */ -}; - -/********* Misc **********/ - -struct ioc_data_version { - __u64 idv_version; - __u64 idv_flags; /* See LL_DV_xxx */ -}; - -#define LL_DV_RD_FLUSH (1 << 0) /* Flush dirty pages from clients */ -#define LL_DV_WR_FLUSH (1 << 1) /* Flush all caching pages from clients */ - -#ifndef offsetof -# define offsetof(typ, memb) ((unsigned long)((char *)&(((typ *)0)->memb))) -#endif - -#define dot_lustre_name ".lustre" - -/********* HSM **********/ - -/** HSM per-file state - * See HSM_FLAGS below. - */ -enum hsm_states { - HS_NONE = 0x00000000, - HS_EXISTS = 0x00000001, - HS_DIRTY = 0x00000002, - HS_RELEASED = 0x00000004, - HS_ARCHIVED = 0x00000008, - HS_NORELEASE = 0x00000010, - HS_NOARCHIVE = 0x00000020, - HS_LOST = 0x00000040, -}; - -/* HSM user-setable flags. */ -#define HSM_USER_MASK (HS_NORELEASE | HS_NOARCHIVE | HS_DIRTY) - -/* Other HSM flags. */ -#define HSM_STATUS_MASK (HS_EXISTS | HS_LOST | HS_RELEASED | HS_ARCHIVED) - -/* - * All HSM-related possible flags that could be applied to a file. - * This should be kept in sync with hsm_states. - */ -#define HSM_FLAGS_MASK (HSM_USER_MASK | HSM_STATUS_MASK) - -/** - * HSM request progress state - */ -enum hsm_progress_states { - HPS_WAITING = 1, - HPS_RUNNING = 2, - HPS_DONE = 3, -}; - -#define HPS_NONE 0 - -static inline char *hsm_progress_state2name(enum hsm_progress_states s) -{ - switch (s) { - case HPS_WAITING: return "waiting"; - case HPS_RUNNING: return "running"; - case HPS_DONE: return "done"; - default: return "unknown"; - } -} - -struct hsm_extent { - __u64 offset; - __u64 length; -} __packed; - -/** - * Current HSM states of a Lustre file. - * - * This structure purpose is to be sent to user-space mainly. It describes the - * current HSM flags and in-progress action. - */ -struct hsm_user_state { - /** Current HSM states, from enum hsm_states. */ - __u32 hus_states; - __u32 hus_archive_id; - /** The current undergoing action, if there is one */ - __u32 hus_in_progress_state; - __u32 hus_in_progress_action; - struct hsm_extent hus_in_progress_location; - char hus_extended_info[]; -}; - -struct hsm_state_set_ioc { - struct lu_fid hssi_fid; - __u64 hssi_setmask; - __u64 hssi_clearmask; -}; - -/* - * This structure describes the current in-progress action for a file. - * it is returned to user space and send over the wire - */ -struct hsm_current_action { - /** The current undergoing action, if there is one */ - /* state is one of hsm_progress_states */ - __u32 hca_state; - /* action is one of hsm_user_action */ - __u32 hca_action; - struct hsm_extent hca_location; -}; - -/***** HSM user requests ******/ -/* User-generated (lfs/ioctl) request types */ -enum hsm_user_action { - HUA_NONE = 1, /* no action (noop) */ - HUA_ARCHIVE = 10, /* copy to hsm */ - HUA_RESTORE = 11, /* prestage */ - HUA_RELEASE = 12, /* drop ost objects */ - HUA_REMOVE = 13, /* remove from archive */ - HUA_CANCEL = 14 /* cancel a request */ -}; - -static inline char *hsm_user_action2name(enum hsm_user_action a) -{ - switch (a) { - case HUA_NONE: return "NOOP"; - case HUA_ARCHIVE: return "ARCHIVE"; - case HUA_RESTORE: return "RESTORE"; - case HUA_RELEASE: return "RELEASE"; - case HUA_REMOVE: return "REMOVE"; - case HUA_CANCEL: return "CANCEL"; - default: return "UNKNOWN"; - } -} - -/* - * List of hr_flags (bit field) - */ -#define HSM_FORCE_ACTION 0x0001 -/* used by CT, connot be set by user */ -#define HSM_GHOST_COPY 0x0002 - -/** - * Contains all the fixed part of struct hsm_user_request. - * - */ -struct hsm_request { - __u32 hr_action; /* enum hsm_user_action */ - __u32 hr_archive_id; /* archive id, used only with HUA_ARCHIVE */ - __u64 hr_flags; /* request flags */ - __u32 hr_itemcount; /* item count in hur_user_item vector */ - __u32 hr_data_len; -}; - -struct hsm_user_item { - struct lu_fid hui_fid; - struct hsm_extent hui_extent; -} __packed; - -struct hsm_user_request { - struct hsm_request hur_request; - struct hsm_user_item hur_user_item[0]; - /* extra data blob at end of struct (after all - * hur_user_items), only use helpers to access it - */ -} __packed; - -/** Return pointer to data field in a hsm user request */ -static inline void *hur_data(struct hsm_user_request *hur) -{ - return &hur->hur_user_item[hur->hur_request.hr_itemcount]; -} - -/** - * Compute the current length of the provided hsm_user_request. This returns -1 - * instead of an errno because ssize_t is defined to be only [ -1, SSIZE_MAX ] - * - * return -1 on bounds check error. - */ -static inline ssize_t hur_len(struct hsm_user_request *hur) -{ - __u64 size; - - /* can't overflow a __u64 since hr_itemcount is only __u32 */ - size = offsetof(struct hsm_user_request, hur_user_item[0]) + - (__u64)hur->hur_request.hr_itemcount * - sizeof(hur->hur_user_item[0]) + hur->hur_request.hr_data_len; - - if (size != (ssize_t)size) - return -1; - - return size; -} - -/****** HSM RPCs to copytool *****/ -/* Message types the copytool may receive */ -enum hsm_message_type { - HMT_ACTION_LIST = 100, /* message is a hsm_action_list */ -}; - -/* Actions the copytool may be instructed to take for a given action_item */ -enum hsm_copytool_action { - HSMA_NONE = 10, /* no action */ - HSMA_ARCHIVE = 20, /* arbitrary offset */ - HSMA_RESTORE = 21, - HSMA_REMOVE = 22, - HSMA_CANCEL = 23 -}; - -static inline char *hsm_copytool_action2name(enum hsm_copytool_action a) -{ - switch (a) { - case HSMA_NONE: return "NOOP"; - case HSMA_ARCHIVE: return "ARCHIVE"; - case HSMA_RESTORE: return "RESTORE"; - case HSMA_REMOVE: return "REMOVE"; - case HSMA_CANCEL: return "CANCEL"; - default: return "UNKNOWN"; - } -} - -/* Copytool item action description */ -struct hsm_action_item { - __u32 hai_len; /* valid size of this struct */ - __u32 hai_action; /* hsm_copytool_action, but use known size */ - struct lu_fid hai_fid; /* Lustre FID to operated on */ - struct lu_fid hai_dfid; /* fid used for data access */ - struct hsm_extent hai_extent; /* byte range to operate on */ - __u64 hai_cookie; /* action cookie from coordinator */ - __u64 hai_gid; /* grouplock id */ - char hai_data[0]; /* variable length */ -} __packed; - -/* - * helper function which print in hexa the first bytes of - * hai opaque field - * \param hai [IN] record to print - * \param buffer [OUT] output buffer - * \param len [IN] max buffer len - * \retval buffer - */ -static inline char *hai_dump_data_field(struct hsm_action_item *hai, - char *buffer, size_t len) -{ - int i, data_len; - char *ptr; - - ptr = buffer; - data_len = hai->hai_len - sizeof(*hai); - for (i = 0; (i < data_len) && (len > 2); i++) { - snprintf(ptr, 3, "%02X", (unsigned char)hai->hai_data[i]); - ptr += 2; - len -= 2; - } - - *ptr = '\0'; - - return buffer; -} - -/* Copytool action list */ -#define HAL_VERSION 1 -#define HAL_MAXSIZE LNET_MTU /* bytes, used in userspace only */ -struct hsm_action_list { - __u32 hal_version; - __u32 hal_count; /* number of hai's to follow */ - __u64 hal_compound_id; /* returned by coordinator */ - __u64 hal_flags; - __u32 hal_archive_id; /* which archive backend */ - __u32 padding1; - char hal_fsname[0]; /* null-terminated */ - /* struct hsm_action_item[hal_count] follows, aligned on 8-byte - * boundaries. See hai_first - */ -} __packed; - -#ifndef HAVE_CFS_SIZE_ROUND -static inline int cfs_size_round(int val) -{ - return (val + 7) & (~0x7); -} - -#define HAVE_CFS_SIZE_ROUND -#endif - -/* Return pointer to first hai in action list */ -static inline struct hsm_action_item *hai_first(struct hsm_action_list *hal) -{ - return (struct hsm_action_item *)(hal->hal_fsname + - cfs_size_round(strlen(hal-> \ - hal_fsname) - + 1)); -} - -/* Return pointer to next hai */ -static inline struct hsm_action_item *hai_next(struct hsm_action_item *hai) -{ - return (struct hsm_action_item *)((char *)hai + - cfs_size_round(hai->hai_len)); -} - -/* Return size of an hsm_action_list */ -static inline int hal_size(struct hsm_action_list *hal) -{ - int i, sz; - struct hsm_action_item *hai; - - sz = sizeof(*hal) + cfs_size_round(strlen(hal->hal_fsname) + 1); - hai = hai_first(hal); - for (i = 0; i < hal->hal_count; i++, hai = hai_next(hai)) - sz += cfs_size_round(hai->hai_len); - - return sz; -} - -/* HSM file import - * describe the attributes to be set on imported file - */ -struct hsm_user_import { - __u64 hui_size; - __u64 hui_atime; - __u64 hui_mtime; - __u32 hui_atime_ns; - __u32 hui_mtime_ns; - __u32 hui_uid; - __u32 hui_gid; - __u32 hui_mode; - __u32 hui_archive_id; -}; - -/* Copytool progress reporting */ -#define HP_FLAG_COMPLETED 0x01 -#define HP_FLAG_RETRY 0x02 - -struct hsm_progress { - struct lu_fid hp_fid; - __u64 hp_cookie; - struct hsm_extent hp_extent; - __u16 hp_flags; - __u16 hp_errval; /* positive val */ - __u32 padding; -}; - -struct hsm_copy { - __u64 hc_data_version; - __u16 hc_flags; - __u16 hc_errval; /* positive val */ - __u32 padding; - struct hsm_action_item hc_hai; -}; - -/** @} lustreuser */ - -#endif /* _LUSTRE_USER_H */ diff --git a/drivers/staging/lustre/include/uapi/linux/lustre/lustre_ver.h b/drivers/staging/lustre/include/uapi/linux/lustre/lustre_ver.h deleted file mode 100644 index 19c9135e2273dfbf8f0ed074ed6efa9fd64ccfce..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/include/uapi/linux/lustre/lustre_ver.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef _LUSTRE_VER_H_ -#define _LUSTRE_VER_H_ - -#define LUSTRE_MAJOR 2 -#define LUSTRE_MINOR 6 -#define LUSTRE_PATCH 99 -#define LUSTRE_FIX 0 -#define LUSTRE_VERSION_STRING "2.6.99" - -#define OBD_OCD_VERSION(major, minor, patch, fix) \ - (((major) << 24) + ((minor) << 16) + ((patch) << 8) + (fix)) - -#define OBD_OCD_VERSION_MAJOR(version) ((int)((version) >> 24) & 255) -#define OBD_OCD_VERSION_MINOR(version) ((int)((version) >> 16) & 255) -#define OBD_OCD_VERSION_PATCH(version) ((int)((version) >> 8) & 255) -#define OBD_OCD_VERSION_FIX(version) ((int)((version) >> 0) & 255) - -#define LUSTRE_VERSION_CODE \ - OBD_OCD_VERSION(LUSTRE_MAJOR, LUSTRE_MINOR, LUSTRE_PATCH, LUSTRE_FIX) - -/* - * If lustre version of client and servers it connects to differs by more - * than this amount, client would issue a warning. - */ -#define LUSTRE_VERSION_OFFSET_WARN OBD_OCD_VERSION(0, 4, 0, 0) - -#endif diff --git a/drivers/staging/lustre/lnet/Kconfig b/drivers/staging/lustre/lnet/Kconfig deleted file mode 100644 index ad049e6f24e47a73b6894340be2742d95b21f692..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/Kconfig +++ /dev/null @@ -1,46 +0,0 @@ -config LNET - tristate "Lustre networking subsystem (LNet)" - depends on INET - help - The Lustre network layer, also known as LNet, is a networking abstaction - level API that was initially created to allow Lustre Filesystem to utilize - very different networks like tcp and ib verbs in a uniform way. In the - case of Lustre routers only the LNet layer is required. Lately other - projects are also looking into using LNet as their networking API as well. - -config LNET_MAX_PAYLOAD - int "Lustre lnet max transfer payload (default 1MB)" - depends on LNET - default "1048576" - help - This option defines the maximum size of payload in bytes that lnet - can put into its transport. - - If unsure, use default. - -config LNET_SELFTEST - tristate "Lustre networking self testing" - depends on LNET - help - Choose Y here if you want to do lnet self testing. To compile this - as a module, choose M here: the module will be called lnet_selftest. - - To compile this as a kernel modules, choose M here and it will be - called lnet_selftest. - - If unsure, say N. - - See also http://wiki.lustre.org/ - -config LNET_XPRT_IB - tristate "LNET infiniband support" - depends on LNET && PCI && INFINIBAND && INFINIBAND_ADDR_TRANS - default LNET && INFINIBAND - help - This option allows the LNET users to use infiniband as an - RDMA-enabled transport. - - To compile this as a kernel module, choose M here and it will be - called ko2iblnd. - - If unsure, say N. diff --git a/drivers/staging/lustre/lnet/Makefile b/drivers/staging/lustre/lnet/Makefile deleted file mode 100644 index 0a380fe88ce862247619daef60e1e18ee273dbe0..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/Makefile +++ /dev/null @@ -1 +0,0 @@ -obj-$(CONFIG_LNET) += libcfs/ lnet/ klnds/ selftest/ diff --git a/drivers/staging/lustre/lnet/klnds/Makefile b/drivers/staging/lustre/lnet/klnds/Makefile deleted file mode 100644 index c23e4f67f83701c30868ccbe8e1d934025a13536..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/klnds/Makefile +++ /dev/null @@ -1 +0,0 @@ -obj-$(CONFIG_LNET) += o2iblnd/ socklnd/ diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile b/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile deleted file mode 100644 index 4affe1d799487f014b65d62ad8ad63860f3e085f..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/include -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/lustre/include - -obj-$(CONFIG_LNET_XPRT_IB) += ko2iblnd.o -ko2iblnd-y := o2iblnd.o o2iblnd_cb.o o2iblnd_modparams.o diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c deleted file mode 100644 index f0b4eb42bc1dca809d55cc5d125e8ff150e25b7a..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c +++ /dev/null @@ -1,2958 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/o2iblnd/o2iblnd.c - * - * Author: Eric Barton - */ - -#include -#include -#include "o2iblnd.h" - -static struct lnet_lnd the_o2iblnd; - -struct kib_data kiblnd_data; - -static __u32 kiblnd_cksum(void *ptr, int nob) -{ - char *c = ptr; - __u32 sum = 0; - - while (nob-- > 0) - sum = ((sum << 1) | (sum >> 31)) + *c++; - - /* ensure I don't return 0 (== no checksum) */ - return !sum ? 1 : sum; -} - -static char *kiblnd_msgtype2str(int type) -{ - switch (type) { - case IBLND_MSG_CONNREQ: - return "CONNREQ"; - - case IBLND_MSG_CONNACK: - return "CONNACK"; - - case IBLND_MSG_NOOP: - return "NOOP"; - - case IBLND_MSG_IMMEDIATE: - return "IMMEDIATE"; - - case IBLND_MSG_PUT_REQ: - return "PUT_REQ"; - - case IBLND_MSG_PUT_NAK: - return "PUT_NAK"; - - case IBLND_MSG_PUT_ACK: - return "PUT_ACK"; - - case IBLND_MSG_PUT_DONE: - return "PUT_DONE"; - - case IBLND_MSG_GET_REQ: - return "GET_REQ"; - - case IBLND_MSG_GET_DONE: - return "GET_DONE"; - - default: - return "???"; - } -} - -static int kiblnd_msgtype2size(int type) -{ - const int hdr_size = offsetof(struct kib_msg, ibm_u); - - switch (type) { - case IBLND_MSG_CONNREQ: - case IBLND_MSG_CONNACK: - return hdr_size + sizeof(struct kib_connparams); - - case IBLND_MSG_NOOP: - return hdr_size; - - case IBLND_MSG_IMMEDIATE: - return offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[0]); - - case IBLND_MSG_PUT_REQ: - return hdr_size + sizeof(struct kib_putreq_msg); - - case IBLND_MSG_PUT_ACK: - return hdr_size + sizeof(struct kib_putack_msg); - - case IBLND_MSG_GET_REQ: - return hdr_size + sizeof(struct kib_get_msg); - - case IBLND_MSG_PUT_NAK: - case IBLND_MSG_PUT_DONE: - case IBLND_MSG_GET_DONE: - return hdr_size + sizeof(struct kib_completion_msg); - default: - return -1; - } -} - -static int kiblnd_unpack_rd(struct kib_msg *msg, int flip) -{ - struct kib_rdma_desc *rd; - int msg_size; - int nob; - int n; - int i; - - LASSERT(msg->ibm_type == IBLND_MSG_GET_REQ || - msg->ibm_type == IBLND_MSG_PUT_ACK); - - rd = msg->ibm_type == IBLND_MSG_GET_REQ ? - &msg->ibm_u.get.ibgm_rd : - &msg->ibm_u.putack.ibpam_rd; - - if (flip) { - __swab32s(&rd->rd_key); - __swab32s(&rd->rd_nfrags); - } - - n = rd->rd_nfrags; - - nob = offsetof(struct kib_msg, ibm_u) + - kiblnd_rd_msg_size(rd, msg->ibm_type, n); - - if (msg->ibm_nob < nob) { - CERROR("Short %s: %d(%d)\n", - kiblnd_msgtype2str(msg->ibm_type), msg->ibm_nob, nob); - return 1; - } - - msg_size = kiblnd_rd_size(rd); - if (msg_size <= 0 || msg_size > LNET_MAX_PAYLOAD) { - CERROR("Bad msg_size: %d, should be 0 < n <= %d\n", - msg_size, LNET_MAX_PAYLOAD); - return 1; - } - - if (!flip) - return 0; - - for (i = 0; i < n; i++) { - __swab32s(&rd->rd_frags[i].rf_nob); - __swab64s(&rd->rd_frags[i].rf_addr); - } - - return 0; -} - -void kiblnd_pack_msg(struct lnet_ni *ni, struct kib_msg *msg, int version, - int credits, lnet_nid_t dstnid, __u64 dststamp) -{ - struct kib_net *net = ni->ni_data; - - /* - * CAVEAT EMPTOR! all message fields not set here should have been - * initialised previously. - */ - msg->ibm_magic = IBLND_MSG_MAGIC; - msg->ibm_version = version; - /* ibm_type */ - msg->ibm_credits = credits; - /* ibm_nob */ - msg->ibm_cksum = 0; - msg->ibm_srcnid = ni->ni_nid; - msg->ibm_srcstamp = net->ibn_incarnation; - msg->ibm_dstnid = dstnid; - msg->ibm_dststamp = dststamp; - - if (*kiblnd_tunables.kib_cksum) { - /* NB ibm_cksum zero while computing cksum */ - msg->ibm_cksum = kiblnd_cksum(msg, msg->ibm_nob); - } -} - -int kiblnd_unpack_msg(struct kib_msg *msg, int nob) -{ - const int hdr_size = offsetof(struct kib_msg, ibm_u); - __u32 msg_cksum; - __u16 version; - int msg_nob; - int flip; - - /* 6 bytes are enough to have received magic + version */ - if (nob < 6) { - CERROR("Short message: %d\n", nob); - return -EPROTO; - } - - if (msg->ibm_magic == IBLND_MSG_MAGIC) { - flip = 0; - } else if (msg->ibm_magic == __swab32(IBLND_MSG_MAGIC)) { - flip = 1; - } else { - CERROR("Bad magic: %08x\n", msg->ibm_magic); - return -EPROTO; - } - - version = flip ? __swab16(msg->ibm_version) : msg->ibm_version; - if (version != IBLND_MSG_VERSION && - version != IBLND_MSG_VERSION_1) { - CERROR("Bad version: %x\n", version); - return -EPROTO; - } - - if (nob < hdr_size) { - CERROR("Short message: %d\n", nob); - return -EPROTO; - } - - msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob; - if (msg_nob > nob) { - CERROR("Short message: got %d, wanted %d\n", nob, msg_nob); - return -EPROTO; - } - - /* - * checksum must be computed with ibm_cksum zero and BEFORE anything - * gets flipped - */ - msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum; - msg->ibm_cksum = 0; - if (msg_cksum && - msg_cksum != kiblnd_cksum(msg, msg_nob)) { - CERROR("Bad checksum\n"); - return -EPROTO; - } - - msg->ibm_cksum = msg_cksum; - - if (flip) { - /* leave magic unflipped as a clue to peer endianness */ - msg->ibm_version = version; - BUILD_BUG_ON(sizeof(msg->ibm_type) != 1); - BUILD_BUG_ON(sizeof(msg->ibm_credits) != 1); - msg->ibm_nob = msg_nob; - __swab64s(&msg->ibm_srcnid); - __swab64s(&msg->ibm_srcstamp); - __swab64s(&msg->ibm_dstnid); - __swab64s(&msg->ibm_dststamp); - } - - if (msg->ibm_srcnid == LNET_NID_ANY) { - CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid)); - return -EPROTO; - } - - if (msg_nob < kiblnd_msgtype2size(msg->ibm_type)) { - CERROR("Short %s: %d(%d)\n", kiblnd_msgtype2str(msg->ibm_type), - msg_nob, kiblnd_msgtype2size(msg->ibm_type)); - return -EPROTO; - } - - switch (msg->ibm_type) { - default: - CERROR("Unknown message type %x\n", msg->ibm_type); - return -EPROTO; - - case IBLND_MSG_NOOP: - case IBLND_MSG_IMMEDIATE: - case IBLND_MSG_PUT_REQ: - break; - - case IBLND_MSG_PUT_ACK: - case IBLND_MSG_GET_REQ: - if (kiblnd_unpack_rd(msg, flip)) - return -EPROTO; - break; - - case IBLND_MSG_PUT_NAK: - case IBLND_MSG_PUT_DONE: - case IBLND_MSG_GET_DONE: - if (flip) - __swab32s(&msg->ibm_u.completion.ibcm_status); - break; - - case IBLND_MSG_CONNREQ: - case IBLND_MSG_CONNACK: - if (flip) { - __swab16s(&msg->ibm_u.connparams.ibcp_queue_depth); - __swab16s(&msg->ibm_u.connparams.ibcp_max_frags); - __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size); - } - break; - } - return 0; -} - -int kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer **peerp, - lnet_nid_t nid) -{ - struct kib_peer *peer; - struct kib_net *net = ni->ni_data; - int cpt = lnet_cpt_of_nid(nid); - unsigned long flags; - - LASSERT(net); - LASSERT(nid != LNET_NID_ANY); - - peer = kzalloc_cpt(sizeof(*peer), GFP_NOFS, cpt); - if (!peer) { - CERROR("Cannot allocate peer\n"); - return -ENOMEM; - } - - peer->ibp_ni = ni; - peer->ibp_nid = nid; - peer->ibp_error = 0; - peer->ibp_last_alive = 0; - peer->ibp_max_frags = kiblnd_cfg_rdma_frags(peer->ibp_ni); - peer->ibp_queue_depth = ni->ni_peertxcredits; - atomic_set(&peer->ibp_refcount, 1); /* 1 ref for caller */ - - INIT_LIST_HEAD(&peer->ibp_list); /* not in the peer table yet */ - INIT_LIST_HEAD(&peer->ibp_conns); - INIT_LIST_HEAD(&peer->ibp_tx_queue); - - write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - - /* always called with a ref on ni, which prevents ni being shutdown */ - LASSERT(!net->ibn_shutdown); - - /* npeers only grows with the global lock held */ - atomic_inc(&net->ibn_npeers); - - write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - - *peerp = peer; - return 0; -} - -void kiblnd_destroy_peer(struct kib_peer *peer) -{ - struct kib_net *net = peer->ibp_ni->ni_data; - - LASSERT(net); - LASSERT(!atomic_read(&peer->ibp_refcount)); - LASSERT(!kiblnd_peer_active(peer)); - LASSERT(kiblnd_peer_idle(peer)); - LASSERT(list_empty(&peer->ibp_tx_queue)); - - kfree(peer); - - /* - * NB a peer's connections keep a reference on their peer until - * they are destroyed, so we can be assured that _all_ state to do - * with this peer has been cleaned up when its refcount drops to - * zero. - */ - atomic_dec(&net->ibn_npeers); -} - -struct kib_peer *kiblnd_find_peer_locked(lnet_nid_t nid) -{ - /* - * the caller is responsible for accounting the additional reference - * that this creates - */ - struct list_head *peer_list = kiblnd_nid2peerlist(nid); - struct list_head *tmp; - struct kib_peer *peer; - - list_for_each(tmp, peer_list) { - peer = list_entry(tmp, struct kib_peer, ibp_list); - LASSERT(!kiblnd_peer_idle(peer)); - - if (peer->ibp_nid != nid) - continue; - - CDEBUG(D_NET, "got peer [%p] -> %s (%d) version: %x\n", - peer, libcfs_nid2str(nid), - atomic_read(&peer->ibp_refcount), - peer->ibp_version); - return peer; - } - return NULL; -} - -void kiblnd_unlink_peer_locked(struct kib_peer *peer) -{ - LASSERT(list_empty(&peer->ibp_conns)); - - LASSERT(kiblnd_peer_active(peer)); - list_del_init(&peer->ibp_list); - /* lose peerlist's ref */ - kiblnd_peer_decref(peer); -} - -static int kiblnd_get_peer_info(struct lnet_ni *ni, int index, - lnet_nid_t *nidp, int *count) -{ - struct kib_peer *peer; - struct list_head *ptmp; - int i; - unsigned long flags; - - read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - - for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) { - list_for_each(ptmp, &kiblnd_data.kib_peers[i]) { - peer = list_entry(ptmp, struct kib_peer, ibp_list); - LASSERT(!kiblnd_peer_idle(peer)); - - if (peer->ibp_ni != ni) - continue; - - if (index-- > 0) - continue; - - *nidp = peer->ibp_nid; - *count = atomic_read(&peer->ibp_refcount); - - read_unlock_irqrestore(&kiblnd_data.kib_global_lock, - flags); - return 0; - } - } - - read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - return -ENOENT; -} - -static void kiblnd_del_peer_locked(struct kib_peer *peer) -{ - struct list_head *ctmp; - struct list_head *cnxt; - struct kib_conn *conn; - - if (list_empty(&peer->ibp_conns)) { - kiblnd_unlink_peer_locked(peer); - } else { - list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry(ctmp, struct kib_conn, ibc_list); - - kiblnd_close_conn_locked(conn, 0); - } - /* NB closing peer's last conn unlinked it. */ - } - /* - * NB peer now unlinked; might even be freed if the peer table had the - * last ref on it. - */ -} - -static int kiblnd_del_peer(struct lnet_ni *ni, lnet_nid_t nid) -{ - LIST_HEAD(zombies); - struct list_head *ptmp; - struct list_head *pnxt; - struct kib_peer *peer; - int lo; - int hi; - int i; - unsigned long flags; - int rc = -ENOENT; - - write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - - if (nid != LNET_NID_ANY) { - lo = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers; - hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers; - } else { - lo = 0; - hi = kiblnd_data.kib_peer_hash_size - 1; - } - - for (i = lo; i <= hi; i++) { - list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) { - peer = list_entry(ptmp, struct kib_peer, ibp_list); - LASSERT(!kiblnd_peer_idle(peer)); - - if (peer->ibp_ni != ni) - continue; - - if (!(nid == LNET_NID_ANY || peer->ibp_nid == nid)) - continue; - - if (!list_empty(&peer->ibp_tx_queue)) { - LASSERT(list_empty(&peer->ibp_conns)); - - list_splice_init(&peer->ibp_tx_queue, - &zombies); - } - - kiblnd_del_peer_locked(peer); - rc = 0; /* matched something */ - } - } - - write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - - kiblnd_txlist_done(ni, &zombies, -EIO); - - return rc; -} - -static struct kib_conn *kiblnd_get_conn_by_idx(struct lnet_ni *ni, int index) -{ - struct kib_peer *peer; - struct list_head *ptmp; - struct kib_conn *conn; - struct list_head *ctmp; - int i; - unsigned long flags; - - read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - - for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) { - list_for_each(ptmp, &kiblnd_data.kib_peers[i]) { - peer = list_entry(ptmp, struct kib_peer, ibp_list); - LASSERT(!kiblnd_peer_idle(peer)); - - if (peer->ibp_ni != ni) - continue; - - list_for_each(ctmp, &peer->ibp_conns) { - if (index-- > 0) - continue; - - conn = list_entry(ctmp, struct kib_conn, - ibc_list); - kiblnd_conn_addref(conn); - read_unlock_irqrestore( - &kiblnd_data.kib_global_lock, - flags); - return conn; - } - } - } - - read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - return NULL; -} - -int kiblnd_translate_mtu(int value) -{ - switch (value) { - default: - return -1; - case 0: - return 0; - case 256: - return IB_MTU_256; - case 512: - return IB_MTU_512; - case 1024: - return IB_MTU_1024; - case 2048: - return IB_MTU_2048; - case 4096: - return IB_MTU_4096; - } -} - -static void kiblnd_setup_mtu_locked(struct rdma_cm_id *cmid) -{ - int mtu; - - /* XXX There is no path record for iWARP, set by netdev->change_mtu? */ - if (!cmid->route.path_rec) - return; - - mtu = kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu); - LASSERT(mtu >= 0); - if (mtu) - cmid->route.path_rec->mtu = mtu; -} - -static int kiblnd_get_completion_vector(struct kib_conn *conn, int cpt) -{ - cpumask_var_t *mask; - int vectors; - int off; - int i; - lnet_nid_t nid = conn->ibc_peer->ibp_nid; - - vectors = conn->ibc_cmid->device->num_comp_vectors; - if (vectors <= 1) - return 0; - - mask = cfs_cpt_cpumask(lnet_cpt_table(), cpt); - if (!mask) - return 0; - - /* hash NID to CPU id in this partition... */ - off = do_div(nid, cpumask_weight(*mask)); - for_each_cpu(i, *mask) { - if (!off--) - return i % vectors; - } - - LBUG(); - return 1; -} - -struct kib_conn *kiblnd_create_conn(struct kib_peer *peer, struct rdma_cm_id *cmid, - int state, int version) -{ - /* - * CAVEAT EMPTOR: - * If the new conn is created successfully it takes over the caller's - * ref on 'peer'. It also "owns" 'cmid' and destroys it when it itself - * is destroyed. On failure, the caller's ref on 'peer' remains and - * she must dispose of 'cmid'. (Actually I'd block forever if I tried - * to destroy 'cmid' here since I'm called from the CM which still has - * its ref on 'cmid'). - */ - rwlock_t *glock = &kiblnd_data.kib_global_lock; - struct kib_net *net = peer->ibp_ni->ni_data; - struct kib_dev *dev; - struct ib_qp_init_attr *init_qp_attr; - struct kib_sched_info *sched; - struct ib_cq_init_attr cq_attr = {}; - struct kib_conn *conn; - struct ib_cq *cq; - unsigned long flags; - int cpt; - int rc; - int i; - - LASSERT(net); - LASSERT(!in_interrupt()); - - dev = net->ibn_dev; - - cpt = lnet_cpt_of_nid(peer->ibp_nid); - sched = kiblnd_data.kib_scheds[cpt]; - - LASSERT(sched->ibs_nthreads > 0); - - init_qp_attr = kzalloc_cpt(sizeof(*init_qp_attr), GFP_NOFS, cpt); - if (!init_qp_attr) { - CERROR("Can't allocate qp_attr for %s\n", - libcfs_nid2str(peer->ibp_nid)); - goto failed_0; - } - - conn = kzalloc_cpt(sizeof(*conn), GFP_NOFS, cpt); - if (!conn) { - CERROR("Can't allocate connection for %s\n", - libcfs_nid2str(peer->ibp_nid)); - goto failed_1; - } - - conn->ibc_state = IBLND_CONN_INIT; - conn->ibc_version = version; - conn->ibc_peer = peer; /* I take the caller's ref */ - cmid->context = conn; /* for future CM callbacks */ - conn->ibc_cmid = cmid; - conn->ibc_max_frags = peer->ibp_max_frags; - conn->ibc_queue_depth = peer->ibp_queue_depth; - - INIT_LIST_HEAD(&conn->ibc_early_rxs); - INIT_LIST_HEAD(&conn->ibc_tx_noops); - INIT_LIST_HEAD(&conn->ibc_tx_queue); - INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd); - INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred); - INIT_LIST_HEAD(&conn->ibc_active_txs); - spin_lock_init(&conn->ibc_lock); - - conn->ibc_connvars = kzalloc_cpt(sizeof(*conn->ibc_connvars), GFP_NOFS, cpt); - if (!conn->ibc_connvars) { - CERROR("Can't allocate in-progress connection state\n"); - goto failed_2; - } - - write_lock_irqsave(glock, flags); - if (dev->ibd_failover) { - write_unlock_irqrestore(glock, flags); - CERROR("%s: failover in progress\n", dev->ibd_ifname); - goto failed_2; - } - - if (dev->ibd_hdev->ibh_ibdev != cmid->device) { - /* wakeup failover thread and teardown connection */ - if (kiblnd_dev_can_failover(dev)) { - list_add_tail(&dev->ibd_fail_list, - &kiblnd_data.kib_failed_devs); - wake_up(&kiblnd_data.kib_failover_waitq); - } - - write_unlock_irqrestore(glock, flags); - CERROR("cmid HCA(%s), kib_dev(%s) need failover\n", - cmid->device->name, dev->ibd_ifname); - goto failed_2; - } - - kiblnd_hdev_addref_locked(dev->ibd_hdev); - conn->ibc_hdev = dev->ibd_hdev; - - kiblnd_setup_mtu_locked(cmid); - - write_unlock_irqrestore(glock, flags); - - conn->ibc_rxs = kzalloc_cpt(IBLND_RX_MSGS(conn) * sizeof(struct kib_rx), - GFP_NOFS, cpt); - if (!conn->ibc_rxs) { - CERROR("Cannot allocate RX buffers\n"); - goto failed_2; - } - - rc = kiblnd_alloc_pages(&conn->ibc_rx_pages, cpt, - IBLND_RX_MSG_PAGES(conn)); - if (rc) - goto failed_2; - - kiblnd_map_rx_descs(conn); - - cq_attr.cqe = IBLND_CQ_ENTRIES(conn); - cq_attr.comp_vector = kiblnd_get_completion_vector(conn, cpt); - cq = ib_create_cq(cmid->device, - kiblnd_cq_completion, kiblnd_cq_event, conn, - &cq_attr); - if (IS_ERR(cq)) { - CERROR("Failed to create CQ with %d CQEs: %ld\n", - IBLND_CQ_ENTRIES(conn), PTR_ERR(cq)); - goto failed_2; - } - - conn->ibc_cq = cq; - - rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); - if (rc) { - CERROR("Can't request completion notification: %d\n", rc); - goto failed_2; - } - - init_qp_attr->event_handler = kiblnd_qp_event; - init_qp_attr->qp_context = conn; - init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS(conn); - init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS(conn); - init_qp_attr->cap.max_send_sge = 1; - init_qp_attr->cap.max_recv_sge = 1; - init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR; - init_qp_attr->qp_type = IB_QPT_RC; - init_qp_attr->send_cq = cq; - init_qp_attr->recv_cq = cq; - - conn->ibc_sched = sched; - - rc = rdma_create_qp(cmid, conn->ibc_hdev->ibh_pd, init_qp_attr); - if (rc) { - CERROR("Can't create QP: %d, send_wr: %d, recv_wr: %d\n", - rc, init_qp_attr->cap.max_send_wr, - init_qp_attr->cap.max_recv_wr); - goto failed_2; - } - - kfree(init_qp_attr); - - /* 1 ref for caller and each rxmsg */ - atomic_set(&conn->ibc_refcount, 1 + IBLND_RX_MSGS(conn)); - conn->ibc_nrx = IBLND_RX_MSGS(conn); - - /* post receives */ - for (i = 0; i < IBLND_RX_MSGS(conn); i++) { - rc = kiblnd_post_rx(&conn->ibc_rxs[i], - IBLND_POSTRX_NO_CREDIT); - if (rc) { - CERROR("Can't post rxmsg: %d\n", rc); - - /* Make posted receives complete */ - kiblnd_abort_receives(conn); - - /* - * correct # of posted buffers - * NB locking needed now I'm racing with completion - */ - spin_lock_irqsave(&sched->ibs_lock, flags); - conn->ibc_nrx -= IBLND_RX_MSGS(conn) - i; - spin_unlock_irqrestore(&sched->ibs_lock, flags); - - /* - * cmid will be destroyed by CM(ofed) after cm_callback - * returned, so we can't refer it anymore - * (by kiblnd_connd()->kiblnd_destroy_conn) - */ - rdma_destroy_qp(conn->ibc_cmid); - conn->ibc_cmid = NULL; - - /* Drop my own and unused rxbuffer refcounts */ - while (i++ <= IBLND_RX_MSGS(conn)) - kiblnd_conn_decref(conn); - - return NULL; - } - } - - /* Init successful! */ - LASSERT(state == IBLND_CONN_ACTIVE_CONNECT || - state == IBLND_CONN_PASSIVE_WAIT); - conn->ibc_state = state; - - /* 1 more conn */ - atomic_inc(&net->ibn_nconns); - return conn; - - failed_2: - kiblnd_destroy_conn(conn); - kfree(conn); - failed_1: - kfree(init_qp_attr); - failed_0: - return NULL; -} - -void kiblnd_destroy_conn(struct kib_conn *conn) -{ - struct rdma_cm_id *cmid = conn->ibc_cmid; - struct kib_peer *peer = conn->ibc_peer; - int rc; - - LASSERT(!in_interrupt()); - LASSERT(!atomic_read(&conn->ibc_refcount)); - LASSERT(list_empty(&conn->ibc_early_rxs)); - LASSERT(list_empty(&conn->ibc_tx_noops)); - LASSERT(list_empty(&conn->ibc_tx_queue)); - LASSERT(list_empty(&conn->ibc_tx_queue_rsrvd)); - LASSERT(list_empty(&conn->ibc_tx_queue_nocred)); - LASSERT(list_empty(&conn->ibc_active_txs)); - LASSERT(!conn->ibc_noops_posted); - LASSERT(!conn->ibc_nsends_posted); - - switch (conn->ibc_state) { - default: - /* conn must be completely disengaged from the network */ - LBUG(); - - case IBLND_CONN_DISCONNECTED: - /* connvars should have been freed already */ - LASSERT(!conn->ibc_connvars); - break; - - case IBLND_CONN_INIT: - break; - } - - /* conn->ibc_cmid might be destroyed by CM already */ - if (cmid && cmid->qp) - rdma_destroy_qp(cmid); - - if (conn->ibc_cq) { - rc = ib_destroy_cq(conn->ibc_cq); - if (rc) - CWARN("Error destroying CQ: %d\n", rc); - } - - if (conn->ibc_rx_pages) - kiblnd_unmap_rx_descs(conn); - - kfree(conn->ibc_rxs); - kfree(conn->ibc_connvars); - - if (conn->ibc_hdev) - kiblnd_hdev_decref(conn->ibc_hdev); - - /* See CAVEAT EMPTOR above in kiblnd_create_conn */ - if (conn->ibc_state != IBLND_CONN_INIT) { - struct kib_net *net = peer->ibp_ni->ni_data; - - kiblnd_peer_decref(peer); - rdma_destroy_id(cmid); - atomic_dec(&net->ibn_nconns); - } -} - -int kiblnd_close_peer_conns_locked(struct kib_peer *peer, int why) -{ - struct kib_conn *conn; - struct list_head *ctmp; - struct list_head *cnxt; - int count = 0; - - list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry(ctmp, struct kib_conn, ibc_list); - - CDEBUG(D_NET, "Closing conn -> %s, version: %x, reason: %d\n", - libcfs_nid2str(peer->ibp_nid), - conn->ibc_version, why); - - kiblnd_close_conn_locked(conn, why); - count++; - } - - return count; -} - -int kiblnd_close_stale_conns_locked(struct kib_peer *peer, - int version, __u64 incarnation) -{ - struct kib_conn *conn; - struct list_head *ctmp; - struct list_head *cnxt; - int count = 0; - - list_for_each_safe(ctmp, cnxt, &peer->ibp_conns) { - conn = list_entry(ctmp, struct kib_conn, ibc_list); - - if (conn->ibc_version == version && - conn->ibc_incarnation == incarnation) - continue; - - CDEBUG(D_NET, - "Closing stale conn -> %s version: %x, incarnation:%#llx(%x, %#llx)\n", - libcfs_nid2str(peer->ibp_nid), - conn->ibc_version, conn->ibc_incarnation, - version, incarnation); - - kiblnd_close_conn_locked(conn, -ESTALE); - count++; - } - - return count; -} - -static int kiblnd_close_matching_conns(struct lnet_ni *ni, lnet_nid_t nid) -{ - struct kib_peer *peer; - struct list_head *ptmp; - struct list_head *pnxt; - int lo; - int hi; - int i; - unsigned long flags; - int count = 0; - - write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - - if (nid != LNET_NID_ANY) { - lo = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers; - hi = kiblnd_nid2peerlist(nid) - kiblnd_data.kib_peers; - } else { - lo = 0; - hi = kiblnd_data.kib_peer_hash_size - 1; - } - - for (i = lo; i <= hi; i++) { - list_for_each_safe(ptmp, pnxt, &kiblnd_data.kib_peers[i]) { - peer = list_entry(ptmp, struct kib_peer, ibp_list); - LASSERT(!kiblnd_peer_idle(peer)); - - if (peer->ibp_ni != ni) - continue; - - if (!(nid == LNET_NID_ANY || nid == peer->ibp_nid)) - continue; - - count += kiblnd_close_peer_conns_locked(peer, 0); - } - } - - write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - - /* wildcards always succeed */ - if (nid == LNET_NID_ANY) - return 0; - - return !count ? -ENOENT : 0; -} - -static int kiblnd_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg) -{ - struct libcfs_ioctl_data *data = arg; - int rc = -EINVAL; - - switch (cmd) { - case IOC_LIBCFS_GET_PEER: { - lnet_nid_t nid = 0; - int count = 0; - - rc = kiblnd_get_peer_info(ni, data->ioc_count, - &nid, &count); - data->ioc_nid = nid; - data->ioc_count = count; - break; - } - - case IOC_LIBCFS_DEL_PEER: { - rc = kiblnd_del_peer(ni, data->ioc_nid); - break; - } - case IOC_LIBCFS_GET_CONN: { - struct kib_conn *conn; - - rc = 0; - conn = kiblnd_get_conn_by_idx(ni, data->ioc_count); - if (!conn) { - rc = -ENOENT; - break; - } - - LASSERT(conn->ibc_cmid); - data->ioc_nid = conn->ibc_peer->ibp_nid; - if (!conn->ibc_cmid->route.path_rec) - data->ioc_u32[0] = 0; /* iWarp has no path MTU */ - else - data->ioc_u32[0] = - ib_mtu_enum_to_int(conn->ibc_cmid->route.path_rec->mtu); - kiblnd_conn_decref(conn); - break; - } - case IOC_LIBCFS_CLOSE_CONNECTION: { - rc = kiblnd_close_matching_conns(ni, data->ioc_nid); - break; - } - - default: - break; - } - - return rc; -} - -static void kiblnd_query(struct lnet_ni *ni, lnet_nid_t nid, - unsigned long *when) -{ - unsigned long last_alive = 0; - unsigned long now = jiffies; - rwlock_t *glock = &kiblnd_data.kib_global_lock; - struct kib_peer *peer; - unsigned long flags; - - read_lock_irqsave(glock, flags); - - peer = kiblnd_find_peer_locked(nid); - if (peer) - last_alive = peer->ibp_last_alive; - - read_unlock_irqrestore(glock, flags); - - if (last_alive) - *when = last_alive; - - /* - * peer is not persistent in hash, trigger peer creation - * and connection establishment with a NULL tx - */ - if (!peer) - kiblnd_launch_tx(ni, NULL, nid); - - CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago\n", - libcfs_nid2str(nid), peer, - last_alive ? (now - last_alive) / HZ : -1); -} - -static void kiblnd_free_pages(struct kib_pages *p) -{ - int npages = p->ibp_npages; - int i; - - for (i = 0; i < npages; i++) { - if (p->ibp_pages[i]) - __free_page(p->ibp_pages[i]); - } - - kfree(p); -} - -int kiblnd_alloc_pages(struct kib_pages **pp, int cpt, int npages) -{ - struct kib_pages *p; - int i; - - p = kzalloc_cpt(offsetof(struct kib_pages, ibp_pages[npages]), - GFP_NOFS, cpt); - if (!p) { - CERROR("Can't allocate descriptor for %d pages\n", npages); - return -ENOMEM; - } - - p->ibp_npages = npages; - - for (i = 0; i < npages; i++) { - p->ibp_pages[i] = alloc_pages_node( - cfs_cpt_spread_node(lnet_cpt_table(), cpt), - GFP_NOFS, 0); - if (!p->ibp_pages[i]) { - CERROR("Can't allocate page %d of %d\n", i, npages); - kiblnd_free_pages(p); - return -ENOMEM; - } - } - - *pp = p; - return 0; -} - -void kiblnd_unmap_rx_descs(struct kib_conn *conn) -{ - struct kib_rx *rx; - int i; - - LASSERT(conn->ibc_rxs); - LASSERT(conn->ibc_hdev); - - for (i = 0; i < IBLND_RX_MSGS(conn); i++) { - rx = &conn->ibc_rxs[i]; - - LASSERT(rx->rx_nob >= 0); /* not posted */ - - kiblnd_dma_unmap_single(conn->ibc_hdev->ibh_ibdev, - KIBLND_UNMAP_ADDR(rx, rx_msgunmap, - rx->rx_msgaddr), - IBLND_MSG_SIZE, DMA_FROM_DEVICE); - } - - kiblnd_free_pages(conn->ibc_rx_pages); - - conn->ibc_rx_pages = NULL; -} - -void kiblnd_map_rx_descs(struct kib_conn *conn) -{ - struct kib_rx *rx; - struct page *pg; - int pg_off; - int ipg; - int i; - - for (pg_off = ipg = i = 0; i < IBLND_RX_MSGS(conn); i++) { - pg = conn->ibc_rx_pages->ibp_pages[ipg]; - rx = &conn->ibc_rxs[i]; - - rx->rx_conn = conn; - rx->rx_msg = (struct kib_msg *)(((char *)page_address(pg)) + pg_off); - - rx->rx_msgaddr = kiblnd_dma_map_single(conn->ibc_hdev->ibh_ibdev, - rx->rx_msg, - IBLND_MSG_SIZE, - DMA_FROM_DEVICE); - LASSERT(!kiblnd_dma_mapping_error(conn->ibc_hdev->ibh_ibdev, - rx->rx_msgaddr)); - KIBLND_UNMAP_ADDR_SET(rx, rx_msgunmap, rx->rx_msgaddr); - - CDEBUG(D_NET, "rx %d: %p %#llx(%#llx)\n", - i, rx->rx_msg, rx->rx_msgaddr, - (__u64)(page_to_phys(pg) + pg_off)); - - pg_off += IBLND_MSG_SIZE; - LASSERT(pg_off <= PAGE_SIZE); - - if (pg_off == PAGE_SIZE) { - pg_off = 0; - ipg++; - LASSERT(ipg <= IBLND_RX_MSG_PAGES(conn)); - } - } -} - -static void kiblnd_unmap_tx_pool(struct kib_tx_pool *tpo) -{ - struct kib_hca_dev *hdev = tpo->tpo_hdev; - struct kib_tx *tx; - int i; - - LASSERT(!tpo->tpo_pool.po_allocated); - - if (!hdev) - return; - - for (i = 0; i < tpo->tpo_pool.po_size; i++) { - tx = &tpo->tpo_tx_descs[i]; - kiblnd_dma_unmap_single(hdev->ibh_ibdev, - KIBLND_UNMAP_ADDR(tx, tx_msgunmap, - tx->tx_msgaddr), - IBLND_MSG_SIZE, DMA_TO_DEVICE); - } - - kiblnd_hdev_decref(hdev); - tpo->tpo_hdev = NULL; -} - -static struct kib_hca_dev *kiblnd_current_hdev(struct kib_dev *dev) -{ - struct kib_hca_dev *hdev; - unsigned long flags; - int i = 0; - - read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - while (dev->ibd_failover) { - read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - if (!(i++ % 50)) - CDEBUG(D_NET, "%s: Wait for failover\n", - dev->ibd_ifname); - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ / 100); - - read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - } - - kiblnd_hdev_addref_locked(dev->ibd_hdev); - hdev = dev->ibd_hdev; - - read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - - return hdev; -} - -static void kiblnd_map_tx_pool(struct kib_tx_pool *tpo) -{ - struct kib_pages *txpgs = tpo->tpo_tx_pages; - struct kib_pool *pool = &tpo->tpo_pool; - struct kib_net *net = pool->po_owner->ps_net; - struct kib_dev *dev; - struct page *page; - struct kib_tx *tx; - int page_offset; - int ipage; - int i; - - LASSERT(net); - - dev = net->ibn_dev; - - /* pre-mapped messages are not bigger than 1 page */ - BUILD_BUG_ON(IBLND_MSG_SIZE > PAGE_SIZE); - - /* No fancy arithmetic when we do the buffer calculations */ - BUILD_BUG_ON(PAGE_SIZE % IBLND_MSG_SIZE); - - tpo->tpo_hdev = kiblnd_current_hdev(dev); - - for (ipage = page_offset = i = 0; i < pool->po_size; i++) { - page = txpgs->ibp_pages[ipage]; - tx = &tpo->tpo_tx_descs[i]; - - tx->tx_msg = (struct kib_msg *)(((char *)page_address(page)) + - page_offset); - - tx->tx_msgaddr = kiblnd_dma_map_single( - tpo->tpo_hdev->ibh_ibdev, tx->tx_msg, - IBLND_MSG_SIZE, DMA_TO_DEVICE); - LASSERT(!kiblnd_dma_mapping_error(tpo->tpo_hdev->ibh_ibdev, - tx->tx_msgaddr)); - KIBLND_UNMAP_ADDR_SET(tx, tx_msgunmap, tx->tx_msgaddr); - - list_add(&tx->tx_list, &pool->po_free_list); - - page_offset += IBLND_MSG_SIZE; - LASSERT(page_offset <= PAGE_SIZE); - - if (page_offset == PAGE_SIZE) { - page_offset = 0; - ipage++; - LASSERT(ipage <= txpgs->ibp_npages); - } - } -} - -static void kiblnd_destroy_fmr_pool(struct kib_fmr_pool *fpo) -{ - LASSERT(!fpo->fpo_map_count); - - if (fpo->fpo_is_fmr) { - if (fpo->fmr.fpo_fmr_pool) - ib_destroy_fmr_pool(fpo->fmr.fpo_fmr_pool); - } else { - struct kib_fast_reg_descriptor *frd, *tmp; - int i = 0; - - list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list, - frd_list) { - list_del(&frd->frd_list); - ib_dereg_mr(frd->frd_mr); - kfree(frd); - i++; - } - if (i < fpo->fast_reg.fpo_pool_size) - CERROR("FastReg pool still has %d regions registered\n", - fpo->fast_reg.fpo_pool_size - i); - } - - if (fpo->fpo_hdev) - kiblnd_hdev_decref(fpo->fpo_hdev); - - kfree(fpo); -} - -static void kiblnd_destroy_fmr_pool_list(struct list_head *head) -{ - struct kib_fmr_pool *fpo, *tmp; - - list_for_each_entry_safe(fpo, tmp, head, fpo_list) { - list_del(&fpo->fpo_list); - kiblnd_destroy_fmr_pool(fpo); - } -} - -static int -kiblnd_fmr_pool_size(struct lnet_ioctl_config_o2iblnd_tunables *tunables, - int ncpts) -{ - int size = tunables->lnd_fmr_pool_size / ncpts; - - return max(IBLND_FMR_POOL, size); -} - -static int -kiblnd_fmr_flush_trigger(struct lnet_ioctl_config_o2iblnd_tunables *tunables, - int ncpts) -{ - int size = tunables->lnd_fmr_flush_trigger / ncpts; - - return max(IBLND_FMR_POOL_FLUSH, size); -} - -static int kiblnd_alloc_fmr_pool(struct kib_fmr_poolset *fps, struct kib_fmr_pool *fpo) -{ - struct ib_fmr_pool_param param = { - .max_pages_per_fmr = LNET_MAX_PAYLOAD / PAGE_SIZE, - .page_shift = PAGE_SHIFT, - .access = (IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE), - .pool_size = fps->fps_pool_size, - .dirty_watermark = fps->fps_flush_trigger, - .flush_function = NULL, - .flush_arg = NULL, - .cache = !!fps->fps_cache }; - int rc = 0; - - fpo->fmr.fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd, - ¶m); - if (IS_ERR(fpo->fmr.fpo_fmr_pool)) { - rc = PTR_ERR(fpo->fmr.fpo_fmr_pool); - if (rc != -ENOSYS) - CERROR("Failed to create FMR pool: %d\n", rc); - else - CERROR("FMRs are not supported\n"); - } - - return rc; -} - -static int kiblnd_alloc_freg_pool(struct kib_fmr_poolset *fps, struct kib_fmr_pool *fpo) -{ - struct kib_fast_reg_descriptor *frd, *tmp; - int i, rc; - - INIT_LIST_HEAD(&fpo->fast_reg.fpo_pool_list); - fpo->fast_reg.fpo_pool_size = 0; - for (i = 0; i < fps->fps_pool_size; i++) { - frd = kzalloc_cpt(sizeof(*frd), GFP_NOFS, fps->fps_cpt); - if (!frd) { - CERROR("Failed to allocate a new fast_reg descriptor\n"); - rc = -ENOMEM; - goto out; - } - - frd->frd_mr = ib_alloc_mr(fpo->fpo_hdev->ibh_pd, - IB_MR_TYPE_MEM_REG, - LNET_MAX_PAYLOAD / PAGE_SIZE); - if (IS_ERR(frd->frd_mr)) { - rc = PTR_ERR(frd->frd_mr); - CERROR("Failed to allocate ib_alloc_mr: %d\n", rc); - frd->frd_mr = NULL; - goto out_middle; - } - - frd->frd_valid = true; - - list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list); - fpo->fast_reg.fpo_pool_size++; - } - - return 0; - -out_middle: - if (frd->frd_mr) - ib_dereg_mr(frd->frd_mr); - kfree(frd); - -out: - list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list, - frd_list) { - list_del(&frd->frd_list); - ib_dereg_mr(frd->frd_mr); - kfree(frd); - } - - return rc; -} - -static int kiblnd_create_fmr_pool(struct kib_fmr_poolset *fps, - struct kib_fmr_pool **pp_fpo) -{ - struct kib_dev *dev = fps->fps_net->ibn_dev; - struct ib_device_attr *dev_attr; - struct kib_fmr_pool *fpo; - int rc; - - fpo = kzalloc_cpt(sizeof(*fpo), GFP_NOFS, fps->fps_cpt); - if (!fpo) - return -ENOMEM; - - fpo->fpo_hdev = kiblnd_current_hdev(dev); - dev_attr = &fpo->fpo_hdev->ibh_ibdev->attrs; - - /* Check for FMR or FastReg support */ - fpo->fpo_is_fmr = 0; - if (fpo->fpo_hdev->ibh_ibdev->alloc_fmr && - fpo->fpo_hdev->ibh_ibdev->dealloc_fmr && - fpo->fpo_hdev->ibh_ibdev->map_phys_fmr && - fpo->fpo_hdev->ibh_ibdev->unmap_fmr) { - LCONSOLE_INFO("Using FMR for registration\n"); - fpo->fpo_is_fmr = 1; - } else if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { - LCONSOLE_INFO("Using FastReg for registration\n"); - } else { - rc = -ENOSYS; - LCONSOLE_ERROR_MSG(rc, "IB device does not support FMRs nor FastRegs, can't register memory\n"); - goto out_fpo; - } - - if (fpo->fpo_is_fmr) - rc = kiblnd_alloc_fmr_pool(fps, fpo); - else - rc = kiblnd_alloc_freg_pool(fps, fpo); - if (rc) - goto out_fpo; - - fpo->fpo_deadline = jiffies + IBLND_POOL_DEADLINE * HZ; - fpo->fpo_owner = fps; - *pp_fpo = fpo; - - return 0; - -out_fpo: - kiblnd_hdev_decref(fpo->fpo_hdev); - kfree(fpo); - return rc; -} - -static void kiblnd_fail_fmr_poolset(struct kib_fmr_poolset *fps, - struct list_head *zombies) -{ - if (!fps->fps_net) /* initialized? */ - return; - - spin_lock(&fps->fps_lock); - - while (!list_empty(&fps->fps_pool_list)) { - struct kib_fmr_pool *fpo = list_entry(fps->fps_pool_list.next, - struct kib_fmr_pool, fpo_list); - fpo->fpo_failed = 1; - list_del(&fpo->fpo_list); - if (!fpo->fpo_map_count) - list_add(&fpo->fpo_list, zombies); - else - list_add(&fpo->fpo_list, &fps->fps_failed_pool_list); - } - - spin_unlock(&fps->fps_lock); -} - -static void kiblnd_fini_fmr_poolset(struct kib_fmr_poolset *fps) -{ - if (fps->fps_net) { /* initialized? */ - kiblnd_destroy_fmr_pool_list(&fps->fps_failed_pool_list); - kiblnd_destroy_fmr_pool_list(&fps->fps_pool_list); - } -} - -static int -kiblnd_init_fmr_poolset(struct kib_fmr_poolset *fps, int cpt, int ncpts, - struct kib_net *net, - struct lnet_ioctl_config_o2iblnd_tunables *tunables) -{ - struct kib_fmr_pool *fpo; - int rc; - - memset(fps, 0, sizeof(*fps)); - - fps->fps_net = net; - fps->fps_cpt = cpt; - - fps->fps_pool_size = kiblnd_fmr_pool_size(tunables, ncpts); - fps->fps_flush_trigger = kiblnd_fmr_flush_trigger(tunables, ncpts); - fps->fps_cache = tunables->lnd_fmr_cache; - - spin_lock_init(&fps->fps_lock); - INIT_LIST_HEAD(&fps->fps_pool_list); - INIT_LIST_HEAD(&fps->fps_failed_pool_list); - - rc = kiblnd_create_fmr_pool(fps, &fpo); - if (!rc) - list_add_tail(&fpo->fpo_list, &fps->fps_pool_list); - - return rc; -} - -static int kiblnd_fmr_pool_is_idle(struct kib_fmr_pool *fpo, unsigned long now) -{ - if (fpo->fpo_map_count) /* still in use */ - return 0; - if (fpo->fpo_failed) - return 1; - return time_after_eq(now, fpo->fpo_deadline); -} - -static int -kiblnd_map_tx_pages(struct kib_tx *tx, struct kib_rdma_desc *rd) -{ - __u64 *pages = tx->tx_pages; - struct kib_hca_dev *hdev; - int npages; - int size; - int i; - - hdev = tx->tx_pool->tpo_hdev; - - for (i = 0, npages = 0; i < rd->rd_nfrags; i++) { - for (size = 0; size < rd->rd_frags[i].rf_nob; - size += hdev->ibh_page_size) { - pages[npages++] = (rd->rd_frags[i].rf_addr & - hdev->ibh_page_mask) + size; - } - } - - return npages; -} - -void kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status) -{ - LIST_HEAD(zombies); - struct kib_fmr_pool *fpo = fmr->fmr_pool; - struct kib_fmr_poolset *fps; - unsigned long now = jiffies; - struct kib_fmr_pool *tmp; - int rc; - - if (!fpo) - return; - - fps = fpo->fpo_owner; - if (fpo->fpo_is_fmr) { - if (fmr->fmr_pfmr) { - rc = ib_fmr_pool_unmap(fmr->fmr_pfmr); - LASSERT(!rc); - fmr->fmr_pfmr = NULL; - } - - if (status) { - rc = ib_flush_fmr_pool(fpo->fmr.fpo_fmr_pool); - LASSERT(!rc); - } - } else { - struct kib_fast_reg_descriptor *frd = fmr->fmr_frd; - - if (frd) { - frd->frd_valid = false; - spin_lock(&fps->fps_lock); - list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list); - spin_unlock(&fps->fps_lock); - fmr->fmr_frd = NULL; - } - } - fmr->fmr_pool = NULL; - - spin_lock(&fps->fps_lock); - fpo->fpo_map_count--; /* decref the pool */ - - list_for_each_entry_safe(fpo, tmp, &fps->fps_pool_list, fpo_list) { - /* the first pool is persistent */ - if (fps->fps_pool_list.next == &fpo->fpo_list) - continue; - - if (kiblnd_fmr_pool_is_idle(fpo, now)) { - list_move(&fpo->fpo_list, &zombies); - fps->fps_version++; - } - } - spin_unlock(&fps->fps_lock); - - if (!list_empty(&zombies)) - kiblnd_destroy_fmr_pool_list(&zombies); -} - -int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx, - struct kib_rdma_desc *rd, __u32 nob, __u64 iov, - struct kib_fmr *fmr) -{ - __u64 *pages = tx->tx_pages; - bool is_rx = (rd != tx->tx_rd); - bool tx_pages_mapped = false; - struct kib_fmr_pool *fpo; - int npages = 0; - __u64 version; - int rc; - - again: - spin_lock(&fps->fps_lock); - version = fps->fps_version; - list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) { - fpo->fpo_deadline = jiffies + IBLND_POOL_DEADLINE * HZ; - fpo->fpo_map_count++; - - if (fpo->fpo_is_fmr) { - struct ib_pool_fmr *pfmr; - - spin_unlock(&fps->fps_lock); - - if (!tx_pages_mapped) { - npages = kiblnd_map_tx_pages(tx, rd); - tx_pages_mapped = 1; - } - - pfmr = ib_fmr_pool_map_phys(fpo->fmr.fpo_fmr_pool, - pages, npages, iov); - if (likely(!IS_ERR(pfmr))) { - fmr->fmr_key = is_rx ? pfmr->fmr->rkey : - pfmr->fmr->lkey; - fmr->fmr_frd = NULL; - fmr->fmr_pfmr = pfmr; - fmr->fmr_pool = fpo; - return 0; - } - rc = PTR_ERR(pfmr); - } else { - if (!list_empty(&fpo->fast_reg.fpo_pool_list)) { - struct kib_fast_reg_descriptor *frd; - struct ib_reg_wr *wr; - struct ib_mr *mr; - int n; - - frd = list_first_entry(&fpo->fast_reg.fpo_pool_list, - struct kib_fast_reg_descriptor, - frd_list); - list_del(&frd->frd_list); - spin_unlock(&fps->fps_lock); - - mr = frd->frd_mr; - - if (!frd->frd_valid) { - __u32 key = is_rx ? mr->rkey : mr->lkey; - struct ib_send_wr *inv_wr; - - inv_wr = &frd->frd_inv_wr; - memset(inv_wr, 0, sizeof(*inv_wr)); - inv_wr->opcode = IB_WR_LOCAL_INV; - inv_wr->wr_id = IBLND_WID_MR; - inv_wr->ex.invalidate_rkey = key; - - /* Bump the key */ - key = ib_inc_rkey(key); - ib_update_fast_reg_key(mr, key); - } - - n = ib_map_mr_sg(mr, tx->tx_frags, - tx->tx_nfrags, NULL, PAGE_SIZE); - if (unlikely(n != tx->tx_nfrags)) { - CERROR("Failed to map mr %d/%d elements\n", - n, tx->tx_nfrags); - return n < 0 ? n : -EINVAL; - } - - mr->iova = iov; - - /* Prepare FastReg WR */ - wr = &frd->frd_fastreg_wr; - memset(wr, 0, sizeof(*wr)); - wr->wr.opcode = IB_WR_REG_MR; - wr->wr.wr_id = IBLND_WID_MR; - wr->wr.num_sge = 0; - wr->wr.send_flags = 0; - wr->mr = mr; - wr->key = is_rx ? mr->rkey : mr->lkey; - wr->access = (IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE); - - fmr->fmr_key = is_rx ? mr->rkey : mr->lkey; - fmr->fmr_frd = frd; - fmr->fmr_pfmr = NULL; - fmr->fmr_pool = fpo; - return 0; - } - spin_unlock(&fps->fps_lock); - rc = -EAGAIN; - } - - spin_lock(&fps->fps_lock); - fpo->fpo_map_count--; - if (rc != -EAGAIN) { - spin_unlock(&fps->fps_lock); - return rc; - } - - /* EAGAIN and ... */ - if (version != fps->fps_version) { - spin_unlock(&fps->fps_lock); - goto again; - } - } - - if (fps->fps_increasing) { - spin_unlock(&fps->fps_lock); - CDEBUG(D_NET, "Another thread is allocating new FMR pool, waiting for her to complete\n"); - schedule(); - goto again; - } - - if (time_before(jiffies, fps->fps_next_retry)) { - /* someone failed recently */ - spin_unlock(&fps->fps_lock); - return -EAGAIN; - } - - fps->fps_increasing = 1; - spin_unlock(&fps->fps_lock); - - CDEBUG(D_NET, "Allocate new FMR pool\n"); - rc = kiblnd_create_fmr_pool(fps, &fpo); - spin_lock(&fps->fps_lock); - fps->fps_increasing = 0; - if (!rc) { - fps->fps_version++; - list_add_tail(&fpo->fpo_list, &fps->fps_pool_list); - } else { - fps->fps_next_retry = jiffies + IBLND_POOL_RETRY * HZ; - } - spin_unlock(&fps->fps_lock); - - goto again; -} - -static void kiblnd_fini_pool(struct kib_pool *pool) -{ - LASSERT(list_empty(&pool->po_free_list)); - LASSERT(!pool->po_allocated); - - CDEBUG(D_NET, "Finalize %s pool\n", pool->po_owner->ps_name); -} - -static void kiblnd_init_pool(struct kib_poolset *ps, struct kib_pool *pool, int size) -{ - CDEBUG(D_NET, "Initialize %s pool\n", ps->ps_name); - - memset(pool, 0, sizeof(*pool)); - INIT_LIST_HEAD(&pool->po_free_list); - pool->po_deadline = jiffies + IBLND_POOL_DEADLINE * HZ; - pool->po_owner = ps; - pool->po_size = size; -} - -static void kiblnd_destroy_pool_list(struct list_head *head) -{ - struct kib_pool *pool; - - while (!list_empty(head)) { - pool = list_entry(head->next, struct kib_pool, po_list); - list_del(&pool->po_list); - - LASSERT(pool->po_owner); - pool->po_owner->ps_pool_destroy(pool); - } -} - -static void kiblnd_fail_poolset(struct kib_poolset *ps, struct list_head *zombies) -{ - if (!ps->ps_net) /* initialized? */ - return; - - spin_lock(&ps->ps_lock); - while (!list_empty(&ps->ps_pool_list)) { - struct kib_pool *po = list_entry(ps->ps_pool_list.next, - struct kib_pool, po_list); - po->po_failed = 1; - list_del(&po->po_list); - if (!po->po_allocated) - list_add(&po->po_list, zombies); - else - list_add(&po->po_list, &ps->ps_failed_pool_list); - } - spin_unlock(&ps->ps_lock); -} - -static void kiblnd_fini_poolset(struct kib_poolset *ps) -{ - if (ps->ps_net) { /* initialized? */ - kiblnd_destroy_pool_list(&ps->ps_failed_pool_list); - kiblnd_destroy_pool_list(&ps->ps_pool_list); - } -} - -static int kiblnd_init_poolset(struct kib_poolset *ps, int cpt, - struct kib_net *net, char *name, int size, - kib_ps_pool_create_t po_create, - kib_ps_pool_destroy_t po_destroy, - kib_ps_node_init_t nd_init, - kib_ps_node_fini_t nd_fini) -{ - struct kib_pool *pool; - int rc; - - memset(ps, 0, sizeof(*ps)); - - ps->ps_cpt = cpt; - ps->ps_net = net; - ps->ps_pool_create = po_create; - ps->ps_pool_destroy = po_destroy; - ps->ps_node_init = nd_init; - ps->ps_node_fini = nd_fini; - ps->ps_pool_size = size; - if (strlcpy(ps->ps_name, name, sizeof(ps->ps_name)) - >= sizeof(ps->ps_name)) - return -E2BIG; - spin_lock_init(&ps->ps_lock); - INIT_LIST_HEAD(&ps->ps_pool_list); - INIT_LIST_HEAD(&ps->ps_failed_pool_list); - - rc = ps->ps_pool_create(ps, size, &pool); - if (!rc) - list_add(&pool->po_list, &ps->ps_pool_list); - else - CERROR("Failed to create the first pool for %s\n", ps->ps_name); - - return rc; -} - -static int kiblnd_pool_is_idle(struct kib_pool *pool, unsigned long now) -{ - if (pool->po_allocated) /* still in use */ - return 0; - if (pool->po_failed) - return 1; - return time_after_eq(now, pool->po_deadline); -} - -void kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node) -{ - LIST_HEAD(zombies); - struct kib_poolset *ps = pool->po_owner; - struct kib_pool *tmp; - unsigned long now = jiffies; - - spin_lock(&ps->ps_lock); - - if (ps->ps_node_fini) - ps->ps_node_fini(pool, node); - - LASSERT(pool->po_allocated > 0); - list_add(node, &pool->po_free_list); - pool->po_allocated--; - - list_for_each_entry_safe(pool, tmp, &ps->ps_pool_list, po_list) { - /* the first pool is persistent */ - if (ps->ps_pool_list.next == &pool->po_list) - continue; - - if (kiblnd_pool_is_idle(pool, now)) - list_move(&pool->po_list, &zombies); - } - spin_unlock(&ps->ps_lock); - - if (!list_empty(&zombies)) - kiblnd_destroy_pool_list(&zombies); -} - -struct list_head *kiblnd_pool_alloc_node(struct kib_poolset *ps) -{ - struct list_head *node; - struct kib_pool *pool; - unsigned int interval = 1; - unsigned long time_before; - unsigned int trips = 0; - int rc; - - again: - spin_lock(&ps->ps_lock); - list_for_each_entry(pool, &ps->ps_pool_list, po_list) { - if (list_empty(&pool->po_free_list)) - continue; - - pool->po_allocated++; - pool->po_deadline = jiffies + IBLND_POOL_DEADLINE * HZ; - node = pool->po_free_list.next; - list_del(node); - - if (ps->ps_node_init) { - /* still hold the lock */ - ps->ps_node_init(pool, node); - } - spin_unlock(&ps->ps_lock); - return node; - } - - /* no available tx pool and ... */ - if (ps->ps_increasing) { - /* another thread is allocating a new pool */ - spin_unlock(&ps->ps_lock); - trips++; - CDEBUG(D_NET, "Another thread is allocating new %s pool, waiting %d HZs for her to complete. trips = %d\n", - ps->ps_name, interval, trips); - - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(interval); - if (interval < HZ) - interval *= 2; - - goto again; - } - - if (time_before(jiffies, ps->ps_next_retry)) { - /* someone failed recently */ - spin_unlock(&ps->ps_lock); - return NULL; - } - - ps->ps_increasing = 1; - spin_unlock(&ps->ps_lock); - - CDEBUG(D_NET, "%s pool exhausted, allocate new pool\n", ps->ps_name); - time_before = jiffies; - rc = ps->ps_pool_create(ps, ps->ps_pool_size, &pool); - CDEBUG(D_NET, "ps_pool_create took %lu HZ to complete", - jiffies - time_before); - - spin_lock(&ps->ps_lock); - ps->ps_increasing = 0; - if (!rc) { - list_add_tail(&pool->po_list, &ps->ps_pool_list); - } else { - ps->ps_next_retry = jiffies + IBLND_POOL_RETRY * HZ; - CERROR("Can't allocate new %s pool because out of memory\n", - ps->ps_name); - } - spin_unlock(&ps->ps_lock); - - goto again; -} - -static void kiblnd_destroy_tx_pool(struct kib_pool *pool) -{ - struct kib_tx_pool *tpo = container_of(pool, struct kib_tx_pool, tpo_pool); - int i; - - LASSERT(!pool->po_allocated); - - if (tpo->tpo_tx_pages) { - kiblnd_unmap_tx_pool(tpo); - kiblnd_free_pages(tpo->tpo_tx_pages); - } - - if (!tpo->tpo_tx_descs) - goto out; - - for (i = 0; i < pool->po_size; i++) { - struct kib_tx *tx = &tpo->tpo_tx_descs[i]; - - list_del(&tx->tx_list); - kfree(tx->tx_pages); - kfree(tx->tx_frags); - kfree(tx->tx_wrq); - kfree(tx->tx_sge); - kfree(tx->tx_rd); - } - - kfree(tpo->tpo_tx_descs); -out: - kiblnd_fini_pool(pool); - kfree(tpo); -} - -static int kiblnd_tx_pool_size(int ncpts) -{ - int ntx = *kiblnd_tunables.kib_ntx / ncpts; - - return max(IBLND_TX_POOL, ntx); -} - -static int kiblnd_create_tx_pool(struct kib_poolset *ps, int size, - struct kib_pool **pp_po) -{ - int i; - int npg; - struct kib_pool *pool; - struct kib_tx_pool *tpo; - - tpo = kzalloc_cpt(sizeof(*tpo), GFP_NOFS, ps->ps_cpt); - if (!tpo) { - CERROR("Failed to allocate TX pool\n"); - return -ENOMEM; - } - - pool = &tpo->tpo_pool; - kiblnd_init_pool(ps, pool, size); - tpo->tpo_tx_descs = NULL; - tpo->tpo_tx_pages = NULL; - - npg = DIV_ROUND_UP(size * IBLND_MSG_SIZE, PAGE_SIZE); - if (kiblnd_alloc_pages(&tpo->tpo_tx_pages, ps->ps_cpt, npg)) { - CERROR("Can't allocate tx pages: %d\n", npg); - kfree(tpo); - return -ENOMEM; - } - - tpo->tpo_tx_descs = kzalloc_cpt(size * sizeof(struct kib_tx), - GFP_NOFS, ps->ps_cpt); - if (!tpo->tpo_tx_descs) { - CERROR("Can't allocate %d tx descriptors\n", size); - ps->ps_pool_destroy(pool); - return -ENOMEM; - } - - memset(tpo->tpo_tx_descs, 0, size * sizeof(struct kib_tx)); - - for (i = 0; i < size; i++) { - struct kib_tx *tx = &tpo->tpo_tx_descs[i]; - - tx->tx_pool = tpo; - if (ps->ps_net->ibn_fmr_ps) { - tx->tx_pages = kzalloc_cpt(LNET_MAX_IOV * sizeof(*tx->tx_pages), - GFP_NOFS, ps->ps_cpt); - if (!tx->tx_pages) - break; - } - - tx->tx_frags = kzalloc_cpt((1 + IBLND_MAX_RDMA_FRAGS) * - sizeof(*tx->tx_frags), - GFP_NOFS, ps->ps_cpt); - if (!tx->tx_frags) - break; - - sg_init_table(tx->tx_frags, IBLND_MAX_RDMA_FRAGS + 1); - - tx->tx_wrq = kzalloc_cpt((1 + IBLND_MAX_RDMA_FRAGS) * - sizeof(*tx->tx_wrq), - GFP_NOFS, ps->ps_cpt); - if (!tx->tx_wrq) - break; - - tx->tx_sge = kzalloc_cpt((1 + IBLND_MAX_RDMA_FRAGS) * - sizeof(*tx->tx_sge), - GFP_NOFS, ps->ps_cpt); - if (!tx->tx_sge) - break; - - tx->tx_rd = kzalloc_cpt(offsetof(struct kib_rdma_desc, - rd_frags[IBLND_MAX_RDMA_FRAGS]), - GFP_NOFS, ps->ps_cpt); - if (!tx->tx_rd) - break; - } - - if (i == size) { - kiblnd_map_tx_pool(tpo); - *pp_po = pool; - return 0; - } - - ps->ps_pool_destroy(pool); - return -ENOMEM; -} - -static void kiblnd_tx_init(struct kib_pool *pool, struct list_head *node) -{ - struct kib_tx_poolset *tps = container_of(pool->po_owner, - struct kib_tx_poolset, - tps_poolset); - struct kib_tx *tx = list_entry(node, struct kib_tx, tx_list); - - tx->tx_cookie = tps->tps_next_tx_cookie++; -} - -static void kiblnd_net_fini_pools(struct kib_net *net) -{ - int i; - - cfs_cpt_for_each(i, lnet_cpt_table()) { - struct kib_tx_poolset *tps; - struct kib_fmr_poolset *fps; - - if (net->ibn_tx_ps) { - tps = net->ibn_tx_ps[i]; - kiblnd_fini_poolset(&tps->tps_poolset); - } - - if (net->ibn_fmr_ps) { - fps = net->ibn_fmr_ps[i]; - kiblnd_fini_fmr_poolset(fps); - } - } - - if (net->ibn_tx_ps) { - cfs_percpt_free(net->ibn_tx_ps); - net->ibn_tx_ps = NULL; - } - - if (net->ibn_fmr_ps) { - cfs_percpt_free(net->ibn_fmr_ps); - net->ibn_fmr_ps = NULL; - } -} - -static int kiblnd_net_init_pools(struct kib_net *net, struct lnet_ni *ni, - __u32 *cpts, int ncpts) -{ - struct lnet_ioctl_config_o2iblnd_tunables *tunables; - int cpt; - int rc; - int i; - - tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; - - if (tunables->lnd_fmr_pool_size < *kiblnd_tunables.kib_ntx / 4) { - CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n", - tunables->lnd_fmr_pool_size, - *kiblnd_tunables.kib_ntx / 4); - rc = -EINVAL; - goto failed; - } - - /* - * TX pool must be created later than FMR, see LU-2268 - * for details - */ - LASSERT(!net->ibn_tx_ps); - - /* - * premapping can fail if ibd_nmr > 1, so we always create - * FMR pool and map-on-demand if premapping failed - * - * cfs_precpt_alloc is creating an array of struct kib_fmr_poolset - * The number of struct kib_fmr_poolsets create is equal to the - * number of CPTs that exist, i.e net->ibn_fmr_ps[cpt]. - */ - net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(), - sizeof(struct kib_fmr_poolset)); - if (!net->ibn_fmr_ps) { - CERROR("Failed to allocate FMR pool array\n"); - rc = -ENOMEM; - goto failed; - } - - for (i = 0; i < ncpts; i++) { - cpt = !cpts ? i : cpts[i]; - rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, ncpts, - net, tunables); - if (rc) { - CERROR("Can't initialize FMR pool for CPT %d: %d\n", - cpt, rc); - goto failed; - } - } - - if (i > 0) - LASSERT(i == ncpts); - - /* - * cfs_precpt_alloc is creating an array of struct kib_tx_poolset - * The number of struct kib_tx_poolsets create is equal to the - * number of CPTs that exist, i.e net->ibn_tx_ps[cpt]. - */ - net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(), - sizeof(struct kib_tx_poolset)); - if (!net->ibn_tx_ps) { - CERROR("Failed to allocate tx pool array\n"); - rc = -ENOMEM; - goto failed; - } - - for (i = 0; i < ncpts; i++) { - cpt = !cpts ? i : cpts[i]; - rc = kiblnd_init_poolset(&net->ibn_tx_ps[cpt]->tps_poolset, - cpt, net, "TX", - kiblnd_tx_pool_size(ncpts), - kiblnd_create_tx_pool, - kiblnd_destroy_tx_pool, - kiblnd_tx_init, NULL); - if (rc) { - CERROR("Can't initialize TX pool for CPT %d: %d\n", - cpt, rc); - goto failed; - } - } - - return 0; - failed: - kiblnd_net_fini_pools(net); - LASSERT(rc); - return rc; -} - -static int kiblnd_hdev_get_attr(struct kib_hca_dev *hdev) -{ - /* - * It's safe to assume a HCA can handle a page size - * matching that of the native system - */ - hdev->ibh_page_shift = PAGE_SHIFT; - hdev->ibh_page_size = 1 << PAGE_SHIFT; - hdev->ibh_page_mask = ~((__u64)hdev->ibh_page_size - 1); - - hdev->ibh_mr_size = hdev->ibh_ibdev->attrs.max_mr_size; - if (hdev->ibh_mr_size == ~0ULL) { - hdev->ibh_mr_shift = 64; - return 0; - } - - CERROR("Invalid mr size: %#llx\n", hdev->ibh_mr_size); - return -EINVAL; -} - -void kiblnd_hdev_destroy(struct kib_hca_dev *hdev) -{ - if (hdev->ibh_pd) - ib_dealloc_pd(hdev->ibh_pd); - - if (hdev->ibh_cmid) - rdma_destroy_id(hdev->ibh_cmid); - - kfree(hdev); -} - -/* DUMMY */ -static int kiblnd_dummy_callback(struct rdma_cm_id *cmid, - struct rdma_cm_event *event) -{ - return 0; -} - -static int kiblnd_dev_need_failover(struct kib_dev *dev) -{ - struct rdma_cm_id *cmid; - struct sockaddr_in srcaddr; - struct sockaddr_in dstaddr; - int rc; - - if (!dev->ibd_hdev || /* initializing */ - !dev->ibd_hdev->ibh_cmid || /* listener is dead */ - *kiblnd_tunables.kib_dev_failover > 1) /* debugging */ - return 1; - - /* - * XXX: it's UGLY, but I don't have better way to find - * ib-bonding HCA failover because: - * - * a. no reliable CM event for HCA failover... - * b. no OFED API to get ib_device for current net_device... - * - * We have only two choices at this point: - * - * a. rdma_bind_addr(), it will conflict with listener cmid - * b. rdma_resolve_addr() to zero addr - */ - cmid = kiblnd_rdma_create_id(kiblnd_dummy_callback, dev, RDMA_PS_TCP, - IB_QPT_RC); - if (IS_ERR(cmid)) { - rc = PTR_ERR(cmid); - CERROR("Failed to create cmid for failover: %d\n", rc); - return rc; - } - - memset(&srcaddr, 0, sizeof(srcaddr)); - srcaddr.sin_family = AF_INET; - srcaddr.sin_addr.s_addr = htonl(dev->ibd_ifip); - - memset(&dstaddr, 0, sizeof(dstaddr)); - dstaddr.sin_family = AF_INET; - rc = rdma_resolve_addr(cmid, (struct sockaddr *)&srcaddr, - (struct sockaddr *)&dstaddr, 1); - if (rc || !cmid->device) { - CERROR("Failed to bind %s:%pI4h to device(%p): %d\n", - dev->ibd_ifname, &dev->ibd_ifip, - cmid->device, rc); - rdma_destroy_id(cmid); - return rc; - } - - rc = dev->ibd_hdev->ibh_ibdev != cmid->device; /* true for failover */ - rdma_destroy_id(cmid); - - return rc; -} - -int kiblnd_dev_failover(struct kib_dev *dev) -{ - LIST_HEAD(zombie_tpo); - LIST_HEAD(zombie_ppo); - LIST_HEAD(zombie_fpo); - struct rdma_cm_id *cmid = NULL; - struct kib_hca_dev *hdev = NULL; - struct ib_pd *pd; - struct kib_net *net; - struct sockaddr_in addr; - unsigned long flags; - int rc = 0; - int i; - - LASSERT(*kiblnd_tunables.kib_dev_failover > 1 || - dev->ibd_can_failover || !dev->ibd_hdev); - - rc = kiblnd_dev_need_failover(dev); - if (rc <= 0) - goto out; - - if (dev->ibd_hdev && - dev->ibd_hdev->ibh_cmid) { - /* - * XXX it's not good to close old listener at here, - * because we can fail to create new listener. - * But we have to close it now, otherwise rdma_bind_addr - * will return EADDRINUSE... How crap! - */ - write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - - cmid = dev->ibd_hdev->ibh_cmid; - /* - * make next schedule of kiblnd_dev_need_failover() - * return 1 for me - */ - dev->ibd_hdev->ibh_cmid = NULL; - write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - - rdma_destroy_id(cmid); - } - - cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, dev, RDMA_PS_TCP, - IB_QPT_RC); - if (IS_ERR(cmid)) { - rc = PTR_ERR(cmid); - CERROR("Failed to create cmid for failover: %d\n", rc); - goto out; - } - - memset(&addr, 0, sizeof(addr)); - addr.sin_family = AF_INET; - addr.sin_addr.s_addr = htonl(dev->ibd_ifip); - addr.sin_port = htons(*kiblnd_tunables.kib_service); - - /* Bind to failover device or port */ - rc = rdma_bind_addr(cmid, (struct sockaddr *)&addr); - if (rc || !cmid->device) { - CERROR("Failed to bind %s:%pI4h to device(%p): %d\n", - dev->ibd_ifname, &dev->ibd_ifip, - cmid->device, rc); - rdma_destroy_id(cmid); - goto out; - } - - hdev = kzalloc(sizeof(*hdev), GFP_NOFS); - if (!hdev) { - CERROR("Failed to allocate kib_hca_dev\n"); - rdma_destroy_id(cmid); - rc = -ENOMEM; - goto out; - } - - atomic_set(&hdev->ibh_ref, 1); - hdev->ibh_dev = dev; - hdev->ibh_cmid = cmid; - hdev->ibh_ibdev = cmid->device; - - pd = ib_alloc_pd(cmid->device, 0); - if (IS_ERR(pd)) { - rc = PTR_ERR(pd); - CERROR("Can't allocate PD: %d\n", rc); - goto out; - } - - hdev->ibh_pd = pd; - - rc = rdma_listen(cmid, 0); - if (rc) { - CERROR("Can't start new listener: %d\n", rc); - goto out; - } - - rc = kiblnd_hdev_get_attr(hdev); - if (rc) { - CERROR("Can't get device attributes: %d\n", rc); - goto out; - } - - write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - - swap(dev->ibd_hdev, hdev); /* take over the refcount */ - - list_for_each_entry(net, &dev->ibd_nets, ibn_list) { - cfs_cpt_for_each(i, lnet_cpt_table()) { - kiblnd_fail_poolset(&net->ibn_tx_ps[i]->tps_poolset, - &zombie_tpo); - - if (net->ibn_fmr_ps) - kiblnd_fail_fmr_poolset(net->ibn_fmr_ps[i], - &zombie_fpo); - } - } - - write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - out: - if (!list_empty(&zombie_tpo)) - kiblnd_destroy_pool_list(&zombie_tpo); - if (!list_empty(&zombie_ppo)) - kiblnd_destroy_pool_list(&zombie_ppo); - if (!list_empty(&zombie_fpo)) - kiblnd_destroy_fmr_pool_list(&zombie_fpo); - if (hdev) - kiblnd_hdev_decref(hdev); - - if (rc) - dev->ibd_failed_failover++; - else - dev->ibd_failed_failover = 0; - - return rc; -} - -void kiblnd_destroy_dev(struct kib_dev *dev) -{ - LASSERT(!dev->ibd_nnets); - LASSERT(list_empty(&dev->ibd_nets)); - - list_del(&dev->ibd_fail_list); - list_del(&dev->ibd_list); - - if (dev->ibd_hdev) - kiblnd_hdev_decref(dev->ibd_hdev); - - kfree(dev); -} - -static struct kib_dev *kiblnd_create_dev(char *ifname) -{ - struct net_device *netdev; - struct kib_dev *dev; - __u32 netmask; - __u32 ip; - int up; - int rc; - - rc = lnet_ipif_query(ifname, &up, &ip, &netmask); - if (rc) { - CERROR("Can't query IPoIB interface %s: %d\n", - ifname, rc); - return NULL; - } - - if (!up) { - CERROR("Can't query IPoIB interface %s: it's down\n", ifname); - return NULL; - } - - dev = kzalloc(sizeof(*dev), GFP_NOFS); - if (!dev) - return NULL; - - netdev = dev_get_by_name(&init_net, ifname); - if (!netdev) { - dev->ibd_can_failover = 0; - } else { - dev->ibd_can_failover = !!(netdev->flags & IFF_MASTER); - dev_put(netdev); - } - - INIT_LIST_HEAD(&dev->ibd_nets); - INIT_LIST_HEAD(&dev->ibd_list); /* not yet in kib_devs */ - INIT_LIST_HEAD(&dev->ibd_fail_list); - dev->ibd_ifip = ip; - strcpy(&dev->ibd_ifname[0], ifname); - - /* initialize the device */ - rc = kiblnd_dev_failover(dev); - if (rc) { - CERROR("Can't initialize device: %d\n", rc); - kfree(dev); - return NULL; - } - - list_add_tail(&dev->ibd_list, &kiblnd_data.kib_devs); - return dev; -} - -static void kiblnd_base_shutdown(void) -{ - struct kib_sched_info *sched; - int i; - - LASSERT(list_empty(&kiblnd_data.kib_devs)); - - switch (kiblnd_data.kib_init) { - default: - LBUG(); - - case IBLND_INIT_ALL: - case IBLND_INIT_DATA: - LASSERT(kiblnd_data.kib_peers); - for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) - LASSERT(list_empty(&kiblnd_data.kib_peers[i])); - LASSERT(list_empty(&kiblnd_data.kib_connd_zombies)); - LASSERT(list_empty(&kiblnd_data.kib_connd_conns)); - LASSERT(list_empty(&kiblnd_data.kib_reconn_list)); - LASSERT(list_empty(&kiblnd_data.kib_reconn_wait)); - - /* flag threads to terminate; wake and wait for them to die */ - kiblnd_data.kib_shutdown = 1; - - /* - * NB: we really want to stop scheduler threads net by net - * instead of the whole module, this should be improved - * with dynamic configuration LNet - */ - cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) - wake_up_all(&sched->ibs_waitq); - - wake_up_all(&kiblnd_data.kib_connd_waitq); - wake_up_all(&kiblnd_data.kib_failover_waitq); - - i = 2; - while (atomic_read(&kiblnd_data.kib_nthreads)) { - i++; - /* power of 2 ? */ - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, - "Waiting for %d threads to terminate\n", - atomic_read(&kiblnd_data.kib_nthreads)); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ); - } - - /* fall through */ - - case IBLND_INIT_NOTHING: - break; - } - - kvfree(kiblnd_data.kib_peers); - - if (kiblnd_data.kib_scheds) - cfs_percpt_free(kiblnd_data.kib_scheds); - - kiblnd_data.kib_init = IBLND_INIT_NOTHING; - module_put(THIS_MODULE); -} - -static void kiblnd_shutdown(struct lnet_ni *ni) -{ - struct kib_net *net = ni->ni_data; - rwlock_t *g_lock = &kiblnd_data.kib_global_lock; - int i; - unsigned long flags; - - LASSERT(kiblnd_data.kib_init == IBLND_INIT_ALL); - - if (!net) - goto out; - - write_lock_irqsave(g_lock, flags); - net->ibn_shutdown = 1; - write_unlock_irqrestore(g_lock, flags); - - switch (net->ibn_init) { - default: - LBUG(); - - case IBLND_INIT_ALL: - /* nuke all existing peers within this net */ - kiblnd_del_peer(ni, LNET_NID_ANY); - - /* Wait for all peer state to clean up */ - i = 2; - while (atomic_read(&net->ibn_npeers)) { - i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* 2**n? */ - "%s: waiting for %d peers to disconnect\n", - libcfs_nid2str(ni->ni_nid), - atomic_read(&net->ibn_npeers)); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ); - } - - kiblnd_net_fini_pools(net); - - write_lock_irqsave(g_lock, flags); - LASSERT(net->ibn_dev->ibd_nnets > 0); - net->ibn_dev->ibd_nnets--; - list_del(&net->ibn_list); - write_unlock_irqrestore(g_lock, flags); - - /* fall through */ - - case IBLND_INIT_NOTHING: - LASSERT(!atomic_read(&net->ibn_nconns)); - - if (net->ibn_dev && !net->ibn_dev->ibd_nnets) - kiblnd_destroy_dev(net->ibn_dev); - - break; - } - - net->ibn_init = IBLND_INIT_NOTHING; - ni->ni_data = NULL; - - kfree(net); - -out: - if (list_empty(&kiblnd_data.kib_devs)) - kiblnd_base_shutdown(); -} - -static int kiblnd_base_startup(void) -{ - struct kib_sched_info *sched; - int rc; - int i; - - LASSERT(kiblnd_data.kib_init == IBLND_INIT_NOTHING); - - try_module_get(THIS_MODULE); - /* zero pointers, flags etc */ - memset(&kiblnd_data, 0, sizeof(kiblnd_data)); - - rwlock_init(&kiblnd_data.kib_global_lock); - - INIT_LIST_HEAD(&kiblnd_data.kib_devs); - INIT_LIST_HEAD(&kiblnd_data.kib_failed_devs); - - kiblnd_data.kib_peer_hash_size = IBLND_PEER_HASH_SIZE; - kiblnd_data.kib_peers = kvmalloc_array(kiblnd_data.kib_peer_hash_size, - sizeof(struct list_head), - GFP_KERNEL); - if (!kiblnd_data.kib_peers) - goto failed; - for (i = 0; i < kiblnd_data.kib_peer_hash_size; i++) - INIT_LIST_HEAD(&kiblnd_data.kib_peers[i]); - - spin_lock_init(&kiblnd_data.kib_connd_lock); - INIT_LIST_HEAD(&kiblnd_data.kib_connd_conns); - INIT_LIST_HEAD(&kiblnd_data.kib_connd_zombies); - INIT_LIST_HEAD(&kiblnd_data.kib_reconn_list); - INIT_LIST_HEAD(&kiblnd_data.kib_reconn_wait); - - init_waitqueue_head(&kiblnd_data.kib_connd_waitq); - init_waitqueue_head(&kiblnd_data.kib_failover_waitq); - - kiblnd_data.kib_scheds = cfs_percpt_alloc(lnet_cpt_table(), - sizeof(*sched)); - if (!kiblnd_data.kib_scheds) - goto failed; - - cfs_percpt_for_each(sched, i, kiblnd_data.kib_scheds) { - int nthrs; - - spin_lock_init(&sched->ibs_lock); - INIT_LIST_HEAD(&sched->ibs_conns); - init_waitqueue_head(&sched->ibs_waitq); - - nthrs = cfs_cpt_weight(lnet_cpt_table(), i); - if (*kiblnd_tunables.kib_nscheds > 0) { - nthrs = min(nthrs, *kiblnd_tunables.kib_nscheds); - } else { - /* - * max to half of CPUs, another half is reserved for - * upper layer modules - */ - nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs); - } - - sched->ibs_nthreads_max = nthrs; - sched->ibs_cpt = i; - } - - kiblnd_data.kib_error_qpa.qp_state = IB_QPS_ERR; - - /* lists/ptrs/locks initialised */ - kiblnd_data.kib_init = IBLND_INIT_DATA; - /*****************************************************/ - - rc = kiblnd_thread_start(kiblnd_connd, NULL, "kiblnd_connd"); - if (rc) { - CERROR("Can't spawn o2iblnd connd: %d\n", rc); - goto failed; - } - - if (*kiblnd_tunables.kib_dev_failover) - rc = kiblnd_thread_start(kiblnd_failover_thread, NULL, - "kiblnd_failover"); - - if (rc) { - CERROR("Can't spawn o2iblnd failover thread: %d\n", rc); - goto failed; - } - - /* flag everything initialised */ - kiblnd_data.kib_init = IBLND_INIT_ALL; - /*****************************************************/ - - return 0; - - failed: - kiblnd_base_shutdown(); - return -ENETDOWN; -} - -static int kiblnd_start_schedulers(struct kib_sched_info *sched) -{ - int rc = 0; - int nthrs; - int i; - - if (!sched->ibs_nthreads) { - if (*kiblnd_tunables.kib_nscheds > 0) { - nthrs = sched->ibs_nthreads_max; - } else { - nthrs = cfs_cpt_weight(lnet_cpt_table(), - sched->ibs_cpt); - nthrs = min(max(IBLND_N_SCHED, nthrs >> 1), nthrs); - nthrs = min(IBLND_N_SCHED_HIGH, nthrs); - } - } else { - LASSERT(sched->ibs_nthreads <= sched->ibs_nthreads_max); - /* increase one thread if there is new interface */ - nthrs = sched->ibs_nthreads < sched->ibs_nthreads_max; - } - - for (i = 0; i < nthrs; i++) { - long id; - char name[20]; - - id = KIB_THREAD_ID(sched->ibs_cpt, sched->ibs_nthreads + i); - snprintf(name, sizeof(name), "kiblnd_sd_%02ld_%02ld", - KIB_THREAD_CPT(id), KIB_THREAD_TID(id)); - rc = kiblnd_thread_start(kiblnd_scheduler, (void *)id, name); - if (!rc) - continue; - - CERROR("Can't spawn thread %d for scheduler[%d]: %d\n", - sched->ibs_cpt, sched->ibs_nthreads + i, rc); - break; - } - - sched->ibs_nthreads += i; - return rc; -} - -static int kiblnd_dev_start_threads(struct kib_dev *dev, int newdev, __u32 *cpts, - int ncpts) -{ - int cpt; - int rc; - int i; - - for (i = 0; i < ncpts; i++) { - struct kib_sched_info *sched; - - cpt = !cpts ? i : cpts[i]; - sched = kiblnd_data.kib_scheds[cpt]; - - if (!newdev && sched->ibs_nthreads > 0) - continue; - - rc = kiblnd_start_schedulers(kiblnd_data.kib_scheds[cpt]); - if (rc) { - CERROR("Failed to start scheduler threads for %s\n", - dev->ibd_ifname); - return rc; - } - } - return 0; -} - -static struct kib_dev *kiblnd_dev_search(char *ifname) -{ - struct kib_dev *alias = NULL; - struct kib_dev *dev; - char *colon; - char *colon2; - - colon = strchr(ifname, ':'); - list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) { - if (!strcmp(&dev->ibd_ifname[0], ifname)) - return dev; - - if (alias) - continue; - - colon2 = strchr(dev->ibd_ifname, ':'); - if (colon) - *colon = 0; - if (colon2) - *colon2 = 0; - - if (!strcmp(&dev->ibd_ifname[0], ifname)) - alias = dev; - - if (colon) - *colon = ':'; - if (colon2) - *colon2 = ':'; - } - return alias; -} - -static int kiblnd_startup(struct lnet_ni *ni) -{ - char *ifname; - struct kib_dev *ibdev = NULL; - struct kib_net *net; - struct timespec64 tv; - unsigned long flags; - int rc; - int newdev; - - LASSERT(ni->ni_lnd == &the_o2iblnd); - - if (kiblnd_data.kib_init == IBLND_INIT_NOTHING) { - rc = kiblnd_base_startup(); - if (rc) - return rc; - } - - net = kzalloc(sizeof(*net), GFP_NOFS); - ni->ni_data = net; - if (!net) - goto net_failed; - - ktime_get_real_ts64(&tv); - net->ibn_incarnation = tv.tv_sec * USEC_PER_SEC + - tv.tv_nsec / NSEC_PER_USEC; - - rc = kiblnd_tunables_setup(ni); - if (rc) - goto net_failed; - - if (ni->ni_interfaces[0]) { - /* Use the IPoIB interface specified in 'networks=' */ - - BUILD_BUG_ON(LNET_MAX_INTERFACES <= 1); - if (ni->ni_interfaces[1]) { - CERROR("Multiple interfaces not supported\n"); - goto failed; - } - - ifname = ni->ni_interfaces[0]; - } else { - ifname = *kiblnd_tunables.kib_default_ipif; - } - - if (strlen(ifname) >= sizeof(ibdev->ibd_ifname)) { - CERROR("IPoIB interface name too long: %s\n", ifname); - goto failed; - } - - ibdev = kiblnd_dev_search(ifname); - - newdev = !ibdev; - /* hmm...create kib_dev even for alias */ - if (!ibdev || strcmp(&ibdev->ibd_ifname[0], ifname)) - ibdev = kiblnd_create_dev(ifname); - - if (!ibdev) - goto failed; - - net->ibn_dev = ibdev; - ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ibdev->ibd_ifip); - - rc = kiblnd_dev_start_threads(ibdev, newdev, - ni->ni_cpts, ni->ni_ncpts); - if (rc) - goto failed; - - rc = kiblnd_net_init_pools(net, ni, ni->ni_cpts, ni->ni_ncpts); - if (rc) { - CERROR("Failed to initialize NI pools: %d\n", rc); - goto failed; - } - - write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - ibdev->ibd_nnets++; - list_add_tail(&net->ibn_list, &ibdev->ibd_nets); - write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - - net->ibn_init = IBLND_INIT_ALL; - - return 0; - -failed: - if (!net->ibn_dev && ibdev) - kiblnd_destroy_dev(ibdev); - -net_failed: - kiblnd_shutdown(ni); - - CDEBUG(D_NET, "%s failed\n", __func__); - return -ENETDOWN; -} - -static struct lnet_lnd the_o2iblnd = { - .lnd_type = O2IBLND, - .lnd_startup = kiblnd_startup, - .lnd_shutdown = kiblnd_shutdown, - .lnd_ctl = kiblnd_ctl, - .lnd_query = kiblnd_query, - .lnd_send = kiblnd_send, - .lnd_recv = kiblnd_recv, -}; - -static void __exit ko2iblnd_exit(void) -{ - lnet_unregister_lnd(&the_o2iblnd); -} - -static int __init ko2iblnd_init(void) -{ - int rc; - - BUILD_BUG_ON(sizeof(struct kib_msg) > IBLND_MSG_SIZE); - BUILD_BUG_ON(offsetof(struct kib_msg, - ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) - > IBLND_MSG_SIZE); - BUILD_BUG_ON(offsetof(struct kib_msg, - ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) - > IBLND_MSG_SIZE); - - kiblnd_tunables_init(); - - rc = libcfs_setup(); - if (rc) - return rc; - - lnet_register_lnd(&the_o2iblnd); - - return 0; -} - -MODULE_AUTHOR("OpenSFS, Inc. "); -MODULE_DESCRIPTION("OpenIB gen2 LNet Network Driver"); -MODULE_VERSION("2.7.0"); -MODULE_LICENSE("GPL"); - -module_init(ko2iblnd_init); -module_exit(ko2iblnd_exit); diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h deleted file mode 100644 index 217503f125bc8a362f44c5fe76dd90d9cde0c01e..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h +++ /dev/null @@ -1,1048 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/o2iblnd/o2iblnd.h - * - * Author: Eric Barton - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include - -#define DEBUG_SUBSYSTEM S_LND - -#include - -#define IBLND_PEER_HASH_SIZE 101 /* # peer lists */ -/* # scheduler loops before reschedule */ -#define IBLND_RESCHED 100 - -#define IBLND_N_SCHED 2 -#define IBLND_N_SCHED_HIGH 4 - -struct kib_tunables { - int *kib_dev_failover; /* HCA failover */ - unsigned int *kib_service; /* IB service number */ - int *kib_min_reconnect_interval; /* first failed connection retry... */ - int *kib_max_reconnect_interval; /* exponentially increasing to this */ - int *kib_cksum; /* checksum struct kib_msg? */ - int *kib_timeout; /* comms timeout (seconds) */ - int *kib_keepalive; /* keepalive timeout (seconds) */ - int *kib_ntx; /* # tx descs */ - char **kib_default_ipif; /* default IPoIB interface */ - int *kib_retry_count; - int *kib_rnr_retry_count; - int *kib_ib_mtu; /* IB MTU */ - int *kib_require_priv_port; /* accept only privileged ports */ - int *kib_use_priv_port; /* use privileged port for active connect */ - int *kib_nscheds; /* # threads on each CPT */ -}; - -extern struct kib_tunables kiblnd_tunables; - -#define IBLND_MSG_QUEUE_SIZE_V1 8 /* V1 only : # messages/RDMAs in-flight */ -#define IBLND_CREDIT_HIGHWATER_V1 7 /* V1 only : when eagerly to return credits */ - -#define IBLND_CREDITS_DEFAULT 8 /* default # of peer credits */ -#define IBLND_CREDITS_MAX ((typeof(((struct kib_msg *)0)->ibm_credits)) - 1) /* Max # of peer credits */ - -/* when eagerly to return credits */ -#define IBLND_CREDITS_HIGHWATER(t, v) ((v) == IBLND_MSG_VERSION_1 ? \ - IBLND_CREDIT_HIGHWATER_V1 : \ - t->lnd_peercredits_hiw) - -#define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(current->nsproxy->net_ns, \ - cb, dev, \ - ps, qpt) - -/* 2 OOB shall suffice for 1 keepalive and 1 returning credits */ -#define IBLND_OOB_CAPABLE(v) ((v) != IBLND_MSG_VERSION_1) -#define IBLND_OOB_MSGS(v) (IBLND_OOB_CAPABLE(v) ? 2 : 0) - -#define IBLND_FRAG_SHIFT (PAGE_SHIFT - 12) /* frag size on wire is in 4K units */ -#define IBLND_MSG_SIZE (4 << 10) /* max size of queued messages (inc hdr) */ -#define IBLND_MAX_RDMA_FRAGS (LNET_MAX_PAYLOAD >> 12)/* max # of fragments supported in 4K size */ - -/************************/ -/* derived constants... */ -/* Pools (shared by connections on each CPT) */ -/* These pools can grow at runtime, so don't need give a very large value */ -#define IBLND_TX_POOL 256 -#define IBLND_FMR_POOL 256 -#define IBLND_FMR_POOL_FLUSH 192 - -#define IBLND_RX_MSGS(c) \ - ((c->ibc_queue_depth) * 2 + IBLND_OOB_MSGS(c->ibc_version)) -#define IBLND_RX_MSG_BYTES(c) (IBLND_RX_MSGS(c) * IBLND_MSG_SIZE) -#define IBLND_RX_MSG_PAGES(c) \ - ((IBLND_RX_MSG_BYTES(c) + PAGE_SIZE - 1) / PAGE_SIZE) - -/* WRs and CQEs (per connection) */ -#define IBLND_RECV_WRS(c) IBLND_RX_MSGS(c) -#define IBLND_SEND_WRS(c) \ - (((c->ibc_max_frags + 1) << IBLND_FRAG_SHIFT) * \ - kiblnd_concurrent_sends(c->ibc_version, c->ibc_peer->ibp_ni)) -#define IBLND_CQ_ENTRIES(c) (IBLND_RECV_WRS(c) + IBLND_SEND_WRS(c)) - -struct kib_hca_dev; - -/* o2iblnd can run over aliased interface */ -#ifdef IFALIASZ -#define KIB_IFNAME_SIZE IFALIASZ -#else -#define KIB_IFNAME_SIZE 256 -#endif - -struct kib_dev { - struct list_head ibd_list; /* chain on kib_devs */ - struct list_head ibd_fail_list; /* chain on kib_failed_devs */ - __u32 ibd_ifip; /* IPoIB interface IP */ - - /* IPoIB interface name */ - char ibd_ifname[KIB_IFNAME_SIZE]; - int ibd_nnets; /* # nets extant */ - - unsigned long ibd_next_failover; - int ibd_failed_failover; /* # failover failures */ - unsigned int ibd_failover; /* failover in progress */ - unsigned int ibd_can_failover; /* IPoIB interface is a bonding master */ - struct list_head ibd_nets; - struct kib_hca_dev *ibd_hdev; -}; - -struct kib_hca_dev { - struct rdma_cm_id *ibh_cmid; /* listener cmid */ - struct ib_device *ibh_ibdev; /* IB device */ - int ibh_page_shift; /* page shift of current HCA */ - int ibh_page_size; /* page size of current HCA */ - __u64 ibh_page_mask; /* page mask of current HCA */ - int ibh_mr_shift; /* bits shift of max MR size */ - __u64 ibh_mr_size; /* size of MR */ - struct ib_pd *ibh_pd; /* PD */ - struct kib_dev *ibh_dev; /* owner */ - atomic_t ibh_ref; /* refcount */ -}; - -/** # of seconds to keep pool alive */ -#define IBLND_POOL_DEADLINE 300 -/** # of seconds to retry if allocation failed */ -#define IBLND_POOL_RETRY 1 - -struct kib_pages { - int ibp_npages; /* # pages */ - struct page *ibp_pages[0]; /* page array */ -}; - -struct kib_pool; -struct kib_poolset; - -typedef int (*kib_ps_pool_create_t)(struct kib_poolset *ps, - int inc, struct kib_pool **pp_po); -typedef void (*kib_ps_pool_destroy_t)(struct kib_pool *po); -typedef void (*kib_ps_node_init_t)(struct kib_pool *po, struct list_head *node); -typedef void (*kib_ps_node_fini_t)(struct kib_pool *po, struct list_head *node); - -struct kib_net; - -#define IBLND_POOL_NAME_LEN 32 - -struct kib_poolset { - spinlock_t ps_lock; /* serialize */ - struct kib_net *ps_net; /* network it belongs to */ - char ps_name[IBLND_POOL_NAME_LEN]; /* pool set name */ - struct list_head ps_pool_list; /* list of pools */ - struct list_head ps_failed_pool_list;/* failed pool list */ - unsigned long ps_next_retry; /* time stamp for retry if */ - /* failed to allocate */ - int ps_increasing; /* is allocating new pool */ - int ps_pool_size; /* new pool size */ - int ps_cpt; /* CPT id */ - - kib_ps_pool_create_t ps_pool_create; /* create a new pool */ - kib_ps_pool_destroy_t ps_pool_destroy; /* destroy a pool */ - kib_ps_node_init_t ps_node_init; /* initialize new allocated node */ - kib_ps_node_fini_t ps_node_fini; /* finalize node */ -}; - -struct kib_pool { - struct list_head po_list; /* chain on pool list */ - struct list_head po_free_list; /* pre-allocated node */ - struct kib_poolset *po_owner; /* pool_set of this pool */ - unsigned long po_deadline; /* deadline of this pool */ - int po_allocated; /* # of elements in use */ - int po_failed; /* pool is created on failed HCA */ - int po_size; /* # of pre-allocated elements */ -}; - -struct kib_tx_poolset { - struct kib_poolset tps_poolset; /* pool-set */ - __u64 tps_next_tx_cookie; /* cookie of TX */ -}; - -struct kib_tx_pool { - struct kib_pool tpo_pool; /* pool */ - struct kib_hca_dev *tpo_hdev; /* device for this pool */ - struct kib_tx *tpo_tx_descs; /* all the tx descriptors */ - struct kib_pages *tpo_tx_pages; /* premapped tx msg pages */ -}; - -struct kib_fmr_poolset { - spinlock_t fps_lock; /* serialize */ - struct kib_net *fps_net; /* IB network */ - struct list_head fps_pool_list; /* FMR pool list */ - struct list_head fps_failed_pool_list;/* FMR pool list */ - __u64 fps_version; /* validity stamp */ - int fps_cpt; /* CPT id */ - int fps_pool_size; - int fps_flush_trigger; - int fps_cache; - int fps_increasing; /* is allocating new pool */ - unsigned long fps_next_retry; /* time stamp for retry if*/ - /* failed to allocate */ -}; - -struct kib_fast_reg_descriptor { /* For fast registration */ - struct list_head frd_list; - struct ib_send_wr frd_inv_wr; - struct ib_reg_wr frd_fastreg_wr; - struct ib_mr *frd_mr; - bool frd_valid; -}; - -struct kib_fmr_pool { - struct list_head fpo_list; /* chain on pool list */ - struct kib_hca_dev *fpo_hdev; /* device for this pool */ - struct kib_fmr_poolset *fpo_owner; /* owner of this pool */ - union { - struct { - struct ib_fmr_pool *fpo_fmr_pool; /* IB FMR pool */ - } fmr; - struct { /* For fast registration */ - struct list_head fpo_pool_list; - int fpo_pool_size; - } fast_reg; - }; - unsigned long fpo_deadline; /* deadline of this pool */ - int fpo_failed; /* fmr pool is failed */ - int fpo_map_count; /* # of mapped FMR */ - int fpo_is_fmr; -}; - -struct kib_fmr { - struct kib_fmr_pool *fmr_pool; /* pool of FMR */ - struct ib_pool_fmr *fmr_pfmr; /* IB pool fmr */ - struct kib_fast_reg_descriptor *fmr_frd; - u32 fmr_key; -}; - -struct kib_net { - struct list_head ibn_list; /* chain on struct kib_dev::ibd_nets */ - __u64 ibn_incarnation;/* my epoch */ - int ibn_init; /* initialisation state */ - int ibn_shutdown; /* shutting down? */ - - atomic_t ibn_npeers; /* # peers extant */ - atomic_t ibn_nconns; /* # connections extant */ - - struct kib_tx_poolset **ibn_tx_ps; /* tx pool-set */ - struct kib_fmr_poolset **ibn_fmr_ps; /* fmr pool-set */ - - struct kib_dev *ibn_dev; /* underlying IB device */ -}; - -#define KIB_THREAD_SHIFT 16 -#define KIB_THREAD_ID(cpt, tid) ((cpt) << KIB_THREAD_SHIFT | (tid)) -#define KIB_THREAD_CPT(id) ((id) >> KIB_THREAD_SHIFT) -#define KIB_THREAD_TID(id) ((id) & ((1UL << KIB_THREAD_SHIFT) - 1)) - -struct kib_sched_info { - spinlock_t ibs_lock; /* serialise */ - wait_queue_head_t ibs_waitq; /* schedulers sleep here */ - struct list_head ibs_conns; /* conns to check for rx completions */ - int ibs_nthreads; /* number of scheduler threads */ - int ibs_nthreads_max; /* max allowed scheduler threads */ - int ibs_cpt; /* CPT id */ -}; - -struct kib_data { - int kib_init; /* initialisation state */ - int kib_shutdown; /* shut down? */ - struct list_head kib_devs; /* IB devices extant */ - struct list_head kib_failed_devs; /* list head of failed devices */ - wait_queue_head_t kib_failover_waitq; /* schedulers sleep here */ - atomic_t kib_nthreads; /* # live threads */ - rwlock_t kib_global_lock; /* stabilize net/dev/peer/conn ops */ - struct list_head *kib_peers; /* hash table of all my known peers */ - int kib_peer_hash_size; /* size of kib_peers */ - void *kib_connd; /* the connd task (serialisation assertions) */ - struct list_head kib_connd_conns; /* connections to setup/teardown */ - struct list_head kib_connd_zombies; /* connections with zero refcount */ - /* connections to reconnect */ - struct list_head kib_reconn_list; - /* peers wait for reconnection */ - struct list_head kib_reconn_wait; - /** - * The second that peers are pulled out from \a kib_reconn_wait - * for reconnection. - */ - time64_t kib_reconn_sec; - - wait_queue_head_t kib_connd_waitq; /* connection daemon sleeps here */ - spinlock_t kib_connd_lock; /* serialise */ - struct ib_qp_attr kib_error_qpa; /* QP->ERROR */ - struct kib_sched_info **kib_scheds; /* percpt data for schedulers */ -}; - -#define IBLND_INIT_NOTHING 0 -#define IBLND_INIT_DATA 1 -#define IBLND_INIT_ALL 2 - -/************************************************************************ - * IB Wire message format. - * These are sent in sender's byte order (i.e. receiver flips). - */ - -struct kib_connparams { - __u16 ibcp_queue_depth; - __u16 ibcp_max_frags; - __u32 ibcp_max_msg_size; -} WIRE_ATTR; - -struct kib_immediate_msg { - struct lnet_hdr ibim_hdr; /* portals header */ - char ibim_payload[0]; /* piggy-backed payload */ -} WIRE_ATTR; - -struct kib_rdma_frag { - __u32 rf_nob; /* # bytes this frag */ - __u64 rf_addr; /* CAVEAT EMPTOR: misaligned!! */ -} WIRE_ATTR; - -struct kib_rdma_desc { - __u32 rd_key; /* local/remote key */ - __u32 rd_nfrags; /* # fragments */ - struct kib_rdma_frag rd_frags[0]; /* buffer frags */ -} WIRE_ATTR; - -struct kib_putreq_msg { - struct lnet_hdr ibprm_hdr; /* portals header */ - __u64 ibprm_cookie; /* opaque completion cookie */ -} WIRE_ATTR; - -struct kib_putack_msg { - __u64 ibpam_src_cookie; /* reflected completion cookie */ - __u64 ibpam_dst_cookie; /* opaque completion cookie */ - struct kib_rdma_desc ibpam_rd; /* sender's sink buffer */ -} WIRE_ATTR; - -struct kib_get_msg { - struct lnet_hdr ibgm_hdr; /* portals header */ - __u64 ibgm_cookie; /* opaque completion cookie */ - struct kib_rdma_desc ibgm_rd; /* rdma descriptor */ -} WIRE_ATTR; - -struct kib_completion_msg { - __u64 ibcm_cookie; /* opaque completion cookie */ - __s32 ibcm_status; /* < 0 failure: >= 0 length */ -} WIRE_ATTR; - -struct kib_msg { - /* First 2 fields fixed FOR ALL TIME */ - __u32 ibm_magic; /* I'm an ibnal message */ - __u16 ibm_version; /* this is my version number */ - - __u8 ibm_type; /* msg type */ - __u8 ibm_credits; /* returned credits */ - __u32 ibm_nob; /* # bytes in whole message */ - __u32 ibm_cksum; /* checksum (0 == no checksum) */ - __u64 ibm_srcnid; /* sender's NID */ - __u64 ibm_srcstamp; /* sender's incarnation */ - __u64 ibm_dstnid; /* destination's NID */ - __u64 ibm_dststamp; /* destination's incarnation */ - - union { - struct kib_connparams connparams; - struct kib_immediate_msg immediate; - struct kib_putreq_msg putreq; - struct kib_putack_msg putack; - struct kib_get_msg get; - struct kib_completion_msg completion; - } WIRE_ATTR ibm_u; -} WIRE_ATTR; - -#define IBLND_MSG_MAGIC LNET_PROTO_IB_MAGIC /* unique magic */ - -#define IBLND_MSG_VERSION_1 0x11 -#define IBLND_MSG_VERSION_2 0x12 -#define IBLND_MSG_VERSION IBLND_MSG_VERSION_2 - -#define IBLND_MSG_CONNREQ 0xc0 /* connection request */ -#define IBLND_MSG_CONNACK 0xc1 /* connection acknowledge */ -#define IBLND_MSG_NOOP 0xd0 /* nothing (just credits) */ -#define IBLND_MSG_IMMEDIATE 0xd1 /* immediate */ -#define IBLND_MSG_PUT_REQ 0xd2 /* putreq (src->sink) */ -#define IBLND_MSG_PUT_NAK 0xd3 /* completion (sink->src) */ -#define IBLND_MSG_PUT_ACK 0xd4 /* putack (sink->src) */ -#define IBLND_MSG_PUT_DONE 0xd5 /* completion (src->sink) */ -#define IBLND_MSG_GET_REQ 0xd6 /* getreq (sink->src) */ -#define IBLND_MSG_GET_DONE 0xd7 /* completion (src->sink: all OK) */ - -struct kib_rej { - __u32 ibr_magic; /* sender's magic */ - __u16 ibr_version; /* sender's version */ - __u8 ibr_why; /* reject reason */ - __u8 ibr_padding; /* padding */ - __u64 ibr_incarnation; /* incarnation of peer */ - struct kib_connparams ibr_cp; /* connection parameters */ -} WIRE_ATTR; - -/* connection rejection reasons */ -#define IBLND_REJECT_CONN_RACE 1 /* You lost connection race */ -#define IBLND_REJECT_NO_RESOURCES 2 /* Out of memory/conns etc */ -#define IBLND_REJECT_FATAL 3 /* Anything else */ -#define IBLND_REJECT_CONN_UNCOMPAT 4 /* incompatible version peer */ -#define IBLND_REJECT_CONN_STALE 5 /* stale peer */ -/* peer's rdma frags doesn't match mine */ -#define IBLND_REJECT_RDMA_FRAGS 6 -/* peer's msg queue size doesn't match mine */ -#define IBLND_REJECT_MSG_QUEUE_SIZE 7 - -/***********************************************************************/ - -struct kib_rx { /* receive message */ - struct list_head rx_list; /* queue for attention */ - struct kib_conn *rx_conn; /* owning conn */ - int rx_nob; /* # bytes received (-1 while posted) */ - enum ib_wc_status rx_status; /* completion status */ - struct kib_msg *rx_msg; /* message buffer (host vaddr) */ - __u64 rx_msgaddr; /* message buffer (I/O addr) */ - DECLARE_PCI_UNMAP_ADDR(rx_msgunmap); /* for dma_unmap_single() */ - struct ib_recv_wr rx_wrq; /* receive work item... */ - struct ib_sge rx_sge; /* ...and its memory */ -}; - -#define IBLND_POSTRX_DONT_POST 0 /* don't post */ -#define IBLND_POSTRX_NO_CREDIT 1 /* post: no credits */ -#define IBLND_POSTRX_PEER_CREDIT 2 /* post: give peer back 1 credit */ -#define IBLND_POSTRX_RSRVD_CREDIT 3 /* post: give self back 1 reserved credit */ - -struct kib_tx { /* transmit message */ - struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */ - struct kib_tx_pool *tx_pool; /* pool I'm from */ - struct kib_conn *tx_conn; /* owning conn */ - short tx_sending; /* # tx callbacks outstanding */ - short tx_queued; /* queued for sending */ - short tx_waiting; /* waiting for peer */ - int tx_status; /* LNET completion status */ - unsigned long tx_deadline; /* completion deadline */ - __u64 tx_cookie; /* completion cookie */ - struct lnet_msg *tx_lntmsg[2]; /* lnet msgs to finalize on completion */ - struct kib_msg *tx_msg; /* message buffer (host vaddr) */ - __u64 tx_msgaddr; /* message buffer (I/O addr) */ - DECLARE_PCI_UNMAP_ADDR(tx_msgunmap); /* for dma_unmap_single() */ - int tx_nwrq; /* # send work items */ - struct ib_rdma_wr *tx_wrq; /* send work items... */ - struct ib_sge *tx_sge; /* ...and their memory */ - struct kib_rdma_desc *tx_rd; /* rdma descriptor */ - int tx_nfrags; /* # entries in... */ - struct scatterlist *tx_frags; /* dma_map_sg descriptor */ - __u64 *tx_pages; /* rdma phys page addrs */ - struct kib_fmr fmr; /* FMR */ - int tx_dmadir; /* dma direction */ -}; - -struct kib_connvars { - struct kib_msg cv_msg; /* connection-in-progress variables */ -}; - -struct kib_conn { - struct kib_sched_info *ibc_sched; /* scheduler information */ - struct kib_peer *ibc_peer; /* owning peer */ - struct kib_hca_dev *ibc_hdev; /* HCA bound on */ - struct list_head ibc_list; /* stash on peer's conn list */ - struct list_head ibc_sched_list; /* schedule for attention */ - __u16 ibc_version; /* version of connection */ - /* reconnect later */ - __u16 ibc_reconnect:1; - __u64 ibc_incarnation; /* which instance of the peer */ - atomic_t ibc_refcount; /* # users */ - int ibc_state; /* what's happening */ - int ibc_nsends_posted; /* # uncompleted sends */ - int ibc_noops_posted; /* # uncompleted NOOPs */ - int ibc_credits; /* # credits I have */ - int ibc_outstanding_credits; /* # credits to return */ - int ibc_reserved_credits; /* # ACK/DONE msg credits */ - int ibc_comms_error; /* set on comms error */ - /* connections queue depth */ - __u16 ibc_queue_depth; - /* connections max frags */ - __u16 ibc_max_frags; - unsigned int ibc_nrx:16; /* receive buffers owned */ - unsigned int ibc_scheduled:1; /* scheduled for attention */ - unsigned int ibc_ready:1; /* CQ callback fired */ - unsigned long ibc_last_send; /* time of last send */ - struct list_head ibc_connd_list; /* link chain for */ - /* kiblnd_check_conns only */ - struct list_head ibc_early_rxs; /* rxs completed before ESTABLISHED */ - struct list_head ibc_tx_noops; /* IBLND_MSG_NOOPs for */ - /* IBLND_MSG_VERSION_1 */ - struct list_head ibc_tx_queue; /* sends that need a credit */ - struct list_head ibc_tx_queue_nocred; /* sends that don't need a */ - /* credit */ - struct list_head ibc_tx_queue_rsrvd; /* sends that need to */ - /* reserve an ACK/DONE msg */ - struct list_head ibc_active_txs; /* active tx awaiting completion */ - spinlock_t ibc_lock; /* serialise */ - struct kib_rx *ibc_rxs; /* the rx descs */ - struct kib_pages *ibc_rx_pages; /* premapped rx msg pages */ - - struct rdma_cm_id *ibc_cmid; /* CM id */ - struct ib_cq *ibc_cq; /* completion queue */ - - struct kib_connvars *ibc_connvars; /* in-progress connection state */ -}; - -#define IBLND_CONN_INIT 0 /* being initialised */ -#define IBLND_CONN_ACTIVE_CONNECT 1 /* active sending req */ -#define IBLND_CONN_PASSIVE_WAIT 2 /* passive waiting for rtu */ -#define IBLND_CONN_ESTABLISHED 3 /* connection established */ -#define IBLND_CONN_CLOSING 4 /* being closed */ -#define IBLND_CONN_DISCONNECTED 5 /* disconnected */ - -struct kib_peer { - struct list_head ibp_list; /* stash on global peer list */ - lnet_nid_t ibp_nid; /* who's on the other end(s) */ - struct lnet_ni *ibp_ni; /* LNet interface */ - struct list_head ibp_conns; /* all active connections */ - struct kib_conn *ibp_next_conn; /* next connection to send on for - * round robin */ - struct list_head ibp_tx_queue; /* msgs waiting for a conn */ - __u64 ibp_incarnation; /* incarnation of peer */ - /* when (in jiffies) I was last alive */ - unsigned long ibp_last_alive; - /* # users */ - atomic_t ibp_refcount; - /* version of peer */ - __u16 ibp_version; - /* current passive connection attempts */ - unsigned short ibp_accepting; - /* current active connection attempts */ - unsigned short ibp_connecting; - /* reconnect this peer later */ - unsigned char ibp_reconnecting; - /* counter of how many times we triggered a conn race */ - unsigned char ibp_races; - /* # consecutive reconnection attempts to this peer */ - unsigned int ibp_reconnected; - /* errno on closing this peer */ - int ibp_error; - /* max map_on_demand */ - __u16 ibp_max_frags; - /* max_peer_credits */ - __u16 ibp_queue_depth; -}; - -extern struct kib_data kiblnd_data; - -void kiblnd_hdev_destroy(struct kib_hca_dev *hdev); - -int kiblnd_msg_queue_size(int version, struct lnet_ni *ni); - -/* max # of fragments configured by user */ -static inline int -kiblnd_cfg_rdma_frags(struct lnet_ni *ni) -{ - struct lnet_ioctl_config_o2iblnd_tunables *tunables; - int mod; - - tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; - mod = tunables->lnd_map_on_demand; - return mod ? mod : IBLND_MAX_RDMA_FRAGS >> IBLND_FRAG_SHIFT; -} - -static inline int -kiblnd_rdma_frags(int version, struct lnet_ni *ni) -{ - return version == IBLND_MSG_VERSION_1 ? - (IBLND_MAX_RDMA_FRAGS >> IBLND_FRAG_SHIFT) : - kiblnd_cfg_rdma_frags(ni); -} - -static inline int -kiblnd_concurrent_sends(int version, struct lnet_ni *ni) -{ - struct lnet_ioctl_config_o2iblnd_tunables *tunables; - int concurrent_sends; - - tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; - concurrent_sends = tunables->lnd_concurrent_sends; - - if (version == IBLND_MSG_VERSION_1) { - if (concurrent_sends > IBLND_MSG_QUEUE_SIZE_V1 * 2) - return IBLND_MSG_QUEUE_SIZE_V1 * 2; - - if (concurrent_sends < IBLND_MSG_QUEUE_SIZE_V1 / 2) - return IBLND_MSG_QUEUE_SIZE_V1 / 2; - } - - return concurrent_sends; -} - -static inline void -kiblnd_hdev_addref_locked(struct kib_hca_dev *hdev) -{ - LASSERT(atomic_read(&hdev->ibh_ref) > 0); - atomic_inc(&hdev->ibh_ref); -} - -static inline void -kiblnd_hdev_decref(struct kib_hca_dev *hdev) -{ - LASSERT(atomic_read(&hdev->ibh_ref) > 0); - if (atomic_dec_and_test(&hdev->ibh_ref)) - kiblnd_hdev_destroy(hdev); -} - -static inline int -kiblnd_dev_can_failover(struct kib_dev *dev) -{ - if (!list_empty(&dev->ibd_fail_list)) /* already scheduled */ - return 0; - - if (!*kiblnd_tunables.kib_dev_failover) /* disabled */ - return 0; - - if (*kiblnd_tunables.kib_dev_failover > 1) /* force failover */ - return 1; - - return dev->ibd_can_failover; -} - -#define kiblnd_conn_addref(conn) \ -do { \ - CDEBUG(D_NET, "conn[%p] (%d)++\n", \ - (conn), atomic_read(&(conn)->ibc_refcount)); \ - atomic_inc(&(conn)->ibc_refcount); \ -} while (0) - -#define kiblnd_conn_decref(conn) \ -do { \ - unsigned long flags; \ - \ - CDEBUG(D_NET, "conn[%p] (%d)--\n", \ - (conn), atomic_read(&(conn)->ibc_refcount)); \ - LASSERT_ATOMIC_POS(&(conn)->ibc_refcount); \ - if (atomic_dec_and_test(&(conn)->ibc_refcount)) { \ - spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); \ - list_add_tail(&(conn)->ibc_list, \ - &kiblnd_data.kib_connd_zombies); \ - wake_up(&kiblnd_data.kib_connd_waitq); \ - spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags);\ - } \ -} while (0) - -#define kiblnd_peer_addref(peer) \ -do { \ - CDEBUG(D_NET, "peer[%p] -> %s (%d)++\n", \ - (peer), libcfs_nid2str((peer)->ibp_nid), \ - atomic_read(&(peer)->ibp_refcount)); \ - atomic_inc(&(peer)->ibp_refcount); \ -} while (0) - -#define kiblnd_peer_decref(peer) \ -do { \ - CDEBUG(D_NET, "peer[%p] -> %s (%d)--\n", \ - (peer), libcfs_nid2str((peer)->ibp_nid), \ - atomic_read(&(peer)->ibp_refcount)); \ - LASSERT_ATOMIC_POS(&(peer)->ibp_refcount); \ - if (atomic_dec_and_test(&(peer)->ibp_refcount)) \ - kiblnd_destroy_peer(peer); \ -} while (0) - -static inline bool -kiblnd_peer_connecting(struct kib_peer *peer) -{ - return peer->ibp_connecting || - peer->ibp_reconnecting || - peer->ibp_accepting; -} - -static inline bool -kiblnd_peer_idle(struct kib_peer *peer) -{ - return !kiblnd_peer_connecting(peer) && list_empty(&peer->ibp_conns); -} - -static inline struct list_head * -kiblnd_nid2peerlist(lnet_nid_t nid) -{ - unsigned int hash = - ((unsigned int)nid) % kiblnd_data.kib_peer_hash_size; - - return &kiblnd_data.kib_peers[hash]; -} - -static inline int -kiblnd_peer_active(struct kib_peer *peer) -{ - /* Am I in the peer hash table? */ - return !list_empty(&peer->ibp_list); -} - -static inline struct kib_conn * -kiblnd_get_conn_locked(struct kib_peer *peer) -{ - struct list_head *next; - - LASSERT(!list_empty(&peer->ibp_conns)); - - /* Advance to next connection, be sure to skip the head node */ - if (!peer->ibp_next_conn || - peer->ibp_next_conn->ibc_list.next == &peer->ibp_conns) - next = peer->ibp_conns.next; - else - next = peer->ibp_next_conn->ibc_list.next; - peer->ibp_next_conn = list_entry(next, struct kib_conn, ibc_list); - - return peer->ibp_next_conn; -} - -static inline int -kiblnd_send_keepalive(struct kib_conn *conn) -{ - return (*kiblnd_tunables.kib_keepalive > 0) && - time_after(jiffies, conn->ibc_last_send + - msecs_to_jiffies(*kiblnd_tunables.kib_keepalive * - MSEC_PER_SEC)); -} - -static inline int -kiblnd_need_noop(struct kib_conn *conn) -{ - struct lnet_ioctl_config_o2iblnd_tunables *tunables; - struct lnet_ni *ni = conn->ibc_peer->ibp_ni; - - LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED); - tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; - - if (conn->ibc_outstanding_credits < - IBLND_CREDITS_HIGHWATER(tunables, conn->ibc_version) && - !kiblnd_send_keepalive(conn)) - return 0; /* No need to send NOOP */ - - if (IBLND_OOB_CAPABLE(conn->ibc_version)) { - if (!list_empty(&conn->ibc_tx_queue_nocred)) - return 0; /* NOOP can be piggybacked */ - - /* No tx to piggyback NOOP onto or no credit to send a tx */ - return (list_empty(&conn->ibc_tx_queue) || - !conn->ibc_credits); - } - - if (!list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */ - !list_empty(&conn->ibc_tx_queue_nocred) || /* piggyback NOOP */ - !conn->ibc_credits) /* no credit */ - return 0; - - if (conn->ibc_credits == 1 && /* last credit reserved for */ - !conn->ibc_outstanding_credits) /* giving back credits */ - return 0; - - /* No tx to piggyback NOOP onto or no credit to send a tx */ - return (list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 1); -} - -static inline void -kiblnd_abort_receives(struct kib_conn *conn) -{ - ib_modify_qp(conn->ibc_cmid->qp, - &kiblnd_data.kib_error_qpa, IB_QP_STATE); -} - -static inline const char * -kiblnd_queue2str(struct kib_conn *conn, struct list_head *q) -{ - if (q == &conn->ibc_tx_queue) - return "tx_queue"; - - if (q == &conn->ibc_tx_queue_rsrvd) - return "tx_queue_rsrvd"; - - if (q == &conn->ibc_tx_queue_nocred) - return "tx_queue_nocred"; - - if (q == &conn->ibc_active_txs) - return "active_txs"; - - LBUG(); - return NULL; -} - -/* CAVEAT EMPTOR: We rely on descriptor alignment to allow us to use the */ -/* lowest bits of the work request id to stash the work item type. */ - -#define IBLND_WID_INVAL 0 -#define IBLND_WID_TX 1 -#define IBLND_WID_RX 2 -#define IBLND_WID_RDMA 3 -#define IBLND_WID_MR 4 -#define IBLND_WID_MASK 7UL - -static inline __u64 -kiblnd_ptr2wreqid(void *ptr, int type) -{ - unsigned long lptr = (unsigned long)ptr; - - LASSERT(!(lptr & IBLND_WID_MASK)); - LASSERT(!(type & ~IBLND_WID_MASK)); - return (__u64)(lptr | type); -} - -static inline void * -kiblnd_wreqid2ptr(__u64 wreqid) -{ - return (void *)(((unsigned long)wreqid) & ~IBLND_WID_MASK); -} - -static inline int -kiblnd_wreqid2type(__u64 wreqid) -{ - return wreqid & IBLND_WID_MASK; -} - -static inline void -kiblnd_set_conn_state(struct kib_conn *conn, int state) -{ - conn->ibc_state = state; - mb(); -} - -static inline void -kiblnd_init_msg(struct kib_msg *msg, int type, int body_nob) -{ - msg->ibm_type = type; - msg->ibm_nob = offsetof(struct kib_msg, ibm_u) + body_nob; -} - -static inline int -kiblnd_rd_size(struct kib_rdma_desc *rd) -{ - int i; - int size; - - for (i = size = 0; i < rd->rd_nfrags; i++) - size += rd->rd_frags[i].rf_nob; - - return size; -} - -static inline __u64 -kiblnd_rd_frag_addr(struct kib_rdma_desc *rd, int index) -{ - return rd->rd_frags[index].rf_addr; -} - -static inline __u32 -kiblnd_rd_frag_size(struct kib_rdma_desc *rd, int index) -{ - return rd->rd_frags[index].rf_nob; -} - -static inline __u32 -kiblnd_rd_frag_key(struct kib_rdma_desc *rd, int index) -{ - return rd->rd_key; -} - -static inline int -kiblnd_rd_consume_frag(struct kib_rdma_desc *rd, int index, __u32 nob) -{ - if (nob < rd->rd_frags[index].rf_nob) { - rd->rd_frags[index].rf_addr += nob; - rd->rd_frags[index].rf_nob -= nob; - } else { - index++; - } - - return index; -} - -static inline int -kiblnd_rd_msg_size(struct kib_rdma_desc *rd, int msgtype, int n) -{ - LASSERT(msgtype == IBLND_MSG_GET_REQ || - msgtype == IBLND_MSG_PUT_ACK); - - return msgtype == IBLND_MSG_GET_REQ ? - offsetof(struct kib_get_msg, ibgm_rd.rd_frags[n]) : - offsetof(struct kib_putack_msg, ibpam_rd.rd_frags[n]); -} - -static inline __u64 -kiblnd_dma_mapping_error(struct ib_device *dev, u64 dma_addr) -{ - return ib_dma_mapping_error(dev, dma_addr); -} - -static inline __u64 kiblnd_dma_map_single(struct ib_device *dev, - void *msg, size_t size, - enum dma_data_direction direction) -{ - return ib_dma_map_single(dev, msg, size, direction); -} - -static inline void kiblnd_dma_unmap_single(struct ib_device *dev, - __u64 addr, size_t size, - enum dma_data_direction direction) -{ - ib_dma_unmap_single(dev, addr, size, direction); -} - -#define KIBLND_UNMAP_ADDR_SET(p, m, a) do {} while (0) -#define KIBLND_UNMAP_ADDR(p, m, a) (a) - -static inline int kiblnd_dma_map_sg(struct ib_device *dev, - struct scatterlist *sg, int nents, - enum dma_data_direction direction) -{ - return ib_dma_map_sg(dev, sg, nents, direction); -} - -static inline void kiblnd_dma_unmap_sg(struct ib_device *dev, - struct scatterlist *sg, int nents, - enum dma_data_direction direction) -{ - ib_dma_unmap_sg(dev, sg, nents, direction); -} - -static inline __u64 kiblnd_sg_dma_address(struct ib_device *dev, - struct scatterlist *sg) -{ - return ib_sg_dma_address(dev, sg); -} - -static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev, - struct scatterlist *sg) -{ - return ib_sg_dma_len(dev, sg); -} - -/* XXX We use KIBLND_CONN_PARAM(e) as writable buffer, it's not strictly */ -/* right because OFED1.2 defines it as const, to use it we have to add */ -/* (void *) cast to overcome "const" */ - -#define KIBLND_CONN_PARAM(e) ((e)->param.conn.private_data) -#define KIBLND_CONN_PARAM_LEN(e) ((e)->param.conn.private_data_len) - -void kiblnd_map_rx_descs(struct kib_conn *conn); -void kiblnd_unmap_rx_descs(struct kib_conn *conn); -void kiblnd_pool_free_node(struct kib_pool *pool, struct list_head *node); -struct list_head *kiblnd_pool_alloc_node(struct kib_poolset *ps); - -int kiblnd_fmr_pool_map(struct kib_fmr_poolset *fps, struct kib_tx *tx, - struct kib_rdma_desc *rd, __u32 nob, __u64 iov, - struct kib_fmr *fmr); -void kiblnd_fmr_pool_unmap(struct kib_fmr *fmr, int status); - -int kiblnd_tunables_setup(struct lnet_ni *ni); -void kiblnd_tunables_init(void); - -int kiblnd_connd(void *arg); -int kiblnd_scheduler(void *arg); -int kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name); -int kiblnd_failover_thread(void *arg); - -int kiblnd_alloc_pages(struct kib_pages **pp, int cpt, int npages); - -int kiblnd_cm_callback(struct rdma_cm_id *cmid, - struct rdma_cm_event *event); -int kiblnd_translate_mtu(int value); - -int kiblnd_dev_failover(struct kib_dev *dev); -int kiblnd_create_peer(struct lnet_ni *ni, struct kib_peer **peerp, - lnet_nid_t nid); -void kiblnd_destroy_peer(struct kib_peer *peer); -bool kiblnd_reconnect_peer(struct kib_peer *peer); -void kiblnd_destroy_dev(struct kib_dev *dev); -void kiblnd_unlink_peer_locked(struct kib_peer *peer); -struct kib_peer *kiblnd_find_peer_locked(lnet_nid_t nid); -int kiblnd_close_stale_conns_locked(struct kib_peer *peer, - int version, __u64 incarnation); -int kiblnd_close_peer_conns_locked(struct kib_peer *peer, int why); - -struct kib_conn *kiblnd_create_conn(struct kib_peer *peer, - struct rdma_cm_id *cmid, - int state, int version); -void kiblnd_destroy_conn(struct kib_conn *conn); -void kiblnd_close_conn(struct kib_conn *conn, int error); -void kiblnd_close_conn_locked(struct kib_conn *conn, int error); - -void kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid); -void kiblnd_txlist_done(struct lnet_ni *ni, struct list_head *txlist, - int status); - -void kiblnd_qp_event(struct ib_event *event, void *arg); -void kiblnd_cq_event(struct ib_event *event, void *arg); -void kiblnd_cq_completion(struct ib_cq *cq, void *arg); - -void kiblnd_pack_msg(struct lnet_ni *ni, struct kib_msg *msg, int version, - int credits, lnet_nid_t dstnid, __u64 dststamp); -int kiblnd_unpack_msg(struct kib_msg *msg, int nob); -int kiblnd_post_rx(struct kib_rx *rx, int credit); - -int kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg); -int kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg, - int delayed, struct iov_iter *to, unsigned int rlen); diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c deleted file mode 100644 index 65b7a62943ada0e85b37b35b96e778587c4d739e..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ /dev/null @@ -1,3763 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/o2iblnd/o2iblnd_cb.c - * - * Author: Eric Barton - */ - -#include -#include "o2iblnd.h" - -#define MAX_CONN_RACES_BEFORE_ABORT 20 - -static void kiblnd_peer_alive(struct kib_peer *peer); -static void kiblnd_peer_connect_failed(struct kib_peer *peer, int active, int error); -static void kiblnd_init_tx_msg(struct lnet_ni *ni, struct kib_tx *tx, - int type, int body_nob); -static int kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type, - int resid, struct kib_rdma_desc *dstrd, - __u64 dstcookie); -static void kiblnd_queue_tx_locked(struct kib_tx *tx, struct kib_conn *conn); -static void kiblnd_queue_tx(struct kib_tx *tx, struct kib_conn *conn); -static void kiblnd_unmap_tx(struct kib_tx *tx); -static void kiblnd_check_sends_locked(struct kib_conn *conn); - -static void -kiblnd_tx_done(struct lnet_ni *ni, struct kib_tx *tx) -{ - struct lnet_msg *lntmsg[2]; - struct kib_net *net = ni->ni_data; - int rc; - int i; - - LASSERT(net); - LASSERT(!in_interrupt()); - LASSERT(!tx->tx_queued); /* mustn't be queued for sending */ - LASSERT(!tx->tx_sending); /* mustn't be awaiting sent callback */ - LASSERT(!tx->tx_waiting); /* mustn't be awaiting peer response */ - LASSERT(tx->tx_pool); - - kiblnd_unmap_tx(tx); - - /* tx may have up to 2 lnet msgs to finalise */ - lntmsg[0] = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL; - lntmsg[1] = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL; - rc = tx->tx_status; - - if (tx->tx_conn) { - LASSERT(ni == tx->tx_conn->ibc_peer->ibp_ni); - - kiblnd_conn_decref(tx->tx_conn); - tx->tx_conn = NULL; - } - - tx->tx_nwrq = 0; - tx->tx_status = 0; - - kiblnd_pool_free_node(&tx->tx_pool->tpo_pool, &tx->tx_list); - - /* delay finalize until my descs have been freed */ - for (i = 0; i < 2; i++) { - if (!lntmsg[i]) - continue; - - lnet_finalize(ni, lntmsg[i], rc); - } -} - -void -kiblnd_txlist_done(struct lnet_ni *ni, struct list_head *txlist, int status) -{ - struct kib_tx *tx; - - while (!list_empty(txlist)) { - tx = list_entry(txlist->next, struct kib_tx, tx_list); - - list_del(&tx->tx_list); - /* complete now */ - tx->tx_waiting = 0; - tx->tx_status = status; - kiblnd_tx_done(ni, tx); - } -} - -static struct kib_tx * -kiblnd_get_idle_tx(struct lnet_ni *ni, lnet_nid_t target) -{ - struct kib_net *net = (struct kib_net *)ni->ni_data; - struct list_head *node; - struct kib_tx *tx; - struct kib_tx_poolset *tps; - - tps = net->ibn_tx_ps[lnet_cpt_of_nid(target)]; - node = kiblnd_pool_alloc_node(&tps->tps_poolset); - if (!node) - return NULL; - tx = list_entry(node, struct kib_tx, tx_list); - - LASSERT(!tx->tx_nwrq); - LASSERT(!tx->tx_queued); - LASSERT(!tx->tx_sending); - LASSERT(!tx->tx_waiting); - LASSERT(!tx->tx_status); - LASSERT(!tx->tx_conn); - LASSERT(!tx->tx_lntmsg[0]); - LASSERT(!tx->tx_lntmsg[1]); - LASSERT(!tx->tx_nfrags); - - return tx; -} - -static void -kiblnd_drop_rx(struct kib_rx *rx) -{ - struct kib_conn *conn = rx->rx_conn; - struct kib_sched_info *sched = conn->ibc_sched; - unsigned long flags; - - spin_lock_irqsave(&sched->ibs_lock, flags); - LASSERT(conn->ibc_nrx > 0); - conn->ibc_nrx--; - spin_unlock_irqrestore(&sched->ibs_lock, flags); - - kiblnd_conn_decref(conn); -} - -int -kiblnd_post_rx(struct kib_rx *rx, int credit) -{ - struct kib_conn *conn = rx->rx_conn; - struct kib_net *net = conn->ibc_peer->ibp_ni->ni_data; - struct ib_recv_wr *bad_wrq = NULL; - int rc; - - LASSERT(net); - LASSERT(!in_interrupt()); - LASSERT(credit == IBLND_POSTRX_NO_CREDIT || - credit == IBLND_POSTRX_PEER_CREDIT || - credit == IBLND_POSTRX_RSRVD_CREDIT); - - rx->rx_sge.lkey = conn->ibc_hdev->ibh_pd->local_dma_lkey; - rx->rx_sge.addr = rx->rx_msgaddr; - rx->rx_sge.length = IBLND_MSG_SIZE; - - rx->rx_wrq.next = NULL; - rx->rx_wrq.sg_list = &rx->rx_sge; - rx->rx_wrq.num_sge = 1; - rx->rx_wrq.wr_id = kiblnd_ptr2wreqid(rx, IBLND_WID_RX); - - LASSERT(conn->ibc_state >= IBLND_CONN_INIT); - LASSERT(rx->rx_nob >= 0); /* not posted */ - - if (conn->ibc_state > IBLND_CONN_ESTABLISHED) { - kiblnd_drop_rx(rx); /* No more posts for this rx */ - return 0; - } - - rx->rx_nob = -1; /* flag posted */ - - /* NB: need an extra reference after ib_post_recv because we don't - * own this rx (and rx::rx_conn) anymore, LU-5678. - */ - kiblnd_conn_addref(conn); - rc = ib_post_recv(conn->ibc_cmid->qp, &rx->rx_wrq, &bad_wrq); - if (unlikely(rc)) { - CERROR("Can't post rx for %s: %d, bad_wrq: %p\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), rc, bad_wrq); - rx->rx_nob = 0; - } - - if (conn->ibc_state < IBLND_CONN_ESTABLISHED) /* Initial post */ - goto out; - - if (unlikely(rc)) { - kiblnd_close_conn(conn, rc); - kiblnd_drop_rx(rx); /* No more posts for this rx */ - goto out; - } - - if (credit == IBLND_POSTRX_NO_CREDIT) - goto out; - - spin_lock(&conn->ibc_lock); - if (credit == IBLND_POSTRX_PEER_CREDIT) - conn->ibc_outstanding_credits++; - else - conn->ibc_reserved_credits++; - kiblnd_check_sends_locked(conn); - spin_unlock(&conn->ibc_lock); - -out: - kiblnd_conn_decref(conn); - return rc; -} - -static struct kib_tx * -kiblnd_find_waiting_tx_locked(struct kib_conn *conn, int txtype, __u64 cookie) -{ - struct list_head *tmp; - - list_for_each(tmp, &conn->ibc_active_txs) { - struct kib_tx *tx = list_entry(tmp, struct kib_tx, tx_list); - - LASSERT(!tx->tx_queued); - LASSERT(tx->tx_sending || tx->tx_waiting); - - if (tx->tx_cookie != cookie) - continue; - - if (tx->tx_waiting && - tx->tx_msg->ibm_type == txtype) - return tx; - - CWARN("Bad completion: %swaiting, type %x (wanted %x)\n", - tx->tx_waiting ? "" : "NOT ", - tx->tx_msg->ibm_type, txtype); - } - return NULL; -} - -static void -kiblnd_handle_completion(struct kib_conn *conn, int txtype, int status, __u64 cookie) -{ - struct kib_tx *tx; - struct lnet_ni *ni = conn->ibc_peer->ibp_ni; - int idle; - - spin_lock(&conn->ibc_lock); - - tx = kiblnd_find_waiting_tx_locked(conn, txtype, cookie); - if (!tx) { - spin_unlock(&conn->ibc_lock); - - CWARN("Unmatched completion type %x cookie %#llx from %s\n", - txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - kiblnd_close_conn(conn, -EPROTO); - return; - } - - if (!tx->tx_status) { /* success so far */ - if (status < 0) /* failed? */ - tx->tx_status = status; - else if (txtype == IBLND_MSG_GET_REQ) - lnet_set_reply_msg_len(ni, tx->tx_lntmsg[1], status); - } - - tx->tx_waiting = 0; - - idle = !tx->tx_queued && !tx->tx_sending; - if (idle) - list_del(&tx->tx_list); - - spin_unlock(&conn->ibc_lock); - - if (idle) - kiblnd_tx_done(ni, tx); -} - -static void -kiblnd_send_completion(struct kib_conn *conn, int type, int status, __u64 cookie) -{ - struct lnet_ni *ni = conn->ibc_peer->ibp_ni; - struct kib_tx *tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid); - - if (!tx) { - CERROR("Can't get tx for completion %x for %s\n", - type, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - return; - } - - tx->tx_msg->ibm_u.completion.ibcm_status = status; - tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie; - kiblnd_init_tx_msg(ni, tx, type, sizeof(struct kib_completion_msg)); - - kiblnd_queue_tx(tx, conn); -} - -static void -kiblnd_handle_rx(struct kib_rx *rx) -{ - struct kib_msg *msg = rx->rx_msg; - struct kib_conn *conn = rx->rx_conn; - struct lnet_ni *ni = conn->ibc_peer->ibp_ni; - int credits = msg->ibm_credits; - struct kib_tx *tx; - int rc = 0; - int rc2; - int post_credit; - - LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED); - - CDEBUG(D_NET, "Received %x[%d] from %s\n", - msg->ibm_type, credits, - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - - if (credits) { - /* Have I received credits that will let me send? */ - spin_lock(&conn->ibc_lock); - - if (conn->ibc_credits + credits > - conn->ibc_queue_depth) { - rc2 = conn->ibc_credits; - spin_unlock(&conn->ibc_lock); - - CERROR("Bad credits from %s: %d + %d > %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), - rc2, credits, conn->ibc_queue_depth); - - kiblnd_close_conn(conn, -EPROTO); - kiblnd_post_rx(rx, IBLND_POSTRX_NO_CREDIT); - return; - } - - conn->ibc_credits += credits; - - /* This ensures the credit taken by NOOP can be returned */ - if (msg->ibm_type == IBLND_MSG_NOOP && - !IBLND_OOB_CAPABLE(conn->ibc_version)) /* v1 only */ - conn->ibc_outstanding_credits++; - - kiblnd_check_sends_locked(conn); - spin_unlock(&conn->ibc_lock); - } - - switch (msg->ibm_type) { - default: - CERROR("Bad IBLND message type %x from %s\n", - msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - post_credit = IBLND_POSTRX_NO_CREDIT; - rc = -EPROTO; - break; - - case IBLND_MSG_NOOP: - if (IBLND_OOB_CAPABLE(conn->ibc_version)) { - post_credit = IBLND_POSTRX_NO_CREDIT; - break; - } - - if (credits) /* credit already posted */ - post_credit = IBLND_POSTRX_NO_CREDIT; - else /* a keepalive NOOP */ - post_credit = IBLND_POSTRX_PEER_CREDIT; - break; - - case IBLND_MSG_IMMEDIATE: - post_credit = IBLND_POSTRX_DONT_POST; - rc = lnet_parse(ni, &msg->ibm_u.immediate.ibim_hdr, - msg->ibm_srcnid, rx, 0); - if (rc < 0) /* repost on error */ - post_credit = IBLND_POSTRX_PEER_CREDIT; - break; - - case IBLND_MSG_PUT_REQ: - post_credit = IBLND_POSTRX_DONT_POST; - rc = lnet_parse(ni, &msg->ibm_u.putreq.ibprm_hdr, - msg->ibm_srcnid, rx, 1); - if (rc < 0) /* repost on error */ - post_credit = IBLND_POSTRX_PEER_CREDIT; - break; - - case IBLND_MSG_PUT_NAK: - CWARN("PUT_NACK from %s\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - post_credit = IBLND_POSTRX_RSRVD_CREDIT; - kiblnd_handle_completion(conn, IBLND_MSG_PUT_REQ, - msg->ibm_u.completion.ibcm_status, - msg->ibm_u.completion.ibcm_cookie); - break; - - case IBLND_MSG_PUT_ACK: - post_credit = IBLND_POSTRX_RSRVD_CREDIT; - - spin_lock(&conn->ibc_lock); - tx = kiblnd_find_waiting_tx_locked(conn, IBLND_MSG_PUT_REQ, - msg->ibm_u.putack.ibpam_src_cookie); - if (tx) - list_del(&tx->tx_list); - spin_unlock(&conn->ibc_lock); - - if (!tx) { - CERROR("Unmatched PUT_ACK from %s\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - rc = -EPROTO; - break; - } - - LASSERT(tx->tx_waiting); - /* - * CAVEAT EMPTOR: I could be racing with tx_complete, but... - * (a) I can overwrite tx_msg since my peer has received it! - * (b) tx_waiting set tells tx_complete() it's not done. - */ - tx->tx_nwrq = 0; /* overwrite PUT_REQ */ - - rc2 = kiblnd_init_rdma(conn, tx, IBLND_MSG_PUT_DONE, - kiblnd_rd_size(&msg->ibm_u.putack.ibpam_rd), - &msg->ibm_u.putack.ibpam_rd, - msg->ibm_u.putack.ibpam_dst_cookie); - if (rc2 < 0) - CERROR("Can't setup rdma for PUT to %s: %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), rc2); - - spin_lock(&conn->ibc_lock); - tx->tx_waiting = 0; /* clear waiting and queue atomically */ - kiblnd_queue_tx_locked(tx, conn); - spin_unlock(&conn->ibc_lock); - break; - - case IBLND_MSG_PUT_DONE: - post_credit = IBLND_POSTRX_PEER_CREDIT; - kiblnd_handle_completion(conn, IBLND_MSG_PUT_ACK, - msg->ibm_u.completion.ibcm_status, - msg->ibm_u.completion.ibcm_cookie); - break; - - case IBLND_MSG_GET_REQ: - post_credit = IBLND_POSTRX_DONT_POST; - rc = lnet_parse(ni, &msg->ibm_u.get.ibgm_hdr, - msg->ibm_srcnid, rx, 1); - if (rc < 0) /* repost on error */ - post_credit = IBLND_POSTRX_PEER_CREDIT; - break; - - case IBLND_MSG_GET_DONE: - post_credit = IBLND_POSTRX_RSRVD_CREDIT; - kiblnd_handle_completion(conn, IBLND_MSG_GET_REQ, - msg->ibm_u.completion.ibcm_status, - msg->ibm_u.completion.ibcm_cookie); - break; - } - - if (rc < 0) /* protocol error */ - kiblnd_close_conn(conn, rc); - - if (post_credit != IBLND_POSTRX_DONT_POST) - kiblnd_post_rx(rx, post_credit); -} - -static void -kiblnd_rx_complete(struct kib_rx *rx, int status, int nob) -{ - struct kib_msg *msg = rx->rx_msg; - struct kib_conn *conn = rx->rx_conn; - struct lnet_ni *ni = conn->ibc_peer->ibp_ni; - struct kib_net *net = ni->ni_data; - int rc; - int err = -EIO; - - LASSERT(net); - LASSERT(rx->rx_nob < 0); /* was posted */ - rx->rx_nob = 0; /* isn't now */ - - if (conn->ibc_state > IBLND_CONN_ESTABLISHED) - goto ignore; - - if (status != IB_WC_SUCCESS) { - CNETERR("Rx from %s failed: %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), status); - goto failed; - } - - LASSERT(nob >= 0); - rx->rx_nob = nob; - - rc = kiblnd_unpack_msg(msg, rx->rx_nob); - if (rc) { - CERROR("Error %d unpacking rx from %s\n", - rc, libcfs_nid2str(conn->ibc_peer->ibp_nid)); - goto failed; - } - - if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid || - msg->ibm_dstnid != ni->ni_nid || - msg->ibm_srcstamp != conn->ibc_incarnation || - msg->ibm_dststamp != net->ibn_incarnation) { - CERROR("Stale rx from %s\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - err = -ESTALE; - goto failed; - } - - /* set time last known alive */ - kiblnd_peer_alive(conn->ibc_peer); - - /* racing with connection establishment/teardown! */ - - if (conn->ibc_state < IBLND_CONN_ESTABLISHED) { - rwlock_t *g_lock = &kiblnd_data.kib_global_lock; - unsigned long flags; - - write_lock_irqsave(g_lock, flags); - /* must check holding global lock to eliminate race */ - if (conn->ibc_state < IBLND_CONN_ESTABLISHED) { - list_add_tail(&rx->rx_list, &conn->ibc_early_rxs); - write_unlock_irqrestore(g_lock, flags); - return; - } - write_unlock_irqrestore(g_lock, flags); - } - kiblnd_handle_rx(rx); - return; - - failed: - CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); - kiblnd_close_conn(conn, err); - ignore: - kiblnd_drop_rx(rx); /* Don't re-post rx. */ -} - -static struct page * -kiblnd_kvaddr_to_page(unsigned long vaddr) -{ - struct page *page; - - if (is_vmalloc_addr((void *)vaddr)) { - page = vmalloc_to_page((void *)vaddr); - LASSERT(page); - return page; - } -#ifdef CONFIG_HIGHMEM - if (vaddr >= PKMAP_BASE && - vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) { - /* No highmem pages only used for bulk (kiov) I/O */ - CERROR("find page for address in highmem\n"); - LBUG(); - } -#endif - page = virt_to_page(vaddr); - LASSERT(page); - return page; -} - -static int -kiblnd_fmr_map_tx(struct kib_net *net, struct kib_tx *tx, struct kib_rdma_desc *rd, __u32 nob) -{ - struct kib_hca_dev *hdev; - struct kib_fmr_poolset *fps; - int cpt; - int rc; - - LASSERT(tx->tx_pool); - LASSERT(tx->tx_pool->tpo_pool.po_owner); - - hdev = tx->tx_pool->tpo_hdev; - cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt; - - fps = net->ibn_fmr_ps[cpt]; - rc = kiblnd_fmr_pool_map(fps, tx, rd, nob, 0, &tx->fmr); - if (rc) { - CERROR("Can't map %u bytes: %d\n", nob, rc); - return rc; - } - - /* - * If rd is not tx_rd, it's going to get sent to a peer, who will need - * the rkey - */ - rd->rd_key = tx->fmr.fmr_key; - rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask; - rd->rd_frags[0].rf_nob = nob; - rd->rd_nfrags = 1; - - return 0; -} - -static void kiblnd_unmap_tx(struct kib_tx *tx) -{ - if (tx->fmr.fmr_pfmr || tx->fmr.fmr_frd) - kiblnd_fmr_pool_unmap(&tx->fmr, tx->tx_status); - - if (tx->tx_nfrags) { - kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev, - tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir); - tx->tx_nfrags = 0; - } -} - -static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx, - struct kib_rdma_desc *rd, int nfrags) -{ - struct kib_net *net = ni->ni_data; - struct kib_hca_dev *hdev = net->ibn_dev->ibd_hdev; - __u32 nob; - int i; - - /* - * If rd is not tx_rd, it's going to get sent to a peer and I'm the - * RDMA sink - */ - tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE; - tx->tx_nfrags = nfrags; - - rd->rd_nfrags = kiblnd_dma_map_sg(hdev->ibh_ibdev, tx->tx_frags, - tx->tx_nfrags, tx->tx_dmadir); - - for (i = 0, nob = 0; i < rd->rd_nfrags; i++) { - rd->rd_frags[i].rf_nob = kiblnd_sg_dma_len( - hdev->ibh_ibdev, &tx->tx_frags[i]); - rd->rd_frags[i].rf_addr = kiblnd_sg_dma_address( - hdev->ibh_ibdev, &tx->tx_frags[i]); - nob += rd->rd_frags[i].rf_nob; - } - - if (net->ibn_fmr_ps) - return kiblnd_fmr_map_tx(net, tx, rd, nob); - - return -EINVAL; -} - -static int -kiblnd_setup_rd_iov(struct lnet_ni *ni, struct kib_tx *tx, - struct kib_rdma_desc *rd, unsigned int niov, - const struct kvec *iov, int offset, int nob) -{ - struct kib_net *net = ni->ni_data; - struct page *page; - struct scatterlist *sg; - unsigned long vaddr; - int fragnob; - int page_offset; - - LASSERT(nob > 0); - LASSERT(niov > 0); - LASSERT(net); - - while (offset >= iov->iov_len) { - offset -= iov->iov_len; - niov--; - iov++; - LASSERT(niov > 0); - } - - sg = tx->tx_frags; - do { - LASSERT(niov > 0); - - vaddr = ((unsigned long)iov->iov_base) + offset; - page_offset = vaddr & (PAGE_SIZE - 1); - page = kiblnd_kvaddr_to_page(vaddr); - if (!page) { - CERROR("Can't find page\n"); - return -EFAULT; - } - - fragnob = min((int)(iov->iov_len - offset), nob); - fragnob = min(fragnob, (int)PAGE_SIZE - page_offset); - - sg_set_page(sg, page, fragnob, page_offset); - sg = sg_next(sg); - if (!sg) { - CERROR("lacking enough sg entries to map tx\n"); - return -EFAULT; - } - - if (offset + fragnob < iov->iov_len) { - offset += fragnob; - } else { - offset = 0; - iov++; - niov--; - } - nob -= fragnob; - } while (nob > 0); - - return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags); -} - -static int -kiblnd_setup_rd_kiov(struct lnet_ni *ni, struct kib_tx *tx, - struct kib_rdma_desc *rd, int nkiov, - const struct bio_vec *kiov, int offset, int nob) -{ - struct kib_net *net = ni->ni_data; - struct scatterlist *sg; - int fragnob; - - CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); - - LASSERT(nob > 0); - LASSERT(nkiov > 0); - LASSERT(net); - - while (offset >= kiov->bv_len) { - offset -= kiov->bv_len; - nkiov--; - kiov++; - LASSERT(nkiov > 0); - } - - sg = tx->tx_frags; - do { - LASSERT(nkiov > 0); - - fragnob = min((int)(kiov->bv_len - offset), nob); - - sg_set_page(sg, kiov->bv_page, fragnob, - kiov->bv_offset + offset); - sg = sg_next(sg); - if (!sg) { - CERROR("lacking enough sg entries to map tx\n"); - return -EFAULT; - } - - offset = 0; - kiov++; - nkiov--; - nob -= fragnob; - } while (nob > 0); - - return kiblnd_map_tx(ni, tx, rd, sg - tx->tx_frags); -} - -static int -kiblnd_post_tx_locked(struct kib_conn *conn, struct kib_tx *tx, int credit) - __must_hold(&conn->ibc_lock) -{ - struct kib_msg *msg = tx->tx_msg; - struct kib_peer *peer = conn->ibc_peer; - struct lnet_ni *ni = peer->ibp_ni; - int ver = conn->ibc_version; - int rc; - int done; - - LASSERT(tx->tx_queued); - /* We rely on this for QP sizing */ - LASSERT(tx->tx_nwrq > 0); - - LASSERT(!credit || credit == 1); - LASSERT(conn->ibc_outstanding_credits >= 0); - LASSERT(conn->ibc_outstanding_credits <= conn->ibc_queue_depth); - LASSERT(conn->ibc_credits >= 0); - LASSERT(conn->ibc_credits <= conn->ibc_queue_depth); - - if (conn->ibc_nsends_posted == kiblnd_concurrent_sends(ver, ni)) { - /* tx completions outstanding... */ - CDEBUG(D_NET, "%s: posted enough\n", - libcfs_nid2str(peer->ibp_nid)); - return -EAGAIN; - } - - if (credit && !conn->ibc_credits) { /* no credits */ - CDEBUG(D_NET, "%s: no credits\n", - libcfs_nid2str(peer->ibp_nid)); - return -EAGAIN; - } - - if (credit && !IBLND_OOB_CAPABLE(ver) && - conn->ibc_credits == 1 && /* last credit reserved */ - msg->ibm_type != IBLND_MSG_NOOP) { /* for NOOP */ - CDEBUG(D_NET, "%s: not using last credit\n", - libcfs_nid2str(peer->ibp_nid)); - return -EAGAIN; - } - - /* NB don't drop ibc_lock before bumping tx_sending */ - list_del(&tx->tx_list); - tx->tx_queued = 0; - - if (msg->ibm_type == IBLND_MSG_NOOP && - (!kiblnd_need_noop(conn) || /* redundant NOOP */ - (IBLND_OOB_CAPABLE(ver) && /* posted enough NOOP */ - conn->ibc_noops_posted == IBLND_OOB_MSGS(ver)))) { - /* - * OK to drop when posted enough NOOPs, since - * kiblnd_check_sends_locked will queue NOOP again when - * posted NOOPs complete - */ - spin_unlock(&conn->ibc_lock); - kiblnd_tx_done(peer->ibp_ni, tx); - spin_lock(&conn->ibc_lock); - CDEBUG(D_NET, "%s(%d): redundant or enough NOOP\n", - libcfs_nid2str(peer->ibp_nid), - conn->ibc_noops_posted); - return 0; - } - - kiblnd_pack_msg(peer->ibp_ni, msg, ver, conn->ibc_outstanding_credits, - peer->ibp_nid, conn->ibc_incarnation); - - conn->ibc_credits -= credit; - conn->ibc_outstanding_credits = 0; - conn->ibc_nsends_posted++; - if (msg->ibm_type == IBLND_MSG_NOOP) - conn->ibc_noops_posted++; - - /* - * CAVEAT EMPTOR! This tx could be the PUT_DONE of an RDMA - * PUT. If so, it was first queued here as a PUT_REQ, sent and - * stashed on ibc_active_txs, matched by an incoming PUT_ACK, - * and then re-queued here. It's (just) possible that - * tx_sending is non-zero if we've not done the tx_complete() - * from the first send; hence the ++ rather than = below. - */ - tx->tx_sending++; - list_add(&tx->tx_list, &conn->ibc_active_txs); - - /* I'm still holding ibc_lock! */ - if (conn->ibc_state != IBLND_CONN_ESTABLISHED) { - rc = -ECONNABORTED; - } else if (tx->tx_pool->tpo_pool.po_failed || - conn->ibc_hdev != tx->tx_pool->tpo_hdev) { - /* close_conn will launch failover */ - rc = -ENETDOWN; - } else { - struct kib_fast_reg_descriptor *frd = tx->fmr.fmr_frd; - struct ib_send_wr *bad = &tx->tx_wrq[tx->tx_nwrq - 1].wr; - struct ib_send_wr *wrq = &tx->tx_wrq[0].wr; - - if (frd) { - if (!frd->frd_valid) { - wrq = &frd->frd_inv_wr; - wrq->next = &frd->frd_fastreg_wr.wr; - } else { - wrq = &frd->frd_fastreg_wr.wr; - } - frd->frd_fastreg_wr.wr.next = &tx->tx_wrq[0].wr; - } - - LASSERTF(bad->wr_id == kiblnd_ptr2wreqid(tx, IBLND_WID_TX), - "bad wr_id %llx, opc %d, flags %d, peer: %s\n", - bad->wr_id, bad->opcode, bad->send_flags, - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - bad = NULL; - rc = ib_post_send(conn->ibc_cmid->qp, wrq, &bad); - } - - conn->ibc_last_send = jiffies; - - if (!rc) - return 0; - - /* - * NB credits are transferred in the actual - * message, which can only be the last work item - */ - conn->ibc_credits += credit; - conn->ibc_outstanding_credits += msg->ibm_credits; - conn->ibc_nsends_posted--; - if (msg->ibm_type == IBLND_MSG_NOOP) - conn->ibc_noops_posted--; - - tx->tx_status = rc; - tx->tx_waiting = 0; - tx->tx_sending--; - - done = !tx->tx_sending; - if (done) - list_del(&tx->tx_list); - - spin_unlock(&conn->ibc_lock); - - if (conn->ibc_state == IBLND_CONN_ESTABLISHED) - CERROR("Error %d posting transmit to %s\n", - rc, libcfs_nid2str(peer->ibp_nid)); - else - CDEBUG(D_NET, "Error %d posting transmit to %s\n", - rc, libcfs_nid2str(peer->ibp_nid)); - - kiblnd_close_conn(conn, rc); - - if (done) - kiblnd_tx_done(peer->ibp_ni, tx); - - spin_lock(&conn->ibc_lock); - - return -EIO; -} - -static void -kiblnd_check_sends_locked(struct kib_conn *conn) -{ - int ver = conn->ibc_version; - struct lnet_ni *ni = conn->ibc_peer->ibp_ni; - struct kib_tx *tx; - - /* Don't send anything until after the connection is established */ - if (conn->ibc_state < IBLND_CONN_ESTABLISHED) { - CDEBUG(D_NET, "%s too soon\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - return; - } - - LASSERT(conn->ibc_nsends_posted <= kiblnd_concurrent_sends(ver, ni)); - LASSERT(!IBLND_OOB_CAPABLE(ver) || - conn->ibc_noops_posted <= IBLND_OOB_MSGS(ver)); - LASSERT(conn->ibc_reserved_credits >= 0); - - while (conn->ibc_reserved_credits > 0 && - !list_empty(&conn->ibc_tx_queue_rsrvd)) { - tx = list_entry(conn->ibc_tx_queue_rsrvd.next, - struct kib_tx, tx_list); - list_del(&tx->tx_list); - list_add_tail(&tx->tx_list, &conn->ibc_tx_queue); - conn->ibc_reserved_credits--; - } - - if (kiblnd_need_noop(conn)) { - spin_unlock(&conn->ibc_lock); - - tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid); - if (tx) - kiblnd_init_tx_msg(ni, tx, IBLND_MSG_NOOP, 0); - - spin_lock(&conn->ibc_lock); - if (tx) - kiblnd_queue_tx_locked(tx, conn); - } - - for (;;) { - int credit; - - if (!list_empty(&conn->ibc_tx_queue_nocred)) { - credit = 0; - tx = list_entry(conn->ibc_tx_queue_nocred.next, - struct kib_tx, tx_list); - } else if (!list_empty(&conn->ibc_tx_noops)) { - LASSERT(!IBLND_OOB_CAPABLE(ver)); - credit = 1; - tx = list_entry(conn->ibc_tx_noops.next, - struct kib_tx, tx_list); - } else if (!list_empty(&conn->ibc_tx_queue)) { - credit = 1; - tx = list_entry(conn->ibc_tx_queue.next, - struct kib_tx, tx_list); - } else { - break; - } - - if (kiblnd_post_tx_locked(conn, tx, credit)) - break; - } -} - -static void -kiblnd_tx_complete(struct kib_tx *tx, int status) -{ - int failed = (status != IB_WC_SUCCESS); - struct kib_conn *conn = tx->tx_conn; - int idle; - - LASSERT(tx->tx_sending > 0); - - if (failed) { - if (conn->ibc_state == IBLND_CONN_ESTABLISHED) - CNETERR("Tx -> %s cookie %#llx sending %d waiting %d: failed %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), - tx->tx_cookie, tx->tx_sending, tx->tx_waiting, - status); - - kiblnd_close_conn(conn, -EIO); - } else { - kiblnd_peer_alive(conn->ibc_peer); - } - - spin_lock(&conn->ibc_lock); - - /* - * I could be racing with rdma completion. Whoever makes 'tx' idle - * gets to free it, which also drops its ref on 'conn'. - */ - tx->tx_sending--; - conn->ibc_nsends_posted--; - if (tx->tx_msg->ibm_type == IBLND_MSG_NOOP) - conn->ibc_noops_posted--; - - if (failed) { - tx->tx_waiting = 0; /* don't wait for peer */ - tx->tx_status = -EIO; - } - - idle = !tx->tx_sending && /* This is the final callback */ - !tx->tx_waiting && /* Not waiting for peer */ - !tx->tx_queued; /* Not re-queued (PUT_DONE) */ - if (idle) - list_del(&tx->tx_list); - - kiblnd_check_sends_locked(conn); - spin_unlock(&conn->ibc_lock); - - if (idle) - kiblnd_tx_done(conn->ibc_peer->ibp_ni, tx); -} - -static void -kiblnd_init_tx_msg(struct lnet_ni *ni, struct kib_tx *tx, int type, - int body_nob) -{ - struct kib_hca_dev *hdev = tx->tx_pool->tpo_hdev; - struct ib_sge *sge = &tx->tx_sge[tx->tx_nwrq]; - struct ib_rdma_wr *wrq = &tx->tx_wrq[tx->tx_nwrq]; - int nob = offsetof(struct kib_msg, ibm_u) + body_nob; - - LASSERT(tx->tx_nwrq >= 0); - LASSERT(tx->tx_nwrq < IBLND_MAX_RDMA_FRAGS + 1); - LASSERT(nob <= IBLND_MSG_SIZE); - - kiblnd_init_msg(tx->tx_msg, type, body_nob); - - sge->lkey = hdev->ibh_pd->local_dma_lkey; - sge->addr = tx->tx_msgaddr; - sge->length = nob; - - memset(wrq, 0, sizeof(*wrq)); - - wrq->wr.next = NULL; - wrq->wr.wr_id = kiblnd_ptr2wreqid(tx, IBLND_WID_TX); - wrq->wr.sg_list = sge; - wrq->wr.num_sge = 1; - wrq->wr.opcode = IB_WR_SEND; - wrq->wr.send_flags = IB_SEND_SIGNALED; - - tx->tx_nwrq++; -} - -static int -kiblnd_init_rdma(struct kib_conn *conn, struct kib_tx *tx, int type, - int resid, struct kib_rdma_desc *dstrd, __u64 dstcookie) -{ - struct kib_msg *ibmsg = tx->tx_msg; - struct kib_rdma_desc *srcrd = tx->tx_rd; - struct ib_sge *sge = &tx->tx_sge[0]; - struct ib_rdma_wr *wrq, *next; - int rc = resid; - int srcidx = 0; - int dstidx = 0; - int wrknob; - - LASSERT(!in_interrupt()); - LASSERT(!tx->tx_nwrq); - LASSERT(type == IBLND_MSG_GET_DONE || - type == IBLND_MSG_PUT_DONE); - - if (kiblnd_rd_size(srcrd) > conn->ibc_max_frags << PAGE_SHIFT) { - CERROR("RDMA is too large for peer %s (%d), src size: %d dst size: %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), - conn->ibc_max_frags << PAGE_SHIFT, - kiblnd_rd_size(srcrd), kiblnd_rd_size(dstrd)); - rc = -EMSGSIZE; - goto too_big; - } - - while (resid > 0) { - if (srcidx >= srcrd->rd_nfrags) { - CERROR("Src buffer exhausted: %d frags\n", srcidx); - rc = -EPROTO; - break; - } - - if (dstidx == dstrd->rd_nfrags) { - CERROR("Dst buffer exhausted: %d frags\n", dstidx); - rc = -EPROTO; - break; - } - - if (tx->tx_nwrq >= IBLND_MAX_RDMA_FRAGS) { - CERROR("RDMA has too many fragments for peer %s (%d), src idx/frags: %d/%d dst idx/frags: %d/%d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), - IBLND_MAX_RDMA_FRAGS, - srcidx, srcrd->rd_nfrags, - dstidx, dstrd->rd_nfrags); - rc = -EMSGSIZE; - break; - } - - wrknob = min3(kiblnd_rd_frag_size(srcrd, srcidx), - kiblnd_rd_frag_size(dstrd, dstidx), - (__u32)resid); - - sge = &tx->tx_sge[tx->tx_nwrq]; - sge->addr = kiblnd_rd_frag_addr(srcrd, srcidx); - sge->lkey = kiblnd_rd_frag_key(srcrd, srcidx); - sge->length = wrknob; - - wrq = &tx->tx_wrq[tx->tx_nwrq]; - next = wrq + 1; - - wrq->wr.next = &next->wr; - wrq->wr.wr_id = kiblnd_ptr2wreqid(tx, IBLND_WID_RDMA); - wrq->wr.sg_list = sge; - wrq->wr.num_sge = 1; - wrq->wr.opcode = IB_WR_RDMA_WRITE; - wrq->wr.send_flags = 0; - - wrq->remote_addr = kiblnd_rd_frag_addr(dstrd, dstidx); - wrq->rkey = kiblnd_rd_frag_key(dstrd, dstidx); - - srcidx = kiblnd_rd_consume_frag(srcrd, srcidx, wrknob); - dstidx = kiblnd_rd_consume_frag(dstrd, dstidx, wrknob); - - resid -= wrknob; - - tx->tx_nwrq++; - wrq++; - sge++; - } -too_big: - if (rc < 0) /* no RDMA if completing with failure */ - tx->tx_nwrq = 0; - - ibmsg->ibm_u.completion.ibcm_status = rc; - ibmsg->ibm_u.completion.ibcm_cookie = dstcookie; - kiblnd_init_tx_msg(conn->ibc_peer->ibp_ni, tx, - type, sizeof(struct kib_completion_msg)); - - return rc; -} - -static void -kiblnd_queue_tx_locked(struct kib_tx *tx, struct kib_conn *conn) -{ - struct list_head *q; - - LASSERT(tx->tx_nwrq > 0); /* work items set up */ - LASSERT(!tx->tx_queued); /* not queued for sending already */ - LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED); - - tx->tx_queued = 1; - tx->tx_deadline = jiffies + - msecs_to_jiffies(*kiblnd_tunables.kib_timeout * - MSEC_PER_SEC); - - if (!tx->tx_conn) { - kiblnd_conn_addref(conn); - tx->tx_conn = conn; - LASSERT(tx->tx_msg->ibm_type != IBLND_MSG_PUT_DONE); - } else { - /* PUT_DONE first attached to conn as a PUT_REQ */ - LASSERT(tx->tx_conn == conn); - LASSERT(tx->tx_msg->ibm_type == IBLND_MSG_PUT_DONE); - } - - switch (tx->tx_msg->ibm_type) { - default: - LBUG(); - - case IBLND_MSG_PUT_REQ: - case IBLND_MSG_GET_REQ: - q = &conn->ibc_tx_queue_rsrvd; - break; - - case IBLND_MSG_PUT_NAK: - case IBLND_MSG_PUT_ACK: - case IBLND_MSG_PUT_DONE: - case IBLND_MSG_GET_DONE: - q = &conn->ibc_tx_queue_nocred; - break; - - case IBLND_MSG_NOOP: - if (IBLND_OOB_CAPABLE(conn->ibc_version)) - q = &conn->ibc_tx_queue_nocred; - else - q = &conn->ibc_tx_noops; - break; - - case IBLND_MSG_IMMEDIATE: - q = &conn->ibc_tx_queue; - break; - } - - list_add_tail(&tx->tx_list, q); -} - -static void -kiblnd_queue_tx(struct kib_tx *tx, struct kib_conn *conn) -{ - spin_lock(&conn->ibc_lock); - kiblnd_queue_tx_locked(tx, conn); - kiblnd_check_sends_locked(conn); - spin_unlock(&conn->ibc_lock); -} - -static int kiblnd_resolve_addr(struct rdma_cm_id *cmid, - struct sockaddr_in *srcaddr, - struct sockaddr_in *dstaddr, - int timeout_ms) -{ - unsigned short port; - int rc; - - /* allow the port to be reused */ - rc = rdma_set_reuseaddr(cmid, 1); - if (rc) { - CERROR("Unable to set reuse on cmid: %d\n", rc); - return rc; - } - - /* look for a free privileged port */ - for (port = PROT_SOCK - 1; port > 0; port--) { - srcaddr->sin_port = htons(port); - rc = rdma_resolve_addr(cmid, - (struct sockaddr *)srcaddr, - (struct sockaddr *)dstaddr, - timeout_ms); - if (!rc) { - CDEBUG(D_NET, "bound to port %hu\n", port); - return 0; - } else if (rc == -EADDRINUSE || rc == -EADDRNOTAVAIL) { - CDEBUG(D_NET, "bind to port %hu failed: %d\n", - port, rc); - } else { - return rc; - } - } - - CERROR("Failed to bind to a free privileged port\n"); - return rc; -} - -static void -kiblnd_connect_peer(struct kib_peer *peer) -{ - struct rdma_cm_id *cmid; - struct kib_dev *dev; - struct kib_net *net = peer->ibp_ni->ni_data; - struct sockaddr_in srcaddr; - struct sockaddr_in dstaddr; - int rc; - - LASSERT(net); - LASSERT(peer->ibp_connecting > 0); - - cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, peer, RDMA_PS_TCP, - IB_QPT_RC); - - if (IS_ERR(cmid)) { - CERROR("Can't create CMID for %s: %ld\n", - libcfs_nid2str(peer->ibp_nid), PTR_ERR(cmid)); - rc = PTR_ERR(cmid); - goto failed; - } - - dev = net->ibn_dev; - memset(&srcaddr, 0, sizeof(srcaddr)); - srcaddr.sin_family = AF_INET; - srcaddr.sin_addr.s_addr = htonl(dev->ibd_ifip); - - memset(&dstaddr, 0, sizeof(dstaddr)); - dstaddr.sin_family = AF_INET; - dstaddr.sin_port = htons(*kiblnd_tunables.kib_service); - dstaddr.sin_addr.s_addr = htonl(LNET_NIDADDR(peer->ibp_nid)); - - kiblnd_peer_addref(peer); /* cmid's ref */ - - if (*kiblnd_tunables.kib_use_priv_port) { - rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr, - *kiblnd_tunables.kib_timeout * 1000); - } else { - rc = rdma_resolve_addr(cmid, - (struct sockaddr *)&srcaddr, - (struct sockaddr *)&dstaddr, - *kiblnd_tunables.kib_timeout * 1000); - } - if (rc) { - /* Can't initiate address resolution: */ - CERROR("Can't resolve addr for %s: %d\n", - libcfs_nid2str(peer->ibp_nid), rc); - goto failed2; - } - - return; - - failed2: - kiblnd_peer_connect_failed(peer, 1, rc); - kiblnd_peer_decref(peer); /* cmid's ref */ - rdma_destroy_id(cmid); - return; - failed: - kiblnd_peer_connect_failed(peer, 1, rc); -} - -bool -kiblnd_reconnect_peer(struct kib_peer *peer) -{ - rwlock_t *glock = &kiblnd_data.kib_global_lock; - char *reason = NULL; - struct list_head txs; - unsigned long flags; - - INIT_LIST_HEAD(&txs); - - write_lock_irqsave(glock, flags); - if (!peer->ibp_reconnecting) { - if (peer->ibp_accepting) - reason = "accepting"; - else if (peer->ibp_connecting) - reason = "connecting"; - else if (!list_empty(&peer->ibp_conns)) - reason = "connected"; - else /* connected then closed */ - reason = "closed"; - - goto no_reconnect; - } - - LASSERT(!peer->ibp_accepting && !peer->ibp_connecting && - list_empty(&peer->ibp_conns)); - peer->ibp_reconnecting--; - - if (!kiblnd_peer_active(peer)) { - list_splice_init(&peer->ibp_tx_queue, &txs); - reason = "unlinked"; - goto no_reconnect; - } - - peer->ibp_connecting++; - peer->ibp_reconnected++; - write_unlock_irqrestore(glock, flags); - - kiblnd_connect_peer(peer); - return true; - -no_reconnect: - write_unlock_irqrestore(glock, flags); - - CWARN("Abort reconnection of %s: %s\n", - libcfs_nid2str(peer->ibp_nid), reason); - kiblnd_txlist_done(peer->ibp_ni, &txs, -ECONNABORTED); - return false; -} - -void -kiblnd_launch_tx(struct lnet_ni *ni, struct kib_tx *tx, lnet_nid_t nid) -{ - struct kib_peer *peer; - struct kib_peer *peer2; - struct kib_conn *conn; - rwlock_t *g_lock = &kiblnd_data.kib_global_lock; - unsigned long flags; - int rc; - int i; - struct lnet_ioctl_config_o2iblnd_tunables *tunables; - - /* - * If I get here, I've committed to send, so I complete the tx with - * failure on any problems - */ - LASSERT(!tx || !tx->tx_conn); /* only set when assigned a conn */ - LASSERT(!tx || tx->tx_nwrq > 0); /* work items have been set up */ - - /* - * First time, just use a read lock since I expect to find my peer - * connected - */ - read_lock_irqsave(g_lock, flags); - - peer = kiblnd_find_peer_locked(nid); - if (peer && !list_empty(&peer->ibp_conns)) { - /* Found a peer with an established connection */ - conn = kiblnd_get_conn_locked(peer); - kiblnd_conn_addref(conn); /* 1 ref for me... */ - - read_unlock_irqrestore(g_lock, flags); - - if (tx) - kiblnd_queue_tx(tx, conn); - kiblnd_conn_decref(conn); /* ...to here */ - return; - } - - read_unlock(g_lock); - /* Re-try with a write lock */ - write_lock(g_lock); - - peer = kiblnd_find_peer_locked(nid); - if (peer) { - if (list_empty(&peer->ibp_conns)) { - /* found a peer, but it's still connecting... */ - LASSERT(kiblnd_peer_connecting(peer)); - if (tx) - list_add_tail(&tx->tx_list, - &peer->ibp_tx_queue); - write_unlock_irqrestore(g_lock, flags); - } else { - conn = kiblnd_get_conn_locked(peer); - kiblnd_conn_addref(conn); /* 1 ref for me... */ - - write_unlock_irqrestore(g_lock, flags); - - if (tx) - kiblnd_queue_tx(tx, conn); - kiblnd_conn_decref(conn); /* ...to here */ - } - return; - } - - write_unlock_irqrestore(g_lock, flags); - - /* Allocate a peer ready to add to the peer table and retry */ - rc = kiblnd_create_peer(ni, &peer, nid); - if (rc) { - CERROR("Can't create peer %s\n", libcfs_nid2str(nid)); - if (tx) { - tx->tx_status = -EHOSTUNREACH; - tx->tx_waiting = 0; - kiblnd_tx_done(ni, tx); - } - return; - } - - write_lock_irqsave(g_lock, flags); - - peer2 = kiblnd_find_peer_locked(nid); - if (peer2) { - if (list_empty(&peer2->ibp_conns)) { - /* found a peer, but it's still connecting... */ - LASSERT(kiblnd_peer_connecting(peer2)); - if (tx) - list_add_tail(&tx->tx_list, - &peer2->ibp_tx_queue); - write_unlock_irqrestore(g_lock, flags); - } else { - conn = kiblnd_get_conn_locked(peer2); - kiblnd_conn_addref(conn); /* 1 ref for me... */ - - write_unlock_irqrestore(g_lock, flags); - - if (tx) - kiblnd_queue_tx(tx, conn); - kiblnd_conn_decref(conn); /* ...to here */ - } - - kiblnd_peer_decref(peer); - return; - } - - /* Brand new peer */ - LASSERT(!peer->ibp_connecting); - tunables = &peer->ibp_ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; - peer->ibp_connecting = tunables->lnd_conns_per_peer; - - /* always called with a ref on ni, which prevents ni being shutdown */ - LASSERT(!((struct kib_net *)ni->ni_data)->ibn_shutdown); - - if (tx) - list_add_tail(&tx->tx_list, &peer->ibp_tx_queue); - - kiblnd_peer_addref(peer); - list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid)); - - write_unlock_irqrestore(g_lock, flags); - - for (i = 0; i < tunables->lnd_conns_per_peer; i++) - kiblnd_connect_peer(peer); - kiblnd_peer_decref(peer); -} - -int -kiblnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg) -{ - struct lnet_hdr *hdr = &lntmsg->msg_hdr; - int type = lntmsg->msg_type; - struct lnet_process_id target = lntmsg->msg_target; - int target_is_router = lntmsg->msg_target_is_router; - int routing = lntmsg->msg_routing; - unsigned int payload_niov = lntmsg->msg_niov; - struct kvec *payload_iov = lntmsg->msg_iov; - struct bio_vec *payload_kiov = lntmsg->msg_kiov; - unsigned int payload_offset = lntmsg->msg_offset; - unsigned int payload_nob = lntmsg->msg_len; - struct iov_iter from; - struct kib_msg *ibmsg; - struct kib_rdma_desc *rd; - struct kib_tx *tx; - int nob; - int rc; - - /* NB 'private' is different depending on what we're sending.... */ - - CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n", - payload_nob, payload_niov, libcfs_id2str(target)); - - LASSERT(!payload_nob || payload_niov > 0); - LASSERT(payload_niov <= LNET_MAX_IOV); - - /* Thread context */ - LASSERT(!in_interrupt()); - /* payload is either all vaddrs or all pages */ - LASSERT(!(payload_kiov && payload_iov)); - - if (payload_kiov) - iov_iter_bvec(&from, ITER_BVEC | WRITE, - payload_kiov, payload_niov, - payload_nob + payload_offset); - else - iov_iter_kvec(&from, ITER_KVEC | WRITE, - payload_iov, payload_niov, - payload_nob + payload_offset); - - iov_iter_advance(&from, payload_offset); - - switch (type) { - default: - LBUG(); - return -EIO; - - case LNET_MSG_ACK: - LASSERT(!payload_nob); - break; - - case LNET_MSG_GET: - if (routing || target_is_router) - break; /* send IMMEDIATE */ - - /* is the REPLY message too small for RDMA? */ - nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]); - if (nob <= IBLND_MSG_SIZE) - break; /* send IMMEDIATE */ - - tx = kiblnd_get_idle_tx(ni, target.nid); - if (!tx) { - CERROR("Can't allocate txd for GET to %s\n", - libcfs_nid2str(target.nid)); - return -ENOMEM; - } - - ibmsg = tx->tx_msg; - rd = &ibmsg->ibm_u.get.ibgm_rd; - if (!(lntmsg->msg_md->md_options & LNET_MD_KIOV)) - rc = kiblnd_setup_rd_iov(ni, tx, rd, - lntmsg->msg_md->md_niov, - lntmsg->msg_md->md_iov.iov, - 0, lntmsg->msg_md->md_length); - else - rc = kiblnd_setup_rd_kiov(ni, tx, rd, - lntmsg->msg_md->md_niov, - lntmsg->msg_md->md_iov.kiov, - 0, lntmsg->msg_md->md_length); - if (rc) { - CERROR("Can't setup GET sink for %s: %d\n", - libcfs_nid2str(target.nid), rc); - kiblnd_tx_done(ni, tx); - return -EIO; - } - - nob = offsetof(struct kib_get_msg, ibgm_rd.rd_frags[rd->rd_nfrags]); - ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie; - ibmsg->ibm_u.get.ibgm_hdr = *hdr; - - kiblnd_init_tx_msg(ni, tx, IBLND_MSG_GET_REQ, nob); - - tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg); - if (!tx->tx_lntmsg[1]) { - CERROR("Can't create reply for GET -> %s\n", - libcfs_nid2str(target.nid)); - kiblnd_tx_done(ni, tx); - return -EIO; - } - - tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg[0,1] on completion */ - tx->tx_waiting = 1; /* waiting for GET_DONE */ - kiblnd_launch_tx(ni, tx, target.nid); - return 0; - - case LNET_MSG_REPLY: - case LNET_MSG_PUT: - /* Is the payload small enough not to need RDMA? */ - nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[payload_nob]); - if (nob <= IBLND_MSG_SIZE) - break; /* send IMMEDIATE */ - - tx = kiblnd_get_idle_tx(ni, target.nid); - if (!tx) { - CERROR("Can't allocate %s txd for %s\n", - type == LNET_MSG_PUT ? "PUT" : "REPLY", - libcfs_nid2str(target.nid)); - return -ENOMEM; - } - - if (!payload_kiov) - rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd, - payload_niov, payload_iov, - payload_offset, payload_nob); - else - rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd, - payload_niov, payload_kiov, - payload_offset, payload_nob); - if (rc) { - CERROR("Can't setup PUT src for %s: %d\n", - libcfs_nid2str(target.nid), rc); - kiblnd_tx_done(ni, tx); - return -EIO; - } - - ibmsg = tx->tx_msg; - ibmsg->ibm_u.putreq.ibprm_hdr = *hdr; - ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie; - kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_REQ, sizeof(struct kib_putreq_msg)); - - tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ - tx->tx_waiting = 1; /* waiting for PUT_{ACK,NAK} */ - kiblnd_launch_tx(ni, tx, target.nid); - return 0; - } - - /* send IMMEDIATE */ - - LASSERT(offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[payload_nob]) - <= IBLND_MSG_SIZE); - - tx = kiblnd_get_idle_tx(ni, target.nid); - if (!tx) { - CERROR("Can't send %d to %s: tx descs exhausted\n", - type, libcfs_nid2str(target.nid)); - return -ENOMEM; - } - - ibmsg = tx->tx_msg; - ibmsg->ibm_u.immediate.ibim_hdr = *hdr; - - rc = copy_from_iter(&ibmsg->ibm_u.immediate.ibim_payload, payload_nob, - &from); - if (rc != payload_nob) { - kiblnd_pool_free_node(&tx->tx_pool->tpo_pool, &tx->tx_list); - return -EFAULT; - } - - nob = offsetof(struct kib_immediate_msg, ibim_payload[payload_nob]); - kiblnd_init_tx_msg(ni, tx, IBLND_MSG_IMMEDIATE, nob); - - tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ - kiblnd_launch_tx(ni, tx, target.nid); - return 0; -} - -static void -kiblnd_reply(struct lnet_ni *ni, struct kib_rx *rx, struct lnet_msg *lntmsg) -{ - struct lnet_process_id target = lntmsg->msg_target; - unsigned int niov = lntmsg->msg_niov; - struct kvec *iov = lntmsg->msg_iov; - struct bio_vec *kiov = lntmsg->msg_kiov; - unsigned int offset = lntmsg->msg_offset; - unsigned int nob = lntmsg->msg_len; - struct kib_tx *tx; - int rc; - - tx = kiblnd_get_idle_tx(ni, rx->rx_conn->ibc_peer->ibp_nid); - if (!tx) { - CERROR("Can't get tx for REPLY to %s\n", - libcfs_nid2str(target.nid)); - goto failed_0; - } - - if (!nob) - rc = 0; - else if (!kiov) - rc = kiblnd_setup_rd_iov(ni, tx, tx->tx_rd, - niov, iov, offset, nob); - else - rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd, - niov, kiov, offset, nob); - - if (rc) { - CERROR("Can't setup GET src for %s: %d\n", - libcfs_nid2str(target.nid), rc); - goto failed_1; - } - - rc = kiblnd_init_rdma(rx->rx_conn, tx, - IBLND_MSG_GET_DONE, nob, - &rx->rx_msg->ibm_u.get.ibgm_rd, - rx->rx_msg->ibm_u.get.ibgm_cookie); - if (rc < 0) { - CERROR("Can't setup rdma for GET from %s: %d\n", - libcfs_nid2str(target.nid), rc); - goto failed_1; - } - - if (!nob) { - /* No RDMA: local completion may happen now! */ - lnet_finalize(ni, lntmsg, 0); - } else { - /* RDMA: lnet_finalize(lntmsg) when it completes */ - tx->tx_lntmsg[0] = lntmsg; - } - - kiblnd_queue_tx(tx, rx->rx_conn); - return; - - failed_1: - kiblnd_tx_done(ni, tx); - failed_0: - lnet_finalize(ni, lntmsg, -EIO); -} - -int -kiblnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg, - int delayed, struct iov_iter *to, unsigned int rlen) -{ - struct kib_rx *rx = private; - struct kib_msg *rxmsg = rx->rx_msg; - struct kib_conn *conn = rx->rx_conn; - struct kib_tx *tx; - int nob; - int post_credit = IBLND_POSTRX_PEER_CREDIT; - int rc = 0; - - LASSERT(iov_iter_count(to) <= rlen); - LASSERT(!in_interrupt()); - /* Either all pages or all vaddrs */ - - switch (rxmsg->ibm_type) { - default: - LBUG(); - - case IBLND_MSG_IMMEDIATE: - nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[rlen]); - if (nob > rx->rx_nob) { - CERROR("Immediate message from %s too big: %d(%d)\n", - libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid), - nob, rx->rx_nob); - rc = -EPROTO; - break; - } - - rc = copy_to_iter(&rxmsg->ibm_u.immediate.ibim_payload, rlen, - to); - if (rc != rlen) { - rc = -EFAULT; - break; - } - - rc = 0; - lnet_finalize(ni, lntmsg, 0); - break; - - case IBLND_MSG_PUT_REQ: { - struct kib_msg *txmsg; - struct kib_rdma_desc *rd; - - if (!iov_iter_count(to)) { - lnet_finalize(ni, lntmsg, 0); - kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, 0, - rxmsg->ibm_u.putreq.ibprm_cookie); - break; - } - - tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid); - if (!tx) { - CERROR("Can't allocate tx for %s\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - /* Not replying will break the connection */ - rc = -ENOMEM; - break; - } - - txmsg = tx->tx_msg; - rd = &txmsg->ibm_u.putack.ibpam_rd; - if (!(to->type & ITER_BVEC)) - rc = kiblnd_setup_rd_iov(ni, tx, rd, - to->nr_segs, to->kvec, - to->iov_offset, - iov_iter_count(to)); - else - rc = kiblnd_setup_rd_kiov(ni, tx, rd, - to->nr_segs, to->bvec, - to->iov_offset, - iov_iter_count(to)); - if (rc) { - CERROR("Can't setup PUT sink for %s: %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); - kiblnd_tx_done(ni, tx); - /* tell peer it's over */ - kiblnd_send_completion(rx->rx_conn, IBLND_MSG_PUT_NAK, rc, - rxmsg->ibm_u.putreq.ibprm_cookie); - break; - } - - nob = offsetof(struct kib_putack_msg, ibpam_rd.rd_frags[rd->rd_nfrags]); - txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie; - txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie; - - kiblnd_init_tx_msg(ni, tx, IBLND_MSG_PUT_ACK, nob); - - tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */ - tx->tx_waiting = 1; /* waiting for PUT_DONE */ - kiblnd_queue_tx(tx, conn); - - /* reposted buffer reserved for PUT_DONE */ - post_credit = IBLND_POSTRX_NO_CREDIT; - break; - } - - case IBLND_MSG_GET_REQ: - if (lntmsg) { - /* Optimized GET; RDMA lntmsg's payload */ - kiblnd_reply(ni, rx, lntmsg); - } else { - /* GET didn't match anything */ - kiblnd_send_completion(rx->rx_conn, IBLND_MSG_GET_DONE, - -ENODATA, - rxmsg->ibm_u.get.ibgm_cookie); - } - break; - } - - kiblnd_post_rx(rx, post_credit); - return rc; -} - -int -kiblnd_thread_start(int (*fn)(void *arg), void *arg, char *name) -{ - struct task_struct *task = kthread_run(fn, arg, "%s", name); - - if (IS_ERR(task)) - return PTR_ERR(task); - - atomic_inc(&kiblnd_data.kib_nthreads); - return 0; -} - -static void -kiblnd_thread_fini(void) -{ - atomic_dec(&kiblnd_data.kib_nthreads); -} - -static void -kiblnd_peer_alive(struct kib_peer *peer) -{ - /* This is racy, but everyone's only writing jiffies */ - peer->ibp_last_alive = jiffies; - mb(); -} - -static void -kiblnd_peer_notify(struct kib_peer *peer) -{ - int error = 0; - unsigned long last_alive = 0; - unsigned long flags; - - read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - - if (kiblnd_peer_idle(peer) && peer->ibp_error) { - error = peer->ibp_error; - peer->ibp_error = 0; - - last_alive = peer->ibp_last_alive; - } - - read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - - if (error) - lnet_notify(peer->ibp_ni, - peer->ibp_nid, 0, last_alive); -} - -void -kiblnd_close_conn_locked(struct kib_conn *conn, int error) -{ - /* - * This just does the immediate housekeeping. 'error' is zero for a - * normal shutdown which can happen only after the connection has been - * established. If the connection is established, schedule the - * connection to be finished off by the connd. Otherwise the connd is - * already dealing with it (either to set it up or tear it down). - * Caller holds kib_global_lock exclusively in irq context - */ - struct kib_peer *peer = conn->ibc_peer; - struct kib_dev *dev; - unsigned long flags; - - LASSERT(error || conn->ibc_state >= IBLND_CONN_ESTABLISHED); - - if (error && !conn->ibc_comms_error) - conn->ibc_comms_error = error; - - if (conn->ibc_state != IBLND_CONN_ESTABLISHED) - return; /* already being handled */ - - if (!error && - list_empty(&conn->ibc_tx_noops) && - list_empty(&conn->ibc_tx_queue) && - list_empty(&conn->ibc_tx_queue_rsrvd) && - list_empty(&conn->ibc_tx_queue_nocred) && - list_empty(&conn->ibc_active_txs)) { - CDEBUG(D_NET, "closing conn to %s\n", - libcfs_nid2str(peer->ibp_nid)); - } else { - CNETERR("Closing conn to %s: error %d%s%s%s%s%s\n", - libcfs_nid2str(peer->ibp_nid), error, - list_empty(&conn->ibc_tx_queue) ? "" : "(sending)", - list_empty(&conn->ibc_tx_noops) ? "" : "(sending_noops)", - list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)", - list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)", - list_empty(&conn->ibc_active_txs) ? "" : "(waiting)"); - } - - dev = ((struct kib_net *)peer->ibp_ni->ni_data)->ibn_dev; - if (peer->ibp_next_conn == conn) - /* clear next_conn so it won't be used */ - peer->ibp_next_conn = NULL; - list_del(&conn->ibc_list); - /* connd (see below) takes over ibc_list's ref */ - - if (list_empty(&peer->ibp_conns) && /* no more conns */ - kiblnd_peer_active(peer)) { /* still in peer table */ - kiblnd_unlink_peer_locked(peer); - - /* set/clear error on last conn */ - peer->ibp_error = conn->ibc_comms_error; - } - - kiblnd_set_conn_state(conn, IBLND_CONN_CLOSING); - - if (error && - kiblnd_dev_can_failover(dev)) { - list_add_tail(&dev->ibd_fail_list, - &kiblnd_data.kib_failed_devs); - wake_up(&kiblnd_data.kib_failover_waitq); - } - - spin_lock_irqsave(&kiblnd_data.kib_connd_lock, flags); - - list_add_tail(&conn->ibc_list, &kiblnd_data.kib_connd_conns); - wake_up(&kiblnd_data.kib_connd_waitq); - - spin_unlock_irqrestore(&kiblnd_data.kib_connd_lock, flags); -} - -void -kiblnd_close_conn(struct kib_conn *conn, int error) -{ - unsigned long flags; - - write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - - kiblnd_close_conn_locked(conn, error); - - write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); -} - -static void -kiblnd_handle_early_rxs(struct kib_conn *conn) -{ - unsigned long flags; - struct kib_rx *rx; - - LASSERT(!in_interrupt()); - LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED); - - write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - while (!list_empty(&conn->ibc_early_rxs)) { - rx = list_entry(conn->ibc_early_rxs.next, - struct kib_rx, rx_list); - list_del(&rx->rx_list); - write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - - kiblnd_handle_rx(rx); - - write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - } - write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); -} - -static void -kiblnd_abort_txs(struct kib_conn *conn, struct list_head *txs) -{ - LIST_HEAD(zombies); - struct list_head *tmp; - struct list_head *nxt; - struct kib_tx *tx; - - spin_lock(&conn->ibc_lock); - - list_for_each_safe(tmp, nxt, txs) { - tx = list_entry(tmp, struct kib_tx, tx_list); - - if (txs == &conn->ibc_active_txs) { - LASSERT(!tx->tx_queued); - LASSERT(tx->tx_waiting || tx->tx_sending); - } else { - LASSERT(tx->tx_queued); - } - - tx->tx_status = -ECONNABORTED; - tx->tx_waiting = 0; - - if (!tx->tx_sending) { - tx->tx_queued = 0; - list_del(&tx->tx_list); - list_add(&tx->tx_list, &zombies); - } - } - - spin_unlock(&conn->ibc_lock); - - kiblnd_txlist_done(conn->ibc_peer->ibp_ni, &zombies, -ECONNABORTED); -} - -static void -kiblnd_finalise_conn(struct kib_conn *conn) -{ - LASSERT(!in_interrupt()); - LASSERT(conn->ibc_state > IBLND_CONN_INIT); - - kiblnd_set_conn_state(conn, IBLND_CONN_DISCONNECTED); - - /* - * abort_receives moves QP state to IB_QPS_ERR. This is only required - * for connections that didn't get as far as being connected, because - * rdma_disconnect() does this for free. - */ - kiblnd_abort_receives(conn); - - /* - * Complete all tx descs not waiting for sends to complete. - * NB we should be safe from RDMA now that the QP has changed state - */ - kiblnd_abort_txs(conn, &conn->ibc_tx_noops); - kiblnd_abort_txs(conn, &conn->ibc_tx_queue); - kiblnd_abort_txs(conn, &conn->ibc_tx_queue_rsrvd); - kiblnd_abort_txs(conn, &conn->ibc_tx_queue_nocred); - kiblnd_abort_txs(conn, &conn->ibc_active_txs); - - kiblnd_handle_early_rxs(conn); -} - -static void -kiblnd_peer_connect_failed(struct kib_peer *peer, int active, int error) -{ - LIST_HEAD(zombies); - unsigned long flags; - - LASSERT(error); - LASSERT(!in_interrupt()); - - write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - - if (active) { - LASSERT(peer->ibp_connecting > 0); - peer->ibp_connecting--; - } else { - LASSERT(peer->ibp_accepting > 0); - peer->ibp_accepting--; - } - - if (kiblnd_peer_connecting(peer)) { - /* another connection attempt under way... */ - write_unlock_irqrestore(&kiblnd_data.kib_global_lock, - flags); - return; - } - - peer->ibp_reconnected = 0; - if (list_empty(&peer->ibp_conns)) { - /* Take peer's blocked transmits to complete with error */ - list_add(&zombies, &peer->ibp_tx_queue); - list_del_init(&peer->ibp_tx_queue); - - if (kiblnd_peer_active(peer)) - kiblnd_unlink_peer_locked(peer); - - peer->ibp_error = error; - } else { - /* Can't have blocked transmits if there are connections */ - LASSERT(list_empty(&peer->ibp_tx_queue)); - } - - write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - - kiblnd_peer_notify(peer); - - if (list_empty(&zombies)) - return; - - CNETERR("Deleting messages for %s: connection failed\n", - libcfs_nid2str(peer->ibp_nid)); - - kiblnd_txlist_done(peer->ibp_ni, &zombies, -EHOSTUNREACH); -} - -static void -kiblnd_connreq_done(struct kib_conn *conn, int status) -{ - struct kib_peer *peer = conn->ibc_peer; - struct kib_tx *tx; - struct kib_tx *tmp; - struct list_head txs; - unsigned long flags; - int active; - - active = (conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT); - - CDEBUG(D_NET, "%s: active(%d), version(%x), status(%d)\n", - libcfs_nid2str(peer->ibp_nid), active, - conn->ibc_version, status); - - LASSERT(!in_interrupt()); - LASSERT((conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT && - peer->ibp_connecting > 0) || - (conn->ibc_state == IBLND_CONN_PASSIVE_WAIT && - peer->ibp_accepting > 0)); - - kfree(conn->ibc_connvars); - conn->ibc_connvars = NULL; - - if (status) { - /* failed to establish connection */ - kiblnd_peer_connect_failed(peer, active, status); - kiblnd_finalise_conn(conn); - return; - } - - /* connection established */ - write_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - - conn->ibc_last_send = jiffies; - kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED); - kiblnd_peer_alive(peer); - - /* - * Add conn to peer's list and nuke any dangling conns from a different - * peer instance... - */ - kiblnd_conn_addref(conn); /* +1 ref for ibc_list */ - list_add(&conn->ibc_list, &peer->ibp_conns); - peer->ibp_reconnected = 0; - if (active) - peer->ibp_connecting--; - else - peer->ibp_accepting--; - - if (!peer->ibp_version) { - peer->ibp_version = conn->ibc_version; - peer->ibp_incarnation = conn->ibc_incarnation; - } - - if (peer->ibp_version != conn->ibc_version || - peer->ibp_incarnation != conn->ibc_incarnation) { - kiblnd_close_stale_conns_locked(peer, conn->ibc_version, - conn->ibc_incarnation); - peer->ibp_version = conn->ibc_version; - peer->ibp_incarnation = conn->ibc_incarnation; - } - - /* grab pending txs while I have the lock */ - list_add(&txs, &peer->ibp_tx_queue); - list_del_init(&peer->ibp_tx_queue); - - if (!kiblnd_peer_active(peer) || /* peer has been deleted */ - conn->ibc_comms_error) { /* error has happened already */ - struct lnet_ni *ni = peer->ibp_ni; - - /* start to shut down connection */ - kiblnd_close_conn_locked(conn, -ECONNABORTED); - write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - - kiblnd_txlist_done(ni, &txs, -ECONNABORTED); - - return; - } - - /* - * +1 ref for myself, this connection is visible to other threads - * now, refcount of peer:ibp_conns can be released by connection - * close from either a different thread, or the calling of - * kiblnd_check_sends_locked() below. See bz21911 for details. - */ - kiblnd_conn_addref(conn); - write_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - - /* Schedule blocked txs - * Note: if we are running with conns_per_peer > 1, these blocked - * txs will all get scheduled to the first connection which gets - * scheduled. We won't be using round robin on this first batch. - */ - spin_lock(&conn->ibc_lock); - list_for_each_entry_safe(tx, tmp, &txs, tx_list) { - list_del(&tx->tx_list); - - kiblnd_queue_tx_locked(tx, conn); - } - kiblnd_check_sends_locked(conn); - spin_unlock(&conn->ibc_lock); - - /* schedule blocked rxs */ - kiblnd_handle_early_rxs(conn); - - kiblnd_conn_decref(conn); -} - -static void -kiblnd_reject(struct rdma_cm_id *cmid, struct kib_rej *rej) -{ - int rc; - - rc = rdma_reject(cmid, rej, sizeof(*rej)); - - if (rc) - CWARN("Error %d sending reject\n", rc); -} - -static int -kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) -{ - rwlock_t *g_lock = &kiblnd_data.kib_global_lock; - struct kib_msg *reqmsg = priv; - struct kib_msg *ackmsg; - struct kib_dev *ibdev; - struct kib_peer *peer; - struct kib_peer *peer2; - struct kib_conn *conn; - struct lnet_ni *ni = NULL; - struct kib_net *net = NULL; - lnet_nid_t nid; - struct rdma_conn_param cp; - struct kib_rej rej; - int version = IBLND_MSG_VERSION; - unsigned long flags; - int max_frags; - int rc; - struct sockaddr_in *peer_addr; - - LASSERT(!in_interrupt()); - - /* cmid inherits 'context' from the corresponding listener id */ - ibdev = (struct kib_dev *)cmid->context; - LASSERT(ibdev); - - memset(&rej, 0, sizeof(rej)); - rej.ibr_magic = IBLND_MSG_MAGIC; - rej.ibr_why = IBLND_REJECT_FATAL; - rej.ibr_cp.ibcp_max_msg_size = IBLND_MSG_SIZE; - - peer_addr = (struct sockaddr_in *)&cmid->route.addr.dst_addr; - if (*kiblnd_tunables.kib_require_priv_port && - ntohs(peer_addr->sin_port) >= PROT_SOCK) { - __u32 ip = ntohl(peer_addr->sin_addr.s_addr); - - CERROR("Peer's port (%pI4h:%hu) is not privileged\n", - &ip, ntohs(peer_addr->sin_port)); - goto failed; - } - - if (priv_nob < offsetof(struct kib_msg, ibm_type)) { - CERROR("Short connection request\n"); - goto failed; - } - - /* - * Future protocol version compatibility support! If the - * o2iblnd-specific protocol changes, or when LNET unifies - * protocols over all LNDs, the initial connection will - * negotiate a protocol version. I trap this here to avoid - * console errors; the reject tells the peer which protocol I - * speak. - */ - if (reqmsg->ibm_magic == LNET_PROTO_MAGIC || - reqmsg->ibm_magic == __swab32(LNET_PROTO_MAGIC)) - goto failed; - if (reqmsg->ibm_magic == IBLND_MSG_MAGIC && - reqmsg->ibm_version != IBLND_MSG_VERSION && - reqmsg->ibm_version != IBLND_MSG_VERSION_1) - goto failed; - if (reqmsg->ibm_magic == __swab32(IBLND_MSG_MAGIC) && - reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION) && - reqmsg->ibm_version != __swab16(IBLND_MSG_VERSION_1)) - goto failed; - - rc = kiblnd_unpack_msg(reqmsg, priv_nob); - if (rc) { - CERROR("Can't parse connection request: %d\n", rc); - goto failed; - } - - nid = reqmsg->ibm_srcnid; - ni = lnet_net2ni(LNET_NIDNET(reqmsg->ibm_dstnid)); - - if (ni) { - net = (struct kib_net *)ni->ni_data; - rej.ibr_incarnation = net->ibn_incarnation; - } - - if (!ni || /* no matching net */ - ni->ni_nid != reqmsg->ibm_dstnid || /* right NET, wrong NID! */ - net->ibn_dev != ibdev) { /* wrong device */ - CERROR("Can't accept conn from %s on %s (%s:%d:%pI4h): bad dst nid %s\n", - libcfs_nid2str(nid), - !ni ? "NA" : libcfs_nid2str(ni->ni_nid), - ibdev->ibd_ifname, ibdev->ibd_nnets, - &ibdev->ibd_ifip, - libcfs_nid2str(reqmsg->ibm_dstnid)); - - goto failed; - } - - /* check time stamp as soon as possible */ - if (reqmsg->ibm_dststamp && - reqmsg->ibm_dststamp != net->ibn_incarnation) { - CWARN("Stale connection request\n"); - rej.ibr_why = IBLND_REJECT_CONN_STALE; - goto failed; - } - - /* I can accept peer's version */ - version = reqmsg->ibm_version; - - if (reqmsg->ibm_type != IBLND_MSG_CONNREQ) { - CERROR("Unexpected connreq msg type: %x from %s\n", - reqmsg->ibm_type, libcfs_nid2str(nid)); - goto failed; - } - - if (reqmsg->ibm_u.connparams.ibcp_queue_depth > - kiblnd_msg_queue_size(version, ni)) { - CERROR("Can't accept conn from %s, queue depth too large: %d (<=%d wanted)\n", - libcfs_nid2str(nid), - reqmsg->ibm_u.connparams.ibcp_queue_depth, - kiblnd_msg_queue_size(version, ni)); - - if (version == IBLND_MSG_VERSION) - rej.ibr_why = IBLND_REJECT_MSG_QUEUE_SIZE; - - goto failed; - } - - max_frags = reqmsg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT; - if (max_frags > kiblnd_rdma_frags(version, ni)) { - CWARN("Can't accept conn from %s (version %x): max message size %d is too large (%d wanted)\n", - libcfs_nid2str(nid), version, max_frags, - kiblnd_rdma_frags(version, ni)); - - if (version >= IBLND_MSG_VERSION) - rej.ibr_why = IBLND_REJECT_RDMA_FRAGS; - - goto failed; - } else if (max_frags < kiblnd_rdma_frags(version, ni) && - !net->ibn_fmr_ps) { - CWARN("Can't accept conn from %s (version %x): max message size %d incompatible without FMR pool (%d wanted)\n", - libcfs_nid2str(nid), version, max_frags, - kiblnd_rdma_frags(version, ni)); - - if (version == IBLND_MSG_VERSION) - rej.ibr_why = IBLND_REJECT_RDMA_FRAGS; - - goto failed; - } - - if (reqmsg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) { - CERROR("Can't accept %s: message size %d too big (%d max)\n", - libcfs_nid2str(nid), - reqmsg->ibm_u.connparams.ibcp_max_msg_size, - IBLND_MSG_SIZE); - goto failed; - } - - /* assume 'nid' is a new peer; create */ - rc = kiblnd_create_peer(ni, &peer, nid); - if (rc) { - CERROR("Can't create peer for %s\n", libcfs_nid2str(nid)); - rej.ibr_why = IBLND_REJECT_NO_RESOURCES; - goto failed; - } - - /* We have validated the peer's parameters so use those */ - peer->ibp_max_frags = max_frags; - peer->ibp_queue_depth = reqmsg->ibm_u.connparams.ibcp_queue_depth; - - write_lock_irqsave(g_lock, flags); - - peer2 = kiblnd_find_peer_locked(nid); - if (peer2) { - if (!peer2->ibp_version) { - peer2->ibp_version = version; - peer2->ibp_incarnation = reqmsg->ibm_srcstamp; - } - - /* not the guy I've talked with */ - if (peer2->ibp_incarnation != reqmsg->ibm_srcstamp || - peer2->ibp_version != version) { - kiblnd_close_peer_conns_locked(peer2, -ESTALE); - - if (kiblnd_peer_active(peer2)) { - peer2->ibp_incarnation = reqmsg->ibm_srcstamp; - peer2->ibp_version = version; - } - write_unlock_irqrestore(g_lock, flags); - - CWARN("Conn stale %s version %x/%x incarnation %llu/%llu\n", - libcfs_nid2str(nid), peer2->ibp_version, version, - peer2->ibp_incarnation, reqmsg->ibm_srcstamp); - - kiblnd_peer_decref(peer); - rej.ibr_why = IBLND_REJECT_CONN_STALE; - goto failed; - } - - /* - * Tie-break connection race in favour of the higher NID. - * If we keep running into a race condition multiple times, - * we have to assume that the connection attempt with the - * higher NID is stuck in a connecting state and will never - * recover. As such, we pass through this if-block and let - * the lower NID connection win so we can move forward. - */ - if (peer2->ibp_connecting && - nid < ni->ni_nid && peer2->ibp_races < - MAX_CONN_RACES_BEFORE_ABORT) { - peer2->ibp_races++; - write_unlock_irqrestore(g_lock, flags); - - CDEBUG(D_NET, "Conn race %s\n", - libcfs_nid2str(peer2->ibp_nid)); - - kiblnd_peer_decref(peer); - rej.ibr_why = IBLND_REJECT_CONN_RACE; - goto failed; - } - if (peer2->ibp_races >= MAX_CONN_RACES_BEFORE_ABORT) - CNETERR("Conn race %s: unresolved after %d attempts, letting lower NID win\n", - libcfs_nid2str(peer2->ibp_nid), - MAX_CONN_RACES_BEFORE_ABORT); - /** - * passive connection is allowed even this peer is waiting for - * reconnection. - */ - peer2->ibp_reconnecting = 0; - peer2->ibp_races = 0; - peer2->ibp_accepting++; - kiblnd_peer_addref(peer2); - - /** - * Race with kiblnd_launch_tx (active connect) to create peer - * so copy validated parameters since we now know what the - * peer's limits are - */ - peer2->ibp_max_frags = peer->ibp_max_frags; - peer2->ibp_queue_depth = peer->ibp_queue_depth; - - write_unlock_irqrestore(g_lock, flags); - kiblnd_peer_decref(peer); - peer = peer2; - } else { - /* Brand new peer */ - LASSERT(!peer->ibp_accepting); - LASSERT(!peer->ibp_version && - !peer->ibp_incarnation); - - peer->ibp_accepting = 1; - peer->ibp_version = version; - peer->ibp_incarnation = reqmsg->ibm_srcstamp; - - /* I have a ref on ni that prevents it being shutdown */ - LASSERT(!net->ibn_shutdown); - - kiblnd_peer_addref(peer); - list_add_tail(&peer->ibp_list, kiblnd_nid2peerlist(nid)); - - write_unlock_irqrestore(g_lock, flags); - } - - conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_PASSIVE_WAIT, - version); - if (!conn) { - kiblnd_peer_connect_failed(peer, 0, -ENOMEM); - kiblnd_peer_decref(peer); - rej.ibr_why = IBLND_REJECT_NO_RESOURCES; - goto failed; - } - - /* - * conn now "owns" cmid, so I return success from here on to ensure the - * CM callback doesn't destroy cmid. - */ - conn->ibc_incarnation = reqmsg->ibm_srcstamp; - conn->ibc_credits = conn->ibc_queue_depth; - conn->ibc_reserved_credits = conn->ibc_queue_depth; - LASSERT(conn->ibc_credits + conn->ibc_reserved_credits + - IBLND_OOB_MSGS(version) <= IBLND_RX_MSGS(conn)); - - ackmsg = &conn->ibc_connvars->cv_msg; - memset(ackmsg, 0, sizeof(*ackmsg)); - - kiblnd_init_msg(ackmsg, IBLND_MSG_CONNACK, - sizeof(ackmsg->ibm_u.connparams)); - ackmsg->ibm_u.connparams.ibcp_queue_depth = conn->ibc_queue_depth; - ackmsg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags << IBLND_FRAG_SHIFT; - ackmsg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE; - - kiblnd_pack_msg(ni, ackmsg, version, 0, nid, reqmsg->ibm_srcstamp); - - memset(&cp, 0, sizeof(cp)); - cp.private_data = ackmsg; - cp.private_data_len = ackmsg->ibm_nob; - cp.responder_resources = 0; /* No atomic ops or RDMA reads */ - cp.initiator_depth = 0; - cp.flow_control = 1; - cp.retry_count = *kiblnd_tunables.kib_retry_count; - cp.rnr_retry_count = *kiblnd_tunables.kib_rnr_retry_count; - - CDEBUG(D_NET, "Accept %s\n", libcfs_nid2str(nid)); - - rc = rdma_accept(cmid, &cp); - if (rc) { - CERROR("Can't accept %s: %d\n", libcfs_nid2str(nid), rc); - rej.ibr_version = version; - rej.ibr_why = IBLND_REJECT_FATAL; - - kiblnd_reject(cmid, &rej); - kiblnd_connreq_done(conn, rc); - kiblnd_conn_decref(conn); - } - - lnet_ni_decref(ni); - return 0; - - failed: - if (ni) { - rej.ibr_cp.ibcp_queue_depth = kiblnd_msg_queue_size(version, ni); - rej.ibr_cp.ibcp_max_frags = kiblnd_rdma_frags(version, ni); - lnet_ni_decref(ni); - } - - rej.ibr_version = version; - kiblnd_reject(cmid, &rej); - - return -ECONNREFUSED; -} - -static void -kiblnd_check_reconnect(struct kib_conn *conn, int version, - __u64 incarnation, int why, struct kib_connparams *cp) -{ - rwlock_t *glock = &kiblnd_data.kib_global_lock; - struct kib_peer *peer = conn->ibc_peer; - char *reason; - int msg_size = IBLND_MSG_SIZE; - int frag_num = -1; - int queue_dep = -1; - bool reconnect; - unsigned long flags; - - LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT); - LASSERT(peer->ibp_connecting > 0); /* 'conn' at least */ - - if (cp) { - msg_size = cp->ibcp_max_msg_size; - frag_num = cp->ibcp_max_frags << IBLND_FRAG_SHIFT; - queue_dep = cp->ibcp_queue_depth; - } - - write_lock_irqsave(glock, flags); - /** - * retry connection if it's still needed and no other connection - * attempts (active or passive) are in progress - * NB: reconnect is still needed even when ibp_tx_queue is - * empty if ibp_version != version because reconnect may be - * initiated by kiblnd_query() - */ - reconnect = (!list_empty(&peer->ibp_tx_queue) || - peer->ibp_version != version) && - peer->ibp_connecting && - !peer->ibp_accepting; - if (!reconnect) { - reason = "no need"; - goto out; - } - - switch (why) { - default: - reason = "Unknown"; - break; - - case IBLND_REJECT_RDMA_FRAGS: { - struct lnet_ioctl_config_lnd_tunables *tunables; - - if (!cp) { - reason = "can't negotiate max frags"; - goto out; - } - tunables = peer->ibp_ni->ni_lnd_tunables; - if (!tunables->lt_tun_u.lt_o2ib.lnd_map_on_demand) { - reason = "map_on_demand must be enabled"; - goto out; - } - if (conn->ibc_max_frags <= frag_num) { - reason = "unsupported max frags"; - goto out; - } - - peer->ibp_max_frags = frag_num; - reason = "rdma fragments"; - break; - } - case IBLND_REJECT_MSG_QUEUE_SIZE: - if (!cp) { - reason = "can't negotiate queue depth"; - goto out; - } - if (conn->ibc_queue_depth <= queue_dep) { - reason = "unsupported queue depth"; - goto out; - } - - peer->ibp_queue_depth = queue_dep; - reason = "queue depth"; - break; - - case IBLND_REJECT_CONN_STALE: - reason = "stale"; - break; - - case IBLND_REJECT_CONN_RACE: - reason = "conn race"; - break; - - case IBLND_REJECT_CONN_UNCOMPAT: - reason = "version negotiation"; - break; - } - - conn->ibc_reconnect = 1; - peer->ibp_reconnecting++; - peer->ibp_version = version; - if (incarnation) - peer->ibp_incarnation = incarnation; -out: - write_unlock_irqrestore(glock, flags); - - CNETERR("%s: %s (%s), %x, %x, msg_size: %d, queue_depth: %d/%d, max_frags: %d/%d\n", - libcfs_nid2str(peer->ibp_nid), - reconnect ? "reconnect" : "don't reconnect", - reason, IBLND_MSG_VERSION, version, msg_size, - conn->ibc_queue_depth, queue_dep, - conn->ibc_max_frags, frag_num); - /** - * if conn::ibc_reconnect is TRUE, connd will reconnect to the peer - * while destroying the zombie - */ -} - -static void -kiblnd_rejected(struct kib_conn *conn, int reason, void *priv, int priv_nob) -{ - struct kib_peer *peer = conn->ibc_peer; - - LASSERT(!in_interrupt()); - LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT); - - switch (reason) { - case IB_CM_REJ_STALE_CONN: - kiblnd_check_reconnect(conn, IBLND_MSG_VERSION, 0, - IBLND_REJECT_CONN_STALE, NULL); - break; - - case IB_CM_REJ_INVALID_SERVICE_ID: - CNETERR("%s rejected: no listener at %d\n", - libcfs_nid2str(peer->ibp_nid), - *kiblnd_tunables.kib_service); - break; - - case IB_CM_REJ_CONSUMER_DEFINED: - if (priv_nob >= offsetof(struct kib_rej, ibr_padding)) { - struct kib_rej *rej = priv; - struct kib_connparams *cp = NULL; - int flip = 0; - __u64 incarnation = -1; - - /* NB. default incarnation is -1 because: - * a) V1 will ignore dst incarnation in connreq. - * b) V2 will provide incarnation while rejecting me, - * -1 will be overwrote. - * - * if I try to connect to a V1 peer with V2 protocol, - * it rejected me then upgrade to V2, I have no idea - * about the upgrading and try to reconnect with V1, - * in this case upgraded V2 can find out I'm trying to - * talk to the old guy and reject me(incarnation is -1). - */ - - if (rej->ibr_magic == __swab32(IBLND_MSG_MAGIC) || - rej->ibr_magic == __swab32(LNET_PROTO_MAGIC)) { - __swab32s(&rej->ibr_magic); - __swab16s(&rej->ibr_version); - flip = 1; - } - - if (priv_nob >= sizeof(struct kib_rej) && - rej->ibr_version > IBLND_MSG_VERSION_1) { - /* - * priv_nob is always 148 in current version - * of OFED, so we still need to check version. - * (define of IB_CM_REJ_PRIVATE_DATA_SIZE) - */ - cp = &rej->ibr_cp; - - if (flip) { - __swab64s(&rej->ibr_incarnation); - __swab16s(&cp->ibcp_queue_depth); - __swab16s(&cp->ibcp_max_frags); - __swab32s(&cp->ibcp_max_msg_size); - } - - incarnation = rej->ibr_incarnation; - } - - if (rej->ibr_magic != IBLND_MSG_MAGIC && - rej->ibr_magic != LNET_PROTO_MAGIC) { - CERROR("%s rejected: consumer defined fatal error\n", - libcfs_nid2str(peer->ibp_nid)); - break; - } - - if (rej->ibr_version != IBLND_MSG_VERSION && - rej->ibr_version != IBLND_MSG_VERSION_1) { - CERROR("%s rejected: o2iblnd version %x error\n", - libcfs_nid2str(peer->ibp_nid), - rej->ibr_version); - break; - } - - if (rej->ibr_why == IBLND_REJECT_FATAL && - rej->ibr_version == IBLND_MSG_VERSION_1) { - CDEBUG(D_NET, "rejected by old version peer %s: %x\n", - libcfs_nid2str(peer->ibp_nid), rej->ibr_version); - - if (conn->ibc_version != IBLND_MSG_VERSION_1) - rej->ibr_why = IBLND_REJECT_CONN_UNCOMPAT; - } - - switch (rej->ibr_why) { - case IBLND_REJECT_CONN_RACE: - case IBLND_REJECT_CONN_STALE: - case IBLND_REJECT_CONN_UNCOMPAT: - case IBLND_REJECT_MSG_QUEUE_SIZE: - case IBLND_REJECT_RDMA_FRAGS: - kiblnd_check_reconnect(conn, rej->ibr_version, - incarnation, - rej->ibr_why, cp); - break; - - case IBLND_REJECT_NO_RESOURCES: - CERROR("%s rejected: o2iblnd no resources\n", - libcfs_nid2str(peer->ibp_nid)); - break; - - case IBLND_REJECT_FATAL: - CERROR("%s rejected: o2iblnd fatal error\n", - libcfs_nid2str(peer->ibp_nid)); - break; - - default: - CERROR("%s rejected: o2iblnd reason %d\n", - libcfs_nid2str(peer->ibp_nid), - rej->ibr_why); - break; - } - break; - } - /* fall through */ - default: - CNETERR("%s rejected: reason %d, size %d\n", - libcfs_nid2str(peer->ibp_nid), reason, priv_nob); - break; - } - - kiblnd_connreq_done(conn, -ECONNREFUSED); -} - -static void -kiblnd_check_connreply(struct kib_conn *conn, void *priv, int priv_nob) -{ - struct kib_peer *peer = conn->ibc_peer; - struct lnet_ni *ni = peer->ibp_ni; - struct kib_net *net = ni->ni_data; - struct kib_msg *msg = priv; - int ver = conn->ibc_version; - int rc = kiblnd_unpack_msg(msg, priv_nob); - unsigned long flags; - - LASSERT(net); - - if (rc) { - CERROR("Can't unpack connack from %s: %d\n", - libcfs_nid2str(peer->ibp_nid), rc); - goto failed; - } - - if (msg->ibm_type != IBLND_MSG_CONNACK) { - CERROR("Unexpected message %d from %s\n", - msg->ibm_type, libcfs_nid2str(peer->ibp_nid)); - rc = -EPROTO; - goto failed; - } - - if (ver != msg->ibm_version) { - CERROR("%s replied version %x is different with requested version %x\n", - libcfs_nid2str(peer->ibp_nid), msg->ibm_version, ver); - rc = -EPROTO; - goto failed; - } - - if (msg->ibm_u.connparams.ibcp_queue_depth > - conn->ibc_queue_depth) { - CERROR("%s has incompatible queue depth %d (<=%d wanted)\n", - libcfs_nid2str(peer->ibp_nid), - msg->ibm_u.connparams.ibcp_queue_depth, - conn->ibc_queue_depth); - rc = -EPROTO; - goto failed; - } - - if ((msg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT) > - conn->ibc_max_frags) { - CERROR("%s has incompatible max_frags %d (<=%d wanted)\n", - libcfs_nid2str(peer->ibp_nid), - msg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT, - conn->ibc_max_frags); - rc = -EPROTO; - goto failed; - } - - if (msg->ibm_u.connparams.ibcp_max_msg_size > IBLND_MSG_SIZE) { - CERROR("%s max message size %d too big (%d max)\n", - libcfs_nid2str(peer->ibp_nid), - msg->ibm_u.connparams.ibcp_max_msg_size, - IBLND_MSG_SIZE); - rc = -EPROTO; - goto failed; - } - - read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - if (msg->ibm_dstnid == ni->ni_nid && - msg->ibm_dststamp == net->ibn_incarnation) - rc = 0; - else - rc = -ESTALE; - read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - - if (rc) { - CERROR("Bad connection reply from %s, rc = %d, version: %x max_frags: %d\n", - libcfs_nid2str(peer->ibp_nid), rc, - msg->ibm_version, msg->ibm_u.connparams.ibcp_max_frags); - goto failed; - } - - conn->ibc_incarnation = msg->ibm_srcstamp; - conn->ibc_credits = msg->ibm_u.connparams.ibcp_queue_depth; - conn->ibc_reserved_credits = msg->ibm_u.connparams.ibcp_queue_depth; - conn->ibc_queue_depth = msg->ibm_u.connparams.ibcp_queue_depth; - conn->ibc_max_frags = msg->ibm_u.connparams.ibcp_max_frags >> IBLND_FRAG_SHIFT; - LASSERT(conn->ibc_credits + conn->ibc_reserved_credits + - IBLND_OOB_MSGS(ver) <= IBLND_RX_MSGS(conn)); - - kiblnd_connreq_done(conn, 0); - return; - - failed: - /* - * NB My QP has already established itself, so I handle anything going - * wrong here by setting ibc_comms_error. - * kiblnd_connreq_done(0) moves the conn state to ESTABLISHED, but then - * immediately tears it down. - */ - LASSERT(rc); - conn->ibc_comms_error = rc; - kiblnd_connreq_done(conn, 0); -} - -static int -kiblnd_active_connect(struct rdma_cm_id *cmid) -{ - struct kib_peer *peer = (struct kib_peer *)cmid->context; - struct kib_conn *conn; - struct kib_msg *msg; - struct rdma_conn_param cp; - int version; - __u64 incarnation; - unsigned long flags; - int rc; - - read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - - incarnation = peer->ibp_incarnation; - version = !peer->ibp_version ? IBLND_MSG_VERSION : - peer->ibp_version; - - read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - - conn = kiblnd_create_conn(peer, cmid, IBLND_CONN_ACTIVE_CONNECT, - version); - if (!conn) { - kiblnd_peer_connect_failed(peer, 1, -ENOMEM); - kiblnd_peer_decref(peer); /* lose cmid's ref */ - return -ENOMEM; - } - - /* - * conn "owns" cmid now, so I return success from here on to ensure the - * CM callback doesn't destroy cmid. conn also takes over cmid's ref - * on peer - */ - msg = &conn->ibc_connvars->cv_msg; - - memset(msg, 0, sizeof(*msg)); - kiblnd_init_msg(msg, IBLND_MSG_CONNREQ, sizeof(msg->ibm_u.connparams)); - msg->ibm_u.connparams.ibcp_queue_depth = conn->ibc_queue_depth; - msg->ibm_u.connparams.ibcp_max_frags = conn->ibc_max_frags << IBLND_FRAG_SHIFT; - msg->ibm_u.connparams.ibcp_max_msg_size = IBLND_MSG_SIZE; - - kiblnd_pack_msg(peer->ibp_ni, msg, version, - 0, peer->ibp_nid, incarnation); - - memset(&cp, 0, sizeof(cp)); - cp.private_data = msg; - cp.private_data_len = msg->ibm_nob; - cp.responder_resources = 0; /* No atomic ops or RDMA reads */ - cp.initiator_depth = 0; - cp.flow_control = 1; - cp.retry_count = *kiblnd_tunables.kib_retry_count; - cp.rnr_retry_count = *kiblnd_tunables.kib_rnr_retry_count; - - LASSERT(cmid->context == (void *)conn); - LASSERT(conn->ibc_cmid == cmid); - - rc = rdma_connect(cmid, &cp); - if (rc) { - CERROR("Can't connect to %s: %d\n", - libcfs_nid2str(peer->ibp_nid), rc); - kiblnd_connreq_done(conn, rc); - kiblnd_conn_decref(conn); - } - - return 0; -} - -int -kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event) -{ - struct kib_peer *peer; - struct kib_conn *conn; - int rc; - - switch (event->event) { - default: - CERROR("Unexpected event: %d, status: %d\n", - event->event, event->status); - LBUG(); - - case RDMA_CM_EVENT_CONNECT_REQUEST: - /* destroy cmid on failure */ - rc = kiblnd_passive_connect(cmid, - (void *)KIBLND_CONN_PARAM(event), - KIBLND_CONN_PARAM_LEN(event)); - CDEBUG(D_NET, "connreq: %d\n", rc); - return rc; - - case RDMA_CM_EVENT_ADDR_ERROR: - peer = (struct kib_peer *)cmid->context; - CNETERR("%s: ADDR ERROR %d\n", - libcfs_nid2str(peer->ibp_nid), event->status); - kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH); - kiblnd_peer_decref(peer); - return -EHOSTUNREACH; /* rc destroys cmid */ - - case RDMA_CM_EVENT_ADDR_RESOLVED: - peer = (struct kib_peer *)cmid->context; - - CDEBUG(D_NET, "%s Addr resolved: %d\n", - libcfs_nid2str(peer->ibp_nid), event->status); - - if (event->status) { - CNETERR("Can't resolve address for %s: %d\n", - libcfs_nid2str(peer->ibp_nid), event->status); - rc = event->status; - } else { - rc = rdma_resolve_route( - cmid, *kiblnd_tunables.kib_timeout * 1000); - if (!rc) { - struct kib_net *net = peer->ibp_ni->ni_data; - struct kib_dev *dev = net->ibn_dev; - - CDEBUG(D_NET, "%s: connection bound to "\ - "%s:%pI4h:%s\n", - libcfs_nid2str(peer->ibp_nid), - dev->ibd_ifname, - &dev->ibd_ifip, cmid->device->name); - - return 0; - } - - /* Can't initiate route resolution */ - CERROR("Can't resolve route for %s: %d\n", - libcfs_nid2str(peer->ibp_nid), rc); - } - kiblnd_peer_connect_failed(peer, 1, rc); - kiblnd_peer_decref(peer); - return rc; /* rc destroys cmid */ - - case RDMA_CM_EVENT_ROUTE_ERROR: - peer = (struct kib_peer *)cmid->context; - CNETERR("%s: ROUTE ERROR %d\n", - libcfs_nid2str(peer->ibp_nid), event->status); - kiblnd_peer_connect_failed(peer, 1, -EHOSTUNREACH); - kiblnd_peer_decref(peer); - return -EHOSTUNREACH; /* rc destroys cmid */ - - case RDMA_CM_EVENT_ROUTE_RESOLVED: - peer = (struct kib_peer *)cmid->context; - CDEBUG(D_NET, "%s Route resolved: %d\n", - libcfs_nid2str(peer->ibp_nid), event->status); - - if (!event->status) - return kiblnd_active_connect(cmid); - - CNETERR("Can't resolve route for %s: %d\n", - libcfs_nid2str(peer->ibp_nid), event->status); - kiblnd_peer_connect_failed(peer, 1, event->status); - kiblnd_peer_decref(peer); - return event->status; /* rc destroys cmid */ - - case RDMA_CM_EVENT_UNREACHABLE: - conn = (struct kib_conn *)cmid->context; - LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT || - conn->ibc_state == IBLND_CONN_PASSIVE_WAIT); - CNETERR("%s: UNREACHABLE %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status); - kiblnd_connreq_done(conn, -ENETDOWN); - kiblnd_conn_decref(conn); - return 0; - - case RDMA_CM_EVENT_CONNECT_ERROR: - conn = (struct kib_conn *)cmid->context; - LASSERT(conn->ibc_state == IBLND_CONN_ACTIVE_CONNECT || - conn->ibc_state == IBLND_CONN_PASSIVE_WAIT); - CNETERR("%s: CONNECT ERROR %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), event->status); - kiblnd_connreq_done(conn, -ENOTCONN); - kiblnd_conn_decref(conn); - return 0; - - case RDMA_CM_EVENT_REJECTED: - conn = (struct kib_conn *)cmid->context; - switch (conn->ibc_state) { - default: - LBUG(); - - case IBLND_CONN_PASSIVE_WAIT: - CERROR("%s: REJECTED %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), - event->status); - kiblnd_connreq_done(conn, -ECONNRESET); - break; - - case IBLND_CONN_ACTIVE_CONNECT: - kiblnd_rejected(conn, event->status, - (void *)KIBLND_CONN_PARAM(event), - KIBLND_CONN_PARAM_LEN(event)); - break; - } - kiblnd_conn_decref(conn); - return 0; - - case RDMA_CM_EVENT_ESTABLISHED: - conn = (struct kib_conn *)cmid->context; - switch (conn->ibc_state) { - default: - LBUG(); - - case IBLND_CONN_PASSIVE_WAIT: - CDEBUG(D_NET, "ESTABLISHED (passive): %s\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - kiblnd_connreq_done(conn, 0); - break; - - case IBLND_CONN_ACTIVE_CONNECT: - CDEBUG(D_NET, "ESTABLISHED(active): %s\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - kiblnd_check_connreply(conn, - (void *)KIBLND_CONN_PARAM(event), - KIBLND_CONN_PARAM_LEN(event)); - break; - } - /* net keeps its ref on conn! */ - return 0; - - case RDMA_CM_EVENT_TIMEWAIT_EXIT: - CDEBUG(D_NET, "Ignore TIMEWAIT_EXIT event\n"); - return 0; - case RDMA_CM_EVENT_DISCONNECTED: - conn = (struct kib_conn *)cmid->context; - if (conn->ibc_state < IBLND_CONN_ESTABLISHED) { - CERROR("%s DISCONNECTED\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - kiblnd_connreq_done(conn, -ECONNRESET); - } else { - kiblnd_close_conn(conn, 0); - } - kiblnd_conn_decref(conn); - cmid->context = NULL; - return 0; - - case RDMA_CM_EVENT_DEVICE_REMOVAL: - LCONSOLE_ERROR_MSG(0x131, - "Received notification of device removal\n" - "Please shutdown LNET to allow this to proceed\n"); - /* - * Can't remove network from underneath LNET for now, so I have - * to ignore this - */ - return 0; - - case RDMA_CM_EVENT_ADDR_CHANGE: - LCONSOLE_INFO("Physical link changed (eg hca/port)\n"); - return 0; - } -} - -static int -kiblnd_check_txs_locked(struct kib_conn *conn, struct list_head *txs) -{ - struct kib_tx *tx; - struct list_head *ttmp; - - list_for_each(ttmp, txs) { - tx = list_entry(ttmp, struct kib_tx, tx_list); - - if (txs != &conn->ibc_active_txs) { - LASSERT(tx->tx_queued); - } else { - LASSERT(!tx->tx_queued); - LASSERT(tx->tx_waiting || tx->tx_sending); - } - - if (time_after_eq(jiffies, tx->tx_deadline)) { - CERROR("Timed out tx: %s, %lu seconds\n", - kiblnd_queue2str(conn, txs), - (jiffies - tx->tx_deadline) / HZ); - return 1; - } - } - - return 0; -} - -static int -kiblnd_conn_timed_out_locked(struct kib_conn *conn) -{ - return kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue) || - kiblnd_check_txs_locked(conn, &conn->ibc_tx_noops) || - kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_rsrvd) || - kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_nocred) || - kiblnd_check_txs_locked(conn, &conn->ibc_active_txs); -} - -static void -kiblnd_check_conns(int idx) -{ - LIST_HEAD(closes); - LIST_HEAD(checksends); - struct list_head *peers = &kiblnd_data.kib_peers[idx]; - struct list_head *ptmp; - struct kib_peer *peer; - struct kib_conn *conn; - struct kib_conn *temp; - struct kib_conn *tmp; - struct list_head *ctmp; - unsigned long flags; - - /* - * NB. We expect to have a look at all the peers and not find any - * RDMAs to time out, so we just use a shared lock while we - * take a look... - */ - read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - - list_for_each(ptmp, peers) { - peer = list_entry(ptmp, struct kib_peer, ibp_list); - - list_for_each(ctmp, &peer->ibp_conns) { - int timedout; - int sendnoop; - - conn = list_entry(ctmp, struct kib_conn, ibc_list); - - LASSERT(conn->ibc_state == IBLND_CONN_ESTABLISHED); - - spin_lock(&conn->ibc_lock); - - sendnoop = kiblnd_need_noop(conn); - timedout = kiblnd_conn_timed_out_locked(conn); - if (!sendnoop && !timedout) { - spin_unlock(&conn->ibc_lock); - continue; - } - - if (timedout) { - CERROR("Timed out RDMA with %s (%lu): c: %u, oc: %u, rc: %u\n", - libcfs_nid2str(peer->ibp_nid), - (jiffies - peer->ibp_last_alive) / HZ, - conn->ibc_credits, - conn->ibc_outstanding_credits, - conn->ibc_reserved_credits); - list_add(&conn->ibc_connd_list, &closes); - } else { - list_add(&conn->ibc_connd_list, &checksends); - } - /* +ref for 'closes' or 'checksends' */ - kiblnd_conn_addref(conn); - - spin_unlock(&conn->ibc_lock); - } - } - - read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - - /* - * Handle timeout by closing the whole - * connection. We can only be sure RDMA activity - * has ceased once the QP has been modified. - */ - list_for_each_entry_safe(conn, tmp, &closes, ibc_connd_list) { - list_del(&conn->ibc_connd_list); - kiblnd_close_conn(conn, -ETIMEDOUT); - kiblnd_conn_decref(conn); - } - - /* - * In case we have enough credits to return via a - * NOOP, but there were no non-blocking tx descs - * free to do it last time... - */ - list_for_each_entry_safe(conn, temp, &checksends, ibc_connd_list) { - list_del(&conn->ibc_connd_list); - - spin_lock(&conn->ibc_lock); - kiblnd_check_sends_locked(conn); - spin_unlock(&conn->ibc_lock); - - kiblnd_conn_decref(conn); - } -} - -static void -kiblnd_disconnect_conn(struct kib_conn *conn) -{ - LASSERT(!in_interrupt()); - LASSERT(current == kiblnd_data.kib_connd); - LASSERT(conn->ibc_state == IBLND_CONN_CLOSING); - - rdma_disconnect(conn->ibc_cmid); - kiblnd_finalise_conn(conn); - - kiblnd_peer_notify(conn->ibc_peer); -} - -/** - * High-water for reconnection to the same peer, reconnection attempt should - * be delayed after trying more than KIB_RECONN_HIGH_RACE. - */ -#define KIB_RECONN_HIGH_RACE 10 -/** - * Allow connd to take a break and handle other things after consecutive - * reconnection attempts. - */ -#define KIB_RECONN_BREAK 100 - -int -kiblnd_connd(void *arg) -{ - spinlock_t *lock = &kiblnd_data.kib_connd_lock; - wait_queue_entry_t wait; - unsigned long flags; - struct kib_conn *conn; - int timeout; - int i; - int dropped_lock; - int peer_index = 0; - unsigned long deadline = jiffies; - - init_waitqueue_entry(&wait, current); - kiblnd_data.kib_connd = current; - - spin_lock_irqsave(lock, flags); - - while (!kiblnd_data.kib_shutdown) { - int reconn = 0; - - dropped_lock = 0; - - if (!list_empty(&kiblnd_data.kib_connd_zombies)) { - struct kib_peer *peer = NULL; - - conn = list_entry(kiblnd_data.kib_connd_zombies.next, - struct kib_conn, ibc_list); - list_del(&conn->ibc_list); - if (conn->ibc_reconnect) { - peer = conn->ibc_peer; - kiblnd_peer_addref(peer); - } - - spin_unlock_irqrestore(lock, flags); - dropped_lock = 1; - - kiblnd_destroy_conn(conn); - - spin_lock_irqsave(lock, flags); - if (!peer) { - kfree(conn); - continue; - } - - conn->ibc_peer = peer; - if (peer->ibp_reconnected < KIB_RECONN_HIGH_RACE) - list_add_tail(&conn->ibc_list, - &kiblnd_data.kib_reconn_list); - else - list_add_tail(&conn->ibc_list, - &kiblnd_data.kib_reconn_wait); - } - - if (!list_empty(&kiblnd_data.kib_connd_conns)) { - conn = list_entry(kiblnd_data.kib_connd_conns.next, - struct kib_conn, ibc_list); - list_del(&conn->ibc_list); - - spin_unlock_irqrestore(lock, flags); - dropped_lock = 1; - - kiblnd_disconnect_conn(conn); - kiblnd_conn_decref(conn); - - spin_lock_irqsave(lock, flags); - } - - while (reconn < KIB_RECONN_BREAK) { - if (kiblnd_data.kib_reconn_sec != - ktime_get_real_seconds()) { - kiblnd_data.kib_reconn_sec = ktime_get_real_seconds(); - list_splice_init(&kiblnd_data.kib_reconn_wait, - &kiblnd_data.kib_reconn_list); - } - - if (list_empty(&kiblnd_data.kib_reconn_list)) - break; - - conn = list_entry(kiblnd_data.kib_reconn_list.next, - struct kib_conn, ibc_list); - list_del(&conn->ibc_list); - - spin_unlock_irqrestore(lock, flags); - dropped_lock = 1; - - reconn += kiblnd_reconnect_peer(conn->ibc_peer); - kiblnd_peer_decref(conn->ibc_peer); - kfree(conn); - - spin_lock_irqsave(lock, flags); - } - - /* careful with the jiffy wrap... */ - timeout = (int)(deadline - jiffies); - if (timeout <= 0) { - const int n = 4; - const int p = 1; - int chunk = kiblnd_data.kib_peer_hash_size; - - spin_unlock_irqrestore(lock, flags); - dropped_lock = 1; - - /* - * Time to check for RDMA timeouts on a few more - * peers: I do checks every 'p' seconds on a - * proportion of the peer table and I need to check - * every connection 'n' times within a timeout - * interval, to ensure I detect a timeout on any - * connection within (n+1)/n times the timeout - * interval. - */ - if (*kiblnd_tunables.kib_timeout > n * p) - chunk = (chunk * n * p) / - *kiblnd_tunables.kib_timeout; - if (!chunk) - chunk = 1; - - for (i = 0; i < chunk; i++) { - kiblnd_check_conns(peer_index); - peer_index = (peer_index + 1) % - kiblnd_data.kib_peer_hash_size; - } - - deadline += msecs_to_jiffies(p * MSEC_PER_SEC); - spin_lock_irqsave(lock, flags); - } - - if (dropped_lock) - continue; - - /* Nothing to do for 'timeout' */ - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kiblnd_data.kib_connd_waitq, &wait); - spin_unlock_irqrestore(lock, flags); - - schedule_timeout(timeout); - - remove_wait_queue(&kiblnd_data.kib_connd_waitq, &wait); - spin_lock_irqsave(lock, flags); - } - - spin_unlock_irqrestore(lock, flags); - - kiblnd_thread_fini(); - return 0; -} - -void -kiblnd_qp_event(struct ib_event *event, void *arg) -{ - struct kib_conn *conn = arg; - - switch (event->event) { - case IB_EVENT_COMM_EST: - CDEBUG(D_NET, "%s established\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - /* - * We received a packet but connection isn't established - * probably handshake packet was lost, so free to - * force make connection established - */ - rdma_notify(conn->ibc_cmid, IB_EVENT_COMM_EST); - return; - - default: - CERROR("%s: Async QP event type %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event); - return; - } -} - -static void -kiblnd_complete(struct ib_wc *wc) -{ - switch (kiblnd_wreqid2type(wc->wr_id)) { - default: - LBUG(); - - case IBLND_WID_MR: - if (wc->status != IB_WC_SUCCESS && - wc->status != IB_WC_WR_FLUSH_ERR) - CNETERR("FastReg failed: %d\n", wc->status); - break; - - case IBLND_WID_RDMA: - /* - * We only get RDMA completion notification if it fails. All - * subsequent work items, including the final SEND will fail - * too. However we can't print out any more info about the - * failing RDMA because 'tx' might be back on the idle list or - * even reused already if we didn't manage to post all our work - * items - */ - CNETERR("RDMA (tx: %p) failed: %d\n", - kiblnd_wreqid2ptr(wc->wr_id), wc->status); - return; - - case IBLND_WID_TX: - kiblnd_tx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status); - return; - - case IBLND_WID_RX: - kiblnd_rx_complete(kiblnd_wreqid2ptr(wc->wr_id), wc->status, - wc->byte_len); - return; - } -} - -void -kiblnd_cq_completion(struct ib_cq *cq, void *arg) -{ - /* - * NB I'm not allowed to schedule this conn once its refcount has - * reached 0. Since fundamentally I'm racing with scheduler threads - * consuming my CQ I could be called after all completions have - * occurred. But in this case, !ibc_nrx && !ibc_nsends_posted - * and this CQ is about to be destroyed so I NOOP. - */ - struct kib_conn *conn = arg; - struct kib_sched_info *sched = conn->ibc_sched; - unsigned long flags; - - LASSERT(cq == conn->ibc_cq); - - spin_lock_irqsave(&sched->ibs_lock, flags); - - conn->ibc_ready = 1; - - if (!conn->ibc_scheduled && - (conn->ibc_nrx > 0 || - conn->ibc_nsends_posted > 0)) { - kiblnd_conn_addref(conn); /* +1 ref for sched_conns */ - conn->ibc_scheduled = 1; - list_add_tail(&conn->ibc_sched_list, &sched->ibs_conns); - - if (waitqueue_active(&sched->ibs_waitq)) - wake_up(&sched->ibs_waitq); - } - - spin_unlock_irqrestore(&sched->ibs_lock, flags); -} - -void -kiblnd_cq_event(struct ib_event *event, void *arg) -{ - struct kib_conn *conn = arg; - - CERROR("%s: async CQ event type %d\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), event->event); -} - -int -kiblnd_scheduler(void *arg) -{ - long id = (long)arg; - struct kib_sched_info *sched; - struct kib_conn *conn; - wait_queue_entry_t wait; - unsigned long flags; - struct ib_wc wc; - int did_something; - int busy_loops = 0; - int rc; - - init_waitqueue_entry(&wait, current); - - sched = kiblnd_data.kib_scheds[KIB_THREAD_CPT(id)]; - - rc = cfs_cpt_bind(lnet_cpt_table(), sched->ibs_cpt); - if (rc) { - CWARN("Unable to bind on CPU partition %d, please verify whether all CPUs are healthy and reload modules if necessary, otherwise your system might under risk of low performance\n", - sched->ibs_cpt); - } - - spin_lock_irqsave(&sched->ibs_lock, flags); - - while (!kiblnd_data.kib_shutdown) { - if (busy_loops++ >= IBLND_RESCHED) { - spin_unlock_irqrestore(&sched->ibs_lock, flags); - - cond_resched(); - busy_loops = 0; - - spin_lock_irqsave(&sched->ibs_lock, flags); - } - - did_something = 0; - - if (!list_empty(&sched->ibs_conns)) { - conn = list_entry(sched->ibs_conns.next, struct kib_conn, - ibc_sched_list); - /* take over kib_sched_conns' ref on conn... */ - LASSERT(conn->ibc_scheduled); - list_del(&conn->ibc_sched_list); - conn->ibc_ready = 0; - - spin_unlock_irqrestore(&sched->ibs_lock, flags); - - wc.wr_id = IBLND_WID_INVAL; - - rc = ib_poll_cq(conn->ibc_cq, 1, &wc); - if (!rc) { - rc = ib_req_notify_cq(conn->ibc_cq, - IB_CQ_NEXT_COMP); - if (rc < 0) { - CWARN("%s: ib_req_notify_cq failed: %d, closing connection\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); - kiblnd_close_conn(conn, -EIO); - kiblnd_conn_decref(conn); - spin_lock_irqsave(&sched->ibs_lock, - flags); - continue; - } - - rc = ib_poll_cq(conn->ibc_cq, 1, &wc); - } - - if (unlikely(rc > 0 && wc.wr_id == IBLND_WID_INVAL)) { - LCONSOLE_ERROR("ib_poll_cq (rc: %d) returned invalid wr_id, opcode %d, status: %d, vendor_err: %d, conn: %s status: %d\nplease upgrade firmware and OFED or contact vendor.\n", - rc, wc.opcode, wc.status, - wc.vendor_err, - libcfs_nid2str(conn->ibc_peer->ibp_nid), - conn->ibc_state); - rc = -EINVAL; - } - - if (rc < 0) { - CWARN("%s: ib_poll_cq failed: %d, closing connection\n", - libcfs_nid2str(conn->ibc_peer->ibp_nid), - rc); - kiblnd_close_conn(conn, -EIO); - kiblnd_conn_decref(conn); - spin_lock_irqsave(&sched->ibs_lock, flags); - continue; - } - - spin_lock_irqsave(&sched->ibs_lock, flags); - - if (rc || conn->ibc_ready) { - /* - * There may be another completion waiting; get - * another scheduler to check while I handle - * this one... - */ - /* +1 ref for sched_conns */ - kiblnd_conn_addref(conn); - list_add_tail(&conn->ibc_sched_list, - &sched->ibs_conns); - if (waitqueue_active(&sched->ibs_waitq)) - wake_up(&sched->ibs_waitq); - } else { - conn->ibc_scheduled = 0; - } - - if (rc) { - spin_unlock_irqrestore(&sched->ibs_lock, flags); - kiblnd_complete(&wc); - - spin_lock_irqsave(&sched->ibs_lock, flags); - } - - kiblnd_conn_decref(conn); /* ...drop my ref from above */ - did_something = 1; - } - - if (did_something) - continue; - - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue_exclusive(&sched->ibs_waitq, &wait); - spin_unlock_irqrestore(&sched->ibs_lock, flags); - - schedule(); - busy_loops = 0; - - remove_wait_queue(&sched->ibs_waitq, &wait); - spin_lock_irqsave(&sched->ibs_lock, flags); - } - - spin_unlock_irqrestore(&sched->ibs_lock, flags); - - kiblnd_thread_fini(); - return 0; -} - -int -kiblnd_failover_thread(void *arg) -{ - rwlock_t *glock = &kiblnd_data.kib_global_lock; - struct kib_dev *dev; - wait_queue_entry_t wait; - unsigned long flags; - int rc; - - LASSERT(*kiblnd_tunables.kib_dev_failover); - - init_waitqueue_entry(&wait, current); - write_lock_irqsave(glock, flags); - - while (!kiblnd_data.kib_shutdown) { - int do_failover = 0; - int long_sleep; - - list_for_each_entry(dev, &kiblnd_data.kib_failed_devs, - ibd_fail_list) { - if (time_before(jiffies, - dev->ibd_next_failover)) - continue; - do_failover = 1; - break; - } - - if (do_failover) { - list_del_init(&dev->ibd_fail_list); - dev->ibd_failover = 1; - write_unlock_irqrestore(glock, flags); - - rc = kiblnd_dev_failover(dev); - - write_lock_irqsave(glock, flags); - - LASSERT(dev->ibd_failover); - dev->ibd_failover = 0; - if (rc >= 0) { /* Device is OK or failover succeed */ - dev->ibd_next_failover = jiffies + 3 * HZ; - continue; - } - - /* failed to failover, retry later */ - dev->ibd_next_failover = - jiffies + min(dev->ibd_failed_failover, 10) * HZ; - if (kiblnd_dev_can_failover(dev)) { - list_add_tail(&dev->ibd_fail_list, - &kiblnd_data.kib_failed_devs); - } - - continue; - } - - /* long sleep if no more pending failover */ - long_sleep = list_empty(&kiblnd_data.kib_failed_devs); - - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kiblnd_data.kib_failover_waitq, &wait); - write_unlock_irqrestore(glock, flags); - - rc = schedule_timeout(long_sleep ? 10 * HZ : - HZ); - remove_wait_queue(&kiblnd_data.kib_failover_waitq, &wait); - write_lock_irqsave(glock, flags); - - if (!long_sleep || rc) - continue; - - /* - * have a long sleep, routine check all active devices, - * we need checking like this because if there is not active - * connection on the dev and no SEND from local, we may listen - * on wrong HCA for ever while there is a bonding failover - */ - list_for_each_entry(dev, &kiblnd_data.kib_devs, ibd_list) { - if (kiblnd_dev_can_failover(dev)) { - list_add_tail(&dev->ibd_fail_list, - &kiblnd_data.kib_failed_devs); - } - } - } - - write_unlock_irqrestore(glock, flags); - - kiblnd_thread_fini(); - return 0; -} diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c deleted file mode 100644 index 39d07926d603de92376394baace12c1b8230834e..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c +++ /dev/null @@ -1,296 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/o2iblnd/o2iblnd_modparams.c - * - * Author: Eric Barton - */ - -#include "o2iblnd.h" - -static int service = 987; -module_param(service, int, 0444); -MODULE_PARM_DESC(service, "service number (within RDMA_PS_TCP)"); - -static int cksum; -module_param(cksum, int, 0644); -MODULE_PARM_DESC(cksum, "set non-zero to enable message (not RDMA) checksums"); - -static int timeout = 50; -module_param(timeout, int, 0644); -MODULE_PARM_DESC(timeout, "timeout (seconds)"); - -/* - * Number of threads in each scheduler pool which is percpt, - * we will estimate reasonable value based on CPUs if it's set to zero. - */ -static int nscheds; -module_param(nscheds, int, 0444); -MODULE_PARM_DESC(nscheds, "number of threads in each scheduler pool"); - -static unsigned int conns_per_peer = 1; -module_param(conns_per_peer, uint, 0444); -MODULE_PARM_DESC(conns_per_peer, "number of connections per peer"); - -/* NB: this value is shared by all CPTs, it can grow at runtime */ -static int ntx = 512; -module_param(ntx, int, 0444); -MODULE_PARM_DESC(ntx, "# of message descriptors allocated for each pool"); - -/* NB: this value is shared by all CPTs */ -static int credits = 256; -module_param(credits, int, 0444); -MODULE_PARM_DESC(credits, "# concurrent sends"); - -static int peer_credits = 8; -module_param(peer_credits, int, 0444); -MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer"); - -static int peer_credits_hiw; -module_param(peer_credits_hiw, int, 0444); -MODULE_PARM_DESC(peer_credits_hiw, "when eagerly to return credits"); - -static int peer_buffer_credits; -module_param(peer_buffer_credits, int, 0444); -MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits"); - -static int peer_timeout = 180; -module_param(peer_timeout, int, 0444); -MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)"); - -static char *ipif_name = "ib0"; -module_param(ipif_name, charp, 0444); -MODULE_PARM_DESC(ipif_name, "IPoIB interface name"); - -static int retry_count = 5; -module_param(retry_count, int, 0644); -MODULE_PARM_DESC(retry_count, "Retransmissions when no ACK received"); - -static int rnr_retry_count = 6; -module_param(rnr_retry_count, int, 0644); -MODULE_PARM_DESC(rnr_retry_count, "RNR retransmissions"); - -static int keepalive = 100; -module_param(keepalive, int, 0644); -MODULE_PARM_DESC(keepalive, "Idle time in seconds before sending a keepalive"); - -static int ib_mtu; -module_param(ib_mtu, int, 0444); -MODULE_PARM_DESC(ib_mtu, "IB MTU 256/512/1024/2048/4096"); - -static int concurrent_sends; -module_param(concurrent_sends, int, 0444); -MODULE_PARM_DESC(concurrent_sends, "send work-queue sizing"); - -#define IBLND_DEFAULT_MAP_ON_DEMAND IBLND_MAX_RDMA_FRAGS -static int map_on_demand = IBLND_DEFAULT_MAP_ON_DEMAND; -module_param(map_on_demand, int, 0444); -MODULE_PARM_DESC(map_on_demand, "map on demand"); - -/* NB: this value is shared by all CPTs, it can grow at runtime */ -static int fmr_pool_size = 512; -module_param(fmr_pool_size, int, 0444); -MODULE_PARM_DESC(fmr_pool_size, "size of fmr pool on each CPT (>= ntx / 4)"); - -/* NB: this value is shared by all CPTs, it can grow at runtime */ -static int fmr_flush_trigger = 384; -module_param(fmr_flush_trigger, int, 0444); -MODULE_PARM_DESC(fmr_flush_trigger, "# dirty FMRs that triggers pool flush"); - -static int fmr_cache = 1; -module_param(fmr_cache, int, 0444); -MODULE_PARM_DESC(fmr_cache, "non-zero to enable FMR caching"); - -/* - * 0: disable failover - * 1: enable failover if necessary - * 2: force to failover (for debug) - */ -static int dev_failover; -module_param(dev_failover, int, 0444); -MODULE_PARM_DESC(dev_failover, "HCA failover for bonding (0 off, 1 on, other values reserved)"); - -static int require_privileged_port; -module_param(require_privileged_port, int, 0644); -MODULE_PARM_DESC(require_privileged_port, "require privileged port when accepting connection"); - -static int use_privileged_port = 1; -module_param(use_privileged_port, int, 0644); -MODULE_PARM_DESC(use_privileged_port, "use privileged port when initiating connection"); - -struct kib_tunables kiblnd_tunables = { - .kib_dev_failover = &dev_failover, - .kib_service = &service, - .kib_cksum = &cksum, - .kib_timeout = &timeout, - .kib_keepalive = &keepalive, - .kib_ntx = &ntx, - .kib_default_ipif = &ipif_name, - .kib_retry_count = &retry_count, - .kib_rnr_retry_count = &rnr_retry_count, - .kib_ib_mtu = &ib_mtu, - .kib_require_priv_port = &require_privileged_port, - .kib_use_priv_port = &use_privileged_port, - .kib_nscheds = &nscheds -}; - -static struct lnet_ioctl_config_o2iblnd_tunables default_tunables; - -/* # messages/RDMAs in-flight */ -int kiblnd_msg_queue_size(int version, struct lnet_ni *ni) -{ - if (version == IBLND_MSG_VERSION_1) - return IBLND_MSG_QUEUE_SIZE_V1; - else if (ni) - return ni->ni_peertxcredits; - else - return peer_credits; -} - -int kiblnd_tunables_setup(struct lnet_ni *ni) -{ - struct lnet_ioctl_config_o2iblnd_tunables *tunables; - - /* - * if there was no tunables specified, setup the tunables to be - * defaulted - */ - if (!ni->ni_lnd_tunables) { - ni->ni_lnd_tunables = kzalloc(sizeof(*ni->ni_lnd_tunables), - GFP_NOFS); - if (!ni->ni_lnd_tunables) - return -ENOMEM; - - memcpy(&ni->ni_lnd_tunables->lt_tun_u.lt_o2ib, - &default_tunables, sizeof(*tunables)); - } - tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; - - /* Current API version */ - tunables->lnd_version = 0; - - if (kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu) < 0) { - CERROR("Invalid ib_mtu %d, expected 256/512/1024/2048/4096\n", - *kiblnd_tunables.kib_ib_mtu); - return -EINVAL; - } - - if (!ni->ni_peertimeout) - ni->ni_peertimeout = peer_timeout; - - if (!ni->ni_maxtxcredits) - ni->ni_maxtxcredits = credits; - - if (!ni->ni_peertxcredits) - ni->ni_peertxcredits = peer_credits; - - if (!ni->ni_peerrtrcredits) - ni->ni_peerrtrcredits = peer_buffer_credits; - - if (ni->ni_peertxcredits < IBLND_CREDITS_DEFAULT) - ni->ni_peertxcredits = IBLND_CREDITS_DEFAULT; - - if (ni->ni_peertxcredits > IBLND_CREDITS_MAX) - ni->ni_peertxcredits = IBLND_CREDITS_MAX; - - if (ni->ni_peertxcredits > credits) - ni->ni_peertxcredits = credits; - - if (!tunables->lnd_peercredits_hiw) - tunables->lnd_peercredits_hiw = peer_credits_hiw; - - if (tunables->lnd_peercredits_hiw < ni->ni_peertxcredits / 2) - tunables->lnd_peercredits_hiw = ni->ni_peertxcredits / 2; - - if (tunables->lnd_peercredits_hiw >= ni->ni_peertxcredits) - tunables->lnd_peercredits_hiw = ni->ni_peertxcredits - 1; - - if (tunables->lnd_map_on_demand <= 0 || - tunables->lnd_map_on_demand > IBLND_MAX_RDMA_FRAGS) { - /* Use the default */ - CWARN("Invalid map_on_demand (%d), expects 1 - %d. Using default of %d\n", - tunables->lnd_map_on_demand, - IBLND_MAX_RDMA_FRAGS, IBLND_DEFAULT_MAP_ON_DEMAND); - tunables->lnd_map_on_demand = IBLND_DEFAULT_MAP_ON_DEMAND; - } - - if (tunables->lnd_map_on_demand == 1) { - /* don't make sense to create map if only one fragment */ - tunables->lnd_map_on_demand = 2; - } - - if (!tunables->lnd_concurrent_sends) { - if (tunables->lnd_map_on_demand > 0 && - tunables->lnd_map_on_demand <= IBLND_MAX_RDMA_FRAGS / 8) { - tunables->lnd_concurrent_sends = - ni->ni_peertxcredits * 2; - } else { - tunables->lnd_concurrent_sends = ni->ni_peertxcredits; - } - } - - if (tunables->lnd_concurrent_sends > ni->ni_peertxcredits * 2) - tunables->lnd_concurrent_sends = ni->ni_peertxcredits * 2; - - if (tunables->lnd_concurrent_sends < ni->ni_peertxcredits / 2) - tunables->lnd_concurrent_sends = ni->ni_peertxcredits / 2; - - if (tunables->lnd_concurrent_sends < ni->ni_peertxcredits) { - CWARN("Concurrent sends %d is lower than message queue size: %d, performance may drop slightly.\n", - tunables->lnd_concurrent_sends, ni->ni_peertxcredits); - } - - if (!tunables->lnd_fmr_pool_size) - tunables->lnd_fmr_pool_size = fmr_pool_size; - if (!tunables->lnd_fmr_flush_trigger) - tunables->lnd_fmr_flush_trigger = fmr_flush_trigger; - if (!tunables->lnd_fmr_cache) - tunables->lnd_fmr_cache = fmr_cache; - if (!tunables->lnd_conns_per_peer) { - tunables->lnd_conns_per_peer = (conns_per_peer) ? - conns_per_peer : 1; - } - - return 0; -} - -void kiblnd_tunables_init(void) -{ - default_tunables.lnd_version = 0; - default_tunables.lnd_peercredits_hiw = peer_credits_hiw, - default_tunables.lnd_map_on_demand = map_on_demand; - default_tunables.lnd_concurrent_sends = concurrent_sends; - default_tunables.lnd_fmr_pool_size = fmr_pool_size; - default_tunables.lnd_fmr_flush_trigger = fmr_flush_trigger; - default_tunables.lnd_fmr_cache = fmr_cache; - default_tunables.lnd_conns_per_peer = conns_per_peer; -} diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/Makefile b/drivers/staging/lustre/lnet/klnds/socklnd/Makefile deleted file mode 100644 index a7da1abfc8042ca6ab8a387a93531778caafdd45..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/klnds/socklnd/Makefile +++ /dev/null @@ -1,6 +0,0 @@ -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/include -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/lustre/include - -obj-$(CONFIG_LNET) += ksocklnd.o - -ksocklnd-y := socklnd.o socklnd_cb.o socklnd_proto.o socklnd_modparams.o socklnd_lib.o diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c deleted file mode 100644 index f01b34ac1a534fab4f168034cdafab0edc4a6c88..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c +++ /dev/null @@ -1,2921 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/klnds/socklnd/socklnd.c - * - * Author: Zach Brown - * Author: Peter J. Braam - * Author: Phil Schwan - * Author: Eric Barton - */ - -#include "socklnd.h" - -static struct lnet_lnd the_ksocklnd; -struct ksock_nal_data ksocknal_data; - -static struct ksock_interface * -ksocknal_ip2iface(struct lnet_ni *ni, __u32 ip) -{ - struct ksock_net *net = ni->ni_data; - int i; - struct ksock_interface *iface; - - for (i = 0; i < net->ksnn_ninterfaces; i++) { - LASSERT(i < LNET_MAX_INTERFACES); - iface = &net->ksnn_interfaces[i]; - - if (iface->ksni_ipaddr == ip) - return iface; - } - - return NULL; -} - -static struct ksock_route * -ksocknal_create_route(__u32 ipaddr, int port) -{ - struct ksock_route *route; - - route = kzalloc(sizeof(*route), GFP_NOFS); - if (!route) - return NULL; - - atomic_set(&route->ksnr_refcount, 1); - route->ksnr_peer = NULL; - route->ksnr_retry_interval = 0; /* OK to connect at any time */ - route->ksnr_ipaddr = ipaddr; - route->ksnr_port = port; - route->ksnr_scheduled = 0; - route->ksnr_connecting = 0; - route->ksnr_connected = 0; - route->ksnr_deleted = 0; - route->ksnr_conn_count = 0; - route->ksnr_share_count = 0; - - return route; -} - -void -ksocknal_destroy_route(struct ksock_route *route) -{ - LASSERT(!atomic_read(&route->ksnr_refcount)); - - if (route->ksnr_peer) - ksocknal_peer_decref(route->ksnr_peer); - - kfree(route); -} - -static int -ksocknal_create_peer(struct ksock_peer **peerp, struct lnet_ni *ni, - struct lnet_process_id id) -{ - int cpt = lnet_cpt_of_nid(id.nid); - struct ksock_net *net = ni->ni_data; - struct ksock_peer *peer; - - LASSERT(id.nid != LNET_NID_ANY); - LASSERT(id.pid != LNET_PID_ANY); - LASSERT(!in_interrupt()); - - peer = kzalloc_cpt(sizeof(*peer), GFP_NOFS, cpt); - if (!peer) - return -ENOMEM; - - peer->ksnp_ni = ni; - peer->ksnp_id = id; - atomic_set(&peer->ksnp_refcount, 1); /* 1 ref for caller */ - peer->ksnp_closing = 0; - peer->ksnp_accepting = 0; - peer->ksnp_proto = NULL; - peer->ksnp_last_alive = 0; - peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1; - - INIT_LIST_HEAD(&peer->ksnp_conns); - INIT_LIST_HEAD(&peer->ksnp_routes); - INIT_LIST_HEAD(&peer->ksnp_tx_queue); - INIT_LIST_HEAD(&peer->ksnp_zc_req_list); - spin_lock_init(&peer->ksnp_lock); - - spin_lock_bh(&net->ksnn_lock); - - if (net->ksnn_shutdown) { - spin_unlock_bh(&net->ksnn_lock); - - kfree(peer); - CERROR("Can't create peer: network shutdown\n"); - return -ESHUTDOWN; - } - - net->ksnn_npeers++; - - spin_unlock_bh(&net->ksnn_lock); - - *peerp = peer; - return 0; -} - -void -ksocknal_destroy_peer(struct ksock_peer *peer) -{ - struct ksock_net *net = peer->ksnp_ni->ni_data; - - CDEBUG(D_NET, "peer %s %p deleted\n", - libcfs_id2str(peer->ksnp_id), peer); - - LASSERT(!atomic_read(&peer->ksnp_refcount)); - LASSERT(!peer->ksnp_accepting); - LASSERT(list_empty(&peer->ksnp_conns)); - LASSERT(list_empty(&peer->ksnp_routes)); - LASSERT(list_empty(&peer->ksnp_tx_queue)); - LASSERT(list_empty(&peer->ksnp_zc_req_list)); - - kfree(peer); - - /* - * NB a peer's connections and routes keep a reference on their peer - * until they are destroyed, so we can be assured that _all_ state to - * do with this peer has been cleaned up when its refcount drops to - * zero. - */ - spin_lock_bh(&net->ksnn_lock); - net->ksnn_npeers--; - spin_unlock_bh(&net->ksnn_lock); -} - -struct ksock_peer * -ksocknal_find_peer_locked(struct lnet_ni *ni, struct lnet_process_id id) -{ - struct list_head *peer_list = ksocknal_nid2peerlist(id.nid); - struct ksock_peer *peer; - - list_for_each_entry(peer, peer_list, ksnp_list) { - LASSERT(!peer->ksnp_closing); - - if (peer->ksnp_ni != ni) - continue; - - if (peer->ksnp_id.nid != id.nid || - peer->ksnp_id.pid != id.pid) - continue; - - CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n", - peer, libcfs_id2str(id), - atomic_read(&peer->ksnp_refcount)); - return peer; - } - return NULL; -} - -struct ksock_peer * -ksocknal_find_peer(struct lnet_ni *ni, struct lnet_process_id id) -{ - struct ksock_peer *peer; - - read_lock(&ksocknal_data.ksnd_global_lock); - peer = ksocknal_find_peer_locked(ni, id); - if (peer) /* +1 ref for caller? */ - ksocknal_peer_addref(peer); - read_unlock(&ksocknal_data.ksnd_global_lock); - - return peer; -} - -static void -ksocknal_unlink_peer_locked(struct ksock_peer *peer) -{ - int i; - __u32 ip; - struct ksock_interface *iface; - - for (i = 0; i < peer->ksnp_n_passive_ips; i++) { - LASSERT(i < LNET_MAX_INTERFACES); - ip = peer->ksnp_passive_ips[i]; - - iface = ksocknal_ip2iface(peer->ksnp_ni, ip); - /* - * All IPs in peer->ksnp_passive_ips[] come from the - * interface list, therefore the call must succeed. - */ - LASSERT(iface); - - CDEBUG(D_NET, "peer=%p iface=%p ksni_nroutes=%d\n", - peer, iface, iface->ksni_nroutes); - iface->ksni_npeers--; - } - - LASSERT(list_empty(&peer->ksnp_conns)); - LASSERT(list_empty(&peer->ksnp_routes)); - LASSERT(!peer->ksnp_closing); - peer->ksnp_closing = 1; - list_del(&peer->ksnp_list); - /* lose peerlist's ref */ - ksocknal_peer_decref(peer); -} - -static int -ksocknal_get_peer_info(struct lnet_ni *ni, int index, - struct lnet_process_id *id, __u32 *myip, __u32 *peer_ip, - int *port, int *conn_count, int *share_count) -{ - struct ksock_peer *peer; - struct list_head *ptmp; - struct ksock_route *route; - struct list_head *rtmp; - int i; - int j; - int rc = -ENOENT; - - read_lock(&ksocknal_data.ksnd_global_lock); - - for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { - list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) { - peer = list_entry(ptmp, struct ksock_peer, ksnp_list); - - if (peer->ksnp_ni != ni) - continue; - - if (!peer->ksnp_n_passive_ips && - list_empty(&peer->ksnp_routes)) { - if (index-- > 0) - continue; - - *id = peer->ksnp_id; - *myip = 0; - *peer_ip = 0; - *port = 0; - *conn_count = 0; - *share_count = 0; - rc = 0; - goto out; - } - - for (j = 0; j < peer->ksnp_n_passive_ips; j++) { - if (index-- > 0) - continue; - - *id = peer->ksnp_id; - *myip = peer->ksnp_passive_ips[j]; - *peer_ip = 0; - *port = 0; - *conn_count = 0; - *share_count = 0; - rc = 0; - goto out; - } - - list_for_each(rtmp, &peer->ksnp_routes) { - if (index-- > 0) - continue; - - route = list_entry(rtmp, struct ksock_route, - ksnr_list); - - *id = peer->ksnp_id; - *myip = route->ksnr_myipaddr; - *peer_ip = route->ksnr_ipaddr; - *port = route->ksnr_port; - *conn_count = route->ksnr_conn_count; - *share_count = route->ksnr_share_count; - rc = 0; - goto out; - } - } - } - out: - read_unlock(&ksocknal_data.ksnd_global_lock); - return rc; -} - -static void -ksocknal_associate_route_conn_locked(struct ksock_route *route, - struct ksock_conn *conn) -{ - struct ksock_peer *peer = route->ksnr_peer; - int type = conn->ksnc_type; - struct ksock_interface *iface; - - conn->ksnc_route = route; - ksocknal_route_addref(route); - - if (route->ksnr_myipaddr != conn->ksnc_myipaddr) { - if (!route->ksnr_myipaddr) { - /* route wasn't bound locally yet (the initial route) */ - CDEBUG(D_NET, "Binding %s %pI4h to %pI4h\n", - libcfs_id2str(peer->ksnp_id), - &route->ksnr_ipaddr, - &conn->ksnc_myipaddr); - } else { - CDEBUG(D_NET, "Rebinding %s %pI4h from %pI4h to %pI4h\n", - libcfs_id2str(peer->ksnp_id), - &route->ksnr_ipaddr, - &route->ksnr_myipaddr, - &conn->ksnc_myipaddr); - - iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni, - route->ksnr_myipaddr); - if (iface) - iface->ksni_nroutes--; - } - route->ksnr_myipaddr = conn->ksnc_myipaddr; - iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni, - route->ksnr_myipaddr); - if (iface) - iface->ksni_nroutes++; - } - - route->ksnr_connected |= (1 << type); - route->ksnr_conn_count++; - - /* - * Successful connection => further attempts can - * proceed immediately - */ - route->ksnr_retry_interval = 0; -} - -static void -ksocknal_add_route_locked(struct ksock_peer *peer, struct ksock_route *route) -{ - struct list_head *tmp; - struct ksock_conn *conn; - struct ksock_route *route2; - - LASSERT(!peer->ksnp_closing); - LASSERT(!route->ksnr_peer); - LASSERT(!route->ksnr_scheduled); - LASSERT(!route->ksnr_connecting); - LASSERT(!route->ksnr_connected); - - /* LASSERT(unique) */ - list_for_each(tmp, &peer->ksnp_routes) { - route2 = list_entry(tmp, struct ksock_route, ksnr_list); - - if (route2->ksnr_ipaddr == route->ksnr_ipaddr) { - CERROR("Duplicate route %s %pI4h\n", - libcfs_id2str(peer->ksnp_id), - &route->ksnr_ipaddr); - LBUG(); - } - } - - route->ksnr_peer = peer; - ksocknal_peer_addref(peer); - /* peer's routelist takes over my ref on 'route' */ - list_add_tail(&route->ksnr_list, &peer->ksnp_routes); - - list_for_each(tmp, &peer->ksnp_conns) { - conn = list_entry(tmp, struct ksock_conn, ksnc_list); - - if (conn->ksnc_ipaddr != route->ksnr_ipaddr) - continue; - - ksocknal_associate_route_conn_locked(route, conn); - /* keep going (typed routes) */ - } -} - -static void -ksocknal_del_route_locked(struct ksock_route *route) -{ - struct ksock_peer *peer = route->ksnr_peer; - struct ksock_interface *iface; - struct ksock_conn *conn; - struct list_head *ctmp; - struct list_head *cnxt; - - LASSERT(!route->ksnr_deleted); - - /* Close associated conns */ - list_for_each_safe(ctmp, cnxt, &peer->ksnp_conns) { - conn = list_entry(ctmp, struct ksock_conn, ksnc_list); - - if (conn->ksnc_route != route) - continue; - - ksocknal_close_conn_locked(conn, 0); - } - - if (route->ksnr_myipaddr) { - iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni, - route->ksnr_myipaddr); - if (iface) - iface->ksni_nroutes--; - } - - route->ksnr_deleted = 1; - list_del(&route->ksnr_list); - ksocknal_route_decref(route); /* drop peer's ref */ - - if (list_empty(&peer->ksnp_routes) && - list_empty(&peer->ksnp_conns)) { - /* - * I've just removed the last route to a peer with no active - * connections - */ - ksocknal_unlink_peer_locked(peer); - } -} - -int -ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ipaddr, - int port) -{ - struct ksock_peer *peer; - struct ksock_peer *peer2; - struct ksock_route *route; - struct ksock_route *route2; - int rc; - - if (id.nid == LNET_NID_ANY || - id.pid == LNET_PID_ANY) - return -EINVAL; - - /* Have a brand new peer ready... */ - rc = ksocknal_create_peer(&peer, ni, id); - if (rc) - return rc; - - route = ksocknal_create_route(ipaddr, port); - if (!route) { - ksocknal_peer_decref(peer); - return -ENOMEM; - } - - write_lock_bh(&ksocknal_data.ksnd_global_lock); - - /* always called with a ref on ni, so shutdown can't have started */ - LASSERT(!((struct ksock_net *)ni->ni_data)->ksnn_shutdown); - - peer2 = ksocknal_find_peer_locked(ni, id); - if (peer2) { - ksocknal_peer_decref(peer); - peer = peer2; - } else { - /* peer table takes my ref on peer */ - list_add_tail(&peer->ksnp_list, - ksocknal_nid2peerlist(id.nid)); - } - - list_for_each_entry(route2, &peer->ksnp_routes, ksnr_list) { - if (route2->ksnr_ipaddr == ipaddr) { - /* Route already exists, use the old one */ - ksocknal_route_decref(route); - route2->ksnr_share_count++; - goto out; - } - } - /* Route doesn't already exist, add the new one */ - ksocknal_add_route_locked(peer, route); - route->ksnr_share_count++; -out: - write_unlock_bh(&ksocknal_data.ksnd_global_lock); - - return 0; -} - -static void -ksocknal_del_peer_locked(struct ksock_peer *peer, __u32 ip) -{ - struct ksock_conn *conn; - struct ksock_route *route; - struct list_head *tmp; - struct list_head *nxt; - int nshared; - - LASSERT(!peer->ksnp_closing); - - /* Extra ref prevents peer disappearing until I'm done with it */ - ksocknal_peer_addref(peer); - - list_for_each_safe(tmp, nxt, &peer->ksnp_routes) { - route = list_entry(tmp, struct ksock_route, ksnr_list); - - /* no match */ - if (!(!ip || route->ksnr_ipaddr == ip)) - continue; - - route->ksnr_share_count = 0; - /* This deletes associated conns too */ - ksocknal_del_route_locked(route); - } - - nshared = 0; - list_for_each_safe(tmp, nxt, &peer->ksnp_routes) { - route = list_entry(tmp, struct ksock_route, ksnr_list); - nshared += route->ksnr_share_count; - } - - if (!nshared) { - /* - * remove everything else if there are no explicit entries - * left - */ - list_for_each_safe(tmp, nxt, &peer->ksnp_routes) { - route = list_entry(tmp, struct ksock_route, ksnr_list); - - /* we should only be removing auto-entries */ - LASSERT(!route->ksnr_share_count); - ksocknal_del_route_locked(route); - } - - list_for_each_safe(tmp, nxt, &peer->ksnp_conns) { - conn = list_entry(tmp, struct ksock_conn, ksnc_list); - - ksocknal_close_conn_locked(conn, 0); - } - } - - ksocknal_peer_decref(peer); - /* NB peer unlinks itself when last conn/route is removed */ -} - -static int -ksocknal_del_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip) -{ - LIST_HEAD(zombies); - struct list_head *ptmp; - struct list_head *pnxt; - struct ksock_peer *peer; - int lo; - int hi; - int i; - int rc = -ENOENT; - - write_lock_bh(&ksocknal_data.ksnd_global_lock); - - if (id.nid != LNET_NID_ANY) { - lo = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers); - hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers); - } else { - lo = 0; - hi = ksocknal_data.ksnd_peer_hash_size - 1; - } - - for (i = lo; i <= hi; i++) { - list_for_each_safe(ptmp, pnxt, &ksocknal_data.ksnd_peers[i]) { - peer = list_entry(ptmp, struct ksock_peer, ksnp_list); - - if (peer->ksnp_ni != ni) - continue; - - if (!((id.nid == LNET_NID_ANY || peer->ksnp_id.nid == id.nid) && - (id.pid == LNET_PID_ANY || peer->ksnp_id.pid == id.pid))) - continue; - - ksocknal_peer_addref(peer); /* a ref for me... */ - - ksocknal_del_peer_locked(peer, ip); - - if (peer->ksnp_closing && - !list_empty(&peer->ksnp_tx_queue)) { - LASSERT(list_empty(&peer->ksnp_conns)); - LASSERT(list_empty(&peer->ksnp_routes)); - - list_splice_init(&peer->ksnp_tx_queue, - &zombies); - } - - ksocknal_peer_decref(peer); /* ...till here */ - - rc = 0; /* matched! */ - } - } - - write_unlock_bh(&ksocknal_data.ksnd_global_lock); - - ksocknal_txlist_done(ni, &zombies, 1); - - return rc; -} - -static struct ksock_conn * -ksocknal_get_conn_by_idx(struct lnet_ni *ni, int index) -{ - struct ksock_peer *peer; - struct list_head *ptmp; - struct ksock_conn *conn; - struct list_head *ctmp; - int i; - - read_lock(&ksocknal_data.ksnd_global_lock); - - for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { - list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) { - peer = list_entry(ptmp, struct ksock_peer, ksnp_list); - - LASSERT(!peer->ksnp_closing); - - if (peer->ksnp_ni != ni) - continue; - - list_for_each(ctmp, &peer->ksnp_conns) { - if (index-- > 0) - continue; - - conn = list_entry(ctmp, struct ksock_conn, - ksnc_list); - ksocknal_conn_addref(conn); - read_unlock(&ksocknal_data.ksnd_global_lock); - return conn; - } - } - } - - read_unlock(&ksocknal_data.ksnd_global_lock); - return NULL; -} - -static struct ksock_sched * -ksocknal_choose_scheduler_locked(unsigned int cpt) -{ - struct ksock_sched_info *info = ksocknal_data.ksnd_sched_info[cpt]; - struct ksock_sched *sched; - int i; - - LASSERT(info->ksi_nthreads > 0); - - sched = &info->ksi_scheds[0]; - /* - * NB: it's safe so far, but info->ksi_nthreads could be changed - * at runtime when we have dynamic LNet configuration, then we - * need to take care of this. - */ - for (i = 1; i < info->ksi_nthreads; i++) { - if (sched->kss_nconns > info->ksi_scheds[i].kss_nconns) - sched = &info->ksi_scheds[i]; - } - - return sched; -} - -static int -ksocknal_local_ipvec(struct lnet_ni *ni, __u32 *ipaddrs) -{ - struct ksock_net *net = ni->ni_data; - int i; - int nip; - - read_lock(&ksocknal_data.ksnd_global_lock); - - nip = net->ksnn_ninterfaces; - LASSERT(nip <= LNET_MAX_INTERFACES); - - /* - * Only offer interfaces for additional connections if I have - * more than one. - */ - if (nip < 2) { - read_unlock(&ksocknal_data.ksnd_global_lock); - return 0; - } - - for (i = 0; i < nip; i++) { - ipaddrs[i] = net->ksnn_interfaces[i].ksni_ipaddr; - LASSERT(ipaddrs[i]); - } - - read_unlock(&ksocknal_data.ksnd_global_lock); - return nip; -} - -static int -ksocknal_match_peerip(struct ksock_interface *iface, __u32 *ips, int nips) -{ - int best_netmatch = 0; - int best_xor = 0; - int best = -1; - int this_xor; - int this_netmatch; - int i; - - for (i = 0; i < nips; i++) { - if (!ips[i]) - continue; - - this_xor = ips[i] ^ iface->ksni_ipaddr; - this_netmatch = !(this_xor & iface->ksni_netmask) ? 1 : 0; - - if (!(best < 0 || - best_netmatch < this_netmatch || - (best_netmatch == this_netmatch && - best_xor > this_xor))) - continue; - - best = i; - best_netmatch = this_netmatch; - best_xor = this_xor; - } - - LASSERT(best >= 0); - return best; -} - -static int -ksocknal_select_ips(struct ksock_peer *peer, __u32 *peerips, int n_peerips) -{ - rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; - struct ksock_net *net = peer->ksnp_ni->ni_data; - struct ksock_interface *iface; - struct ksock_interface *best_iface; - int n_ips; - int i; - int j; - int k; - __u32 ip; - __u32 xor; - int this_netmatch; - int best_netmatch; - int best_npeers; - - /* - * CAVEAT EMPTOR: We do all our interface matching with an - * exclusive hold of global lock at IRQ priority. We're only - * expecting to be dealing with small numbers of interfaces, so the - * O(n**3)-ness shouldn't matter - */ - /* - * Also note that I'm not going to return more than n_peerips - * interfaces, even if I have more myself - */ - write_lock_bh(global_lock); - - LASSERT(n_peerips <= LNET_MAX_INTERFACES); - LASSERT(net->ksnn_ninterfaces <= LNET_MAX_INTERFACES); - - /* - * Only match interfaces for additional connections - * if I have > 1 interface - */ - n_ips = (net->ksnn_ninterfaces < 2) ? 0 : - min(n_peerips, net->ksnn_ninterfaces); - - for (i = 0; peer->ksnp_n_passive_ips < n_ips; i++) { - /* ^ yes really... */ - - /* - * If we have any new interfaces, first tick off all the - * peer IPs that match old interfaces, then choose new - * interfaces to match the remaining peer IPS. - * We don't forget interfaces we've stopped using; we might - * start using them again... - */ - if (i < peer->ksnp_n_passive_ips) { - /* Old interface. */ - ip = peer->ksnp_passive_ips[i]; - best_iface = ksocknal_ip2iface(peer->ksnp_ni, ip); - - /* peer passive ips are kept up to date */ - LASSERT(best_iface); - } else { - /* choose a new interface */ - LASSERT(i == peer->ksnp_n_passive_ips); - - best_iface = NULL; - best_netmatch = 0; - best_npeers = 0; - - for (j = 0; j < net->ksnn_ninterfaces; j++) { - iface = &net->ksnn_interfaces[j]; - ip = iface->ksni_ipaddr; - - for (k = 0; k < peer->ksnp_n_passive_ips; k++) - if (peer->ksnp_passive_ips[k] == ip) - break; - - if (k < peer->ksnp_n_passive_ips) /* using it already */ - continue; - - k = ksocknal_match_peerip(iface, peerips, - n_peerips); - xor = ip ^ peerips[k]; - this_netmatch = !(xor & iface->ksni_netmask) ? 1 : 0; - - if (!(!best_iface || - best_netmatch < this_netmatch || - (best_netmatch == this_netmatch && - best_npeers > iface->ksni_npeers))) - continue; - - best_iface = iface; - best_netmatch = this_netmatch; - best_npeers = iface->ksni_npeers; - } - - LASSERT(best_iface); - - best_iface->ksni_npeers++; - ip = best_iface->ksni_ipaddr; - peer->ksnp_passive_ips[i] = ip; - peer->ksnp_n_passive_ips = i + 1; - } - - /* mark the best matching peer IP used */ - j = ksocknal_match_peerip(best_iface, peerips, n_peerips); - peerips[j] = 0; - } - - /* Overwrite input peer IP addresses */ - memcpy(peerips, peer->ksnp_passive_ips, n_ips * sizeof(*peerips)); - - write_unlock_bh(global_lock); - - return n_ips; -} - -static void -ksocknal_create_routes(struct ksock_peer *peer, int port, - __u32 *peer_ipaddrs, int npeer_ipaddrs) -{ - struct ksock_route *newroute = NULL; - rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; - struct lnet_ni *ni = peer->ksnp_ni; - struct ksock_net *net = ni->ni_data; - struct list_head *rtmp; - struct ksock_route *route; - struct ksock_interface *iface; - struct ksock_interface *best_iface; - int best_netmatch; - int this_netmatch; - int best_nroutes; - int i; - int j; - - /* - * CAVEAT EMPTOR: We do all our interface matching with an - * exclusive hold of global lock at IRQ priority. We're only - * expecting to be dealing with small numbers of interfaces, so the - * O(n**3)-ness here shouldn't matter - */ - write_lock_bh(global_lock); - - if (net->ksnn_ninterfaces < 2) { - /* - * Only create additional connections - * if I have > 1 interface - */ - write_unlock_bh(global_lock); - return; - } - - LASSERT(npeer_ipaddrs <= LNET_MAX_INTERFACES); - - for (i = 0; i < npeer_ipaddrs; i++) { - if (newroute) { - newroute->ksnr_ipaddr = peer_ipaddrs[i]; - } else { - write_unlock_bh(global_lock); - - newroute = ksocknal_create_route(peer_ipaddrs[i], port); - if (!newroute) - return; - - write_lock_bh(global_lock); - } - - if (peer->ksnp_closing) { - /* peer got closed under me */ - break; - } - - /* Already got a route? */ - route = NULL; - list_for_each(rtmp, &peer->ksnp_routes) { - route = list_entry(rtmp, struct ksock_route, ksnr_list); - - if (route->ksnr_ipaddr == newroute->ksnr_ipaddr) - break; - - route = NULL; - } - if (route) - continue; - - best_iface = NULL; - best_nroutes = 0; - best_netmatch = 0; - - LASSERT(net->ksnn_ninterfaces <= LNET_MAX_INTERFACES); - - /* Select interface to connect from */ - for (j = 0; j < net->ksnn_ninterfaces; j++) { - iface = &net->ksnn_interfaces[j]; - - /* Using this interface already? */ - list_for_each(rtmp, &peer->ksnp_routes) { - route = list_entry(rtmp, struct ksock_route, - ksnr_list); - - if (route->ksnr_myipaddr == iface->ksni_ipaddr) - break; - - route = NULL; - } - if (route) - continue; - - this_netmatch = (!((iface->ksni_ipaddr ^ - newroute->ksnr_ipaddr) & - iface->ksni_netmask)) ? 1 : 0; - - if (!(!best_iface || - best_netmatch < this_netmatch || - (best_netmatch == this_netmatch && - best_nroutes > iface->ksni_nroutes))) - continue; - - best_iface = iface; - best_netmatch = this_netmatch; - best_nroutes = iface->ksni_nroutes; - } - - if (!best_iface) - continue; - - newroute->ksnr_myipaddr = best_iface->ksni_ipaddr; - best_iface->ksni_nroutes++; - - ksocknal_add_route_locked(peer, newroute); - newroute = NULL; - } - - write_unlock_bh(global_lock); - if (newroute) - ksocknal_route_decref(newroute); -} - -int -ksocknal_accept(struct lnet_ni *ni, struct socket *sock) -{ - struct ksock_connreq *cr; - int rc; - __u32 peer_ip; - int peer_port; - - rc = lnet_sock_getaddr(sock, 1, &peer_ip, &peer_port); - LASSERT(!rc); /* we succeeded before */ - - cr = kzalloc(sizeof(*cr), GFP_NOFS); - if (!cr) { - LCONSOLE_ERROR_MSG(0x12f, "Dropping connection request from %pI4h: memory exhausted\n", - &peer_ip); - return -ENOMEM; - } - - lnet_ni_addref(ni); - cr->ksncr_ni = ni; - cr->ksncr_sock = sock; - - spin_lock_bh(&ksocknal_data.ksnd_connd_lock); - - list_add_tail(&cr->ksncr_list, &ksocknal_data.ksnd_connd_connreqs); - wake_up(&ksocknal_data.ksnd_connd_waitq); - - spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); - return 0; -} - -static int -ksocknal_connecting(struct ksock_peer *peer, __u32 ipaddr) -{ - struct ksock_route *route; - - list_for_each_entry(route, &peer->ksnp_routes, ksnr_list) { - if (route->ksnr_ipaddr == ipaddr) - return route->ksnr_connecting; - } - return 0; -} - -int -ksocknal_create_conn(struct lnet_ni *ni, struct ksock_route *route, - struct socket *sock, int type) -{ - rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; - LIST_HEAD(zombies); - struct lnet_process_id peerid; - struct list_head *tmp; - __u64 incarnation; - struct ksock_conn *conn; - struct ksock_conn *conn2; - struct ksock_peer *peer = NULL; - struct ksock_peer *peer2; - struct ksock_sched *sched; - struct ksock_hello_msg *hello; - int cpt; - struct ksock_tx *tx; - struct ksock_tx *txtmp; - int rc; - int active; - char *warn = NULL; - - active = !!route; - - LASSERT(active == (type != SOCKLND_CONN_NONE)); - - conn = kzalloc(sizeof(*conn), GFP_NOFS); - if (!conn) { - rc = -ENOMEM; - goto failed_0; - } - - conn->ksnc_peer = NULL; - conn->ksnc_route = NULL; - conn->ksnc_sock = sock; - /* - * 2 ref, 1 for conn, another extra ref prevents socket - * being closed before establishment of connection - */ - atomic_set(&conn->ksnc_sock_refcount, 2); - conn->ksnc_type = type; - ksocknal_lib_save_callback(sock, conn); - atomic_set(&conn->ksnc_conn_refcount, 1); /* 1 ref for me */ - - conn->ksnc_rx_ready = 0; - conn->ksnc_rx_scheduled = 0; - - INIT_LIST_HEAD(&conn->ksnc_tx_queue); - conn->ksnc_tx_ready = 0; - conn->ksnc_tx_scheduled = 0; - conn->ksnc_tx_carrier = NULL; - atomic_set(&conn->ksnc_tx_nob, 0); - - hello = kvzalloc(offsetof(struct ksock_hello_msg, - kshm_ips[LNET_MAX_INTERFACES]), - GFP_KERNEL); - if (!hello) { - rc = -ENOMEM; - goto failed_1; - } - - /* stash conn's local and remote addrs */ - rc = ksocknal_lib_get_conn_addrs(conn); - if (rc) - goto failed_1; - - /* - * Find out/confirm peer's NID and connection type and get the - * vector of interfaces she's willing to let me connect to. - * Passive connections use the listener timeout since the peer sends - * eagerly - */ - if (active) { - peer = route->ksnr_peer; - LASSERT(ni == peer->ksnp_ni); - - /* Active connection sends HELLO eagerly */ - hello->kshm_nips = ksocknal_local_ipvec(ni, hello->kshm_ips); - peerid = peer->ksnp_id; - - write_lock_bh(global_lock); - conn->ksnc_proto = peer->ksnp_proto; - write_unlock_bh(global_lock); - - if (!conn->ksnc_proto) { - conn->ksnc_proto = &ksocknal_protocol_v3x; -#if SOCKNAL_VERSION_DEBUG - if (*ksocknal_tunables.ksnd_protocol == 2) - conn->ksnc_proto = &ksocknal_protocol_v2x; - else if (*ksocknal_tunables.ksnd_protocol == 1) - conn->ksnc_proto = &ksocknal_protocol_v1x; -#endif - } - - rc = ksocknal_send_hello(ni, conn, peerid.nid, hello); - if (rc) - goto failed_1; - } else { - peerid.nid = LNET_NID_ANY; - peerid.pid = LNET_PID_ANY; - - /* Passive, get protocol from peer */ - conn->ksnc_proto = NULL; - } - - rc = ksocknal_recv_hello(ni, conn, hello, &peerid, &incarnation); - if (rc < 0) - goto failed_1; - - LASSERT(!rc || active); - LASSERT(conn->ksnc_proto); - LASSERT(peerid.nid != LNET_NID_ANY); - - cpt = lnet_cpt_of_nid(peerid.nid); - - if (active) { - ksocknal_peer_addref(peer); - write_lock_bh(global_lock); - } else { - rc = ksocknal_create_peer(&peer, ni, peerid); - if (rc) - goto failed_1; - - write_lock_bh(global_lock); - - /* called with a ref on ni, so shutdown can't have started */ - LASSERT(!((struct ksock_net *)ni->ni_data)->ksnn_shutdown); - - peer2 = ksocknal_find_peer_locked(ni, peerid); - if (!peer2) { - /* - * NB this puts an "empty" peer in the peer - * table (which takes my ref) - */ - list_add_tail(&peer->ksnp_list, - ksocknal_nid2peerlist(peerid.nid)); - } else { - ksocknal_peer_decref(peer); - peer = peer2; - } - - /* +1 ref for me */ - ksocknal_peer_addref(peer); - peer->ksnp_accepting++; - - /* - * Am I already connecting to this guy? Resolve in - * favour of higher NID... - */ - if (peerid.nid < ni->ni_nid && - ksocknal_connecting(peer, conn->ksnc_ipaddr)) { - rc = EALREADY; - warn = "connection race resolution"; - goto failed_2; - } - } - - if (peer->ksnp_closing || - (active && route->ksnr_deleted)) { - /* peer/route got closed under me */ - rc = -ESTALE; - warn = "peer/route removed"; - goto failed_2; - } - - if (!peer->ksnp_proto) { - /* - * Never connected before. - * NB recv_hello may have returned EPROTO to signal my peer - * wants a different protocol than the one I asked for. - */ - LASSERT(list_empty(&peer->ksnp_conns)); - - peer->ksnp_proto = conn->ksnc_proto; - peer->ksnp_incarnation = incarnation; - } - - if (peer->ksnp_proto != conn->ksnc_proto || - peer->ksnp_incarnation != incarnation) { - /* Peer rebooted or I've got the wrong protocol version */ - ksocknal_close_peer_conns_locked(peer, 0, 0); - - peer->ksnp_proto = NULL; - rc = ESTALE; - warn = peer->ksnp_incarnation != incarnation ? - "peer rebooted" : - "wrong proto version"; - goto failed_2; - } - - switch (rc) { - default: - LBUG(); - case 0: - break; - case EALREADY: - warn = "lost conn race"; - goto failed_2; - case EPROTO: - warn = "retry with different protocol version"; - goto failed_2; - } - - /* - * Refuse to duplicate an existing connection, unless this is a - * loopback connection - */ - if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) { - list_for_each(tmp, &peer->ksnp_conns) { - conn2 = list_entry(tmp, struct ksock_conn, ksnc_list); - - if (conn2->ksnc_ipaddr != conn->ksnc_ipaddr || - conn2->ksnc_myipaddr != conn->ksnc_myipaddr || - conn2->ksnc_type != conn->ksnc_type) - continue; - - /* - * Reply on a passive connection attempt so the peer - * realises we're connected. - */ - LASSERT(!rc); - if (!active) - rc = EALREADY; - - warn = "duplicate"; - goto failed_2; - } - } - - /* - * If the connection created by this route didn't bind to the IP - * address the route connected to, the connection/route matching - * code below probably isn't going to work. - */ - if (active && - route->ksnr_ipaddr != conn->ksnc_ipaddr) { - CERROR("Route %s %pI4h connected to %pI4h\n", - libcfs_id2str(peer->ksnp_id), - &route->ksnr_ipaddr, - &conn->ksnc_ipaddr); - } - - /* - * Search for a route corresponding to the new connection and - * create an association. This allows incoming connections created - * by routes in my peer to match my own route entries so I don't - * continually create duplicate routes. - */ - list_for_each(tmp, &peer->ksnp_routes) { - route = list_entry(tmp, struct ksock_route, ksnr_list); - - if (route->ksnr_ipaddr != conn->ksnc_ipaddr) - continue; - - ksocknal_associate_route_conn_locked(route, conn); - break; - } - - conn->ksnc_peer = peer; /* conn takes my ref on peer */ - peer->ksnp_last_alive = jiffies; - peer->ksnp_send_keepalive = 0; - peer->ksnp_error = 0; - - sched = ksocknal_choose_scheduler_locked(cpt); - sched->kss_nconns++; - conn->ksnc_scheduler = sched; - - conn->ksnc_tx_last_post = jiffies; - /* Set the deadline for the outgoing HELLO to drain */ - conn->ksnc_tx_bufnob = sock->sk->sk_wmem_queued; - conn->ksnc_tx_deadline = jiffies + *ksocknal_tunables.ksnd_timeout * HZ; - mb(); /* order with adding to peer's conn list */ - - list_add(&conn->ksnc_list, &peer->ksnp_conns); - ksocknal_conn_addref(conn); - - ksocknal_new_packet(conn, 0); - - conn->ksnc_zc_capable = ksocknal_lib_zc_capable(conn); - - /* Take packets blocking for this connection. */ - list_for_each_entry_safe(tx, txtmp, &peer->ksnp_tx_queue, tx_list) { - int match = conn->ksnc_proto->pro_match_tx(conn, tx, - tx->tx_nonblk); - - if (match == SOCKNAL_MATCH_NO) - continue; - - list_del(&tx->tx_list); - ksocknal_queue_tx_locked(tx, conn); - } - - write_unlock_bh(global_lock); - - /* - * We've now got a new connection. Any errors from here on are just - * like "normal" comms errors and we close the connection normally. - * NB (a) we still have to send the reply HELLO for passive - * connections, - * (b) normal I/O on the conn is blocked until I setup and call the - * socket callbacks. - */ - CDEBUG(D_NET, "New conn %s p %d.x %pI4h -> %pI4h/%d incarnation:%lld sched[%d:%d]\n", - libcfs_id2str(peerid), conn->ksnc_proto->pro_version, - &conn->ksnc_myipaddr, &conn->ksnc_ipaddr, - conn->ksnc_port, incarnation, cpt, - (int)(sched - &sched->kss_info->ksi_scheds[0])); - - if (active) { - /* additional routes after interface exchange? */ - ksocknal_create_routes(peer, conn->ksnc_port, - hello->kshm_ips, hello->kshm_nips); - } else { - hello->kshm_nips = ksocknal_select_ips(peer, hello->kshm_ips, - hello->kshm_nips); - rc = ksocknal_send_hello(ni, conn, peerid.nid, hello); - } - - kvfree(hello); - - /* - * setup the socket AFTER I've received hello (it disables - * SO_LINGER). I might call back to the acceptor who may want - * to send a protocol version response and then close the - * socket; this ensures the socket only tears down after the - * response has been sent. - */ - if (!rc) - rc = ksocknal_lib_setup_sock(sock); - - write_lock_bh(global_lock); - - /* NB my callbacks block while I hold ksnd_global_lock */ - ksocknal_lib_set_callback(sock, conn); - - if (!active) - peer->ksnp_accepting--; - - write_unlock_bh(global_lock); - - if (rc) { - write_lock_bh(global_lock); - if (!conn->ksnc_closing) { - /* could be closed by another thread */ - ksocknal_close_conn_locked(conn, rc); - } - write_unlock_bh(global_lock); - } else if (!ksocknal_connsock_addref(conn)) { - /* Allow I/O to proceed. */ - ksocknal_read_callback(conn); - ksocknal_write_callback(conn); - ksocknal_connsock_decref(conn); - } - - ksocknal_connsock_decref(conn); - ksocknal_conn_decref(conn); - return rc; - - failed_2: - if (!peer->ksnp_closing && - list_empty(&peer->ksnp_conns) && - list_empty(&peer->ksnp_routes)) { - list_add(&zombies, &peer->ksnp_tx_queue); - list_del_init(&peer->ksnp_tx_queue); - ksocknal_unlink_peer_locked(peer); - } - - write_unlock_bh(global_lock); - - if (warn) { - if (rc < 0) - CERROR("Not creating conn %s type %d: %s\n", - libcfs_id2str(peerid), conn->ksnc_type, warn); - else - CDEBUG(D_NET, "Not creating conn %s type %d: %s\n", - libcfs_id2str(peerid), conn->ksnc_type, warn); - } - - if (!active) { - if (rc > 0) { - /* - * Request retry by replying with CONN_NONE - * ksnc_proto has been set already - */ - conn->ksnc_type = SOCKLND_CONN_NONE; - hello->kshm_nips = 0; - ksocknal_send_hello(ni, conn, peerid.nid, hello); - } - - write_lock_bh(global_lock); - peer->ksnp_accepting--; - write_unlock_bh(global_lock); - } - - ksocknal_txlist_done(ni, &zombies, 1); - ksocknal_peer_decref(peer); - -failed_1: - kvfree(hello); - - kfree(conn); - -failed_0: - sock_release(sock); - return rc; -} - -void -ksocknal_close_conn_locked(struct ksock_conn *conn, int error) -{ - /* - * This just does the immmediate housekeeping, and queues the - * connection for the reaper to terminate. - * Caller holds ksnd_global_lock exclusively in irq context - */ - struct ksock_peer *peer = conn->ksnc_peer; - struct ksock_route *route; - struct ksock_conn *conn2; - struct list_head *tmp; - - LASSERT(!peer->ksnp_error); - LASSERT(!conn->ksnc_closing); - conn->ksnc_closing = 1; - - /* ksnd_deathrow_conns takes over peer's ref */ - list_del(&conn->ksnc_list); - - route = conn->ksnc_route; - if (route) { - /* dissociate conn from route... */ - LASSERT(!route->ksnr_deleted); - LASSERT(route->ksnr_connected & (1 << conn->ksnc_type)); - - conn2 = NULL; - list_for_each(tmp, &peer->ksnp_conns) { - conn2 = list_entry(tmp, struct ksock_conn, ksnc_list); - - if (conn2->ksnc_route == route && - conn2->ksnc_type == conn->ksnc_type) - break; - - conn2 = NULL; - } - if (!conn2) - route->ksnr_connected &= ~(1 << conn->ksnc_type); - - conn->ksnc_route = NULL; - - ksocknal_route_decref(route); /* drop conn's ref on route */ - } - - if (list_empty(&peer->ksnp_conns)) { - /* No more connections to this peer */ - - if (!list_empty(&peer->ksnp_tx_queue)) { - struct ksock_tx *tx; - - LASSERT(conn->ksnc_proto == &ksocknal_protocol_v3x); - - /* - * throw them to the last connection..., - * these TXs will be send to /dev/null by scheduler - */ - list_for_each_entry(tx, &peer->ksnp_tx_queue, - tx_list) - ksocknal_tx_prep(conn, tx); - - spin_lock_bh(&conn->ksnc_scheduler->kss_lock); - list_splice_init(&peer->ksnp_tx_queue, - &conn->ksnc_tx_queue); - spin_unlock_bh(&conn->ksnc_scheduler->kss_lock); - } - - peer->ksnp_proto = NULL; /* renegotiate protocol version */ - peer->ksnp_error = error; /* stash last conn close reason */ - - if (list_empty(&peer->ksnp_routes)) { - /* - * I've just closed last conn belonging to a - * peer with no routes to it - */ - ksocknal_unlink_peer_locked(peer); - } - } - - spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); - - list_add_tail(&conn->ksnc_list, - &ksocknal_data.ksnd_deathrow_conns); - wake_up(&ksocknal_data.ksnd_reaper_waitq); - - spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); -} - -void -ksocknal_peer_failed(struct ksock_peer *peer) -{ - int notify = 0; - unsigned long last_alive = 0; - - /* - * There has been a connection failure or comms error; but I'll only - * tell LNET I think the peer is dead if it's to another kernel and - * there are no connections or connection attempts in existence. - */ - read_lock(&ksocknal_data.ksnd_global_lock); - - if (!(peer->ksnp_id.pid & LNET_PID_USERFLAG) && - list_empty(&peer->ksnp_conns) && - !peer->ksnp_accepting && - !ksocknal_find_connecting_route_locked(peer)) { - notify = 1; - last_alive = peer->ksnp_last_alive; - } - - read_unlock(&ksocknal_data.ksnd_global_lock); - - if (notify) - lnet_notify(peer->ksnp_ni, peer->ksnp_id.nid, 0, - last_alive); -} - -void -ksocknal_finalize_zcreq(struct ksock_conn *conn) -{ - struct ksock_peer *peer = conn->ksnc_peer; - struct ksock_tx *tx; - struct ksock_tx *temp; - struct ksock_tx *tmp; - LIST_HEAD(zlist); - - /* - * NB safe to finalize TXs because closing of socket will - * abort all buffered data - */ - LASSERT(!conn->ksnc_sock); - - spin_lock(&peer->ksnp_lock); - - list_for_each_entry_safe(tx, tmp, &peer->ksnp_zc_req_list, tx_zc_list) { - if (tx->tx_conn != conn) - continue; - - LASSERT(tx->tx_msg.ksm_zc_cookies[0]); - - tx->tx_msg.ksm_zc_cookies[0] = 0; - tx->tx_zc_aborted = 1; /* mark it as not-acked */ - list_del(&tx->tx_zc_list); - list_add(&tx->tx_zc_list, &zlist); - } - - spin_unlock(&peer->ksnp_lock); - - list_for_each_entry_safe(tx, temp, &zlist, tx_zc_list) { - list_del(&tx->tx_zc_list); - ksocknal_tx_decref(tx); - } -} - -void -ksocknal_terminate_conn(struct ksock_conn *conn) -{ - /* - * This gets called by the reaper (guaranteed thread context) to - * disengage the socket from its callbacks and close it. - * ksnc_refcount will eventually hit zero, and then the reaper will - * destroy it. - */ - struct ksock_peer *peer = conn->ksnc_peer; - struct ksock_sched *sched = conn->ksnc_scheduler; - int failed = 0; - - LASSERT(conn->ksnc_closing); - - /* wake up the scheduler to "send" all remaining packets to /dev/null */ - spin_lock_bh(&sched->kss_lock); - - /* a closing conn is always ready to tx */ - conn->ksnc_tx_ready = 1; - - if (!conn->ksnc_tx_scheduled && - !list_empty(&conn->ksnc_tx_queue)) { - list_add_tail(&conn->ksnc_tx_list, - &sched->kss_tx_conns); - conn->ksnc_tx_scheduled = 1; - /* extra ref for scheduler */ - ksocknal_conn_addref(conn); - - wake_up(&sched->kss_waitq); - } - - spin_unlock_bh(&sched->kss_lock); - - /* serialise with callbacks */ - write_lock_bh(&ksocknal_data.ksnd_global_lock); - - ksocknal_lib_reset_callback(conn->ksnc_sock, conn); - - /* - * OK, so this conn may not be completely disengaged from its - * scheduler yet, but it _has_ committed to terminate... - */ - conn->ksnc_scheduler->kss_nconns--; - - if (peer->ksnp_error) { - /* peer's last conn closed in error */ - LASSERT(list_empty(&peer->ksnp_conns)); - failed = 1; - peer->ksnp_error = 0; /* avoid multiple notifications */ - } - - write_unlock_bh(&ksocknal_data.ksnd_global_lock); - - if (failed) - ksocknal_peer_failed(peer); - - /* - * The socket is closed on the final put; either here, or in - * ksocknal_{send,recv}msg(). Since we set up the linger2 option - * when the connection was established, this will close the socket - * immediately, aborting anything buffered in it. Any hung - * zero-copy transmits will therefore complete in finite time. - */ - ksocknal_connsock_decref(conn); -} - -void -ksocknal_queue_zombie_conn(struct ksock_conn *conn) -{ - /* Queue the conn for the reaper to destroy */ - - LASSERT(!atomic_read(&conn->ksnc_conn_refcount)); - spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); - - list_add_tail(&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns); - wake_up(&ksocknal_data.ksnd_reaper_waitq); - - spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); -} - -void -ksocknal_destroy_conn(struct ksock_conn *conn) -{ - unsigned long last_rcv; - - /* Final coup-de-grace of the reaper */ - CDEBUG(D_NET, "connection %p\n", conn); - - LASSERT(!atomic_read(&conn->ksnc_conn_refcount)); - LASSERT(!atomic_read(&conn->ksnc_sock_refcount)); - LASSERT(!conn->ksnc_sock); - LASSERT(!conn->ksnc_route); - LASSERT(!conn->ksnc_tx_scheduled); - LASSERT(!conn->ksnc_rx_scheduled); - LASSERT(list_empty(&conn->ksnc_tx_queue)); - - /* complete current receive if any */ - switch (conn->ksnc_rx_state) { - case SOCKNAL_RX_LNET_PAYLOAD: - last_rcv = conn->ksnc_rx_deadline - - *ksocknal_tunables.ksnd_timeout * HZ; - CERROR("Completing partial receive from %s[%d], ip %pI4h:%d, with error, wanted: %zd, left: %d, last alive is %ld secs ago\n", - libcfs_id2str(conn->ksnc_peer->ksnp_id), conn->ksnc_type, - &conn->ksnc_ipaddr, conn->ksnc_port, - iov_iter_count(&conn->ksnc_rx_to), conn->ksnc_rx_nob_left, - (jiffies - last_rcv) / HZ); - lnet_finalize(conn->ksnc_peer->ksnp_ni, - conn->ksnc_cookie, -EIO); - break; - case SOCKNAL_RX_LNET_HEADER: - if (conn->ksnc_rx_started) - CERROR("Incomplete receive of lnet header from %s, ip %pI4h:%d, with error, protocol: %d.x.\n", - libcfs_id2str(conn->ksnc_peer->ksnp_id), - &conn->ksnc_ipaddr, conn->ksnc_port, - conn->ksnc_proto->pro_version); - break; - case SOCKNAL_RX_KSM_HEADER: - if (conn->ksnc_rx_started) - CERROR("Incomplete receive of ksock message from %s, ip %pI4h:%d, with error, protocol: %d.x.\n", - libcfs_id2str(conn->ksnc_peer->ksnp_id), - &conn->ksnc_ipaddr, conn->ksnc_port, - conn->ksnc_proto->pro_version); - break; - case SOCKNAL_RX_SLOP: - if (conn->ksnc_rx_started) - CERROR("Incomplete receive of slops from %s, ip %pI4h:%d, with error\n", - libcfs_id2str(conn->ksnc_peer->ksnp_id), - &conn->ksnc_ipaddr, conn->ksnc_port); - break; - default: - LBUG(); - break; - } - - ksocknal_peer_decref(conn->ksnc_peer); - - kfree(conn); -} - -int -ksocknal_close_peer_conns_locked(struct ksock_peer *peer, __u32 ipaddr, int why) -{ - struct ksock_conn *conn; - struct list_head *ctmp; - struct list_head *cnxt; - int count = 0; - - list_for_each_safe(ctmp, cnxt, &peer->ksnp_conns) { - conn = list_entry(ctmp, struct ksock_conn, ksnc_list); - - if (!ipaddr || conn->ksnc_ipaddr == ipaddr) { - count++; - ksocknal_close_conn_locked(conn, why); - } - } - - return count; -} - -int -ksocknal_close_conn_and_siblings(struct ksock_conn *conn, int why) -{ - struct ksock_peer *peer = conn->ksnc_peer; - __u32 ipaddr = conn->ksnc_ipaddr; - int count; - - write_lock_bh(&ksocknal_data.ksnd_global_lock); - - count = ksocknal_close_peer_conns_locked(peer, ipaddr, why); - - write_unlock_bh(&ksocknal_data.ksnd_global_lock); - - return count; -} - -int -ksocknal_close_matching_conns(struct lnet_process_id id, __u32 ipaddr) -{ - struct ksock_peer *peer; - struct list_head *ptmp; - struct list_head *pnxt; - int lo; - int hi; - int i; - int count = 0; - - write_lock_bh(&ksocknal_data.ksnd_global_lock); - - if (id.nid != LNET_NID_ANY) { - lo = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers); - hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers); - } else { - lo = 0; - hi = ksocknal_data.ksnd_peer_hash_size - 1; - } - - for (i = lo; i <= hi; i++) { - list_for_each_safe(ptmp, pnxt, - &ksocknal_data.ksnd_peers[i]) { - peer = list_entry(ptmp, struct ksock_peer, ksnp_list); - - if (!((id.nid == LNET_NID_ANY || id.nid == peer->ksnp_id.nid) && - (id.pid == LNET_PID_ANY || id.pid == peer->ksnp_id.pid))) - continue; - - count += ksocknal_close_peer_conns_locked(peer, ipaddr, - 0); - } - } - - write_unlock_bh(&ksocknal_data.ksnd_global_lock); - - /* wildcards always succeed */ - if (id.nid == LNET_NID_ANY || id.pid == LNET_PID_ANY || !ipaddr) - return 0; - - if (!count) - return -ENOENT; - else - return 0; -} - -void -ksocknal_notify(struct lnet_ni *ni, lnet_nid_t gw_nid, int alive) -{ - /* - * The router is telling me she's been notified of a change in - * gateway state.... - */ - struct lnet_process_id id = {0}; - - id.nid = gw_nid; - id.pid = LNET_PID_ANY; - - CDEBUG(D_NET, "gw %s %s\n", libcfs_nid2str(gw_nid), - alive ? "up" : "down"); - - if (!alive) { - /* If the gateway crashed, close all open connections... */ - ksocknal_close_matching_conns(id, 0); - return; - } - - /* - * ...otherwise do nothing. We can only establish new connections - * if we have autroutes, and these connect on demand. - */ -} - -void -ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, unsigned long *when) -{ - int connect = 1; - unsigned long last_alive = 0; - unsigned long now = jiffies; - struct ksock_peer *peer = NULL; - rwlock_t *glock = &ksocknal_data.ksnd_global_lock; - struct lnet_process_id id = { - .nid = nid, - .pid = LNET_PID_LUSTRE, - }; - - read_lock(glock); - - peer = ksocknal_find_peer_locked(ni, id); - if (peer) { - struct ksock_conn *conn; - int bufnob; - - list_for_each_entry(conn, &peer->ksnp_conns, ksnc_list) { - bufnob = conn->ksnc_sock->sk->sk_wmem_queued; - - if (bufnob < conn->ksnc_tx_bufnob) { - /* something got ACKed */ - conn->ksnc_tx_deadline = - jiffies + *ksocknal_tunables.ksnd_timeout * HZ; - peer->ksnp_last_alive = now; - conn->ksnc_tx_bufnob = bufnob; - } - } - - last_alive = peer->ksnp_last_alive; - if (!ksocknal_find_connectable_route_locked(peer)) - connect = 0; - } - - read_unlock(glock); - - if (last_alive) - *when = last_alive; - - CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago, connect %d\n", - libcfs_nid2str(nid), peer, - last_alive ? (now - last_alive) / HZ : -1, - connect); - - if (!connect) - return; - - ksocknal_add_peer(ni, id, LNET_NIDADDR(nid), lnet_acceptor_port()); - - write_lock_bh(glock); - - peer = ksocknal_find_peer_locked(ni, id); - if (peer) - ksocknal_launch_all_connections_locked(peer); - - write_unlock_bh(glock); -} - -static void -ksocknal_push_peer(struct ksock_peer *peer) -{ - int index; - int i; - struct list_head *tmp; - struct ksock_conn *conn; - - for (index = 0; ; index++) { - read_lock(&ksocknal_data.ksnd_global_lock); - - i = 0; - conn = NULL; - - list_for_each(tmp, &peer->ksnp_conns) { - if (i++ == index) { - conn = list_entry(tmp, struct ksock_conn, - ksnc_list); - ksocknal_conn_addref(conn); - break; - } - } - - read_unlock(&ksocknal_data.ksnd_global_lock); - - if (!conn) - break; - - ksocknal_lib_push_conn(conn); - ksocknal_conn_decref(conn); - } -} - -static int ksocknal_push(struct lnet_ni *ni, struct lnet_process_id id) -{ - struct list_head *start; - struct list_head *end; - struct list_head *tmp; - int rc = -ENOENT; - unsigned int hsize = ksocknal_data.ksnd_peer_hash_size; - - if (id.nid == LNET_NID_ANY) { - start = &ksocknal_data.ksnd_peers[0]; - end = &ksocknal_data.ksnd_peers[hsize - 1]; - } else { - start = ksocknal_nid2peerlist(id.nid); - end = ksocknal_nid2peerlist(id.nid); - } - - for (tmp = start; tmp <= end; tmp++) { - int peer_off; /* searching offset in peer hash table */ - - for (peer_off = 0; ; peer_off++) { - struct ksock_peer *peer; - int i = 0; - - read_lock(&ksocknal_data.ksnd_global_lock); - list_for_each_entry(peer, tmp, ksnp_list) { - if (!((id.nid == LNET_NID_ANY || - id.nid == peer->ksnp_id.nid) && - (id.pid == LNET_PID_ANY || - id.pid == peer->ksnp_id.pid))) - continue; - - if (i++ == peer_off) { - ksocknal_peer_addref(peer); - break; - } - } - read_unlock(&ksocknal_data.ksnd_global_lock); - - if (!i) /* no match */ - break; - - rc = 0; - ksocknal_push_peer(peer); - ksocknal_peer_decref(peer); - } - } - return rc; -} - -static int -ksocknal_add_interface(struct lnet_ni *ni, __u32 ipaddress, __u32 netmask) -{ - struct ksock_net *net = ni->ni_data; - struct ksock_interface *iface; - int rc; - int i; - int j; - struct list_head *ptmp; - struct ksock_peer *peer; - struct list_head *rtmp; - struct ksock_route *route; - - if (!ipaddress || !netmask) - return -EINVAL; - - write_lock_bh(&ksocknal_data.ksnd_global_lock); - - iface = ksocknal_ip2iface(ni, ipaddress); - if (iface) { - /* silently ignore dups */ - rc = 0; - } else if (net->ksnn_ninterfaces == LNET_MAX_INTERFACES) { - rc = -ENOSPC; - } else { - iface = &net->ksnn_interfaces[net->ksnn_ninterfaces++]; - - iface->ksni_ipaddr = ipaddress; - iface->ksni_netmask = netmask; - iface->ksni_nroutes = 0; - iface->ksni_npeers = 0; - - for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { - list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) { - peer = list_entry(ptmp, struct ksock_peer, - ksnp_list); - - for (j = 0; j < peer->ksnp_n_passive_ips; j++) - if (peer->ksnp_passive_ips[j] == ipaddress) - iface->ksni_npeers++; - - list_for_each(rtmp, &peer->ksnp_routes) { - route = list_entry(rtmp, struct ksock_route, - ksnr_list); - - if (route->ksnr_myipaddr == ipaddress) - iface->ksni_nroutes++; - } - } - } - - rc = 0; - /* - * NB only new connections will pay attention to the - * new interface! - */ - } - - write_unlock_bh(&ksocknal_data.ksnd_global_lock); - - return rc; -} - -static void -ksocknal_peer_del_interface_locked(struct ksock_peer *peer, __u32 ipaddr) -{ - struct list_head *tmp; - struct list_head *nxt; - struct ksock_route *route; - struct ksock_conn *conn; - int i; - int j; - - for (i = 0; i < peer->ksnp_n_passive_ips; i++) - if (peer->ksnp_passive_ips[i] == ipaddr) { - for (j = i + 1; j < peer->ksnp_n_passive_ips; j++) - peer->ksnp_passive_ips[j - 1] = - peer->ksnp_passive_ips[j]; - peer->ksnp_n_passive_ips--; - break; - } - - list_for_each_safe(tmp, nxt, &peer->ksnp_routes) { - route = list_entry(tmp, struct ksock_route, ksnr_list); - - if (route->ksnr_myipaddr != ipaddr) - continue; - - if (route->ksnr_share_count) { - /* Manually created; keep, but unbind */ - route->ksnr_myipaddr = 0; - } else { - ksocknal_del_route_locked(route); - } - } - - list_for_each_safe(tmp, nxt, &peer->ksnp_conns) { - conn = list_entry(tmp, struct ksock_conn, ksnc_list); - - if (conn->ksnc_myipaddr == ipaddr) - ksocknal_close_conn_locked(conn, 0); - } -} - -static int -ksocknal_del_interface(struct lnet_ni *ni, __u32 ipaddress) -{ - struct ksock_net *net = ni->ni_data; - int rc = -ENOENT; - struct list_head *tmp; - struct list_head *nxt; - struct ksock_peer *peer; - __u32 this_ip; - int i; - int j; - - write_lock_bh(&ksocknal_data.ksnd_global_lock); - - for (i = 0; i < net->ksnn_ninterfaces; i++) { - this_ip = net->ksnn_interfaces[i].ksni_ipaddr; - - if (!(!ipaddress || ipaddress == this_ip)) - continue; - - rc = 0; - - for (j = i + 1; j < net->ksnn_ninterfaces; j++) - net->ksnn_interfaces[j - 1] = - net->ksnn_interfaces[j]; - - net->ksnn_ninterfaces--; - - for (j = 0; j < ksocknal_data.ksnd_peer_hash_size; j++) { - list_for_each_safe(tmp, nxt, - &ksocknal_data.ksnd_peers[j]) { - peer = list_entry(tmp, struct ksock_peer, ksnp_list); - - if (peer->ksnp_ni != ni) - continue; - - ksocknal_peer_del_interface_locked(peer, this_ip); - } - } - } - - write_unlock_bh(&ksocknal_data.ksnd_global_lock); - - return rc; -} - -int -ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg) -{ - struct lnet_process_id id = {0}; - struct libcfs_ioctl_data *data = arg; - int rc; - - switch (cmd) { - case IOC_LIBCFS_GET_INTERFACE: { - struct ksock_net *net = ni->ni_data; - struct ksock_interface *iface; - - read_lock(&ksocknal_data.ksnd_global_lock); - - if (data->ioc_count >= (__u32)net->ksnn_ninterfaces) { - rc = -ENOENT; - } else { - rc = 0; - iface = &net->ksnn_interfaces[data->ioc_count]; - - data->ioc_u32[0] = iface->ksni_ipaddr; - data->ioc_u32[1] = iface->ksni_netmask; - data->ioc_u32[2] = iface->ksni_npeers; - data->ioc_u32[3] = iface->ksni_nroutes; - } - - read_unlock(&ksocknal_data.ksnd_global_lock); - return rc; - } - - case IOC_LIBCFS_ADD_INTERFACE: - return ksocknal_add_interface(ni, - data->ioc_u32[0], /* IP address */ - data->ioc_u32[1]); /* net mask */ - - case IOC_LIBCFS_DEL_INTERFACE: - return ksocknal_del_interface(ni, - data->ioc_u32[0]); /* IP address */ - - case IOC_LIBCFS_GET_PEER: { - __u32 myip = 0; - __u32 ip = 0; - int port = 0; - int conn_count = 0; - int share_count = 0; - - rc = ksocknal_get_peer_info(ni, data->ioc_count, - &id, &myip, &ip, &port, - &conn_count, &share_count); - if (rc) - return rc; - - data->ioc_nid = id.nid; - data->ioc_count = share_count; - data->ioc_u32[0] = ip; - data->ioc_u32[1] = port; - data->ioc_u32[2] = myip; - data->ioc_u32[3] = conn_count; - data->ioc_u32[4] = id.pid; - return 0; - } - - case IOC_LIBCFS_ADD_PEER: - id.nid = data->ioc_nid; - id.pid = LNET_PID_LUSTRE; - return ksocknal_add_peer(ni, id, - data->ioc_u32[0], /* IP */ - data->ioc_u32[1]); /* port */ - - case IOC_LIBCFS_DEL_PEER: - id.nid = data->ioc_nid; - id.pid = LNET_PID_ANY; - return ksocknal_del_peer(ni, id, - data->ioc_u32[0]); /* IP */ - - case IOC_LIBCFS_GET_CONN: { - int txmem; - int rxmem; - int nagle; - struct ksock_conn *conn; - - conn = ksocknal_get_conn_by_idx(ni, data->ioc_count); - if (!conn) - return -ENOENT; - - ksocknal_lib_get_conn_tunables(conn, &txmem, &rxmem, &nagle); - - data->ioc_count = txmem; - data->ioc_nid = conn->ksnc_peer->ksnp_id.nid; - data->ioc_flags = nagle; - data->ioc_u32[0] = conn->ksnc_ipaddr; - data->ioc_u32[1] = conn->ksnc_port; - data->ioc_u32[2] = conn->ksnc_myipaddr; - data->ioc_u32[3] = conn->ksnc_type; - data->ioc_u32[4] = conn->ksnc_scheduler->kss_info->ksi_cpt; - data->ioc_u32[5] = rxmem; - data->ioc_u32[6] = conn->ksnc_peer->ksnp_id.pid; - ksocknal_conn_decref(conn); - return 0; - } - - case IOC_LIBCFS_CLOSE_CONNECTION: - id.nid = data->ioc_nid; - id.pid = LNET_PID_ANY; - return ksocknal_close_matching_conns(id, - data->ioc_u32[0]); - - case IOC_LIBCFS_REGISTER_MYNID: - /* Ignore if this is a noop */ - if (data->ioc_nid == ni->ni_nid) - return 0; - - CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n", - libcfs_nid2str(data->ioc_nid), - libcfs_nid2str(ni->ni_nid)); - return -EINVAL; - - case IOC_LIBCFS_PUSH_CONNECTION: - id.nid = data->ioc_nid; - id.pid = LNET_PID_ANY; - return ksocknal_push(ni, id); - - default: - return -EINVAL; - } - /* not reached */ -} - -static void -ksocknal_free_buffers(void) -{ - LASSERT(!atomic_read(&ksocknal_data.ksnd_nactive_txs)); - - if (ksocknal_data.ksnd_sched_info) { - struct ksock_sched_info *info; - int i; - - cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) - kfree(info->ksi_scheds); - cfs_percpt_free(ksocknal_data.ksnd_sched_info); - } - - kvfree(ksocknal_data.ksnd_peers); - - spin_lock(&ksocknal_data.ksnd_tx_lock); - - if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) { - struct list_head zlist; - struct ksock_tx *tx; - struct ksock_tx *temp; - - list_add(&zlist, &ksocknal_data.ksnd_idle_noop_txs); - list_del_init(&ksocknal_data.ksnd_idle_noop_txs); - spin_unlock(&ksocknal_data.ksnd_tx_lock); - - list_for_each_entry_safe(tx, temp, &zlist, tx_list) { - list_del(&tx->tx_list); - kfree(tx); - } - } else { - spin_unlock(&ksocknal_data.ksnd_tx_lock); - } -} - -static void -ksocknal_base_shutdown(void) -{ - struct ksock_sched_info *info; - struct ksock_sched *sched; - int i; - int j; - - LASSERT(!ksocknal_data.ksnd_nnets); - - switch (ksocknal_data.ksnd_init) { - default: - LASSERT(0); - /* fall through */ - case SOCKNAL_INIT_ALL: - case SOCKNAL_INIT_DATA: - LASSERT(ksocknal_data.ksnd_peers); - for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) - LASSERT(list_empty(&ksocknal_data.ksnd_peers[i])); - - LASSERT(list_empty(&ksocknal_data.ksnd_nets)); - LASSERT(list_empty(&ksocknal_data.ksnd_enomem_conns)); - LASSERT(list_empty(&ksocknal_data.ksnd_zombie_conns)); - LASSERT(list_empty(&ksocknal_data.ksnd_connd_connreqs)); - LASSERT(list_empty(&ksocknal_data.ksnd_connd_routes)); - - if (ksocknal_data.ksnd_sched_info) { - cfs_percpt_for_each(info, i, - ksocknal_data.ksnd_sched_info) { - if (!info->ksi_scheds) - continue; - - for (j = 0; j < info->ksi_nthreads_max; j++) { - sched = &info->ksi_scheds[j]; - LASSERT(list_empty( - &sched->kss_tx_conns)); - LASSERT(list_empty( - &sched->kss_rx_conns)); - LASSERT(list_empty( - &sched->kss_zombie_noop_txs)); - LASSERT(!sched->kss_nconns); - } - } - } - - /* flag threads to terminate; wake and wait for them to die */ - ksocknal_data.ksnd_shuttingdown = 1; - wake_up_all(&ksocknal_data.ksnd_connd_waitq); - wake_up_all(&ksocknal_data.ksnd_reaper_waitq); - - if (ksocknal_data.ksnd_sched_info) { - cfs_percpt_for_each(info, i, - ksocknal_data.ksnd_sched_info) { - if (!info->ksi_scheds) - continue; - - for (j = 0; j < info->ksi_nthreads_max; j++) { - sched = &info->ksi_scheds[j]; - wake_up_all(&sched->kss_waitq); - } - } - } - - i = 4; - read_lock(&ksocknal_data.ksnd_global_lock); - while (ksocknal_data.ksnd_nthreads) { - i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ - "waiting for %d threads to terminate\n", - ksocknal_data.ksnd_nthreads); - read_unlock(&ksocknal_data.ksnd_global_lock); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ); - read_lock(&ksocknal_data.ksnd_global_lock); - } - read_unlock(&ksocknal_data.ksnd_global_lock); - - ksocknal_free_buffers(); - - ksocknal_data.ksnd_init = SOCKNAL_INIT_NOTHING; - break; - } - - module_put(THIS_MODULE); -} - -static __u64 -ksocknal_new_incarnation(void) -{ - /* The incarnation number is the time this module loaded and it - * identifies this particular instance of the socknal. - */ - return ktime_get_ns(); -} - -static int -ksocknal_base_startup(void) -{ - struct ksock_sched_info *info; - int rc; - int i; - - LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING); - LASSERT(!ksocknal_data.ksnd_nnets); - - memset(&ksocknal_data, 0, sizeof(ksocknal_data)); /* zero pointers */ - - ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE; - ksocknal_data.ksnd_peers = kvmalloc_array(ksocknal_data.ksnd_peer_hash_size, - sizeof(struct list_head), - GFP_KERNEL); - if (!ksocknal_data.ksnd_peers) - return -ENOMEM; - - for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) - INIT_LIST_HEAD(&ksocknal_data.ksnd_peers[i]); - - rwlock_init(&ksocknal_data.ksnd_global_lock); - INIT_LIST_HEAD(&ksocknal_data.ksnd_nets); - - spin_lock_init(&ksocknal_data.ksnd_reaper_lock); - INIT_LIST_HEAD(&ksocknal_data.ksnd_enomem_conns); - INIT_LIST_HEAD(&ksocknal_data.ksnd_zombie_conns); - INIT_LIST_HEAD(&ksocknal_data.ksnd_deathrow_conns); - init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq); - - spin_lock_init(&ksocknal_data.ksnd_connd_lock); - INIT_LIST_HEAD(&ksocknal_data.ksnd_connd_connreqs); - INIT_LIST_HEAD(&ksocknal_data.ksnd_connd_routes); - init_waitqueue_head(&ksocknal_data.ksnd_connd_waitq); - - spin_lock_init(&ksocknal_data.ksnd_tx_lock); - INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_noop_txs); - - /* NB memset above zeros whole of ksocknal_data */ - - /* flag lists/ptrs/locks initialised */ - ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA; - try_module_get(THIS_MODULE); - - ksocknal_data.ksnd_sched_info = cfs_percpt_alloc(lnet_cpt_table(), - sizeof(*info)); - if (!ksocknal_data.ksnd_sched_info) - goto failed; - - cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) { - struct ksock_sched *sched; - int nthrs; - - nthrs = cfs_cpt_weight(lnet_cpt_table(), i); - if (*ksocknal_tunables.ksnd_nscheds > 0) { - nthrs = min(nthrs, *ksocknal_tunables.ksnd_nscheds); - } else { - /* - * max to half of CPUs, assume another half should be - * reserved for upper layer modules - */ - nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs); - } - - info->ksi_nthreads_max = nthrs; - info->ksi_cpt = i; - - info->ksi_scheds = kzalloc_cpt(info->ksi_nthreads_max * sizeof(*sched), - GFP_NOFS, i); - if (!info->ksi_scheds) - goto failed; - - for (; nthrs > 0; nthrs--) { - sched = &info->ksi_scheds[nthrs - 1]; - - sched->kss_info = info; - spin_lock_init(&sched->kss_lock); - INIT_LIST_HEAD(&sched->kss_rx_conns); - INIT_LIST_HEAD(&sched->kss_tx_conns); - INIT_LIST_HEAD(&sched->kss_zombie_noop_txs); - init_waitqueue_head(&sched->kss_waitq); - } - } - - ksocknal_data.ksnd_connd_starting = 0; - ksocknal_data.ksnd_connd_failed_stamp = 0; - ksocknal_data.ksnd_connd_starting_stamp = ktime_get_real_seconds(); - /* - * must have at least 2 connds to remain responsive to accepts while - * connecting - */ - if (*ksocknal_tunables.ksnd_nconnds < SOCKNAL_CONND_RESV + 1) - *ksocknal_tunables.ksnd_nconnds = SOCKNAL_CONND_RESV + 1; - - if (*ksocknal_tunables.ksnd_nconnds_max < - *ksocknal_tunables.ksnd_nconnds) { - ksocknal_tunables.ksnd_nconnds_max = - ksocknal_tunables.ksnd_nconnds; - } - - for (i = 0; i < *ksocknal_tunables.ksnd_nconnds; i++) { - char name[16]; - - spin_lock_bh(&ksocknal_data.ksnd_connd_lock); - ksocknal_data.ksnd_connd_starting++; - spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); - - snprintf(name, sizeof(name), "socknal_cd%02d", i); - rc = ksocknal_thread_start(ksocknal_connd, - (void *)((uintptr_t)i), name); - if (rc) { - spin_lock_bh(&ksocknal_data.ksnd_connd_lock); - ksocknal_data.ksnd_connd_starting--; - spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); - CERROR("Can't spawn socknal connd: %d\n", rc); - goto failed; - } - } - - rc = ksocknal_thread_start(ksocknal_reaper, NULL, "socknal_reaper"); - if (rc) { - CERROR("Can't spawn socknal reaper: %d\n", rc); - goto failed; - } - - /* flag everything initialised */ - ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL; - - return 0; - - failed: - ksocknal_base_shutdown(); - return -ENETDOWN; -} - -static void -ksocknal_debug_peerhash(struct lnet_ni *ni) -{ - struct ksock_peer *peer = NULL; - struct list_head *tmp; - int i; - - read_lock(&ksocknal_data.ksnd_global_lock); - - for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { - list_for_each(tmp, &ksocknal_data.ksnd_peers[i]) { - peer = list_entry(tmp, struct ksock_peer, ksnp_list); - - if (peer->ksnp_ni == ni) - break; - - peer = NULL; - } - } - - if (peer) { - struct ksock_route *route; - struct ksock_conn *conn; - - CWARN("Active peer on shutdown: %s, ref %d, scnt %d, closing %d, accepting %d, err %d, zcookie %llu, txq %d, zc_req %d\n", - libcfs_id2str(peer->ksnp_id), - atomic_read(&peer->ksnp_refcount), - peer->ksnp_sharecount, peer->ksnp_closing, - peer->ksnp_accepting, peer->ksnp_error, - peer->ksnp_zc_next_cookie, - !list_empty(&peer->ksnp_tx_queue), - !list_empty(&peer->ksnp_zc_req_list)); - - list_for_each(tmp, &peer->ksnp_routes) { - route = list_entry(tmp, struct ksock_route, ksnr_list); - CWARN("Route: ref %d, schd %d, conn %d, cnted %d, del %d\n", - atomic_read(&route->ksnr_refcount), - route->ksnr_scheduled, route->ksnr_connecting, - route->ksnr_connected, route->ksnr_deleted); - } - - list_for_each(tmp, &peer->ksnp_conns) { - conn = list_entry(tmp, struct ksock_conn, ksnc_list); - CWARN("Conn: ref %d, sref %d, t %d, c %d\n", - atomic_read(&conn->ksnc_conn_refcount), - atomic_read(&conn->ksnc_sock_refcount), - conn->ksnc_type, conn->ksnc_closing); - } - } - - read_unlock(&ksocknal_data.ksnd_global_lock); -} - -void -ksocknal_shutdown(struct lnet_ni *ni) -{ - struct ksock_net *net = ni->ni_data; - int i; - struct lnet_process_id anyid = {0}; - - anyid.nid = LNET_NID_ANY; - anyid.pid = LNET_PID_ANY; - - LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL); - LASSERT(ksocknal_data.ksnd_nnets > 0); - - spin_lock_bh(&net->ksnn_lock); - net->ksnn_shutdown = 1; /* prevent new peers */ - spin_unlock_bh(&net->ksnn_lock); - - /* Delete all peers */ - ksocknal_del_peer(ni, anyid, 0); - - /* Wait for all peer state to clean up */ - i = 2; - spin_lock_bh(&net->ksnn_lock); - while (net->ksnn_npeers) { - spin_unlock_bh(&net->ksnn_lock); - - i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ - "waiting for %d peers to disconnect\n", - net->ksnn_npeers); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ); - - ksocknal_debug_peerhash(ni); - - spin_lock_bh(&net->ksnn_lock); - } - spin_unlock_bh(&net->ksnn_lock); - - for (i = 0; i < net->ksnn_ninterfaces; i++) { - LASSERT(!net->ksnn_interfaces[i].ksni_npeers); - LASSERT(!net->ksnn_interfaces[i].ksni_nroutes); - } - - list_del(&net->ksnn_list); - kfree(net); - - ksocknal_data.ksnd_nnets--; - if (!ksocknal_data.ksnd_nnets) - ksocknal_base_shutdown(); -} - -static int -ksocknal_enumerate_interfaces(struct ksock_net *net) -{ - char **names; - int i; - int j; - int rc; - int n; - - n = lnet_ipif_enumerate(&names); - if (n <= 0) { - CERROR("Can't enumerate interfaces: %d\n", n); - return n; - } - - for (i = j = 0; i < n; i++) { - int up; - __u32 ip; - __u32 mask; - - if (!strcmp(names[i], "lo")) /* skip the loopback IF */ - continue; - - rc = lnet_ipif_query(names[i], &up, &ip, &mask); - if (rc) { - CWARN("Can't get interface %s info: %d\n", - names[i], rc); - continue; - } - - if (!up) { - CWARN("Ignoring interface %s (down)\n", - names[i]); - continue; - } - - if (j == LNET_MAX_INTERFACES) { - CWARN("Ignoring interface %s (too many interfaces)\n", - names[i]); - continue; - } - - net->ksnn_interfaces[j].ksni_ipaddr = ip; - net->ksnn_interfaces[j].ksni_netmask = mask; - strlcpy(net->ksnn_interfaces[j].ksni_name, - names[i], sizeof(net->ksnn_interfaces[j].ksni_name)); - j++; - } - - lnet_ipif_free_enumeration(names, n); - - if (!j) - CERROR("Can't find any usable interfaces\n"); - - return j; -} - -static int -ksocknal_search_new_ipif(struct ksock_net *net) -{ - int new_ipif = 0; - int i; - - for (i = 0; i < net->ksnn_ninterfaces; i++) { - char *ifnam = &net->ksnn_interfaces[i].ksni_name[0]; - char *colon = strchr(ifnam, ':'); - int found = 0; - struct ksock_net *tmp; - int j; - - if (colon) /* ignore alias device */ - *colon = 0; - - list_for_each_entry(tmp, &ksocknal_data.ksnd_nets, ksnn_list) { - for (j = 0; !found && j < tmp->ksnn_ninterfaces; j++) { - char *ifnam2 = - &tmp->ksnn_interfaces[j].ksni_name[0]; - char *colon2 = strchr(ifnam2, ':'); - - if (colon2) - *colon2 = 0; - - found = !strcmp(ifnam, ifnam2); - if (colon2) - *colon2 = ':'; - } - if (found) - break; - } - - new_ipif += !found; - if (colon) - *colon = ':'; - } - - return new_ipif; -} - -static int -ksocknal_start_schedulers(struct ksock_sched_info *info) -{ - int nthrs; - int rc = 0; - int i; - - if (!info->ksi_nthreads) { - if (*ksocknal_tunables.ksnd_nscheds > 0) { - nthrs = info->ksi_nthreads_max; - } else { - nthrs = cfs_cpt_weight(lnet_cpt_table(), - info->ksi_cpt); - nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs); - nthrs = min(SOCKNAL_NSCHEDS_HIGH, nthrs); - } - nthrs = min(nthrs, info->ksi_nthreads_max); - } else { - LASSERT(info->ksi_nthreads <= info->ksi_nthreads_max); - /* increase two threads if there is new interface */ - nthrs = min(2, info->ksi_nthreads_max - info->ksi_nthreads); - } - - for (i = 0; i < nthrs; i++) { - long id; - char name[20]; - struct ksock_sched *sched; - - id = KSOCK_THREAD_ID(info->ksi_cpt, info->ksi_nthreads + i); - sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)]; - snprintf(name, sizeof(name), "socknal_sd%02d_%02d", - info->ksi_cpt, (int)(sched - &info->ksi_scheds[0])); - - rc = ksocknal_thread_start(ksocknal_scheduler, - (void *)id, name); - if (!rc) - continue; - - CERROR("Can't spawn thread %d for scheduler[%d]: %d\n", - info->ksi_cpt, info->ksi_nthreads + i, rc); - break; - } - - info->ksi_nthreads += i; - return rc; -} - -static int -ksocknal_net_start_threads(struct ksock_net *net, __u32 *cpts, int ncpts) -{ - int newif = ksocknal_search_new_ipif(net); - int rc; - int i; - - LASSERT(ncpts > 0 && ncpts <= cfs_cpt_number(lnet_cpt_table())); - - for (i = 0; i < ncpts; i++) { - struct ksock_sched_info *info; - int cpt = !cpts ? i : cpts[i]; - - LASSERT(cpt < cfs_cpt_number(lnet_cpt_table())); - info = ksocknal_data.ksnd_sched_info[cpt]; - - if (!newif && info->ksi_nthreads > 0) - continue; - - rc = ksocknal_start_schedulers(info); - if (rc) - return rc; - } - return 0; -} - -int -ksocknal_startup(struct lnet_ni *ni) -{ - struct ksock_net *net; - int rc; - int i; - - LASSERT(ni->ni_lnd == &the_ksocklnd); - - if (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING) { - rc = ksocknal_base_startup(); - if (rc) - return rc; - } - - net = kzalloc(sizeof(*net), GFP_NOFS); - if (!net) - goto fail_0; - - spin_lock_init(&net->ksnn_lock); - net->ksnn_incarnation = ksocknal_new_incarnation(); - ni->ni_data = net; - ni->ni_peertimeout = *ksocknal_tunables.ksnd_peertimeout; - ni->ni_maxtxcredits = *ksocknal_tunables.ksnd_credits; - ni->ni_peertxcredits = *ksocknal_tunables.ksnd_peertxcredits; - ni->ni_peerrtrcredits = *ksocknal_tunables.ksnd_peerrtrcredits; - - if (!ni->ni_interfaces[0]) { - rc = ksocknal_enumerate_interfaces(net); - if (rc <= 0) - goto fail_1; - - net->ksnn_ninterfaces = 1; - } else { - for (i = 0; i < LNET_MAX_INTERFACES; i++) { - int up; - - if (!ni->ni_interfaces[i]) - break; - - rc = lnet_ipif_query(ni->ni_interfaces[i], &up, - &net->ksnn_interfaces[i].ksni_ipaddr, - &net->ksnn_interfaces[i].ksni_netmask); - - if (rc) { - CERROR("Can't get interface %s info: %d\n", - ni->ni_interfaces[i], rc); - goto fail_1; - } - - if (!up) { - CERROR("Interface %s is down\n", - ni->ni_interfaces[i]); - goto fail_1; - } - - strlcpy(net->ksnn_interfaces[i].ksni_name, - ni->ni_interfaces[i], - sizeof(net->ksnn_interfaces[i].ksni_name)); - } - net->ksnn_ninterfaces = i; - } - - /* call it before add it to ksocknal_data.ksnd_nets */ - rc = ksocknal_net_start_threads(net, ni->ni_cpts, ni->ni_ncpts); - if (rc) - goto fail_1; - - ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), - net->ksnn_interfaces[0].ksni_ipaddr); - list_add(&net->ksnn_list, &ksocknal_data.ksnd_nets); - - ksocknal_data.ksnd_nnets++; - - return 0; - - fail_1: - kfree(net); - fail_0: - if (!ksocknal_data.ksnd_nnets) - ksocknal_base_shutdown(); - - return -ENETDOWN; -} - -static void __exit ksocklnd_exit(void) -{ - lnet_unregister_lnd(&the_ksocklnd); -} - -static int __init ksocklnd_init(void) -{ - int rc; - - /* check ksnr_connected/connecting field large enough */ - BUILD_BUG_ON(SOCKLND_CONN_NTYPES > 4); - BUILD_BUG_ON(SOCKLND_CONN_ACK != SOCKLND_CONN_BULK_IN); - - /* initialize the_ksocklnd */ - the_ksocklnd.lnd_type = SOCKLND; - the_ksocklnd.lnd_startup = ksocknal_startup; - the_ksocklnd.lnd_shutdown = ksocknal_shutdown; - the_ksocklnd.lnd_ctl = ksocknal_ctl; - the_ksocklnd.lnd_send = ksocknal_send; - the_ksocklnd.lnd_recv = ksocknal_recv; - the_ksocklnd.lnd_notify = ksocknal_notify; - the_ksocklnd.lnd_query = ksocknal_query; - the_ksocklnd.lnd_accept = ksocknal_accept; - - rc = ksocknal_tunables_init(); - if (rc) - return rc; - - rc = libcfs_setup(); - if (rc) - return rc; - - lnet_register_lnd(&the_ksocklnd); - - return 0; -} - -MODULE_AUTHOR("OpenSFS, Inc. "); -MODULE_DESCRIPTION("TCP Socket LNet Network Driver"); -MODULE_VERSION("2.7.0"); -MODULE_LICENSE("GPL"); - -module_init(ksocklnd_init); -module_exit(ksocklnd_exit); diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h deleted file mode 100644 index 4e5c89a692a3315b7dfc55a24c0752a796ef2869..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h +++ /dev/null @@ -1,704 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * - * Copyright (c) 2011, 2012, Intel Corporation. - * - * Author: Zach Brown - * Author: Peter J. Braam - * Author: Phil Schwan - * Author: Eric Barton - * - * This file is part of Lustre, http://www.lustre.org - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#ifndef _SOCKLND_SOCKLND_H_ -#define _SOCKLND_SOCKLND_H_ - -#define DEBUG_PORTAL_ALLOC -#define DEBUG_SUBSYSTEM S_LND - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -/* assume one thread for each connection type */ -#define SOCKNAL_NSCHEDS 3 -#define SOCKNAL_NSCHEDS_HIGH (SOCKNAL_NSCHEDS << 1) - -#define SOCKNAL_PEER_HASH_SIZE 101 /* # peer lists */ -#define SOCKNAL_RESCHED 100 /* # scheduler loops before reschedule */ -#define SOCKNAL_INSANITY_RECONN 5000 /* connd is trying on reconn infinitely */ -#define SOCKNAL_ENOMEM_RETRY 1 /* jiffies between retries */ - -#define SOCKNAL_SINGLE_FRAG_TX 0 /* disable multi-fragment sends */ -#define SOCKNAL_SINGLE_FRAG_RX 0 /* disable multi-fragment receives */ - -#define SOCKNAL_VERSION_DEBUG 0 /* enable protocol version debugging */ - -/* - * risk kmap deadlock on multi-frag I/O (backs off to single-frag if disabled). - * no risk if we're not running on a CONFIG_HIGHMEM platform. - */ -#ifdef CONFIG_HIGHMEM -# define SOCKNAL_RISK_KMAP_DEADLOCK 0 -#else -# define SOCKNAL_RISK_KMAP_DEADLOCK 1 -#endif - -struct ksock_sched_info; - -struct ksock_sched { /* per scheduler state */ - spinlock_t kss_lock; /* serialise */ - struct list_head kss_rx_conns; /* conn waiting to be read */ - struct list_head kss_tx_conns; /* conn waiting to be written */ - struct list_head kss_zombie_noop_txs; /* zombie noop tx list */ - wait_queue_head_t kss_waitq; /* where scheduler sleeps */ - int kss_nconns; /* # connections assigned to - * this scheduler - */ - struct ksock_sched_info *kss_info; /* owner of it */ -}; - -struct ksock_sched_info { - int ksi_nthreads_max; /* max allowed threads */ - int ksi_nthreads; /* number of threads */ - int ksi_cpt; /* CPT id */ - struct ksock_sched *ksi_scheds; /* array of schedulers */ -}; - -#define KSOCK_CPT_SHIFT 16 -#define KSOCK_THREAD_ID(cpt, sid) (((cpt) << KSOCK_CPT_SHIFT) | (sid)) -#define KSOCK_THREAD_CPT(id) ((id) >> KSOCK_CPT_SHIFT) -#define KSOCK_THREAD_SID(id) ((id) & ((1UL << KSOCK_CPT_SHIFT) - 1)) - -struct ksock_interface { /* in-use interface */ - __u32 ksni_ipaddr; /* interface's IP address */ - __u32 ksni_netmask; /* interface's network mask */ - int ksni_nroutes; /* # routes using (active) */ - int ksni_npeers; /* # peers using (passive) */ - char ksni_name[IFNAMSIZ]; /* interface name */ -}; - -struct ksock_tunables { - int *ksnd_timeout; /* "stuck" socket timeout - * (seconds) - */ - int *ksnd_nscheds; /* # scheduler threads in each - * pool while starting - */ - int *ksnd_nconnds; /* # connection daemons */ - int *ksnd_nconnds_max; /* max # connection daemons */ - int *ksnd_min_reconnectms; /* first connection retry after - * (ms)... - */ - int *ksnd_max_reconnectms; /* ...exponentially increasing to - * this - */ - int *ksnd_eager_ack; /* make TCP ack eagerly? */ - int *ksnd_typed_conns; /* drive sockets by type? */ - int *ksnd_min_bulk; /* smallest "large" message */ - int *ksnd_tx_buffer_size; /* socket tx buffer size */ - int *ksnd_rx_buffer_size; /* socket rx buffer size */ - int *ksnd_nagle; /* enable NAGLE? */ - int *ksnd_round_robin; /* round robin for multiple - * interfaces - */ - int *ksnd_keepalive; /* # secs for sending keepalive - * NOOP - */ - int *ksnd_keepalive_idle; /* # idle secs before 1st probe - */ - int *ksnd_keepalive_count; /* # probes */ - int *ksnd_keepalive_intvl; /* time between probes */ - int *ksnd_credits; /* # concurrent sends */ - int *ksnd_peertxcredits; /* # concurrent sends to 1 peer - */ - int *ksnd_peerrtrcredits; /* # per-peer router buffer - * credits - */ - int *ksnd_peertimeout; /* seconds to consider peer dead - */ - int *ksnd_enable_csum; /* enable check sum */ - int *ksnd_inject_csum_error; /* set non-zero to inject - * checksum error - */ - int *ksnd_nonblk_zcack; /* always send zc-ack on - * non-blocking connection - */ - unsigned int *ksnd_zc_min_payload; /* minimum zero copy payload - * size - */ - int *ksnd_zc_recv; /* enable ZC receive (for - * Chelsio TOE) - */ - int *ksnd_zc_recv_min_nfrags; /* minimum # of fragments to - * enable ZC receive - */ -}; - -struct ksock_net { - __u64 ksnn_incarnation; /* my epoch */ - spinlock_t ksnn_lock; /* serialise */ - struct list_head ksnn_list; /* chain on global list */ - int ksnn_npeers; /* # peers */ - int ksnn_shutdown; /* shutting down? */ - int ksnn_ninterfaces; /* IP interfaces */ - struct ksock_interface ksnn_interfaces[LNET_MAX_INTERFACES]; -}; - -/** connd timeout */ -#define SOCKNAL_CONND_TIMEOUT 120 -/** reserved thread for accepting & creating new connd */ -#define SOCKNAL_CONND_RESV 1 - -struct ksock_nal_data { - int ksnd_init; /* initialisation state - */ - int ksnd_nnets; /* # networks set up */ - struct list_head ksnd_nets; /* list of nets */ - rwlock_t ksnd_global_lock; /* stabilize peer/conn - * ops - */ - struct list_head *ksnd_peers; /* hash table of all my - * known peers - */ - int ksnd_peer_hash_size; /* size of ksnd_peers */ - - int ksnd_nthreads; /* # live threads */ - int ksnd_shuttingdown; /* tell threads to exit - */ - struct ksock_sched_info **ksnd_sched_info; /* schedulers info */ - - atomic_t ksnd_nactive_txs; /* #active txs */ - - struct list_head ksnd_deathrow_conns; /* conns to close: - * reaper_lock - */ - struct list_head ksnd_zombie_conns; /* conns to free: - * reaper_lock - */ - struct list_head ksnd_enomem_conns; /* conns to retry: - * reaper_lock - */ - wait_queue_head_t ksnd_reaper_waitq; /* reaper sleeps here */ - unsigned long ksnd_reaper_waketime; /* when reaper will wake - */ - spinlock_t ksnd_reaper_lock; /* serialise */ - - int ksnd_enomem_tx; /* test ENOMEM sender */ - int ksnd_stall_tx; /* test sluggish sender - */ - int ksnd_stall_rx; /* test sluggish - * receiver - */ - struct list_head ksnd_connd_connreqs; /* incoming connection - * requests - */ - struct list_head ksnd_connd_routes; /* routes waiting to be - * connected - */ - wait_queue_head_t ksnd_connd_waitq; /* connds sleep here */ - int ksnd_connd_connecting; /* # connds connecting - */ - time64_t ksnd_connd_failed_stamp;/* time stamp of the - * last failed - * connecting attempt - */ - time64_t ksnd_connd_starting_stamp;/* time stamp of the - * last starting connd - */ - unsigned int ksnd_connd_starting; /* # starting connd */ - unsigned int ksnd_connd_running; /* # running connd */ - spinlock_t ksnd_connd_lock; /* serialise */ - - struct list_head ksnd_idle_noop_txs; /* list head for freed - * noop tx - */ - spinlock_t ksnd_tx_lock; /* serialise, g_lock - * unsafe - */ -}; - -#define SOCKNAL_INIT_NOTHING 0 -#define SOCKNAL_INIT_DATA 1 -#define SOCKNAL_INIT_ALL 2 - -/* - * A packet just assembled for transmission is represented by 1 or more - * struct iovec fragments (the first frag contains the portals header), - * followed by 0 or more struct bio_vec fragments. - * - * On the receive side, initially 1 struct iovec fragment is posted for - * receive (the header). Once the header has been received, the payload is - * received into either struct iovec or struct bio_vec fragments, depending on - * what the header matched or whether the message needs forwarding. - */ -struct ksock_conn; /* forward ref */ -struct ksock_peer; /* forward ref */ -struct ksock_route; /* forward ref */ -struct ksock_proto; /* forward ref */ - -struct ksock_tx { /* transmit packet */ - struct list_head tx_list; /* queue on conn for transmission etc - */ - struct list_head tx_zc_list; /* queue on peer for ZC request */ - atomic_t tx_refcount; /* tx reference count */ - int tx_nob; /* # packet bytes */ - int tx_resid; /* residual bytes */ - int tx_niov; /* # packet iovec frags */ - struct kvec *tx_iov; /* packet iovec frags */ - int tx_nkiov; /* # packet page frags */ - unsigned short tx_zc_aborted; /* aborted ZC request */ - unsigned short tx_zc_capable:1; /* payload is large enough for ZC */ - unsigned short tx_zc_checked:1; /* Have I checked if I should ZC? */ - unsigned short tx_nonblk:1; /* it's a non-blocking ACK */ - struct bio_vec *tx_kiov; /* packet page frags */ - struct ksock_conn *tx_conn; /* owning conn */ - struct lnet_msg *tx_lnetmsg; /* lnet message for lnet_finalize() - */ - unsigned long tx_deadline; /* when (in jiffies) tx times out */ - struct ksock_msg tx_msg; /* socklnd message buffer */ - int tx_desc_size; /* size of this descriptor */ - union { - struct { - struct kvec iov; /* virt hdr */ - struct bio_vec kiov[0]; /* paged payload */ - } paged; - struct { - struct kvec iov[1]; /* virt hdr + payload */ - } virt; - } tx_frags; -}; - -#define KSOCK_NOOP_TX_SIZE (offsetof(struct ksock_tx, tx_frags.paged.kiov[0])) - -/* network zero copy callback descriptor embedded in struct ksock_tx */ - -#define SOCKNAL_RX_KSM_HEADER 1 /* reading ksock message header */ -#define SOCKNAL_RX_LNET_HEADER 2 /* reading lnet message header */ -#define SOCKNAL_RX_PARSE 3 /* Calling lnet_parse() */ -#define SOCKNAL_RX_PARSE_WAIT 4 /* waiting to be told to read the body */ -#define SOCKNAL_RX_LNET_PAYLOAD 5 /* reading lnet payload (to deliver here) */ -#define SOCKNAL_RX_SLOP 6 /* skipping body */ - -struct ksock_conn { - struct ksock_peer *ksnc_peer; /* owning peer */ - struct ksock_route *ksnc_route; /* owning route */ - struct list_head ksnc_list; /* stash on peer's conn list */ - struct socket *ksnc_sock; /* actual socket */ - void *ksnc_saved_data_ready; /* socket's original - * data_ready() callback - */ - void *ksnc_saved_write_space; /* socket's original - * write_space() callback - */ - atomic_t ksnc_conn_refcount;/* conn refcount */ - atomic_t ksnc_sock_refcount;/* sock refcount */ - struct ksock_sched *ksnc_scheduler; /* who schedules this connection - */ - __u32 ksnc_myipaddr; /* my IP */ - __u32 ksnc_ipaddr; /* peer's IP */ - int ksnc_port; /* peer's port */ - signed int ksnc_type:3; /* type of connection, should be - * signed value - */ - unsigned int ksnc_closing:1; /* being shut down */ - unsigned int ksnc_flip:1; /* flip or not, only for V2.x */ - unsigned int ksnc_zc_capable:1; /* enable to ZC */ - struct ksock_proto *ksnc_proto; /* protocol for the connection */ - - /* reader */ - struct list_head ksnc_rx_list; /* where I enq waiting input or a - * forwarding descriptor - */ - unsigned long ksnc_rx_deadline; /* when (in jiffies) receive times - * out - */ - __u8 ksnc_rx_started; /* started receiving a message */ - __u8 ksnc_rx_ready; /* data ready to read */ - __u8 ksnc_rx_scheduled; /* being progressed */ - __u8 ksnc_rx_state; /* what is being read */ - int ksnc_rx_nob_left; /* # bytes to next hdr/body */ - struct iov_iter ksnc_rx_to; /* copy destination */ - struct kvec ksnc_rx_iov_space[LNET_MAX_IOV]; /* space for frag descriptors */ - __u32 ksnc_rx_csum; /* partial checksum for incoming - * data - */ - void *ksnc_cookie; /* rx lnet_finalize passthru arg - */ - struct ksock_msg ksnc_msg; /* incoming message buffer: - * V2.x message takes the - * whole struct - * V1.x message is a bare - * struct lnet_hdr, it's stored in - * ksnc_msg.ksm_u.lnetmsg - */ - /* WRITER */ - struct list_head ksnc_tx_list; /* where I enq waiting for output - * space - */ - struct list_head ksnc_tx_queue; /* packets waiting to be sent */ - struct ksock_tx *ksnc_tx_carrier; /* next TX that can carry a LNet - * message or ZC-ACK - */ - unsigned long ksnc_tx_deadline; /* when (in jiffies) tx times out - */ - int ksnc_tx_bufnob; /* send buffer marker */ - atomic_t ksnc_tx_nob; /* # bytes queued */ - int ksnc_tx_ready; /* write space */ - int ksnc_tx_scheduled; /* being progressed */ - unsigned long ksnc_tx_last_post; /* time stamp of the last posted - * TX - */ -}; - -struct ksock_route { - struct list_head ksnr_list; /* chain on peer route list */ - struct list_head ksnr_connd_list; /* chain on ksnr_connd_routes */ - struct ksock_peer *ksnr_peer; /* owning peer */ - atomic_t ksnr_refcount; /* # users */ - unsigned long ksnr_timeout; /* when (in jiffies) reconnection - * can happen next - */ - long ksnr_retry_interval; /* how long between retries */ - __u32 ksnr_myipaddr; /* my IP */ - __u32 ksnr_ipaddr; /* IP address to connect to */ - int ksnr_port; /* port to connect to */ - unsigned int ksnr_scheduled:1; /* scheduled for attention */ - unsigned int ksnr_connecting:1; /* connection establishment in - * progress - */ - unsigned int ksnr_connected:4; /* connections established by - * type - */ - unsigned int ksnr_deleted:1; /* been removed from peer? */ - unsigned int ksnr_share_count; /* created explicitly? */ - int ksnr_conn_count; /* # conns established by this - * route - */ -}; - -#define SOCKNAL_KEEPALIVE_PING 1 /* cookie for keepalive ping */ - -struct ksock_peer { - struct list_head ksnp_list; /* stash on global peer list */ - unsigned long ksnp_last_alive; /* when (in jiffies) I was last - * alive - */ - struct lnet_process_id ksnp_id; /* who's on the other end(s) */ - atomic_t ksnp_refcount; /* # users */ - int ksnp_sharecount; /* lconf usage counter */ - int ksnp_closing; /* being closed */ - int ksnp_accepting; /* # passive connections pending - */ - int ksnp_error; /* errno on closing last conn */ - __u64 ksnp_zc_next_cookie; /* ZC completion cookie */ - __u64 ksnp_incarnation; /* latest known peer incarnation - */ - struct ksock_proto *ksnp_proto; /* latest known peer protocol */ - struct list_head ksnp_conns; /* all active connections */ - struct list_head ksnp_routes; /* routes */ - struct list_head ksnp_tx_queue; /* waiting packets */ - spinlock_t ksnp_lock; /* serialize, g_lock unsafe */ - struct list_head ksnp_zc_req_list; /* zero copy requests wait for - * ACK - */ - unsigned long ksnp_send_keepalive; /* time to send keepalive */ - struct lnet_ni *ksnp_ni; /* which network */ - int ksnp_n_passive_ips; /* # of... */ - - /* preferred local interfaces */ - __u32 ksnp_passive_ips[LNET_MAX_INTERFACES]; -}; - -struct ksock_connreq { - struct list_head ksncr_list; /* stash on ksnd_connd_connreqs */ - struct lnet_ni *ksncr_ni; /* chosen NI */ - struct socket *ksncr_sock; /* accepted socket */ -}; - -extern struct ksock_nal_data ksocknal_data; -extern struct ksock_tunables ksocknal_tunables; - -#define SOCKNAL_MATCH_NO 0 /* TX can't match type of connection */ -#define SOCKNAL_MATCH_YES 1 /* TX matches type of connection */ -#define SOCKNAL_MATCH_MAY 2 /* TX can be sent on the connection, but not - * preferred - */ - -struct ksock_proto { - /* version number of protocol */ - int pro_version; - - /* handshake function */ - int (*pro_send_hello)(struct ksock_conn *, struct ksock_hello_msg *); - - /* handshake function */ - int (*pro_recv_hello)(struct ksock_conn *, struct ksock_hello_msg *, int); - - /* message pack */ - void (*pro_pack)(struct ksock_tx *); - - /* message unpack */ - void (*pro_unpack)(struct ksock_msg *); - - /* queue tx on the connection */ - struct ksock_tx *(*pro_queue_tx_msg)(struct ksock_conn *, struct ksock_tx *); - - /* queue ZC ack on the connection */ - int (*pro_queue_tx_zcack)(struct ksock_conn *, struct ksock_tx *, __u64); - - /* handle ZC request */ - int (*pro_handle_zcreq)(struct ksock_conn *, __u64, int); - - /* handle ZC ACK */ - int (*pro_handle_zcack)(struct ksock_conn *, __u64, __u64); - - /* - * msg type matches the connection type: - * return value: - * return MATCH_NO : no - * return MATCH_YES : matching type - * return MATCH_MAY : can be backup - */ - int (*pro_match_tx)(struct ksock_conn *, struct ksock_tx *, int); -}; - -extern struct ksock_proto ksocknal_protocol_v1x; -extern struct ksock_proto ksocknal_protocol_v2x; -extern struct ksock_proto ksocknal_protocol_v3x; - -#define KSOCK_PROTO_V1_MAJOR LNET_PROTO_TCP_VERSION_MAJOR -#define KSOCK_PROTO_V1_MINOR LNET_PROTO_TCP_VERSION_MINOR -#define KSOCK_PROTO_V1 KSOCK_PROTO_V1_MAJOR - -#ifndef CPU_MASK_NONE -#define CPU_MASK_NONE 0UL -#endif - -static inline int -ksocknal_route_mask(void) -{ - if (!*ksocknal_tunables.ksnd_typed_conns) - return (1 << SOCKLND_CONN_ANY); - - return ((1 << SOCKLND_CONN_CONTROL) | - (1 << SOCKLND_CONN_BULK_IN) | - (1 << SOCKLND_CONN_BULK_OUT)); -} - -static inline struct list_head * -ksocknal_nid2peerlist(lnet_nid_t nid) -{ - unsigned int hash = ((unsigned int)nid) % ksocknal_data.ksnd_peer_hash_size; - - return &ksocknal_data.ksnd_peers[hash]; -} - -static inline void -ksocknal_conn_addref(struct ksock_conn *conn) -{ - LASSERT(atomic_read(&conn->ksnc_conn_refcount) > 0); - atomic_inc(&conn->ksnc_conn_refcount); -} - -void ksocknal_queue_zombie_conn(struct ksock_conn *conn); -void ksocknal_finalize_zcreq(struct ksock_conn *conn); - -static inline void -ksocknal_conn_decref(struct ksock_conn *conn) -{ - LASSERT(atomic_read(&conn->ksnc_conn_refcount) > 0); - if (atomic_dec_and_test(&conn->ksnc_conn_refcount)) - ksocknal_queue_zombie_conn(conn); -} - -static inline int -ksocknal_connsock_addref(struct ksock_conn *conn) -{ - int rc = -ESHUTDOWN; - - read_lock(&ksocknal_data.ksnd_global_lock); - if (!conn->ksnc_closing) { - LASSERT(atomic_read(&conn->ksnc_sock_refcount) > 0); - atomic_inc(&conn->ksnc_sock_refcount); - rc = 0; - } - read_unlock(&ksocknal_data.ksnd_global_lock); - - return rc; -} - -static inline void -ksocknal_connsock_decref(struct ksock_conn *conn) -{ - LASSERT(atomic_read(&conn->ksnc_sock_refcount) > 0); - if (atomic_dec_and_test(&conn->ksnc_sock_refcount)) { - LASSERT(conn->ksnc_closing); - sock_release(conn->ksnc_sock); - conn->ksnc_sock = NULL; - ksocknal_finalize_zcreq(conn); - } -} - -static inline void -ksocknal_tx_addref(struct ksock_tx *tx) -{ - LASSERT(atomic_read(&tx->tx_refcount) > 0); - atomic_inc(&tx->tx_refcount); -} - -void ksocknal_tx_prep(struct ksock_conn *, struct ksock_tx *tx); -void ksocknal_tx_done(struct lnet_ni *ni, struct ksock_tx *tx); - -static inline void -ksocknal_tx_decref(struct ksock_tx *tx) -{ - LASSERT(atomic_read(&tx->tx_refcount) > 0); - if (atomic_dec_and_test(&tx->tx_refcount)) - ksocknal_tx_done(NULL, tx); -} - -static inline void -ksocknal_route_addref(struct ksock_route *route) -{ - LASSERT(atomic_read(&route->ksnr_refcount) > 0); - atomic_inc(&route->ksnr_refcount); -} - -void ksocknal_destroy_route(struct ksock_route *route); - -static inline void -ksocknal_route_decref(struct ksock_route *route) -{ - LASSERT(atomic_read(&route->ksnr_refcount) > 0); - if (atomic_dec_and_test(&route->ksnr_refcount)) - ksocknal_destroy_route(route); -} - -static inline void -ksocknal_peer_addref(struct ksock_peer *peer) -{ - LASSERT(atomic_read(&peer->ksnp_refcount) > 0); - atomic_inc(&peer->ksnp_refcount); -} - -void ksocknal_destroy_peer(struct ksock_peer *peer); - -static inline void -ksocknal_peer_decref(struct ksock_peer *peer) -{ - LASSERT(atomic_read(&peer->ksnp_refcount) > 0); - if (atomic_dec_and_test(&peer->ksnp_refcount)) - ksocknal_destroy_peer(peer); -} - -int ksocknal_startup(struct lnet_ni *ni); -void ksocknal_shutdown(struct lnet_ni *ni); -int ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg); -int ksocknal_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg); -int ksocknal_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg, - int delayed, struct iov_iter *to, unsigned int rlen); -int ksocknal_accept(struct lnet_ni *ni, struct socket *sock); - -int ksocknal_add_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip, - int port); -struct ksock_peer *ksocknal_find_peer_locked(struct lnet_ni *ni, - struct lnet_process_id id); -struct ksock_peer *ksocknal_find_peer(struct lnet_ni *ni, - struct lnet_process_id id); -void ksocknal_peer_failed(struct ksock_peer *peer); -int ksocknal_create_conn(struct lnet_ni *ni, struct ksock_route *route, - struct socket *sock, int type); -void ksocknal_close_conn_locked(struct ksock_conn *conn, int why); -void ksocknal_terminate_conn(struct ksock_conn *conn); -void ksocknal_destroy_conn(struct ksock_conn *conn); -int ksocknal_close_peer_conns_locked(struct ksock_peer *peer, - __u32 ipaddr, int why); -int ksocknal_close_conn_and_siblings(struct ksock_conn *conn, int why); -int ksocknal_close_matching_conns(struct lnet_process_id id, __u32 ipaddr); -struct ksock_conn *ksocknal_find_conn_locked(struct ksock_peer *peer, - struct ksock_tx *tx, int nonblk); - -int ksocknal_launch_packet(struct lnet_ni *ni, struct ksock_tx *tx, - struct lnet_process_id id); -struct ksock_tx *ksocknal_alloc_tx(int type, int size); -void ksocknal_free_tx(struct ksock_tx *tx); -struct ksock_tx *ksocknal_alloc_tx_noop(__u64 cookie, int nonblk); -void ksocknal_next_tx_carrier(struct ksock_conn *conn); -void ksocknal_queue_tx_locked(struct ksock_tx *tx, struct ksock_conn *conn); -void ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist, int error); -void ksocknal_notify(struct lnet_ni *ni, lnet_nid_t gw_nid, int alive); -void ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, unsigned long *when); -int ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name); -void ksocknal_thread_fini(void); -void ksocknal_launch_all_connections_locked(struct ksock_peer *peer); -struct ksock_route *ksocknal_find_connectable_route_locked(struct ksock_peer *peer); -struct ksock_route *ksocknal_find_connecting_route_locked(struct ksock_peer *peer); -int ksocknal_new_packet(struct ksock_conn *conn, int skip); -int ksocknal_scheduler(void *arg); -int ksocknal_connd(void *arg); -int ksocknal_reaper(void *arg); -int ksocknal_send_hello(struct lnet_ni *ni, struct ksock_conn *conn, - lnet_nid_t peer_nid, struct ksock_hello_msg *hello); -int ksocknal_recv_hello(struct lnet_ni *ni, struct ksock_conn *conn, - struct ksock_hello_msg *hello, - struct lnet_process_id *id, - __u64 *incarnation); -void ksocknal_read_callback(struct ksock_conn *conn); -void ksocknal_write_callback(struct ksock_conn *conn); - -int ksocknal_lib_zc_capable(struct ksock_conn *conn); -void ksocknal_lib_save_callback(struct socket *sock, struct ksock_conn *conn); -void ksocknal_lib_set_callback(struct socket *sock, struct ksock_conn *conn); -void ksocknal_lib_reset_callback(struct socket *sock, struct ksock_conn *conn); -void ksocknal_lib_push_conn(struct ksock_conn *conn); -int ksocknal_lib_get_conn_addrs(struct ksock_conn *conn); -int ksocknal_lib_setup_sock(struct socket *so); -int ksocknal_lib_send_iov(struct ksock_conn *conn, struct ksock_tx *tx); -int ksocknal_lib_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx); -void ksocknal_lib_eager_ack(struct ksock_conn *conn); -int ksocknal_lib_recv(struct ksock_conn *conn); -int ksocknal_lib_get_conn_tunables(struct ksock_conn *conn, int *txmem, - int *rxmem, int *nagle); - -void ksocknal_read_callback(struct ksock_conn *conn); -void ksocknal_write_callback(struct ksock_conn *conn); - -int ksocknal_tunables_init(void); - -void ksocknal_lib_csum_tx(struct ksock_tx *tx); - -int ksocknal_lib_memory_pressure(struct ksock_conn *conn); -int ksocknal_lib_bind_thread_to_cpu(int id); - -#endif /* _SOCKLND_SOCKLND_H_ */ diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c deleted file mode 100644 index 01b31a6bb588fdb2bf027dda8502332bece7174a..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c +++ /dev/null @@ -1,2586 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * - * Copyright (c) 2011, 2012, Intel Corporation. - * - * Author: Zach Brown - * Author: Peter J. Braam - * Author: Phil Schwan - * Author: Eric Barton - * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#include -#include "socklnd.h" - -struct ksock_tx * -ksocknal_alloc_tx(int type, int size) -{ - struct ksock_tx *tx = NULL; - - if (type == KSOCK_MSG_NOOP) { - LASSERT(size == KSOCK_NOOP_TX_SIZE); - - /* searching for a noop tx in free list */ - spin_lock(&ksocknal_data.ksnd_tx_lock); - - if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) { - tx = list_entry(ksocknal_data.ksnd_idle_noop_txs.next, - struct ksock_tx, tx_list); - LASSERT(tx->tx_desc_size == size); - list_del(&tx->tx_list); - } - - spin_unlock(&ksocknal_data.ksnd_tx_lock); - } - - if (!tx) - tx = kzalloc(size, GFP_NOFS); - - if (!tx) - return NULL; - - atomic_set(&tx->tx_refcount, 1); - tx->tx_zc_aborted = 0; - tx->tx_zc_capable = 0; - tx->tx_zc_checked = 0; - tx->tx_desc_size = size; - - atomic_inc(&ksocknal_data.ksnd_nactive_txs); - - return tx; -} - -struct ksock_tx * -ksocknal_alloc_tx_noop(__u64 cookie, int nonblk) -{ - struct ksock_tx *tx; - - tx = ksocknal_alloc_tx(KSOCK_MSG_NOOP, KSOCK_NOOP_TX_SIZE); - if (!tx) { - CERROR("Can't allocate noop tx desc\n"); - return NULL; - } - - tx->tx_conn = NULL; - tx->tx_lnetmsg = NULL; - tx->tx_kiov = NULL; - tx->tx_nkiov = 0; - tx->tx_iov = tx->tx_frags.virt.iov; - tx->tx_niov = 1; - tx->tx_nonblk = nonblk; - - tx->tx_msg.ksm_csum = 0; - tx->tx_msg.ksm_type = KSOCK_MSG_NOOP; - tx->tx_msg.ksm_zc_cookies[0] = 0; - tx->tx_msg.ksm_zc_cookies[1] = cookie; - - return tx; -} - -void -ksocknal_free_tx(struct ksock_tx *tx) -{ - atomic_dec(&ksocknal_data.ksnd_nactive_txs); - - if (!tx->tx_lnetmsg && tx->tx_desc_size == KSOCK_NOOP_TX_SIZE) { - /* it's a noop tx */ - spin_lock(&ksocknal_data.ksnd_tx_lock); - - list_add(&tx->tx_list, &ksocknal_data.ksnd_idle_noop_txs); - - spin_unlock(&ksocknal_data.ksnd_tx_lock); - } else { - kfree(tx); - } -} - -static int -ksocknal_send_iov(struct ksock_conn *conn, struct ksock_tx *tx) -{ - struct kvec *iov = tx->tx_iov; - int nob; - int rc; - - LASSERT(tx->tx_niov > 0); - - /* Never touch tx->tx_iov inside ksocknal_lib_send_iov() */ - rc = ksocknal_lib_send_iov(conn, tx); - - if (rc <= 0) /* sent nothing? */ - return rc; - - nob = rc; - LASSERT(nob <= tx->tx_resid); - tx->tx_resid -= nob; - - /* "consume" iov */ - do { - LASSERT(tx->tx_niov > 0); - - if (nob < (int)iov->iov_len) { - iov->iov_base = (void *)((char *)iov->iov_base + nob); - iov->iov_len -= nob; - return rc; - } - - nob -= iov->iov_len; - tx->tx_iov = ++iov; - tx->tx_niov--; - } while (nob); - - return rc; -} - -static int -ksocknal_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx) -{ - struct bio_vec *kiov = tx->tx_kiov; - int nob; - int rc; - - LASSERT(!tx->tx_niov); - LASSERT(tx->tx_nkiov > 0); - - /* Never touch tx->tx_kiov inside ksocknal_lib_send_kiov() */ - rc = ksocknal_lib_send_kiov(conn, tx); - - if (rc <= 0) /* sent nothing? */ - return rc; - - nob = rc; - LASSERT(nob <= tx->tx_resid); - tx->tx_resid -= nob; - - /* "consume" kiov */ - do { - LASSERT(tx->tx_nkiov > 0); - - if (nob < (int)kiov->bv_len) { - kiov->bv_offset += nob; - kiov->bv_len -= nob; - return rc; - } - - nob -= (int)kiov->bv_len; - tx->tx_kiov = ++kiov; - tx->tx_nkiov--; - } while (nob); - - return rc; -} - -static int -ksocknal_transmit(struct ksock_conn *conn, struct ksock_tx *tx) -{ - int rc; - int bufnob; - - if (ksocknal_data.ksnd_stall_tx) { - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(ksocknal_data.ksnd_stall_tx * HZ); - } - - LASSERT(tx->tx_resid); - - rc = ksocknal_connsock_addref(conn); - if (rc) { - LASSERT(conn->ksnc_closing); - return -ESHUTDOWN; - } - - do { - if (ksocknal_data.ksnd_enomem_tx > 0) { - /* testing... */ - ksocknal_data.ksnd_enomem_tx--; - rc = -EAGAIN; - } else if (tx->tx_niov) { - rc = ksocknal_send_iov(conn, tx); - } else { - rc = ksocknal_send_kiov(conn, tx); - } - - bufnob = conn->ksnc_sock->sk->sk_wmem_queued; - if (rc > 0) /* sent something? */ - conn->ksnc_tx_bufnob += rc; /* account it */ - - if (bufnob < conn->ksnc_tx_bufnob) { - /* - * allocated send buffer bytes < computed; infer - * something got ACKed - */ - conn->ksnc_tx_deadline = - jiffies + *ksocknal_tunables.ksnd_timeout * HZ; - conn->ksnc_peer->ksnp_last_alive = jiffies; - conn->ksnc_tx_bufnob = bufnob; - mb(); - } - - if (rc <= 0) { /* Didn't write anything? */ - - if (!rc) /* some stacks return 0 instead of -EAGAIN */ - rc = -EAGAIN; - - /* Check if EAGAIN is due to memory pressure */ - if (rc == -EAGAIN && ksocknal_lib_memory_pressure(conn)) - rc = -ENOMEM; - - break; - } - - /* socket's wmem_queued now includes 'rc' bytes */ - atomic_sub(rc, &conn->ksnc_tx_nob); - rc = 0; - - } while (tx->tx_resid); - - ksocknal_connsock_decref(conn); - return rc; -} - -static int -ksocknal_recv_iter(struct ksock_conn *conn) -{ - int nob; - int rc; - - /* - * Never touch conn->ksnc_rx_to or change connection - * status inside ksocknal_lib_recv - */ - rc = ksocknal_lib_recv(conn); - - if (rc <= 0) - return rc; - - /* received something... */ - nob = rc; - - conn->ksnc_peer->ksnp_last_alive = jiffies; - conn->ksnc_rx_deadline = - jiffies + *ksocknal_tunables.ksnd_timeout * HZ; - mb(); /* order with setting rx_started */ - conn->ksnc_rx_started = 1; - - conn->ksnc_rx_nob_left -= nob; - - iov_iter_advance(&conn->ksnc_rx_to, nob); - if (iov_iter_count(&conn->ksnc_rx_to)) - return -EAGAIN; - - return 1; -} - -static int -ksocknal_receive(struct ksock_conn *conn) -{ - /* - * Return 1 on success, 0 on EOF, < 0 on error. - * Caller checks ksnc_rx_to to determine - * progress/completion. - */ - int rc; - - if (ksocknal_data.ksnd_stall_rx) { - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(ksocknal_data.ksnd_stall_rx * HZ); - } - - rc = ksocknal_connsock_addref(conn); - if (rc) { - LASSERT(conn->ksnc_closing); - return -ESHUTDOWN; - } - - for (;;) { - rc = ksocknal_recv_iter(conn); - if (rc <= 0) { - /* error/EOF or partial receive */ - if (rc == -EAGAIN) { - rc = 1; - } else if (!rc && conn->ksnc_rx_started) { - /* EOF in the middle of a message */ - rc = -EPROTO; - } - break; - } - - /* Completed a fragment */ - - if (!iov_iter_count(&conn->ksnc_rx_to)) { - rc = 1; - break; - } - } - - ksocknal_connsock_decref(conn); - return rc; -} - -void -ksocknal_tx_done(struct lnet_ni *ni, struct ksock_tx *tx) -{ - struct lnet_msg *lnetmsg = tx->tx_lnetmsg; - int rc = (!tx->tx_resid && !tx->tx_zc_aborted) ? 0 : -EIO; - - LASSERT(ni || tx->tx_conn); - - if (tx->tx_conn) - ksocknal_conn_decref(tx->tx_conn); - - if (!ni && tx->tx_conn) - ni = tx->tx_conn->ksnc_peer->ksnp_ni; - - ksocknal_free_tx(tx); - if (lnetmsg) /* KSOCK_MSG_NOOP go without lnetmsg */ - lnet_finalize(ni, lnetmsg, rc); -} - -void -ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist, int error) -{ - struct ksock_tx *tx; - - while (!list_empty(txlist)) { - tx = list_entry(txlist->next, struct ksock_tx, tx_list); - - if (error && tx->tx_lnetmsg) { - CNETERR("Deleting packet type %d len %d %s->%s\n", - le32_to_cpu(tx->tx_lnetmsg->msg_hdr.type), - le32_to_cpu(tx->tx_lnetmsg->msg_hdr.payload_length), - libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.src_nid)), - libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.dest_nid))); - } else if (error) { - CNETERR("Deleting noop packet\n"); - } - - list_del(&tx->tx_list); - - LASSERT(atomic_read(&tx->tx_refcount) == 1); - ksocknal_tx_done(ni, tx); - } -} - -static void -ksocknal_check_zc_req(struct ksock_tx *tx) -{ - struct ksock_conn *conn = tx->tx_conn; - struct ksock_peer *peer = conn->ksnc_peer; - - /* - * Set tx_msg.ksm_zc_cookies[0] to a unique non-zero cookie and add tx - * to ksnp_zc_req_list if some fragment of this message should be sent - * zero-copy. Our peer will send an ACK containing this cookie when - * she has received this message to tell us we can signal completion. - * tx_msg.ksm_zc_cookies[0] remains non-zero while tx is on - * ksnp_zc_req_list. - */ - LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); - LASSERT(tx->tx_zc_capable); - - tx->tx_zc_checked = 1; - - if (conn->ksnc_proto == &ksocknal_protocol_v1x || - !conn->ksnc_zc_capable) - return; - - /* - * assign cookie and queue tx to pending list, it will be released when - * a matching ack is received. See ksocknal_handle_zcack() - */ - ksocknal_tx_addref(tx); - - spin_lock(&peer->ksnp_lock); - - /* ZC_REQ is going to be pinned to the peer */ - tx->tx_deadline = - jiffies + *ksocknal_tunables.ksnd_timeout * HZ; - - LASSERT(!tx->tx_msg.ksm_zc_cookies[0]); - - tx->tx_msg.ksm_zc_cookies[0] = peer->ksnp_zc_next_cookie++; - - if (!peer->ksnp_zc_next_cookie) - peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1; - - list_add_tail(&tx->tx_zc_list, &peer->ksnp_zc_req_list); - - spin_unlock(&peer->ksnp_lock); -} - -static void -ksocknal_uncheck_zc_req(struct ksock_tx *tx) -{ - struct ksock_peer *peer = tx->tx_conn->ksnc_peer; - - LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); - LASSERT(tx->tx_zc_capable); - - tx->tx_zc_checked = 0; - - spin_lock(&peer->ksnp_lock); - - if (!tx->tx_msg.ksm_zc_cookies[0]) { - /* Not waiting for an ACK */ - spin_unlock(&peer->ksnp_lock); - return; - } - - tx->tx_msg.ksm_zc_cookies[0] = 0; - list_del(&tx->tx_zc_list); - - spin_unlock(&peer->ksnp_lock); - - ksocknal_tx_decref(tx); -} - -static int -ksocknal_process_transmit(struct ksock_conn *conn, struct ksock_tx *tx) -{ - int rc; - - if (tx->tx_zc_capable && !tx->tx_zc_checked) - ksocknal_check_zc_req(tx); - - rc = ksocknal_transmit(conn, tx); - - CDEBUG(D_NET, "send(%d) %d\n", tx->tx_resid, rc); - - if (!tx->tx_resid) { - /* Sent everything OK */ - LASSERT(!rc); - - return 0; - } - - if (rc == -EAGAIN) - return rc; - - if (rc == -ENOMEM) { - static int counter; - - counter++; /* exponential backoff warnings */ - if ((counter & (-counter)) == counter) - CWARN("%u ENOMEM tx %p\n", counter, conn); - - /* Queue on ksnd_enomem_conns for retry after a timeout */ - spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); - - /* enomem list takes over scheduler's ref... */ - LASSERT(conn->ksnc_tx_scheduled); - list_add_tail(&conn->ksnc_tx_list, - &ksocknal_data.ksnd_enomem_conns); - if (!time_after_eq(jiffies + SOCKNAL_ENOMEM_RETRY, - ksocknal_data.ksnd_reaper_waketime)) - wake_up(&ksocknal_data.ksnd_reaper_waitq); - - spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); - return rc; - } - - /* Actual error */ - LASSERT(rc < 0); - - if (!conn->ksnc_closing) { - switch (rc) { - case -ECONNRESET: - LCONSOLE_WARN("Host %pI4h reset our connection while we were sending data; it may have rebooted.\n", - &conn->ksnc_ipaddr); - break; - default: - LCONSOLE_WARN("There was an unexpected network error while writing to %pI4h: %d.\n", - &conn->ksnc_ipaddr, rc); - break; - } - CDEBUG(D_NET, "[%p] Error %d on write to %s ip %pI4h:%d\n", - conn, rc, - libcfs_id2str(conn->ksnc_peer->ksnp_id), - &conn->ksnc_ipaddr, - conn->ksnc_port); - } - - if (tx->tx_zc_checked) - ksocknal_uncheck_zc_req(tx); - - /* it's not an error if conn is being closed */ - ksocknal_close_conn_and_siblings(conn, (conn->ksnc_closing) ? 0 : rc); - - return rc; -} - -static void -ksocknal_launch_connection_locked(struct ksock_route *route) -{ - /* called holding write lock on ksnd_global_lock */ - - LASSERT(!route->ksnr_scheduled); - LASSERT(!route->ksnr_connecting); - LASSERT(ksocknal_route_mask() & ~route->ksnr_connected); - - route->ksnr_scheduled = 1; /* scheduling conn for connd */ - ksocknal_route_addref(route); /* extra ref for connd */ - - spin_lock_bh(&ksocknal_data.ksnd_connd_lock); - - list_add_tail(&route->ksnr_connd_list, - &ksocknal_data.ksnd_connd_routes); - wake_up(&ksocknal_data.ksnd_connd_waitq); - - spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); -} - -void -ksocknal_launch_all_connections_locked(struct ksock_peer *peer) -{ - struct ksock_route *route; - - /* called holding write lock on ksnd_global_lock */ - for (;;) { - /* launch any/all connections that need it */ - route = ksocknal_find_connectable_route_locked(peer); - if (!route) - return; - - ksocknal_launch_connection_locked(route); - } -} - -struct ksock_conn * -ksocknal_find_conn_locked(struct ksock_peer *peer, struct ksock_tx *tx, - int nonblk) -{ - struct list_head *tmp; - struct ksock_conn *conn; - struct ksock_conn *typed = NULL; - struct ksock_conn *fallback = NULL; - int tnob = 0; - int fnob = 0; - - list_for_each(tmp, &peer->ksnp_conns) { - struct ksock_conn *c; - int nob, rc; - - c = list_entry(tmp, struct ksock_conn, ksnc_list); - nob = atomic_read(&c->ksnc_tx_nob) + - c->ksnc_sock->sk->sk_wmem_queued; - - LASSERT(!c->ksnc_closing); - LASSERT(c->ksnc_proto && - c->ksnc_proto->pro_match_tx); - - rc = c->ksnc_proto->pro_match_tx(c, tx, nonblk); - - switch (rc) { - default: - LBUG(); - case SOCKNAL_MATCH_NO: /* protocol rejected the tx */ - continue; - - case SOCKNAL_MATCH_YES: /* typed connection */ - if (!typed || tnob > nob || - (tnob == nob && *ksocknal_tunables.ksnd_round_robin && - time_after(typed->ksnc_tx_last_post, c->ksnc_tx_last_post))) { - typed = c; - tnob = nob; - } - break; - - case SOCKNAL_MATCH_MAY: /* fallback connection */ - if (!fallback || fnob > nob || - (fnob == nob && *ksocknal_tunables.ksnd_round_robin && - time_after(fallback->ksnc_tx_last_post, c->ksnc_tx_last_post))) { - fallback = c; - fnob = nob; - } - break; - } - } - - /* prefer the typed selection */ - conn = (typed) ? typed : fallback; - - if (conn) - conn->ksnc_tx_last_post = jiffies; - - return conn; -} - -void -ksocknal_tx_prep(struct ksock_conn *conn, struct ksock_tx *tx) -{ - conn->ksnc_proto->pro_pack(tx); - - atomic_add(tx->tx_nob, &conn->ksnc_tx_nob); - ksocknal_conn_addref(conn); /* +1 ref for tx */ - tx->tx_conn = conn; -} - -void -ksocknal_queue_tx_locked(struct ksock_tx *tx, struct ksock_conn *conn) -{ - struct ksock_sched *sched = conn->ksnc_scheduler; - struct ksock_msg *msg = &tx->tx_msg; - struct ksock_tx *ztx = NULL; - int bufnob = 0; - - /* - * called holding global lock (read or irq-write) and caller may - * not have dropped this lock between finding conn and calling me, - * so we don't need the {get,put}connsock dance to deref - * ksnc_sock... - */ - LASSERT(!conn->ksnc_closing); - - CDEBUG(D_NET, "Sending to %s ip %pI4h:%d\n", - libcfs_id2str(conn->ksnc_peer->ksnp_id), - &conn->ksnc_ipaddr, conn->ksnc_port); - - ksocknal_tx_prep(conn, tx); - - /* - * Ensure the frags we've been given EXACTLY match the number of - * bytes we want to send. Many TCP/IP stacks disregard any total - * size parameters passed to them and just look at the frags. - * - * We always expect at least 1 mapped fragment containing the - * complete ksocknal message header. - */ - LASSERT(lnet_iov_nob(tx->tx_niov, tx->tx_iov) + - lnet_kiov_nob(tx->tx_nkiov, tx->tx_kiov) == - (unsigned int)tx->tx_nob); - LASSERT(tx->tx_niov >= 1); - LASSERT(tx->tx_resid == tx->tx_nob); - - CDEBUG(D_NET, "Packet %p type %d, nob %d niov %d nkiov %d\n", - tx, (tx->tx_lnetmsg) ? tx->tx_lnetmsg->msg_hdr.type : - KSOCK_MSG_NOOP, - tx->tx_nob, tx->tx_niov, tx->tx_nkiov); - - /* - * FIXME: SOCK_WMEM_QUEUED and SOCK_ERROR could block in __DARWIN8__ - * but they're used inside spinlocks a lot. - */ - bufnob = conn->ksnc_sock->sk->sk_wmem_queued; - spin_lock_bh(&sched->kss_lock); - - if (list_empty(&conn->ksnc_tx_queue) && !bufnob) { - /* First packet starts the timeout */ - conn->ksnc_tx_deadline = - jiffies + *ksocknal_tunables.ksnd_timeout * HZ; - if (conn->ksnc_tx_bufnob > 0) /* something got ACKed */ - conn->ksnc_peer->ksnp_last_alive = jiffies; - conn->ksnc_tx_bufnob = 0; - mb(); /* order with adding to tx_queue */ - } - - if (msg->ksm_type == KSOCK_MSG_NOOP) { - /* - * The packet is noop ZC ACK, try to piggyback the ack_cookie - * on a normal packet so I don't need to send it - */ - LASSERT(msg->ksm_zc_cookies[1]); - LASSERT(conn->ksnc_proto->pro_queue_tx_zcack); - - /* ZC ACK piggybacked on ztx release tx later */ - if (conn->ksnc_proto->pro_queue_tx_zcack(conn, tx, 0)) - ztx = tx; - } else { - /* - * It's a normal packet - can it piggback a noop zc-ack that - * has been queued already? - */ - LASSERT(!msg->ksm_zc_cookies[1]); - LASSERT(conn->ksnc_proto->pro_queue_tx_msg); - - ztx = conn->ksnc_proto->pro_queue_tx_msg(conn, tx); - /* ztx will be released later */ - } - - if (ztx) { - atomic_sub(ztx->tx_nob, &conn->ksnc_tx_nob); - list_add_tail(&ztx->tx_list, &sched->kss_zombie_noop_txs); - } - - if (conn->ksnc_tx_ready && /* able to send */ - !conn->ksnc_tx_scheduled) { /* not scheduled to send */ - /* +1 ref for scheduler */ - ksocknal_conn_addref(conn); - list_add_tail(&conn->ksnc_tx_list, &sched->kss_tx_conns); - conn->ksnc_tx_scheduled = 1; - wake_up(&sched->kss_waitq); - } - - spin_unlock_bh(&sched->kss_lock); -} - -struct ksock_route * -ksocknal_find_connectable_route_locked(struct ksock_peer *peer) -{ - unsigned long now = jiffies; - struct list_head *tmp; - struct ksock_route *route; - - list_for_each(tmp, &peer->ksnp_routes) { - route = list_entry(tmp, struct ksock_route, ksnr_list); - - LASSERT(!route->ksnr_connecting || route->ksnr_scheduled); - - /* connections being established */ - if (route->ksnr_scheduled) - continue; - - /* all route types connected ? */ - if (!(ksocknal_route_mask() & ~route->ksnr_connected)) - continue; - - if (!(!route->ksnr_retry_interval || /* first attempt */ - time_after_eq(now, route->ksnr_timeout))) { - CDEBUG(D_NET, - "Too soon to retry route %pI4h (cnted %d, interval %ld, %ld secs later)\n", - &route->ksnr_ipaddr, - route->ksnr_connected, - route->ksnr_retry_interval, - (route->ksnr_timeout - now) / HZ); - continue; - } - - return route; - } - - return NULL; -} - -struct ksock_route * -ksocknal_find_connecting_route_locked(struct ksock_peer *peer) -{ - struct list_head *tmp; - struct ksock_route *route; - - list_for_each(tmp, &peer->ksnp_routes) { - route = list_entry(tmp, struct ksock_route, ksnr_list); - - LASSERT(!route->ksnr_connecting || route->ksnr_scheduled); - - if (route->ksnr_scheduled) - return route; - } - - return NULL; -} - -int -ksocknal_launch_packet(struct lnet_ni *ni, struct ksock_tx *tx, - struct lnet_process_id id) -{ - struct ksock_peer *peer; - struct ksock_conn *conn; - rwlock_t *g_lock; - int retry; - int rc; - - LASSERT(!tx->tx_conn); - - g_lock = &ksocknal_data.ksnd_global_lock; - - for (retry = 0;; retry = 1) { - read_lock(g_lock); - peer = ksocknal_find_peer_locked(ni, id); - if (peer) { - if (!ksocknal_find_connectable_route_locked(peer)) { - conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk); - if (conn) { - /* - * I've got no routes that need to be - * connecting and I do have an actual - * connection... - */ - ksocknal_queue_tx_locked(tx, conn); - read_unlock(g_lock); - return 0; - } - } - } - - /* I'll need a write lock... */ - read_unlock(g_lock); - - write_lock_bh(g_lock); - - peer = ksocknal_find_peer_locked(ni, id); - if (peer) - break; - - write_unlock_bh(g_lock); - - if (id.pid & LNET_PID_USERFLAG) { - CERROR("Refusing to create a connection to userspace process %s\n", - libcfs_id2str(id)); - return -EHOSTUNREACH; - } - - if (retry) { - CERROR("Can't find peer %s\n", libcfs_id2str(id)); - return -EHOSTUNREACH; - } - - rc = ksocknal_add_peer(ni, id, - LNET_NIDADDR(id.nid), - lnet_acceptor_port()); - if (rc) { - CERROR("Can't add peer %s: %d\n", - libcfs_id2str(id), rc); - return rc; - } - } - - ksocknal_launch_all_connections_locked(peer); - - conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk); - if (conn) { - /* Connection exists; queue message on it */ - ksocknal_queue_tx_locked(tx, conn); - write_unlock_bh(g_lock); - return 0; - } - - if (peer->ksnp_accepting > 0 || - ksocknal_find_connecting_route_locked(peer)) { - /* the message is going to be pinned to the peer */ - tx->tx_deadline = - jiffies + *ksocknal_tunables.ksnd_timeout * HZ; - - /* Queue the message until a connection is established */ - list_add_tail(&tx->tx_list, &peer->ksnp_tx_queue); - write_unlock_bh(g_lock); - return 0; - } - - write_unlock_bh(g_lock); - - /* NB Routes may be ignored if connections to them failed recently */ - CNETERR("No usable routes to %s\n", libcfs_id2str(id)); - return -EHOSTUNREACH; -} - -int -ksocknal_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg) -{ - unsigned int mpflag = 0; - int type = lntmsg->msg_type; - struct lnet_process_id target = lntmsg->msg_target; - unsigned int payload_niov = lntmsg->msg_niov; - struct kvec *payload_iov = lntmsg->msg_iov; - struct bio_vec *payload_kiov = lntmsg->msg_kiov; - unsigned int payload_offset = lntmsg->msg_offset; - unsigned int payload_nob = lntmsg->msg_len; - struct ksock_tx *tx; - int desc_size; - int rc; - - /* - * NB 'private' is different depending on what we're sending. - * Just ignore it... - */ - CDEBUG(D_NET, "sending %u bytes in %d frags to %s\n", - payload_nob, payload_niov, libcfs_id2str(target)); - - LASSERT(!payload_nob || payload_niov > 0); - LASSERT(payload_niov <= LNET_MAX_IOV); - /* payload is either all vaddrs or all pages */ - LASSERT(!(payload_kiov && payload_iov)); - LASSERT(!in_interrupt()); - - if (payload_iov) - desc_size = offsetof(struct ksock_tx, - tx_frags.virt.iov[1 + payload_niov]); - else - desc_size = offsetof(struct ksock_tx, - tx_frags.paged.kiov[payload_niov]); - - if (lntmsg->msg_vmflush) - mpflag = memalloc_noreclaim_save(); - tx = ksocknal_alloc_tx(KSOCK_MSG_LNET, desc_size); - if (!tx) { - CERROR("Can't allocate tx desc type %d size %d\n", - type, desc_size); - if (lntmsg->msg_vmflush) - memalloc_noreclaim_restore(mpflag); - return -ENOMEM; - } - - tx->tx_conn = NULL; /* set when assigned a conn */ - tx->tx_lnetmsg = lntmsg; - - if (payload_iov) { - tx->tx_kiov = NULL; - tx->tx_nkiov = 0; - tx->tx_iov = tx->tx_frags.virt.iov; - tx->tx_niov = 1 + - lnet_extract_iov(payload_niov, &tx->tx_iov[1], - payload_niov, payload_iov, - payload_offset, payload_nob); - } else { - tx->tx_niov = 1; - tx->tx_iov = &tx->tx_frags.paged.iov; - tx->tx_kiov = tx->tx_frags.paged.kiov; - tx->tx_nkiov = lnet_extract_kiov(payload_niov, tx->tx_kiov, - payload_niov, payload_kiov, - payload_offset, payload_nob); - - if (payload_nob >= *ksocknal_tunables.ksnd_zc_min_payload) - tx->tx_zc_capable = 1; - } - - tx->tx_msg.ksm_csum = 0; - tx->tx_msg.ksm_type = KSOCK_MSG_LNET; - tx->tx_msg.ksm_zc_cookies[0] = 0; - tx->tx_msg.ksm_zc_cookies[1] = 0; - - /* The first fragment will be set later in pro_pack */ - rc = ksocknal_launch_packet(ni, tx, target); - if (mpflag) - memalloc_noreclaim_restore(mpflag); - - if (!rc) - return 0; - - ksocknal_free_tx(tx); - return -EIO; -} - -int -ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name) -{ - struct task_struct *task = kthread_run(fn, arg, "%s", name); - - if (IS_ERR(task)) - return PTR_ERR(task); - - write_lock_bh(&ksocknal_data.ksnd_global_lock); - ksocknal_data.ksnd_nthreads++; - write_unlock_bh(&ksocknal_data.ksnd_global_lock); - return 0; -} - -void -ksocknal_thread_fini(void) -{ - write_lock_bh(&ksocknal_data.ksnd_global_lock); - ksocknal_data.ksnd_nthreads--; - write_unlock_bh(&ksocknal_data.ksnd_global_lock); -} - -int -ksocknal_new_packet(struct ksock_conn *conn, int nob_to_skip) -{ - static char ksocknal_slop_buffer[4096]; - struct kvec *kvec = conn->ksnc_rx_iov_space; - - int nob; - unsigned int niov; - int skipped; - - LASSERT(conn->ksnc_proto); - - if (*ksocknal_tunables.ksnd_eager_ack & conn->ksnc_type) { - /* Remind the socket to ack eagerly... */ - ksocknal_lib_eager_ack(conn); - } - - if (!nob_to_skip) { /* right at next packet boundary now */ - conn->ksnc_rx_started = 0; - mb(); /* racing with timeout thread */ - - switch (conn->ksnc_proto->pro_version) { - case KSOCK_PROTO_V2: - case KSOCK_PROTO_V3: - conn->ksnc_rx_state = SOCKNAL_RX_KSM_HEADER; - kvec->iov_base = &conn->ksnc_msg; - kvec->iov_len = offsetof(struct ksock_msg, ksm_u); - conn->ksnc_rx_nob_left = offsetof(struct ksock_msg, ksm_u); - iov_iter_kvec(&conn->ksnc_rx_to, READ|ITER_KVEC, kvec, - 1, offsetof(struct ksock_msg, ksm_u)); - break; - - case KSOCK_PROTO_V1: - /* Receiving bare struct lnet_hdr */ - conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER; - kvec->iov_base = &conn->ksnc_msg.ksm_u.lnetmsg; - kvec->iov_len = sizeof(struct lnet_hdr); - conn->ksnc_rx_nob_left = sizeof(struct lnet_hdr); - iov_iter_kvec(&conn->ksnc_rx_to, READ|ITER_KVEC, kvec, - 1, sizeof(struct lnet_hdr)); - break; - - default: - LBUG(); - } - conn->ksnc_rx_csum = ~0; - return 1; - } - - /* - * Set up to skip as much as possible now. If there's more left - * (ran out of iov entries) we'll get called again - */ - conn->ksnc_rx_state = SOCKNAL_RX_SLOP; - conn->ksnc_rx_nob_left = nob_to_skip; - skipped = 0; - niov = 0; - - do { - nob = min_t(int, nob_to_skip, sizeof(ksocknal_slop_buffer)); - - kvec[niov].iov_base = ksocknal_slop_buffer; - kvec[niov].iov_len = nob; - niov++; - skipped += nob; - nob_to_skip -= nob; - - } while (nob_to_skip && /* mustn't overflow conn's rx iov */ - niov < sizeof(conn->ksnc_rx_iov_space) / sizeof(struct iovec)); - - iov_iter_kvec(&conn->ksnc_rx_to, READ|ITER_KVEC, kvec, niov, skipped); - return 0; -} - -static int -ksocknal_process_receive(struct ksock_conn *conn) -{ - struct kvec *kvec = conn->ksnc_rx_iov_space; - struct lnet_hdr *lhdr; - struct lnet_process_id *id; - int rc; - - LASSERT(atomic_read(&conn->ksnc_conn_refcount) > 0); - - /* NB: sched lock NOT held */ - /* SOCKNAL_RX_LNET_HEADER is here for backward compatibility */ - LASSERT(conn->ksnc_rx_state == SOCKNAL_RX_KSM_HEADER || - conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD || - conn->ksnc_rx_state == SOCKNAL_RX_LNET_HEADER || - conn->ksnc_rx_state == SOCKNAL_RX_SLOP); - again: - if (iov_iter_count(&conn->ksnc_rx_to)) { - rc = ksocknal_receive(conn); - - if (rc <= 0) { - LASSERT(rc != -EAGAIN); - - if (!rc) - CDEBUG(D_NET, "[%p] EOF from %s ip %pI4h:%d\n", - conn, - libcfs_id2str(conn->ksnc_peer->ksnp_id), - &conn->ksnc_ipaddr, - conn->ksnc_port); - else if (!conn->ksnc_closing) - CERROR("[%p] Error %d on read from %s ip %pI4h:%d\n", - conn, rc, - libcfs_id2str(conn->ksnc_peer->ksnp_id), - &conn->ksnc_ipaddr, - conn->ksnc_port); - - /* it's not an error if conn is being closed */ - ksocknal_close_conn_and_siblings(conn, - (conn->ksnc_closing) ? 0 : rc); - return (!rc ? -ESHUTDOWN : rc); - } - - if (iov_iter_count(&conn->ksnc_rx_to)) { - /* short read */ - return -EAGAIN; - } - } - switch (conn->ksnc_rx_state) { - case SOCKNAL_RX_KSM_HEADER: - if (conn->ksnc_flip) { - __swab32s(&conn->ksnc_msg.ksm_type); - __swab32s(&conn->ksnc_msg.ksm_csum); - __swab64s(&conn->ksnc_msg.ksm_zc_cookies[0]); - __swab64s(&conn->ksnc_msg.ksm_zc_cookies[1]); - } - - if (conn->ksnc_msg.ksm_type != KSOCK_MSG_NOOP && - conn->ksnc_msg.ksm_type != KSOCK_MSG_LNET) { - CERROR("%s: Unknown message type: %x\n", - libcfs_id2str(conn->ksnc_peer->ksnp_id), - conn->ksnc_msg.ksm_type); - ksocknal_new_packet(conn, 0); - ksocknal_close_conn_and_siblings(conn, -EPROTO); - return -EPROTO; - } - - if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP && - conn->ksnc_msg.ksm_csum && /* has checksum */ - conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) { - /* NOOP Checksum error */ - CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n", - libcfs_id2str(conn->ksnc_peer->ksnp_id), - conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum); - ksocknal_new_packet(conn, 0); - ksocknal_close_conn_and_siblings(conn, -EPROTO); - return -EIO; - } - - if (conn->ksnc_msg.ksm_zc_cookies[1]) { - __u64 cookie = 0; - - LASSERT(conn->ksnc_proto != &ksocknal_protocol_v1x); - - if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP) - cookie = conn->ksnc_msg.ksm_zc_cookies[0]; - - rc = conn->ksnc_proto->pro_handle_zcack(conn, cookie, - conn->ksnc_msg.ksm_zc_cookies[1]); - - if (rc) { - CERROR("%s: Unknown ZC-ACK cookie: %llu, %llu\n", - libcfs_id2str(conn->ksnc_peer->ksnp_id), - cookie, conn->ksnc_msg.ksm_zc_cookies[1]); - ksocknal_new_packet(conn, 0); - ksocknal_close_conn_and_siblings(conn, -EPROTO); - return rc; - } - } - - if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP) { - ksocknal_new_packet(conn, 0); - return 0; /* NOOP is done and just return */ - } - - conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER; - conn->ksnc_rx_nob_left = sizeof(struct ksock_lnet_msg); - - kvec->iov_base = &conn->ksnc_msg.ksm_u.lnetmsg; - kvec->iov_len = sizeof(struct ksock_lnet_msg); - - iov_iter_kvec(&conn->ksnc_rx_to, READ|ITER_KVEC, kvec, - 1, sizeof(struct ksock_lnet_msg)); - - goto again; /* read lnet header now */ - - case SOCKNAL_RX_LNET_HEADER: - /* unpack message header */ - conn->ksnc_proto->pro_unpack(&conn->ksnc_msg); - - if (conn->ksnc_peer->ksnp_id.pid & LNET_PID_USERFLAG) { - /* Userspace peer */ - lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr; - id = &conn->ksnc_peer->ksnp_id; - - /* Substitute process ID assigned at connection time */ - lhdr->src_pid = cpu_to_le32(id->pid); - lhdr->src_nid = cpu_to_le64(id->nid); - } - - conn->ksnc_rx_state = SOCKNAL_RX_PARSE; - ksocknal_conn_addref(conn); /* ++ref while parsing */ - - rc = lnet_parse(conn->ksnc_peer->ksnp_ni, - &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr, - conn->ksnc_peer->ksnp_id.nid, conn, 0); - if (rc < 0) { - /* I just received garbage: give up on this conn */ - ksocknal_new_packet(conn, 0); - ksocknal_close_conn_and_siblings(conn, rc); - ksocknal_conn_decref(conn); - return -EPROTO; - } - - /* I'm racing with ksocknal_recv() */ - LASSERT(conn->ksnc_rx_state == SOCKNAL_RX_PARSE || - conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD); - - if (conn->ksnc_rx_state != SOCKNAL_RX_LNET_PAYLOAD) - return 0; - - /* ksocknal_recv() got called */ - goto again; - - case SOCKNAL_RX_LNET_PAYLOAD: - /* payload all received */ - rc = 0; - - if (!conn->ksnc_rx_nob_left && /* not truncating */ - conn->ksnc_msg.ksm_csum && /* has checksum */ - conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) { - CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n", - libcfs_id2str(conn->ksnc_peer->ksnp_id), - conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum); - rc = -EIO; - } - - if (!rc && conn->ksnc_msg.ksm_zc_cookies[0]) { - LASSERT(conn->ksnc_proto != &ksocknal_protocol_v1x); - - lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr; - id = &conn->ksnc_peer->ksnp_id; - - rc = conn->ksnc_proto->pro_handle_zcreq(conn, - conn->ksnc_msg.ksm_zc_cookies[0], - *ksocknal_tunables.ksnd_nonblk_zcack || - le64_to_cpu(lhdr->src_nid) != id->nid); - } - - lnet_finalize(conn->ksnc_peer->ksnp_ni, conn->ksnc_cookie, rc); - - if (rc) { - ksocknal_new_packet(conn, 0); - ksocknal_close_conn_and_siblings(conn, rc); - return -EPROTO; - } - /* Fall through */ - - case SOCKNAL_RX_SLOP: - /* starting new packet? */ - if (ksocknal_new_packet(conn, conn->ksnc_rx_nob_left)) - return 0; /* come back later */ - goto again; /* try to finish reading slop now */ - - default: - break; - } - - /* Not Reached */ - LBUG(); - return -EINVAL; /* keep gcc happy */ -} - -int -ksocknal_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg, - int delayed, struct iov_iter *to, unsigned int rlen) -{ - struct ksock_conn *conn = private; - struct ksock_sched *sched = conn->ksnc_scheduler; - - LASSERT(iov_iter_count(to) <= rlen); - LASSERT(to->nr_segs <= LNET_MAX_IOV); - - conn->ksnc_cookie = msg; - conn->ksnc_rx_nob_left = rlen; - - conn->ksnc_rx_to = *to; - - LASSERT(conn->ksnc_rx_scheduled); - - spin_lock_bh(&sched->kss_lock); - - switch (conn->ksnc_rx_state) { - case SOCKNAL_RX_PARSE_WAIT: - list_add_tail(&conn->ksnc_rx_list, &sched->kss_rx_conns); - wake_up(&sched->kss_waitq); - LASSERT(conn->ksnc_rx_ready); - break; - - case SOCKNAL_RX_PARSE: - /* scheduler hasn't noticed I'm parsing yet */ - break; - } - - conn->ksnc_rx_state = SOCKNAL_RX_LNET_PAYLOAD; - - spin_unlock_bh(&sched->kss_lock); - ksocknal_conn_decref(conn); - return 0; -} - -static inline int -ksocknal_sched_cansleep(struct ksock_sched *sched) -{ - int rc; - - spin_lock_bh(&sched->kss_lock); - - rc = !ksocknal_data.ksnd_shuttingdown && - list_empty(&sched->kss_rx_conns) && - list_empty(&sched->kss_tx_conns); - - spin_unlock_bh(&sched->kss_lock); - return rc; -} - -int ksocknal_scheduler(void *arg) -{ - struct ksock_sched_info *info; - struct ksock_sched *sched; - struct ksock_conn *conn; - struct ksock_tx *tx; - int rc; - int nloops = 0; - long id = (long)arg; - - info = ksocknal_data.ksnd_sched_info[KSOCK_THREAD_CPT(id)]; - sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)]; - - rc = cfs_cpt_bind(lnet_cpt_table(), info->ksi_cpt); - if (rc) { - CWARN("Can't set CPU partition affinity to %d: %d\n", - info->ksi_cpt, rc); - } - - spin_lock_bh(&sched->kss_lock); - - while (!ksocknal_data.ksnd_shuttingdown) { - int did_something = 0; - - /* Ensure I progress everything semi-fairly */ - - if (!list_empty(&sched->kss_rx_conns)) { - conn = list_entry(sched->kss_rx_conns.next, - struct ksock_conn, ksnc_rx_list); - list_del(&conn->ksnc_rx_list); - - LASSERT(conn->ksnc_rx_scheduled); - LASSERT(conn->ksnc_rx_ready); - - /* - * clear rx_ready in case receive isn't complete. - * Do it BEFORE we call process_recv, since - * data_ready can set it any time after we release - * kss_lock. - */ - conn->ksnc_rx_ready = 0; - spin_unlock_bh(&sched->kss_lock); - - rc = ksocknal_process_receive(conn); - - spin_lock_bh(&sched->kss_lock); - - /* I'm the only one that can clear this flag */ - LASSERT(conn->ksnc_rx_scheduled); - - /* Did process_receive get everything it wanted? */ - if (!rc) - conn->ksnc_rx_ready = 1; - - if (conn->ksnc_rx_state == SOCKNAL_RX_PARSE) { - /* - * Conn blocked waiting for ksocknal_recv() - * I change its state (under lock) to signal - * it can be rescheduled - */ - conn->ksnc_rx_state = SOCKNAL_RX_PARSE_WAIT; - } else if (conn->ksnc_rx_ready) { - /* reschedule for rx */ - list_add_tail(&conn->ksnc_rx_list, - &sched->kss_rx_conns); - } else { - conn->ksnc_rx_scheduled = 0; - /* drop my ref */ - ksocknal_conn_decref(conn); - } - - did_something = 1; - } - - if (!list_empty(&sched->kss_tx_conns)) { - LIST_HEAD(zlist); - - if (!list_empty(&sched->kss_zombie_noop_txs)) { - list_add(&zlist, &sched->kss_zombie_noop_txs); - list_del_init(&sched->kss_zombie_noop_txs); - } - - conn = list_entry(sched->kss_tx_conns.next, - struct ksock_conn, ksnc_tx_list); - list_del(&conn->ksnc_tx_list); - - LASSERT(conn->ksnc_tx_scheduled); - LASSERT(conn->ksnc_tx_ready); - LASSERT(!list_empty(&conn->ksnc_tx_queue)); - - tx = list_entry(conn->ksnc_tx_queue.next, - struct ksock_tx, tx_list); - - if (conn->ksnc_tx_carrier == tx) - ksocknal_next_tx_carrier(conn); - - /* dequeue now so empty list => more to send */ - list_del(&tx->tx_list); - - /* - * Clear tx_ready in case send isn't complete. Do - * it BEFORE we call process_transmit, since - * write_space can set it any time after we release - * kss_lock. - */ - conn->ksnc_tx_ready = 0; - spin_unlock_bh(&sched->kss_lock); - - if (!list_empty(&zlist)) { - /* - * free zombie noop txs, it's fast because - * noop txs are just put in freelist - */ - ksocknal_txlist_done(NULL, &zlist, 0); - } - - rc = ksocknal_process_transmit(conn, tx); - - if (rc == -ENOMEM || rc == -EAGAIN) { - /* - * Incomplete send: replace tx on HEAD of - * tx_queue - */ - spin_lock_bh(&sched->kss_lock); - list_add(&tx->tx_list, &conn->ksnc_tx_queue); - } else { - /* Complete send; tx -ref */ - ksocknal_tx_decref(tx); - - spin_lock_bh(&sched->kss_lock); - /* assume space for more */ - conn->ksnc_tx_ready = 1; - } - - if (rc == -ENOMEM) { - /* - * Do nothing; after a short timeout, this - * conn will be reposted on kss_tx_conns. - */ - } else if (conn->ksnc_tx_ready && - !list_empty(&conn->ksnc_tx_queue)) { - /* reschedule for tx */ - list_add_tail(&conn->ksnc_tx_list, - &sched->kss_tx_conns); - } else { - conn->ksnc_tx_scheduled = 0; - /* drop my ref */ - ksocknal_conn_decref(conn); - } - - did_something = 1; - } - if (!did_something || /* nothing to do */ - ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */ - spin_unlock_bh(&sched->kss_lock); - - nloops = 0; - - if (!did_something) { /* wait for something to do */ - rc = wait_event_interruptible_exclusive( - sched->kss_waitq, - !ksocknal_sched_cansleep(sched)); - LASSERT(!rc); - } else { - cond_resched(); - } - - spin_lock_bh(&sched->kss_lock); - } - } - - spin_unlock_bh(&sched->kss_lock); - ksocknal_thread_fini(); - return 0; -} - -/* - * Add connection to kss_rx_conns of scheduler - * and wakeup the scheduler. - */ -void ksocknal_read_callback(struct ksock_conn *conn) -{ - struct ksock_sched *sched; - - sched = conn->ksnc_scheduler; - - spin_lock_bh(&sched->kss_lock); - - conn->ksnc_rx_ready = 1; - - if (!conn->ksnc_rx_scheduled) { /* not being progressed */ - list_add_tail(&conn->ksnc_rx_list, &sched->kss_rx_conns); - conn->ksnc_rx_scheduled = 1; - /* extra ref for scheduler */ - ksocknal_conn_addref(conn); - - wake_up(&sched->kss_waitq); - } - spin_unlock_bh(&sched->kss_lock); -} - -/* - * Add connection to kss_tx_conns of scheduler - * and wakeup the scheduler. - */ -void ksocknal_write_callback(struct ksock_conn *conn) -{ - struct ksock_sched *sched; - - sched = conn->ksnc_scheduler; - - spin_lock_bh(&sched->kss_lock); - - conn->ksnc_tx_ready = 1; - - if (!conn->ksnc_tx_scheduled && /* not being progressed */ - !list_empty(&conn->ksnc_tx_queue)) { /* packets to send */ - list_add_tail(&conn->ksnc_tx_list, &sched->kss_tx_conns); - conn->ksnc_tx_scheduled = 1; - /* extra ref for scheduler */ - ksocknal_conn_addref(conn); - - wake_up(&sched->kss_waitq); - } - - spin_unlock_bh(&sched->kss_lock); -} - -static struct ksock_proto * -ksocknal_parse_proto_version(struct ksock_hello_msg *hello) -{ - __u32 version = 0; - - if (hello->kshm_magic == LNET_PROTO_MAGIC) - version = hello->kshm_version; - else if (hello->kshm_magic == __swab32(LNET_PROTO_MAGIC)) - version = __swab32(hello->kshm_version); - - if (version) { -#if SOCKNAL_VERSION_DEBUG - if (*ksocknal_tunables.ksnd_protocol == 1) - return NULL; - - if (*ksocknal_tunables.ksnd_protocol == 2 && - version == KSOCK_PROTO_V3) - return NULL; -#endif - if (version == KSOCK_PROTO_V2) - return &ksocknal_protocol_v2x; - - if (version == KSOCK_PROTO_V3) - return &ksocknal_protocol_v3x; - - return NULL; - } - - if (hello->kshm_magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC)) { - struct lnet_magicversion *hmv = (struct lnet_magicversion *)hello; - - BUILD_BUG_ON(sizeof(struct lnet_magicversion) != - offsetof(struct ksock_hello_msg, kshm_src_nid)); - - if (hmv->version_major == cpu_to_le16(KSOCK_PROTO_V1_MAJOR) && - hmv->version_minor == cpu_to_le16(KSOCK_PROTO_V1_MINOR)) - return &ksocknal_protocol_v1x; - } - - return NULL; -} - -int -ksocknal_send_hello(struct lnet_ni *ni, struct ksock_conn *conn, - lnet_nid_t peer_nid, struct ksock_hello_msg *hello) -{ - /* CAVEAT EMPTOR: this byte flips 'ipaddrs' */ - struct ksock_net *net = (struct ksock_net *)ni->ni_data; - - LASSERT(hello->kshm_nips <= LNET_MAX_INTERFACES); - - /* rely on caller to hold a ref on socket so it wouldn't disappear */ - LASSERT(conn->ksnc_proto); - - hello->kshm_src_nid = ni->ni_nid; - hello->kshm_dst_nid = peer_nid; - hello->kshm_src_pid = the_lnet.ln_pid; - - hello->kshm_src_incarnation = net->ksnn_incarnation; - hello->kshm_ctype = conn->ksnc_type; - - return conn->ksnc_proto->pro_send_hello(conn, hello); -} - -static int -ksocknal_invert_type(int type) -{ - switch (type) { - case SOCKLND_CONN_ANY: - case SOCKLND_CONN_CONTROL: - return type; - case SOCKLND_CONN_BULK_IN: - return SOCKLND_CONN_BULK_OUT; - case SOCKLND_CONN_BULK_OUT: - return SOCKLND_CONN_BULK_IN; - default: - return SOCKLND_CONN_NONE; - } -} - -int -ksocknal_recv_hello(struct lnet_ni *ni, struct ksock_conn *conn, - struct ksock_hello_msg *hello, - struct lnet_process_id *peerid, - __u64 *incarnation) -{ - /* Return < 0 fatal error - * 0 success - * EALREADY lost connection race - * EPROTO protocol version mismatch - */ - struct socket *sock = conn->ksnc_sock; - int active = !!conn->ksnc_proto; - int timeout; - int proto_match; - int rc; - struct ksock_proto *proto; - struct lnet_process_id recv_id; - - /* socket type set on active connections - not set on passive */ - LASSERT(!active == !(conn->ksnc_type != SOCKLND_CONN_NONE)); - - timeout = active ? *ksocknal_tunables.ksnd_timeout : - lnet_acceptor_timeout(); - - rc = lnet_sock_read(sock, &hello->kshm_magic, - sizeof(hello->kshm_magic), timeout); - if (rc) { - CERROR("Error %d reading HELLO from %pI4h\n", - rc, &conn->ksnc_ipaddr); - LASSERT(rc < 0); - return rc; - } - - if (hello->kshm_magic != LNET_PROTO_MAGIC && - hello->kshm_magic != __swab32(LNET_PROTO_MAGIC) && - hello->kshm_magic != le32_to_cpu(LNET_PROTO_TCP_MAGIC)) { - /* Unexpected magic! */ - CERROR("Bad magic(1) %#08x (%#08x expected) from %pI4h\n", - __cpu_to_le32(hello->kshm_magic), - LNET_PROTO_TCP_MAGIC, - &conn->ksnc_ipaddr); - return -EPROTO; - } - - rc = lnet_sock_read(sock, &hello->kshm_version, - sizeof(hello->kshm_version), timeout); - if (rc) { - CERROR("Error %d reading HELLO from %pI4h\n", - rc, &conn->ksnc_ipaddr); - LASSERT(rc < 0); - return rc; - } - - proto = ksocknal_parse_proto_version(hello); - if (!proto) { - if (!active) { - /* unknown protocol from peer, tell peer my protocol */ - conn->ksnc_proto = &ksocknal_protocol_v3x; -#if SOCKNAL_VERSION_DEBUG - if (*ksocknal_tunables.ksnd_protocol == 2) - conn->ksnc_proto = &ksocknal_protocol_v2x; - else if (*ksocknal_tunables.ksnd_protocol == 1) - conn->ksnc_proto = &ksocknal_protocol_v1x; -#endif - hello->kshm_nips = 0; - ksocknal_send_hello(ni, conn, ni->ni_nid, hello); - } - - CERROR("Unknown protocol version (%d.x expected) from %pI4h\n", - conn->ksnc_proto->pro_version, - &conn->ksnc_ipaddr); - - return -EPROTO; - } - - proto_match = (conn->ksnc_proto == proto); - conn->ksnc_proto = proto; - - /* receive the rest of hello message anyway */ - rc = conn->ksnc_proto->pro_recv_hello(conn, hello, timeout); - if (rc) { - CERROR("Error %d reading or checking hello from from %pI4h\n", - rc, &conn->ksnc_ipaddr); - LASSERT(rc < 0); - return rc; - } - - *incarnation = hello->kshm_src_incarnation; - - if (hello->kshm_src_nid == LNET_NID_ANY) { - CERROR("Expecting a HELLO hdr with a NID, but got LNET_NID_ANY from %pI4h\n", - &conn->ksnc_ipaddr); - return -EPROTO; - } - - if (!active && - conn->ksnc_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) { - /* Userspace NAL assigns peer process ID from socket */ - recv_id.pid = conn->ksnc_port | LNET_PID_USERFLAG; - recv_id.nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), - conn->ksnc_ipaddr); - } else { - recv_id.nid = hello->kshm_src_nid; - recv_id.pid = hello->kshm_src_pid; - } - - if (!active) { - *peerid = recv_id; - - /* peer determines type */ - conn->ksnc_type = ksocknal_invert_type(hello->kshm_ctype); - if (conn->ksnc_type == SOCKLND_CONN_NONE) { - CERROR("Unexpected type %d from %s ip %pI4h\n", - hello->kshm_ctype, libcfs_id2str(*peerid), - &conn->ksnc_ipaddr); - return -EPROTO; - } - - return 0; - } - - if (peerid->pid != recv_id.pid || - peerid->nid != recv_id.nid) { - LCONSOLE_ERROR_MSG(0x130, "Connected successfully to %s on host %pI4h, but they claimed they were %s; please check your Lustre configuration.\n", - libcfs_id2str(*peerid), - &conn->ksnc_ipaddr, - libcfs_id2str(recv_id)); - return -EPROTO; - } - - if (hello->kshm_ctype == SOCKLND_CONN_NONE) { - /* Possible protocol mismatch or I lost the connection race */ - return proto_match ? EALREADY : EPROTO; - } - - if (ksocknal_invert_type(hello->kshm_ctype) != conn->ksnc_type) { - CERROR("Mismatched types: me %d, %s ip %pI4h %d\n", - conn->ksnc_type, libcfs_id2str(*peerid), - &conn->ksnc_ipaddr, hello->kshm_ctype); - return -EPROTO; - } - - return 0; -} - -static int -ksocknal_connect(struct ksock_route *route) -{ - LIST_HEAD(zombies); - struct ksock_peer *peer = route->ksnr_peer; - int type; - int wanted; - struct socket *sock; - unsigned long deadline; - int retry_later = 0; - int rc = 0; - - deadline = jiffies + *ksocknal_tunables.ksnd_timeout * HZ; - - write_lock_bh(&ksocknal_data.ksnd_global_lock); - - LASSERT(route->ksnr_scheduled); - LASSERT(!route->ksnr_connecting); - - route->ksnr_connecting = 1; - - for (;;) { - wanted = ksocknal_route_mask() & ~route->ksnr_connected; - - /* - * stop connecting if peer/route got closed under me, or - * route got connected while queued - */ - if (peer->ksnp_closing || route->ksnr_deleted || - !wanted) { - retry_later = 0; - break; - } - - /* reschedule if peer is connecting to me */ - if (peer->ksnp_accepting > 0) { - CDEBUG(D_NET, - "peer %s(%d) already connecting to me, retry later.\n", - libcfs_nid2str(peer->ksnp_id.nid), - peer->ksnp_accepting); - retry_later = 1; - } - - if (retry_later) /* needs reschedule */ - break; - - if (wanted & BIT(SOCKLND_CONN_ANY)) { - type = SOCKLND_CONN_ANY; - } else if (wanted & BIT(SOCKLND_CONN_CONTROL)) { - type = SOCKLND_CONN_CONTROL; - } else if (wanted & BIT(SOCKLND_CONN_BULK_IN)) { - type = SOCKLND_CONN_BULK_IN; - } else { - LASSERT(wanted & BIT(SOCKLND_CONN_BULK_OUT)); - type = SOCKLND_CONN_BULK_OUT; - } - - write_unlock_bh(&ksocknal_data.ksnd_global_lock); - - if (time_after_eq(jiffies, deadline)) { - rc = -ETIMEDOUT; - lnet_connect_console_error(rc, peer->ksnp_id.nid, - route->ksnr_ipaddr, - route->ksnr_port); - goto failed; - } - - rc = lnet_connect(&sock, peer->ksnp_id.nid, - route->ksnr_myipaddr, - route->ksnr_ipaddr, route->ksnr_port); - if (rc) - goto failed; - - rc = ksocknal_create_conn(peer->ksnp_ni, route, sock, type); - if (rc < 0) { - lnet_connect_console_error(rc, peer->ksnp_id.nid, - route->ksnr_ipaddr, - route->ksnr_port); - goto failed; - } - - /* - * A +ve RC means I have to retry because I lost the connection - * race or I have to renegotiate protocol version - */ - retry_later = (rc); - if (retry_later) - CDEBUG(D_NET, "peer %s: conn race, retry later.\n", - libcfs_nid2str(peer->ksnp_id.nid)); - - write_lock_bh(&ksocknal_data.ksnd_global_lock); - } - - route->ksnr_scheduled = 0; - route->ksnr_connecting = 0; - - if (retry_later) { - /* - * re-queue for attention; this frees me up to handle - * the peer's incoming connection request - */ - if (rc == EALREADY || - (!rc && peer->ksnp_accepting > 0)) { - /* - * We want to introduce a delay before next - * attempt to connect if we lost conn race, - * but the race is resolved quickly usually, - * so min_reconnectms should be good heuristic - */ - route->ksnr_retry_interval = - *ksocknal_tunables.ksnd_min_reconnectms * HZ / 1000; - route->ksnr_timeout = jiffies + route->ksnr_retry_interval; - } - - ksocknal_launch_connection_locked(route); - } - - write_unlock_bh(&ksocknal_data.ksnd_global_lock); - return retry_later; - - failed: - write_lock_bh(&ksocknal_data.ksnd_global_lock); - - route->ksnr_scheduled = 0; - route->ksnr_connecting = 0; - - /* This is a retry rather than a new connection */ - route->ksnr_retry_interval *= 2; - route->ksnr_retry_interval = - max(route->ksnr_retry_interval, - (long)*ksocknal_tunables.ksnd_min_reconnectms * HZ / 1000); - route->ksnr_retry_interval = - min(route->ksnr_retry_interval, - (long)*ksocknal_tunables.ksnd_max_reconnectms * HZ / 1000); - - LASSERT(route->ksnr_retry_interval); - route->ksnr_timeout = jiffies + route->ksnr_retry_interval; - - if (!list_empty(&peer->ksnp_tx_queue) && - !peer->ksnp_accepting && - !ksocknal_find_connecting_route_locked(peer)) { - struct ksock_conn *conn; - - /* - * ksnp_tx_queue is queued on a conn on successful - * connection for V1.x and V2.x - */ - if (!list_empty(&peer->ksnp_conns)) { - conn = list_entry(peer->ksnp_conns.next, - struct ksock_conn, ksnc_list); - LASSERT(conn->ksnc_proto == &ksocknal_protocol_v3x); - } - - /* - * take all the blocked packets while I've got the lock and - * complete below... - */ - list_splice_init(&peer->ksnp_tx_queue, &zombies); - } - - write_unlock_bh(&ksocknal_data.ksnd_global_lock); - - ksocknal_peer_failed(peer); - ksocknal_txlist_done(peer->ksnp_ni, &zombies, 1); - return 0; -} - -/* - * check whether we need to create more connds. - * It will try to create new thread if it's necessary, @timeout can - * be updated if failed to create, so caller wouldn't keep try while - * running out of resource. - */ -static int -ksocknal_connd_check_start(time64_t sec, long *timeout) -{ - char name[16]; - int rc; - int total = ksocknal_data.ksnd_connd_starting + - ksocknal_data.ksnd_connd_running; - - if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) { - /* still in initializing */ - return 0; - } - - if (total >= *ksocknal_tunables.ksnd_nconnds_max || - total > ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV) { - /* - * can't create more connd, or still have enough - * threads to handle more connecting - */ - return 0; - } - - if (list_empty(&ksocknal_data.ksnd_connd_routes)) { - /* no pending connecting request */ - return 0; - } - - if (sec - ksocknal_data.ksnd_connd_failed_stamp <= 1) { - /* may run out of resource, retry later */ - *timeout = HZ; - return 0; - } - - if (ksocknal_data.ksnd_connd_starting > 0) { - /* serialize starting to avoid flood */ - return 0; - } - - ksocknal_data.ksnd_connd_starting_stamp = sec; - ksocknal_data.ksnd_connd_starting++; - spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); - - /* NB: total is the next id */ - snprintf(name, sizeof(name), "socknal_cd%02d", total); - rc = ksocknal_thread_start(ksocknal_connd, NULL, name); - - spin_lock_bh(&ksocknal_data.ksnd_connd_lock); - if (!rc) - return 1; - - /* we tried ... */ - LASSERT(ksocknal_data.ksnd_connd_starting > 0); - ksocknal_data.ksnd_connd_starting--; - ksocknal_data.ksnd_connd_failed_stamp = ktime_get_real_seconds(); - - return 1; -} - -/* - * check whether current thread can exit, it will return 1 if there are too - * many threads and no creating in past 120 seconds. - * Also, this function may update @timeout to make caller come back - * again to recheck these conditions. - */ -static int -ksocknal_connd_check_stop(time64_t sec, long *timeout) -{ - int val; - - if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) { - /* still in initializing */ - return 0; - } - - if (ksocknal_data.ksnd_connd_starting > 0) { - /* in progress of starting new thread */ - return 0; - } - - if (ksocknal_data.ksnd_connd_running <= - *ksocknal_tunables.ksnd_nconnds) { /* can't shrink */ - return 0; - } - - /* created thread in past 120 seconds? */ - val = (int)(ksocknal_data.ksnd_connd_starting_stamp + - SOCKNAL_CONND_TIMEOUT - sec); - - *timeout = (val > 0) ? val * HZ : - SOCKNAL_CONND_TIMEOUT * HZ; - if (val > 0) - return 0; - - /* no creating in past 120 seconds */ - - return ksocknal_data.ksnd_connd_running > - ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV; -} - -/* - * Go through connd_routes queue looking for a route that we can process - * right now, @timeout_p can be updated if we need to come back later - */ -static struct ksock_route * -ksocknal_connd_get_route_locked(signed long *timeout_p) -{ - struct ksock_route *route; - unsigned long now; - - now = jiffies; - - /* connd_routes can contain both pending and ordinary routes */ - list_for_each_entry(route, &ksocknal_data.ksnd_connd_routes, - ksnr_connd_list) { - if (!route->ksnr_retry_interval || - time_after_eq(now, route->ksnr_timeout)) - return route; - - if (*timeout_p == MAX_SCHEDULE_TIMEOUT || - (int)*timeout_p > (int)(route->ksnr_timeout - now)) - *timeout_p = (int)(route->ksnr_timeout - now); - } - - return NULL; -} - -int -ksocknal_connd(void *arg) -{ - spinlock_t *connd_lock = &ksocknal_data.ksnd_connd_lock; - struct ksock_connreq *cr; - wait_queue_entry_t wait; - int nloops = 0; - int cons_retry = 0; - - init_waitqueue_entry(&wait, current); - - spin_lock_bh(connd_lock); - - LASSERT(ksocknal_data.ksnd_connd_starting > 0); - ksocknal_data.ksnd_connd_starting--; - ksocknal_data.ksnd_connd_running++; - - while (!ksocknal_data.ksnd_shuttingdown) { - struct ksock_route *route = NULL; - time64_t sec = ktime_get_real_seconds(); - long timeout = MAX_SCHEDULE_TIMEOUT; - int dropped_lock = 0; - - if (ksocknal_connd_check_stop(sec, &timeout)) { - /* wakeup another one to check stop */ - wake_up(&ksocknal_data.ksnd_connd_waitq); - break; - } - - if (ksocknal_connd_check_start(sec, &timeout)) { - /* created new thread */ - dropped_lock = 1; - } - - if (!list_empty(&ksocknal_data.ksnd_connd_connreqs)) { - /* Connection accepted by the listener */ - cr = list_entry(ksocknal_data.ksnd_connd_connreqs.next, - struct ksock_connreq, ksncr_list); - - list_del(&cr->ksncr_list); - spin_unlock_bh(connd_lock); - dropped_lock = 1; - - ksocknal_create_conn(cr->ksncr_ni, NULL, - cr->ksncr_sock, SOCKLND_CONN_NONE); - lnet_ni_decref(cr->ksncr_ni); - kfree(cr); - - spin_lock_bh(connd_lock); - } - - /* - * Only handle an outgoing connection request if there - * is a thread left to handle incoming connections and - * create new connd - */ - if (ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV < - ksocknal_data.ksnd_connd_running) { - route = ksocknal_connd_get_route_locked(&timeout); - } - if (route) { - list_del(&route->ksnr_connd_list); - ksocknal_data.ksnd_connd_connecting++; - spin_unlock_bh(connd_lock); - dropped_lock = 1; - - if (ksocknal_connect(route)) { - /* consecutive retry */ - if (cons_retry++ > SOCKNAL_INSANITY_RECONN) { - CWARN("massive consecutive re-connecting to %pI4h\n", - &route->ksnr_ipaddr); - cons_retry = 0; - } - } else { - cons_retry = 0; - } - - ksocknal_route_decref(route); - - spin_lock_bh(connd_lock); - ksocknal_data.ksnd_connd_connecting--; - } - - if (dropped_lock) { - if (++nloops < SOCKNAL_RESCHED) - continue; - spin_unlock_bh(connd_lock); - nloops = 0; - cond_resched(); - spin_lock_bh(connd_lock); - continue; - } - - /* Nothing to do for 'timeout' */ - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue_exclusive(&ksocknal_data.ksnd_connd_waitq, - &wait); - spin_unlock_bh(connd_lock); - - nloops = 0; - schedule_timeout(timeout); - - remove_wait_queue(&ksocknal_data.ksnd_connd_waitq, &wait); - spin_lock_bh(connd_lock); - } - ksocknal_data.ksnd_connd_running--; - spin_unlock_bh(connd_lock); - - ksocknal_thread_fini(); - return 0; -} - -static struct ksock_conn * -ksocknal_find_timed_out_conn(struct ksock_peer *peer) -{ - /* We're called with a shared lock on ksnd_global_lock */ - struct ksock_conn *conn; - struct list_head *ctmp; - - list_for_each(ctmp, &peer->ksnp_conns) { - int error; - - conn = list_entry(ctmp, struct ksock_conn, ksnc_list); - - /* Don't need the {get,put}connsock dance to deref ksnc_sock */ - LASSERT(!conn->ksnc_closing); - - /* - * SOCK_ERROR will reset error code of socket in - * some platform (like Darwin8.x) - */ - error = conn->ksnc_sock->sk->sk_err; - if (error) { - ksocknal_conn_addref(conn); - - switch (error) { - case ECONNRESET: - CNETERR("A connection with %s (%pI4h:%d) was reset; it may have rebooted.\n", - libcfs_id2str(peer->ksnp_id), - &conn->ksnc_ipaddr, - conn->ksnc_port); - break; - case ETIMEDOUT: - CNETERR("A connection with %s (%pI4h:%d) timed out; the network or node may be down.\n", - libcfs_id2str(peer->ksnp_id), - &conn->ksnc_ipaddr, - conn->ksnc_port); - break; - default: - CNETERR("An unexpected network error %d occurred with %s (%pI4h:%d\n", - error, - libcfs_id2str(peer->ksnp_id), - &conn->ksnc_ipaddr, - conn->ksnc_port); - break; - } - - return conn; - } - - if (conn->ksnc_rx_started && - time_after_eq(jiffies, - conn->ksnc_rx_deadline)) { - /* Timed out incomplete incoming message */ - ksocknal_conn_addref(conn); - CNETERR("Timeout receiving from %s (%pI4h:%d), state %d wanted %zd left %d\n", - libcfs_id2str(peer->ksnp_id), - &conn->ksnc_ipaddr, - conn->ksnc_port, - conn->ksnc_rx_state, - iov_iter_count(&conn->ksnc_rx_to), - conn->ksnc_rx_nob_left); - return conn; - } - - if ((!list_empty(&conn->ksnc_tx_queue) || - conn->ksnc_sock->sk->sk_wmem_queued) && - time_after_eq(jiffies, - conn->ksnc_tx_deadline)) { - /* - * Timed out messages queued for sending or - * buffered in the socket's send buffer - */ - ksocknal_conn_addref(conn); - CNETERR("Timeout sending data to %s (%pI4h:%d) the network or that node may be down.\n", - libcfs_id2str(peer->ksnp_id), - &conn->ksnc_ipaddr, - conn->ksnc_port); - return conn; - } - } - - return NULL; -} - -static inline void -ksocknal_flush_stale_txs(struct ksock_peer *peer) -{ - struct ksock_tx *tx; - struct ksock_tx *tmp; - LIST_HEAD(stale_txs); - - write_lock_bh(&ksocknal_data.ksnd_global_lock); - - list_for_each_entry_safe(tx, tmp, &peer->ksnp_tx_queue, tx_list) { - if (!time_after_eq(jiffies, - tx->tx_deadline)) - break; - - list_del(&tx->tx_list); - list_add_tail(&tx->tx_list, &stale_txs); - } - - write_unlock_bh(&ksocknal_data.ksnd_global_lock); - - ksocknal_txlist_done(peer->ksnp_ni, &stale_txs, 1); -} - -static int -ksocknal_send_keepalive_locked(struct ksock_peer *peer) - __must_hold(&ksocknal_data.ksnd_global_lock) -{ - struct ksock_sched *sched; - struct ksock_conn *conn; - struct ksock_tx *tx; - - /* last_alive will be updated by create_conn */ - if (list_empty(&peer->ksnp_conns)) - return 0; - - if (peer->ksnp_proto != &ksocknal_protocol_v3x) - return 0; - - if (*ksocknal_tunables.ksnd_keepalive <= 0 || - time_before(jiffies, - peer->ksnp_last_alive + *ksocknal_tunables.ksnd_keepalive * HZ)) - return 0; - - if (time_before(jiffies, peer->ksnp_send_keepalive)) - return 0; - - /* - * retry 10 secs later, so we wouldn't put pressure - * on this peer if we failed to send keepalive this time - */ - peer->ksnp_send_keepalive = jiffies + 10 * HZ; - - conn = ksocknal_find_conn_locked(peer, NULL, 1); - if (conn) { - sched = conn->ksnc_scheduler; - - spin_lock_bh(&sched->kss_lock); - if (!list_empty(&conn->ksnc_tx_queue)) { - spin_unlock_bh(&sched->kss_lock); - /* there is an queued ACK, don't need keepalive */ - return 0; - } - - spin_unlock_bh(&sched->kss_lock); - } - - read_unlock(&ksocknal_data.ksnd_global_lock); - - /* cookie = 1 is reserved for keepalive PING */ - tx = ksocknal_alloc_tx_noop(1, 1); - if (!tx) { - read_lock(&ksocknal_data.ksnd_global_lock); - return -ENOMEM; - } - - if (!ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id)) { - read_lock(&ksocknal_data.ksnd_global_lock); - return 1; - } - - ksocknal_free_tx(tx); - read_lock(&ksocknal_data.ksnd_global_lock); - - return -EIO; -} - -static void -ksocknal_check_peer_timeouts(int idx) -{ - struct list_head *peers = &ksocknal_data.ksnd_peers[idx]; - struct ksock_peer *peer; - struct ksock_conn *conn; - struct ksock_tx *tx; - - again: - /* - * NB. We expect to have a look at all the peers and not find any - * connections to time out, so we just use a shared lock while we - * take a look... - */ - read_lock(&ksocknal_data.ksnd_global_lock); - - list_for_each_entry(peer, peers, ksnp_list) { - unsigned long deadline = 0; - struct ksock_tx *tx_stale; - int resid = 0; - int n = 0; - - if (ksocknal_send_keepalive_locked(peer)) { - read_unlock(&ksocknal_data.ksnd_global_lock); - goto again; - } - - conn = ksocknal_find_timed_out_conn(peer); - - if (conn) { - read_unlock(&ksocknal_data.ksnd_global_lock); - - ksocknal_close_conn_and_siblings(conn, -ETIMEDOUT); - - /* - * NB we won't find this one again, but we can't - * just proceed with the next peer, since we dropped - * ksnd_global_lock and it might be dead already! - */ - ksocknal_conn_decref(conn); - goto again; - } - - /* - * we can't process stale txs right here because we're - * holding only shared lock - */ - if (!list_empty(&peer->ksnp_tx_queue)) { - tx = list_entry(peer->ksnp_tx_queue.next, - struct ksock_tx, tx_list); - - if (time_after_eq(jiffies, - tx->tx_deadline)) { - ksocknal_peer_addref(peer); - read_unlock(&ksocknal_data.ksnd_global_lock); - - ksocknal_flush_stale_txs(peer); - - ksocknal_peer_decref(peer); - goto again; - } - } - - if (list_empty(&peer->ksnp_zc_req_list)) - continue; - - tx_stale = NULL; - spin_lock(&peer->ksnp_lock); - list_for_each_entry(tx, &peer->ksnp_zc_req_list, tx_zc_list) { - if (!time_after_eq(jiffies, - tx->tx_deadline)) - break; - /* ignore the TX if connection is being closed */ - if (tx->tx_conn->ksnc_closing) - continue; - if (!tx_stale) - tx_stale = tx; - n++; - } - - if (!tx_stale) { - spin_unlock(&peer->ksnp_lock); - continue; - } - - deadline = tx_stale->tx_deadline; - resid = tx_stale->tx_resid; - conn = tx_stale->tx_conn; - ksocknal_conn_addref(conn); - - spin_unlock(&peer->ksnp_lock); - read_unlock(&ksocknal_data.ksnd_global_lock); - - CERROR("Total %d stale ZC_REQs for peer %s detected; the oldest(%p) timed out %ld secs ago, resid: %d, wmem: %d\n", - n, libcfs_nid2str(peer->ksnp_id.nid), tx_stale, - (jiffies - deadline) / HZ, - resid, conn->ksnc_sock->sk->sk_wmem_queued); - - ksocknal_close_conn_and_siblings(conn, -ETIMEDOUT); - ksocknal_conn_decref(conn); - goto again; - } - - read_unlock(&ksocknal_data.ksnd_global_lock); -} - -int -ksocknal_reaper(void *arg) -{ - wait_queue_entry_t wait; - struct ksock_conn *conn; - struct ksock_sched *sched; - struct list_head enomem_conns; - int nenomem_conns; - long timeout; - int i; - int peer_index = 0; - unsigned long deadline = jiffies; - - INIT_LIST_HEAD(&enomem_conns); - init_waitqueue_entry(&wait, current); - - spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); - - while (!ksocknal_data.ksnd_shuttingdown) { - if (!list_empty(&ksocknal_data.ksnd_deathrow_conns)) { - conn = list_entry(ksocknal_data.ksnd_deathrow_conns.next, - struct ksock_conn, ksnc_list); - list_del(&conn->ksnc_list); - - spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); - - ksocknal_terminate_conn(conn); - ksocknal_conn_decref(conn); - - spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); - continue; - } - - if (!list_empty(&ksocknal_data.ksnd_zombie_conns)) { - conn = list_entry(ksocknal_data.ksnd_zombie_conns.next, - struct ksock_conn, ksnc_list); - list_del(&conn->ksnc_list); - - spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); - - ksocknal_destroy_conn(conn); - - spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); - continue; - } - - if (!list_empty(&ksocknal_data.ksnd_enomem_conns)) { - list_add(&enomem_conns, - &ksocknal_data.ksnd_enomem_conns); - list_del_init(&ksocknal_data.ksnd_enomem_conns); - } - - spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); - - /* reschedule all the connections that stalled with ENOMEM... */ - nenomem_conns = 0; - while (!list_empty(&enomem_conns)) { - conn = list_entry(enomem_conns.next, struct ksock_conn, - ksnc_tx_list); - list_del(&conn->ksnc_tx_list); - - sched = conn->ksnc_scheduler; - - spin_lock_bh(&sched->kss_lock); - - LASSERT(conn->ksnc_tx_scheduled); - conn->ksnc_tx_ready = 1; - list_add_tail(&conn->ksnc_tx_list, - &sched->kss_tx_conns); - wake_up(&sched->kss_waitq); - - spin_unlock_bh(&sched->kss_lock); - nenomem_conns++; - } - - /* careful with the jiffy wrap... */ - while ((timeout = deadline - jiffies) <= 0) { - const int n = 4; - const int p = 1; - int chunk = ksocknal_data.ksnd_peer_hash_size; - - /* - * Time to check for timeouts on a few more peers: I do - * checks every 'p' seconds on a proportion of the peer - * table and I need to check every connection 'n' times - * within a timeout interval, to ensure I detect a - * timeout on any connection within (n+1)/n times the - * timeout interval. - */ - if (*ksocknal_tunables.ksnd_timeout > n * p) - chunk = (chunk * n * p) / - *ksocknal_tunables.ksnd_timeout; - if (!chunk) - chunk = 1; - - for (i = 0; i < chunk; i++) { - ksocknal_check_peer_timeouts(peer_index); - peer_index = (peer_index + 1) % - ksocknal_data.ksnd_peer_hash_size; - } - - deadline = deadline + p * HZ; - } - - if (nenomem_conns) { - /* - * Reduce my timeout if I rescheduled ENOMEM conns. - * This also prevents me getting woken immediately - * if any go back on my enomem list. - */ - timeout = SOCKNAL_ENOMEM_RETRY; - } - ksocknal_data.ksnd_reaper_waketime = jiffies + timeout; - - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&ksocknal_data.ksnd_reaper_waitq, &wait); - - if (!ksocknal_data.ksnd_shuttingdown && - list_empty(&ksocknal_data.ksnd_deathrow_conns) && - list_empty(&ksocknal_data.ksnd_zombie_conns)) - schedule_timeout(timeout); - - set_current_state(TASK_RUNNING); - remove_wait_queue(&ksocknal_data.ksnd_reaper_waitq, &wait); - - spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); - } - - spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); - - ksocknal_thread_fini(); - return 0; -} diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c deleted file mode 100644 index 93a02cd6b6b5350fb3546e3c43443d675abd0e70..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c +++ /dev/null @@ -1,534 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#include -#include "socklnd.h" - -int -ksocknal_lib_get_conn_addrs(struct ksock_conn *conn) -{ - int rc = lnet_sock_getaddr(conn->ksnc_sock, 1, &conn->ksnc_ipaddr, - &conn->ksnc_port); - - /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */ - LASSERT(!conn->ksnc_closing); - - if (rc) { - CERROR("Error %d getting sock peer IP\n", rc); - return rc; - } - - rc = lnet_sock_getaddr(conn->ksnc_sock, 0, &conn->ksnc_myipaddr, NULL); - if (rc) { - CERROR("Error %d getting sock local IP\n", rc); - return rc; - } - - return 0; -} - -int -ksocknal_lib_zc_capable(struct ksock_conn *conn) -{ - int caps = conn->ksnc_sock->sk->sk_route_caps; - - if (conn->ksnc_proto == &ksocknal_protocol_v1x) - return 0; - - /* - * ZC if the socket supports scatter/gather and doesn't need software - * checksums - */ - return ((caps & NETIF_F_SG) && (caps & NETIF_F_CSUM_MASK)); -} - -int -ksocknal_lib_send_iov(struct ksock_conn *conn, struct ksock_tx *tx) -{ - struct msghdr msg = {.msg_flags = MSG_DONTWAIT}; - struct socket *sock = conn->ksnc_sock; - int nob, i; - - if (*ksocknal_tunables.ksnd_enable_csum && /* checksum enabled */ - conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection */ - tx->tx_nob == tx->tx_resid && /* frist sending */ - !tx->tx_msg.ksm_csum) /* not checksummed */ - ksocknal_lib_csum_tx(tx); - - for (nob = i = 0; i < tx->tx_niov; i++) - nob += tx->tx_iov[i].iov_len; - - if (!list_empty(&conn->ksnc_tx_queue) || - nob < tx->tx_resid) - msg.msg_flags |= MSG_MORE; - - iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, - tx->tx_iov, tx->tx_niov, nob); - return sock_sendmsg(sock, &msg); -} - -int -ksocknal_lib_send_kiov(struct ksock_conn *conn, struct ksock_tx *tx) -{ - struct socket *sock = conn->ksnc_sock; - struct bio_vec *kiov = tx->tx_kiov; - int rc; - int nob; - - /* Not NOOP message */ - LASSERT(tx->tx_lnetmsg); - - if (tx->tx_msg.ksm_zc_cookies[0]) { - /* Zero copy is enabled */ - struct sock *sk = sock->sk; - struct page *page = kiov->bv_page; - int offset = kiov->bv_offset; - int fragsize = kiov->bv_len; - int msgflg = MSG_DONTWAIT; - - CDEBUG(D_NET, "page %p + offset %x for %d\n", - page, offset, kiov->bv_len); - - if (!list_empty(&conn->ksnc_tx_queue) || - fragsize < tx->tx_resid) - msgflg |= MSG_MORE; - - if (sk->sk_prot->sendpage) { - rc = sk->sk_prot->sendpage(sk, page, - offset, fragsize, msgflg); - } else { - rc = tcp_sendpage(sk, page, offset, fragsize, msgflg); - } - } else { - struct msghdr msg = {.msg_flags = MSG_DONTWAIT}; - int i; - - for (nob = i = 0; i < tx->tx_nkiov; i++) - nob += kiov[i].bv_len; - - if (!list_empty(&conn->ksnc_tx_queue) || - nob < tx->tx_resid) - msg.msg_flags |= MSG_MORE; - - iov_iter_bvec(&msg.msg_iter, WRITE | ITER_BVEC, - kiov, tx->tx_nkiov, nob); - rc = sock_sendmsg(sock, &msg); - } - return rc; -} - -void -ksocknal_lib_eager_ack(struct ksock_conn *conn) -{ - int opt = 1; - struct socket *sock = conn->ksnc_sock; - - /* - * Remind the socket to ACK eagerly. If I don't, the socket might - * think I'm about to send something it could piggy-back the ACK - * on, introducing delay in completing zero-copy sends in my - * peer. - */ - kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK, (char *)&opt, - sizeof(opt)); -} - -static int lustre_csum(struct kvec *v, void *context) -{ - struct ksock_conn *conn = context; - conn->ksnc_rx_csum = crc32_le(conn->ksnc_rx_csum, - v->iov_base, v->iov_len); - return 0; -} - -int -ksocknal_lib_recv(struct ksock_conn *conn) -{ - struct msghdr msg = { .msg_iter = conn->ksnc_rx_to }; - __u32 saved_csum; - int rc; - - rc = sock_recvmsg(conn->ksnc_sock, &msg, MSG_DONTWAIT); - if (rc <= 0) - return rc; - - saved_csum = conn->ksnc_msg.ksm_csum; - if (!saved_csum) - return rc; - - /* header is included only in V2 - V3 checksums only the bulk data */ - if (!(conn->ksnc_rx_to.type & ITER_BVEC) && - conn->ksnc_proto != &ksocknal_protocol_v2x) - return rc; - - /* accumulate checksum */ - conn->ksnc_msg.ksm_csum = 0; - iov_iter_for_each_range(&conn->ksnc_rx_to, rc, lustre_csum, conn); - conn->ksnc_msg.ksm_csum = saved_csum; - - return rc; -} - -void -ksocknal_lib_csum_tx(struct ksock_tx *tx) -{ - int i; - __u32 csum; - void *base; - - LASSERT(tx->tx_iov[0].iov_base == &tx->tx_msg); - LASSERT(tx->tx_conn); - LASSERT(tx->tx_conn->ksnc_proto == &ksocknal_protocol_v2x); - - tx->tx_msg.ksm_csum = 0; - - csum = crc32_le(~0, tx->tx_iov[0].iov_base, - tx->tx_iov[0].iov_len); - - if (tx->tx_kiov) { - for (i = 0; i < tx->tx_nkiov; i++) { - base = kmap(tx->tx_kiov[i].bv_page) + - tx->tx_kiov[i].bv_offset; - - csum = crc32_le(csum, base, tx->tx_kiov[i].bv_len); - - kunmap(tx->tx_kiov[i].bv_page); - } - } else { - for (i = 1; i < tx->tx_niov; i++) - csum = crc32_le(csum, tx->tx_iov[i].iov_base, - tx->tx_iov[i].iov_len); - } - - if (*ksocknal_tunables.ksnd_inject_csum_error) { - csum++; - *ksocknal_tunables.ksnd_inject_csum_error = 0; - } - - tx->tx_msg.ksm_csum = csum; -} - -int -ksocknal_lib_get_conn_tunables(struct ksock_conn *conn, int *txmem, - int *rxmem, int *nagle) -{ - struct socket *sock = conn->ksnc_sock; - int len; - int rc; - - rc = ksocknal_connsock_addref(conn); - if (rc) { - LASSERT(conn->ksnc_closing); - *txmem = *rxmem = *nagle = 0; - return -ESHUTDOWN; - } - - rc = lnet_sock_getbuf(sock, txmem, rxmem); - if (!rc) { - len = sizeof(*nagle); - rc = kernel_getsockopt(sock, SOL_TCP, TCP_NODELAY, - (char *)nagle, &len); - } - - ksocknal_connsock_decref(conn); - - if (!rc) - *nagle = !*nagle; - else - *txmem = *rxmem = *nagle = 0; - - return rc; -} - -int -ksocknal_lib_setup_sock(struct socket *sock) -{ - int rc; - int option; - int keep_idle; - int keep_intvl; - int keep_count; - int do_keepalive; - struct linger linger; - - sock->sk->sk_allocation = GFP_NOFS; - - /* - * Ensure this socket aborts active sends immediately when we close - * it. - */ - linger.l_onoff = 0; - linger.l_linger = 0; - - rc = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, (char *)&linger, - sizeof(linger)); - if (rc) { - CERROR("Can't set SO_LINGER: %d\n", rc); - return rc; - } - - option = -1; - rc = kernel_setsockopt(sock, SOL_TCP, TCP_LINGER2, (char *)&option, - sizeof(option)); - if (rc) { - CERROR("Can't set SO_LINGER2: %d\n", rc); - return rc; - } - - if (!*ksocknal_tunables.ksnd_nagle) { - option = 1; - - rc = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, - (char *)&option, sizeof(option)); - if (rc) { - CERROR("Can't disable nagle: %d\n", rc); - return rc; - } - } - - rc = lnet_sock_setbuf(sock, *ksocknal_tunables.ksnd_tx_buffer_size, - *ksocknal_tunables.ksnd_rx_buffer_size); - if (rc) { - CERROR("Can't set buffer tx %d, rx %d buffers: %d\n", - *ksocknal_tunables.ksnd_tx_buffer_size, - *ksocknal_tunables.ksnd_rx_buffer_size, rc); - return rc; - } - -/* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */ - - /* snapshot tunables */ - keep_idle = *ksocknal_tunables.ksnd_keepalive_idle; - keep_count = *ksocknal_tunables.ksnd_keepalive_count; - keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl; - - do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0); - - option = (do_keepalive ? 1 : 0); - rc = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, (char *)&option, - sizeof(option)); - if (rc) { - CERROR("Can't set SO_KEEPALIVE: %d\n", rc); - return rc; - } - - if (!do_keepalive) - return 0; - - rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE, (char *)&keep_idle, - sizeof(keep_idle)); - if (rc) { - CERROR("Can't set TCP_KEEPIDLE: %d\n", rc); - return rc; - } - - rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL, - (char *)&keep_intvl, sizeof(keep_intvl)); - if (rc) { - CERROR("Can't set TCP_KEEPINTVL: %d\n", rc); - return rc; - } - - rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, (char *)&keep_count, - sizeof(keep_count)); - if (rc) { - CERROR("Can't set TCP_KEEPCNT: %d\n", rc); - return rc; - } - - return 0; -} - -void -ksocknal_lib_push_conn(struct ksock_conn *conn) -{ - struct sock *sk; - struct tcp_sock *tp; - int nonagle; - int val = 1; - int rc; - - rc = ksocknal_connsock_addref(conn); - if (rc) /* being shut down */ - return; - - sk = conn->ksnc_sock->sk; - tp = tcp_sk(sk); - - lock_sock(sk); - nonagle = tp->nonagle; - tp->nonagle = 1; - release_sock(sk); - - rc = kernel_setsockopt(conn->ksnc_sock, SOL_TCP, TCP_NODELAY, - (char *)&val, sizeof(val)); - LASSERT(!rc); - - lock_sock(sk); - tp->nonagle = nonagle; - release_sock(sk); - - ksocknal_connsock_decref(conn); -} - -/* - * socket call back in Linux - */ -static void -ksocknal_data_ready(struct sock *sk) -{ - struct ksock_conn *conn; - - /* interleave correctly with closing sockets... */ - LASSERT(!in_irq()); - read_lock(&ksocknal_data.ksnd_global_lock); - - conn = sk->sk_user_data; - if (!conn) { /* raced with ksocknal_terminate_conn */ - LASSERT(sk->sk_data_ready != &ksocknal_data_ready); - sk->sk_data_ready(sk); - } else { - ksocknal_read_callback(conn); - } - - read_unlock(&ksocknal_data.ksnd_global_lock); -} - -static void -ksocknal_write_space(struct sock *sk) -{ - struct ksock_conn *conn; - int wspace; - int min_wpace; - - /* interleave correctly with closing sockets... */ - LASSERT(!in_irq()); - read_lock(&ksocknal_data.ksnd_global_lock); - - conn = sk->sk_user_data; - wspace = sk_stream_wspace(sk); - min_wpace = sk_stream_min_wspace(sk); - - CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n", - sk, wspace, min_wpace, conn, - !conn ? "" : (conn->ksnc_tx_ready ? - " ready" : " blocked"), - !conn ? "" : (conn->ksnc_tx_scheduled ? - " scheduled" : " idle"), - !conn ? "" : (list_empty(&conn->ksnc_tx_queue) ? - " empty" : " queued")); - - if (!conn) { /* raced with ksocknal_terminate_conn */ - LASSERT(sk->sk_write_space != &ksocknal_write_space); - sk->sk_write_space(sk); - - read_unlock(&ksocknal_data.ksnd_global_lock); - return; - } - - if (wspace >= min_wpace) { /* got enough space */ - ksocknal_write_callback(conn); - - /* - * Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the - * ENOMEM check in ksocknal_transmit is race-free (think about - * it). - */ - clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - } - - read_unlock(&ksocknal_data.ksnd_global_lock); -} - -void -ksocknal_lib_save_callback(struct socket *sock, struct ksock_conn *conn) -{ - conn->ksnc_saved_data_ready = sock->sk->sk_data_ready; - conn->ksnc_saved_write_space = sock->sk->sk_write_space; -} - -void -ksocknal_lib_set_callback(struct socket *sock, struct ksock_conn *conn) -{ - sock->sk->sk_user_data = conn; - sock->sk->sk_data_ready = ksocknal_data_ready; - sock->sk->sk_write_space = ksocknal_write_space; -} - -void -ksocknal_lib_reset_callback(struct socket *sock, struct ksock_conn *conn) -{ - /* - * Remove conn's network callbacks. - * NB I _have_ to restore the callback, rather than storing a noop, - * since the socket could survive past this module being unloaded!! - */ - sock->sk->sk_data_ready = conn->ksnc_saved_data_ready; - sock->sk->sk_write_space = conn->ksnc_saved_write_space; - - /* - * A callback could be in progress already; they hold a read lock - * on ksnd_global_lock (to serialise with me) and NOOP if - * sk_user_data is NULL. - */ - sock->sk->sk_user_data = NULL; -} - -int -ksocknal_lib_memory_pressure(struct ksock_conn *conn) -{ - int rc = 0; - struct ksock_sched *sched; - - sched = conn->ksnc_scheduler; - spin_lock_bh(&sched->kss_lock); - - if (!test_bit(SOCK_NOSPACE, &conn->ksnc_sock->flags) && - !conn->ksnc_tx_ready) { - /* - * SOCK_NOSPACE is set when the socket fills - * and cleared in the write_space callback - * (which also sets ksnc_tx_ready). If - * SOCK_NOSPACE and ksnc_tx_ready are BOTH - * zero, I didn't fill the socket and - * write_space won't reschedule me, so I - * return -ENOMEM to get my caller to retry - * after a timeout - */ - rc = -ENOMEM; - } - - spin_unlock_bh(&sched->kss_lock); - - return rc; -} diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c deleted file mode 100644 index 5663a4ca94d4fe3bc1ee57064f6bc6ed49515314..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c +++ /dev/null @@ -1,184 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * - * Copyright (c) 2011, 2012, Intel Corporation. - * - * Author: Eric Barton - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#include "socklnd.h" - -static int sock_timeout = 50; -module_param(sock_timeout, int, 0644); -MODULE_PARM_DESC(sock_timeout, "dead socket timeout (seconds)"); - -static int credits = 256; -module_param(credits, int, 0444); -MODULE_PARM_DESC(credits, "# concurrent sends"); - -static int peer_credits = 8; -module_param(peer_credits, int, 0444); -MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer"); - -static int peer_buffer_credits; -module_param(peer_buffer_credits, int, 0444); -MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits"); - -static int peer_timeout = 180; -module_param(peer_timeout, int, 0444); -MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)"); - -/* - * Number of daemons in each thread pool which is percpt, - * we will estimate reasonable value based on CPUs if it's not set. - */ -static unsigned int nscheds; -module_param(nscheds, int, 0444); -MODULE_PARM_DESC(nscheds, "# scheduler daemons in each pool while starting"); - -static int nconnds = 4; -module_param(nconnds, int, 0444); -MODULE_PARM_DESC(nconnds, "# connection daemons while starting"); - -static int nconnds_max = 64; -module_param(nconnds_max, int, 0444); -MODULE_PARM_DESC(nconnds_max, "max # connection daemons"); - -static int min_reconnectms = 1000; -module_param(min_reconnectms, int, 0644); -MODULE_PARM_DESC(min_reconnectms, "min connection retry interval (mS)"); - -static int max_reconnectms = 60000; -module_param(max_reconnectms, int, 0644); -MODULE_PARM_DESC(max_reconnectms, "max connection retry interval (mS)"); - -# define DEFAULT_EAGER_ACK 0 -static int eager_ack = DEFAULT_EAGER_ACK; -module_param(eager_ack, int, 0644); -MODULE_PARM_DESC(eager_ack, "send tcp ack packets eagerly"); - -static int typed_conns = 1; -module_param(typed_conns, int, 0444); -MODULE_PARM_DESC(typed_conns, "use different sockets for bulk"); - -static int min_bulk = 1 << 10; -module_param(min_bulk, int, 0644); -MODULE_PARM_DESC(min_bulk, "smallest 'large' message"); - -# define DEFAULT_BUFFER_SIZE 0 -static int tx_buffer_size = DEFAULT_BUFFER_SIZE; -module_param(tx_buffer_size, int, 0644); -MODULE_PARM_DESC(tx_buffer_size, "socket tx buffer size (0 for system default)"); - -static int rx_buffer_size = DEFAULT_BUFFER_SIZE; -module_param(rx_buffer_size, int, 0644); -MODULE_PARM_DESC(rx_buffer_size, "socket rx buffer size (0 for system default)"); - -static int nagle; -module_param(nagle, int, 0644); -MODULE_PARM_DESC(nagle, "enable NAGLE?"); - -static int round_robin = 1; -module_param(round_robin, int, 0644); -MODULE_PARM_DESC(round_robin, "Round robin for multiple interfaces"); - -static int keepalive = 30; -module_param(keepalive, int, 0644); -MODULE_PARM_DESC(keepalive, "# seconds before send keepalive"); - -static int keepalive_idle = 30; -module_param(keepalive_idle, int, 0644); -MODULE_PARM_DESC(keepalive_idle, "# idle seconds before probe"); - -#define DEFAULT_KEEPALIVE_COUNT 5 -static int keepalive_count = DEFAULT_KEEPALIVE_COUNT; -module_param(keepalive_count, int, 0644); -MODULE_PARM_DESC(keepalive_count, "# missed probes == dead"); - -static int keepalive_intvl = 5; -module_param(keepalive_intvl, int, 0644); -MODULE_PARM_DESC(keepalive_intvl, "seconds between probes"); - -static int enable_csum; -module_param(enable_csum, int, 0644); -MODULE_PARM_DESC(enable_csum, "enable check sum"); - -static int inject_csum_error; -module_param(inject_csum_error, int, 0644); -MODULE_PARM_DESC(inject_csum_error, "set non-zero to inject a checksum error"); - -static int nonblk_zcack = 1; -module_param(nonblk_zcack, int, 0644); -MODULE_PARM_DESC(nonblk_zcack, "always send ZC-ACK on non-blocking connection"); - -static unsigned int zc_min_payload = 16 << 10; -module_param(zc_min_payload, int, 0644); -MODULE_PARM_DESC(zc_min_payload, "minimum payload size to zero copy"); - -static unsigned int zc_recv; -module_param(zc_recv, int, 0644); -MODULE_PARM_DESC(zc_recv, "enable ZC recv for Chelsio driver"); - -static unsigned int zc_recv_min_nfrags = 16; -module_param(zc_recv_min_nfrags, int, 0644); -MODULE_PARM_DESC(zc_recv_min_nfrags, "minimum # of fragments to enable ZC recv"); - -#if SOCKNAL_VERSION_DEBUG -static int protocol = 3; -module_param(protocol, int, 0644); -MODULE_PARM_DESC(protocol, "protocol version"); -#endif - -struct ksock_tunables ksocknal_tunables; - -int ksocknal_tunables_init(void) -{ - /* initialize ksocknal_tunables structure */ - ksocknal_tunables.ksnd_timeout = &sock_timeout; - ksocknal_tunables.ksnd_nscheds = &nscheds; - ksocknal_tunables.ksnd_nconnds = &nconnds; - ksocknal_tunables.ksnd_nconnds_max = &nconnds_max; - ksocknal_tunables.ksnd_min_reconnectms = &min_reconnectms; - ksocknal_tunables.ksnd_max_reconnectms = &max_reconnectms; - ksocknal_tunables.ksnd_eager_ack = &eager_ack; - ksocknal_tunables.ksnd_typed_conns = &typed_conns; - ksocknal_tunables.ksnd_min_bulk = &min_bulk; - ksocknal_tunables.ksnd_tx_buffer_size = &tx_buffer_size; - ksocknal_tunables.ksnd_rx_buffer_size = &rx_buffer_size; - ksocknal_tunables.ksnd_nagle = &nagle; - ksocknal_tunables.ksnd_round_robin = &round_robin; - ksocknal_tunables.ksnd_keepalive = &keepalive; - ksocknal_tunables.ksnd_keepalive_idle = &keepalive_idle; - ksocknal_tunables.ksnd_keepalive_count = &keepalive_count; - ksocknal_tunables.ksnd_keepalive_intvl = &keepalive_intvl; - ksocknal_tunables.ksnd_credits = &credits; - ksocknal_tunables.ksnd_peertxcredits = &peer_credits; - ksocknal_tunables.ksnd_peerrtrcredits = &peer_buffer_credits; - ksocknal_tunables.ksnd_peertimeout = &peer_timeout; - ksocknal_tunables.ksnd_enable_csum = &enable_csum; - ksocknal_tunables.ksnd_inject_csum_error = &inject_csum_error; - ksocknal_tunables.ksnd_nonblk_zcack = &nonblk_zcack; - ksocknal_tunables.ksnd_zc_min_payload = &zc_min_payload; - ksocknal_tunables.ksnd_zc_recv = &zc_recv; - ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags; - -#if SOCKNAL_VERSION_DEBUG - ksocknal_tunables.ksnd_protocol = &protocol; -#endif - - if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10)) - *ksocknal_tunables.ksnd_zc_min_payload = 2 << 10; - - return 0; -}; diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c deleted file mode 100644 index 05982dac781c0cf7146c8f4d640c219d806bd3e6..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c +++ /dev/null @@ -1,810 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. - * - * Copyright (c) 2012, Intel Corporation. - * - * Author: Zach Brown - * Author: Peter J. Braam - * Author: Phil Schwan - * Author: Eric Barton - * - * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#include "socklnd.h" - -/* - * Protocol entries : - * pro_send_hello : send hello message - * pro_recv_hello : receive hello message - * pro_pack : pack message header - * pro_unpack : unpack message header - * pro_queue_tx_zcack() : Called holding BH lock: kss_lock - * return 1 if ACK is piggybacked, otherwise return 0 - * pro_queue_tx_msg() : Called holding BH lock: kss_lock - * return the ACK that piggybacked by my message, or NULL - * pro_handle_zcreq() : handler of incoming ZC-REQ - * pro_handle_zcack() : handler of incoming ZC-ACK - * pro_match_tx() : Called holding glock - */ - -static struct ksock_tx * -ksocknal_queue_tx_msg_v1(struct ksock_conn *conn, struct ksock_tx *tx_msg) -{ - /* V1.x, just enqueue it */ - list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue); - return NULL; -} - -void -ksocknal_next_tx_carrier(struct ksock_conn *conn) -{ - struct ksock_tx *tx = conn->ksnc_tx_carrier; - - /* Called holding BH lock: conn->ksnc_scheduler->kss_lock */ - LASSERT(!list_empty(&conn->ksnc_tx_queue)); - LASSERT(tx); - - /* Next TX that can carry ZC-ACK or LNet message */ - if (tx->tx_list.next == &conn->ksnc_tx_queue) { - /* no more packets queued */ - conn->ksnc_tx_carrier = NULL; - } else { - conn->ksnc_tx_carrier = list_next_entry(tx, tx_list); - LASSERT(conn->ksnc_tx_carrier->tx_msg.ksm_type == tx->tx_msg.ksm_type); - } -} - -static int -ksocknal_queue_tx_zcack_v2(struct ksock_conn *conn, - struct ksock_tx *tx_ack, __u64 cookie) -{ - struct ksock_tx *tx = conn->ksnc_tx_carrier; - - LASSERT(!tx_ack || - tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP); - - /* - * Enqueue or piggyback tx_ack / cookie - * . no tx can piggyback cookie of tx_ack (or cookie), just - * enqueue the tx_ack (if tx_ack != NUL) and return NULL. - * . There is tx can piggyback cookie of tx_ack (or cookie), - * piggyback the cookie and return the tx. - */ - if (!tx) { - if (tx_ack) { - list_add_tail(&tx_ack->tx_list, - &conn->ksnc_tx_queue); - conn->ksnc_tx_carrier = tx_ack; - } - return 0; - } - - if (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP) { - /* tx is noop zc-ack, can't piggyback zc-ack cookie */ - if (tx_ack) - list_add_tail(&tx_ack->tx_list, - &conn->ksnc_tx_queue); - return 0; - } - - LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_LNET); - LASSERT(!tx->tx_msg.ksm_zc_cookies[1]); - - if (tx_ack) - cookie = tx_ack->tx_msg.ksm_zc_cookies[1]; - - /* piggyback the zc-ack cookie */ - tx->tx_msg.ksm_zc_cookies[1] = cookie; - /* move on to the next TX which can carry cookie */ - ksocknal_next_tx_carrier(conn); - - return 1; -} - -static struct ksock_tx * -ksocknal_queue_tx_msg_v2(struct ksock_conn *conn, struct ksock_tx *tx_msg) -{ - struct ksock_tx *tx = conn->ksnc_tx_carrier; - - /* - * Enqueue tx_msg: - * . If there is no NOOP on the connection, just enqueue - * tx_msg and return NULL - * . If there is NOOP on the connection, piggyback the cookie - * and replace the NOOP tx, and return the NOOP tx. - */ - if (!tx) { /* nothing on queue */ - list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue); - conn->ksnc_tx_carrier = tx_msg; - return NULL; - } - - if (tx->tx_msg.ksm_type == KSOCK_MSG_LNET) { /* nothing to carry */ - list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue); - return NULL; - } - - LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_NOOP); - - /* There is a noop zc-ack can be piggybacked */ - tx_msg->tx_msg.ksm_zc_cookies[1] = tx->tx_msg.ksm_zc_cookies[1]; - ksocknal_next_tx_carrier(conn); - - /* use new_tx to replace the noop zc-ack packet */ - list_add(&tx_msg->tx_list, &tx->tx_list); - list_del(&tx->tx_list); - - return tx; -} - -static int -ksocknal_queue_tx_zcack_v3(struct ksock_conn *conn, - struct ksock_tx *tx_ack, __u64 cookie) -{ - struct ksock_tx *tx; - - if (conn->ksnc_type != SOCKLND_CONN_ACK) - return ksocknal_queue_tx_zcack_v2(conn, tx_ack, cookie); - - /* non-blocking ZC-ACK (to router) */ - LASSERT(!tx_ack || - tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP); - - tx = conn->ksnc_tx_carrier; - if (!tx) { - if (tx_ack) { - list_add_tail(&tx_ack->tx_list, - &conn->ksnc_tx_queue); - conn->ksnc_tx_carrier = tx_ack; - } - return 0; - } - - /* conn->ksnc_tx_carrier */ - - if (tx_ack) - cookie = tx_ack->tx_msg.ksm_zc_cookies[1]; - - if (cookie == SOCKNAL_KEEPALIVE_PING) /* ignore keepalive PING */ - return 1; - - if (tx->tx_msg.ksm_zc_cookies[1] == SOCKNAL_KEEPALIVE_PING) { - /* replace the keepalive PING with a real ACK */ - LASSERT(!tx->tx_msg.ksm_zc_cookies[0]); - tx->tx_msg.ksm_zc_cookies[1] = cookie; - return 1; - } - - if (cookie == tx->tx_msg.ksm_zc_cookies[0] || - cookie == tx->tx_msg.ksm_zc_cookies[1]) { - CWARN("%s: duplicated ZC cookie: %llu\n", - libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie); - return 1; /* XXX return error in the future */ - } - - if (!tx->tx_msg.ksm_zc_cookies[0]) { - /* - * NOOP tx has only one ZC-ACK cookie, - * can carry at least one more - */ - if (tx->tx_msg.ksm_zc_cookies[1] > cookie) { - tx->tx_msg.ksm_zc_cookies[0] = tx->tx_msg.ksm_zc_cookies[1]; - tx->tx_msg.ksm_zc_cookies[1] = cookie; - } else { - tx->tx_msg.ksm_zc_cookies[0] = cookie; - } - - if (tx->tx_msg.ksm_zc_cookies[0] - tx->tx_msg.ksm_zc_cookies[1] > 2) { - /* - * not likely to carry more ACKs, skip it - * to simplify logic - */ - ksocknal_next_tx_carrier(conn); - } - - return 1; - } - - /* takes two or more cookies already */ - - if (tx->tx_msg.ksm_zc_cookies[0] > tx->tx_msg.ksm_zc_cookies[1]) { - __u64 tmp = 0; - - /* two separated cookies: (a+2, a) or (a+1, a) */ - LASSERT(tx->tx_msg.ksm_zc_cookies[0] - - tx->tx_msg.ksm_zc_cookies[1] <= 2); - - if (tx->tx_msg.ksm_zc_cookies[0] - - tx->tx_msg.ksm_zc_cookies[1] == 2) { - if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1) - tmp = cookie; - } else if (cookie == tx->tx_msg.ksm_zc_cookies[1] - 1) { - tmp = tx->tx_msg.ksm_zc_cookies[1]; - } else if (cookie == tx->tx_msg.ksm_zc_cookies[0] + 1) { - tmp = tx->tx_msg.ksm_zc_cookies[0]; - } - - if (tmp) { - /* range of cookies */ - tx->tx_msg.ksm_zc_cookies[0] = tmp - 1; - tx->tx_msg.ksm_zc_cookies[1] = tmp + 1; - return 1; - } - - } else { - /* - * ksm_zc_cookies[0] < ksm_zc_cookies[1], - * it is range of cookies - */ - if (cookie >= tx->tx_msg.ksm_zc_cookies[0] && - cookie <= tx->tx_msg.ksm_zc_cookies[1]) { - CWARN("%s: duplicated ZC cookie: %llu\n", - libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie); - return 1; /* XXX: return error in the future */ - } - - if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1) { - tx->tx_msg.ksm_zc_cookies[1] = cookie; - return 1; - } - - if (cookie == tx->tx_msg.ksm_zc_cookies[0] - 1) { - tx->tx_msg.ksm_zc_cookies[0] = cookie; - return 1; - } - } - - /* failed to piggyback ZC-ACK */ - if (tx_ack) { - list_add_tail(&tx_ack->tx_list, &conn->ksnc_tx_queue); - /* the next tx can piggyback at least 1 ACK */ - ksocknal_next_tx_carrier(conn); - } - - return 0; -} - -static int -ksocknal_match_tx(struct ksock_conn *conn, struct ksock_tx *tx, int nonblk) -{ - int nob; - -#if SOCKNAL_VERSION_DEBUG - if (!*ksocknal_tunables.ksnd_typed_conns) - return SOCKNAL_MATCH_YES; -#endif - - if (!tx || !tx->tx_lnetmsg) { - /* noop packet */ - nob = offsetof(struct ksock_msg, ksm_u); - } else { - nob = tx->tx_lnetmsg->msg_len + - ((conn->ksnc_proto == &ksocknal_protocol_v1x) ? - sizeof(struct lnet_hdr) : sizeof(struct ksock_msg)); - } - - /* default checking for typed connection */ - switch (conn->ksnc_type) { - default: - CERROR("ksnc_type bad: %u\n", conn->ksnc_type); - LBUG(); - case SOCKLND_CONN_ANY: - return SOCKNAL_MATCH_YES; - - case SOCKLND_CONN_BULK_IN: - return SOCKNAL_MATCH_MAY; - - case SOCKLND_CONN_BULK_OUT: - if (nob < *ksocknal_tunables.ksnd_min_bulk) - return SOCKNAL_MATCH_MAY; - else - return SOCKNAL_MATCH_YES; - - case SOCKLND_CONN_CONTROL: - if (nob >= *ksocknal_tunables.ksnd_min_bulk) - return SOCKNAL_MATCH_MAY; - else - return SOCKNAL_MATCH_YES; - } -} - -static int -ksocknal_match_tx_v3(struct ksock_conn *conn, struct ksock_tx *tx, int nonblk) -{ - int nob; - - if (!tx || !tx->tx_lnetmsg) - nob = offsetof(struct ksock_msg, ksm_u); - else - nob = tx->tx_lnetmsg->msg_len + sizeof(struct ksock_msg); - - switch (conn->ksnc_type) { - default: - CERROR("ksnc_type bad: %u\n", conn->ksnc_type); - LBUG(); - case SOCKLND_CONN_ANY: - return SOCKNAL_MATCH_NO; - - case SOCKLND_CONN_ACK: - if (nonblk) - return SOCKNAL_MATCH_YES; - else if (!tx || !tx->tx_lnetmsg) - return SOCKNAL_MATCH_MAY; - else - return SOCKNAL_MATCH_NO; - - case SOCKLND_CONN_BULK_OUT: - if (nonblk) - return SOCKNAL_MATCH_NO; - else if (nob < *ksocknal_tunables.ksnd_min_bulk) - return SOCKNAL_MATCH_MAY; - else - return SOCKNAL_MATCH_YES; - - case SOCKLND_CONN_CONTROL: - if (nonblk) - return SOCKNAL_MATCH_NO; - else if (nob >= *ksocknal_tunables.ksnd_min_bulk) - return SOCKNAL_MATCH_MAY; - else - return SOCKNAL_MATCH_YES; - } -} - -/* (Sink) handle incoming ZC request from sender */ -static int -ksocknal_handle_zcreq(struct ksock_conn *c, __u64 cookie, int remote) -{ - struct ksock_peer *peer = c->ksnc_peer; - struct ksock_conn *conn; - struct ksock_tx *tx; - int rc; - - read_lock(&ksocknal_data.ksnd_global_lock); - - conn = ksocknal_find_conn_locked(peer, NULL, !!remote); - if (conn) { - struct ksock_sched *sched = conn->ksnc_scheduler; - - LASSERT(conn->ksnc_proto->pro_queue_tx_zcack); - - spin_lock_bh(&sched->kss_lock); - - rc = conn->ksnc_proto->pro_queue_tx_zcack(conn, NULL, cookie); - - spin_unlock_bh(&sched->kss_lock); - - if (rc) { /* piggybacked */ - read_unlock(&ksocknal_data.ksnd_global_lock); - return 0; - } - } - - read_unlock(&ksocknal_data.ksnd_global_lock); - - /* ACK connection is not ready, or can't piggyback the ACK */ - tx = ksocknal_alloc_tx_noop(cookie, !!remote); - if (!tx) - return -ENOMEM; - - rc = ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id); - if (!rc) - return 0; - - ksocknal_free_tx(tx); - return rc; -} - -/* (Sender) handle ZC_ACK from sink */ -static int -ksocknal_handle_zcack(struct ksock_conn *conn, __u64 cookie1, __u64 cookie2) -{ - struct ksock_peer *peer = conn->ksnc_peer; - struct ksock_tx *tx; - struct ksock_tx *temp; - struct ksock_tx *tmp; - LIST_HEAD(zlist); - int count; - - if (!cookie1) - cookie1 = cookie2; - - count = (cookie1 > cookie2) ? 2 : (cookie2 - cookie1 + 1); - - if (cookie2 == SOCKNAL_KEEPALIVE_PING && - conn->ksnc_proto == &ksocknal_protocol_v3x) { - /* keepalive PING for V3.x, just ignore it */ - return count == 1 ? 0 : -EPROTO; - } - - spin_lock(&peer->ksnp_lock); - - list_for_each_entry_safe(tx, tmp, &peer->ksnp_zc_req_list, - tx_zc_list) { - __u64 c = tx->tx_msg.ksm_zc_cookies[0]; - - if (c == cookie1 || c == cookie2 || - (cookie1 < c && c < cookie2)) { - tx->tx_msg.ksm_zc_cookies[0] = 0; - list_del(&tx->tx_zc_list); - list_add(&tx->tx_zc_list, &zlist); - - if (!--count) - break; - } - } - - spin_unlock(&peer->ksnp_lock); - - list_for_each_entry_safe(tx, temp, &zlist, tx_zc_list) { - list_del(&tx->tx_zc_list); - ksocknal_tx_decref(tx); - } - - return !count ? 0 : -EPROTO; -} - -static int -ksocknal_send_hello_v1(struct ksock_conn *conn, struct ksock_hello_msg *hello) -{ - struct socket *sock = conn->ksnc_sock; - struct lnet_hdr *hdr; - struct lnet_magicversion *hmv; - int rc; - int i; - - BUILD_BUG_ON(sizeof(struct lnet_magicversion) != offsetof(struct lnet_hdr, src_nid)); - - hdr = kzalloc(sizeof(*hdr), GFP_NOFS); - if (!hdr) { - CERROR("Can't allocate struct lnet_hdr\n"); - return -ENOMEM; - } - - hmv = (struct lnet_magicversion *)&hdr->dest_nid; - - /* - * Re-organize V2.x message header to V1.x (struct lnet_hdr) - * header and send out - */ - hmv->magic = cpu_to_le32(LNET_PROTO_TCP_MAGIC); - hmv->version_major = cpu_to_le16(KSOCK_PROTO_V1_MAJOR); - hmv->version_minor = cpu_to_le16(KSOCK_PROTO_V1_MINOR); - - if (the_lnet.ln_testprotocompat) { - /* single-shot proto check */ - LNET_LOCK(); - if (the_lnet.ln_testprotocompat & 1) { - hmv->version_major++; /* just different! */ - the_lnet.ln_testprotocompat &= ~1; - } - if (the_lnet.ln_testprotocompat & 2) { - hmv->magic = LNET_PROTO_MAGIC; - the_lnet.ln_testprotocompat &= ~2; - } - LNET_UNLOCK(); - } - - hdr->src_nid = cpu_to_le64(hello->kshm_src_nid); - hdr->src_pid = cpu_to_le32(hello->kshm_src_pid); - hdr->type = cpu_to_le32(LNET_MSG_HELLO); - hdr->payload_length = cpu_to_le32(hello->kshm_nips * sizeof(__u32)); - hdr->msg.hello.type = cpu_to_le32(hello->kshm_ctype); - hdr->msg.hello.incarnation = cpu_to_le64(hello->kshm_src_incarnation); - - rc = lnet_sock_write(sock, hdr, sizeof(*hdr), lnet_acceptor_timeout()); - if (rc) { - CNETERR("Error %d sending HELLO hdr to %pI4h/%d\n", - rc, &conn->ksnc_ipaddr, conn->ksnc_port); - goto out; - } - - if (!hello->kshm_nips) - goto out; - - for (i = 0; i < (int)hello->kshm_nips; i++) - hello->kshm_ips[i] = __cpu_to_le32(hello->kshm_ips[i]); - - rc = lnet_sock_write(sock, hello->kshm_ips, - hello->kshm_nips * sizeof(__u32), - lnet_acceptor_timeout()); - if (rc) { - CNETERR("Error %d sending HELLO payload (%d) to %pI4h/%d\n", - rc, hello->kshm_nips, - &conn->ksnc_ipaddr, conn->ksnc_port); - } -out: - kfree(hdr); - - return rc; -} - -static int -ksocknal_send_hello_v2(struct ksock_conn *conn, struct ksock_hello_msg *hello) -{ - struct socket *sock = conn->ksnc_sock; - int rc; - - hello->kshm_magic = LNET_PROTO_MAGIC; - hello->kshm_version = conn->ksnc_proto->pro_version; - - if (the_lnet.ln_testprotocompat) { - /* single-shot proto check */ - LNET_LOCK(); - if (the_lnet.ln_testprotocompat & 1) { - hello->kshm_version++; /* just different! */ - the_lnet.ln_testprotocompat &= ~1; - } - LNET_UNLOCK(); - } - - rc = lnet_sock_write(sock, hello, offsetof(struct ksock_hello_msg, kshm_ips), - lnet_acceptor_timeout()); - if (rc) { - CNETERR("Error %d sending HELLO hdr to %pI4h/%d\n", - rc, &conn->ksnc_ipaddr, conn->ksnc_port); - return rc; - } - - if (!hello->kshm_nips) - return 0; - - rc = lnet_sock_write(sock, hello->kshm_ips, - hello->kshm_nips * sizeof(__u32), - lnet_acceptor_timeout()); - if (rc) { - CNETERR("Error %d sending HELLO payload (%d) to %pI4h/%d\n", - rc, hello->kshm_nips, - &conn->ksnc_ipaddr, conn->ksnc_port); - } - - return rc; -} - -static int -ksocknal_recv_hello_v1(struct ksock_conn *conn, struct ksock_hello_msg *hello, - int timeout) -{ - struct socket *sock = conn->ksnc_sock; - struct lnet_hdr *hdr; - int rc; - int i; - - hdr = kzalloc(sizeof(*hdr), GFP_NOFS); - if (!hdr) { - CERROR("Can't allocate struct lnet_hdr\n"); - return -ENOMEM; - } - - rc = lnet_sock_read(sock, &hdr->src_nid, - sizeof(*hdr) - offsetof(struct lnet_hdr, src_nid), - timeout); - if (rc) { - CERROR("Error %d reading rest of HELLO hdr from %pI4h\n", - rc, &conn->ksnc_ipaddr); - LASSERT(rc < 0 && rc != -EALREADY); - goto out; - } - - /* ...and check we got what we expected */ - if (hdr->type != cpu_to_le32(LNET_MSG_HELLO)) { - CERROR("Expecting a HELLO hdr, but got type %d from %pI4h\n", - le32_to_cpu(hdr->type), - &conn->ksnc_ipaddr); - rc = -EPROTO; - goto out; - } - - hello->kshm_src_nid = le64_to_cpu(hdr->src_nid); - hello->kshm_src_pid = le32_to_cpu(hdr->src_pid); - hello->kshm_src_incarnation = le64_to_cpu(hdr->msg.hello.incarnation); - hello->kshm_ctype = le32_to_cpu(hdr->msg.hello.type); - hello->kshm_nips = le32_to_cpu(hdr->payload_length) / - sizeof(__u32); - - if (hello->kshm_nips > LNET_MAX_INTERFACES) { - CERROR("Bad nips %d from ip %pI4h\n", - hello->kshm_nips, &conn->ksnc_ipaddr); - rc = -EPROTO; - goto out; - } - - if (!hello->kshm_nips) - goto out; - - rc = lnet_sock_read(sock, hello->kshm_ips, - hello->kshm_nips * sizeof(__u32), timeout); - if (rc) { - CERROR("Error %d reading IPs from ip %pI4h\n", - rc, &conn->ksnc_ipaddr); - LASSERT(rc < 0 && rc != -EALREADY); - goto out; - } - - for (i = 0; i < (int)hello->kshm_nips; i++) { - hello->kshm_ips[i] = __le32_to_cpu(hello->kshm_ips[i]); - - if (!hello->kshm_ips[i]) { - CERROR("Zero IP[%d] from ip %pI4h\n", - i, &conn->ksnc_ipaddr); - rc = -EPROTO; - break; - } - } -out: - kfree(hdr); - - return rc; -} - -static int -ksocknal_recv_hello_v2(struct ksock_conn *conn, struct ksock_hello_msg *hello, - int timeout) -{ - struct socket *sock = conn->ksnc_sock; - int rc; - int i; - - if (hello->kshm_magic == LNET_PROTO_MAGIC) - conn->ksnc_flip = 0; - else - conn->ksnc_flip = 1; - - rc = lnet_sock_read(sock, &hello->kshm_src_nid, - offsetof(struct ksock_hello_msg, kshm_ips) - - offsetof(struct ksock_hello_msg, kshm_src_nid), - timeout); - if (rc) { - CERROR("Error %d reading HELLO from %pI4h\n", - rc, &conn->ksnc_ipaddr); - LASSERT(rc < 0 && rc != -EALREADY); - return rc; - } - - if (conn->ksnc_flip) { - __swab32s(&hello->kshm_src_pid); - __swab64s(&hello->kshm_src_nid); - __swab32s(&hello->kshm_dst_pid); - __swab64s(&hello->kshm_dst_nid); - __swab64s(&hello->kshm_src_incarnation); - __swab64s(&hello->kshm_dst_incarnation); - __swab32s(&hello->kshm_ctype); - __swab32s(&hello->kshm_nips); - } - - if (hello->kshm_nips > LNET_MAX_INTERFACES) { - CERROR("Bad nips %d from ip %pI4h\n", - hello->kshm_nips, &conn->ksnc_ipaddr); - return -EPROTO; - } - - if (!hello->kshm_nips) - return 0; - - rc = lnet_sock_read(sock, hello->kshm_ips, - hello->kshm_nips * sizeof(__u32), timeout); - if (rc) { - CERROR("Error %d reading IPs from ip %pI4h\n", - rc, &conn->ksnc_ipaddr); - LASSERT(rc < 0 && rc != -EALREADY); - return rc; - } - - for (i = 0; i < (int)hello->kshm_nips; i++) { - if (conn->ksnc_flip) - __swab32s(&hello->kshm_ips[i]); - - if (!hello->kshm_ips[i]) { - CERROR("Zero IP[%d] from ip %pI4h\n", - i, &conn->ksnc_ipaddr); - return -EPROTO; - } - } - - return 0; -} - -static void -ksocknal_pack_msg_v1(struct ksock_tx *tx) -{ - /* V1.x has no KSOCK_MSG_NOOP */ - LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); - LASSERT(tx->tx_lnetmsg); - - tx->tx_iov[0].iov_base = &tx->tx_lnetmsg->msg_hdr; - tx->tx_iov[0].iov_len = sizeof(struct lnet_hdr); - - tx->tx_nob = tx->tx_lnetmsg->msg_len + sizeof(struct lnet_hdr); - tx->tx_resid = tx->tx_lnetmsg->msg_len + sizeof(struct lnet_hdr); -} - -static void -ksocknal_pack_msg_v2(struct ksock_tx *tx) -{ - tx->tx_iov[0].iov_base = &tx->tx_msg; - - if (tx->tx_lnetmsg) { - LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); - - tx->tx_msg.ksm_u.lnetmsg.ksnm_hdr = tx->tx_lnetmsg->msg_hdr; - tx->tx_iov[0].iov_len = sizeof(struct ksock_msg); - tx->tx_nob = sizeof(struct ksock_msg) + tx->tx_lnetmsg->msg_len; - tx->tx_resid = sizeof(struct ksock_msg) + tx->tx_lnetmsg->msg_len; - } else { - LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_NOOP); - - tx->tx_iov[0].iov_len = offsetof(struct ksock_msg, ksm_u.lnetmsg.ksnm_hdr); - tx->tx_nob = offsetof(struct ksock_msg, ksm_u.lnetmsg.ksnm_hdr); - tx->tx_resid = offsetof(struct ksock_msg, ksm_u.lnetmsg.ksnm_hdr); - } - /* - * Don't checksum before start sending, because packet can be - * piggybacked with ACK - */ -} - -static void -ksocknal_unpack_msg_v1(struct ksock_msg *msg) -{ - msg->ksm_csum = 0; - msg->ksm_type = KSOCK_MSG_LNET; - msg->ksm_zc_cookies[0] = 0; - msg->ksm_zc_cookies[1] = 0; -} - -static void -ksocknal_unpack_msg_v2(struct ksock_msg *msg) -{ - return; /* Do nothing */ -} - -struct ksock_proto ksocknal_protocol_v1x = { - .pro_version = KSOCK_PROTO_V1, - .pro_send_hello = ksocknal_send_hello_v1, - .pro_recv_hello = ksocknal_recv_hello_v1, - .pro_pack = ksocknal_pack_msg_v1, - .pro_unpack = ksocknal_unpack_msg_v1, - .pro_queue_tx_msg = ksocknal_queue_tx_msg_v1, - .pro_handle_zcreq = NULL, - .pro_handle_zcack = NULL, - .pro_queue_tx_zcack = NULL, - .pro_match_tx = ksocknal_match_tx -}; - -struct ksock_proto ksocknal_protocol_v2x = { - .pro_version = KSOCK_PROTO_V2, - .pro_send_hello = ksocknal_send_hello_v2, - .pro_recv_hello = ksocknal_recv_hello_v2, - .pro_pack = ksocknal_pack_msg_v2, - .pro_unpack = ksocknal_unpack_msg_v2, - .pro_queue_tx_msg = ksocknal_queue_tx_msg_v2, - .pro_queue_tx_zcack = ksocknal_queue_tx_zcack_v2, - .pro_handle_zcreq = ksocknal_handle_zcreq, - .pro_handle_zcack = ksocknal_handle_zcack, - .pro_match_tx = ksocknal_match_tx -}; - -struct ksock_proto ksocknal_protocol_v3x = { - .pro_version = KSOCK_PROTO_V3, - .pro_send_hello = ksocknal_send_hello_v2, - .pro_recv_hello = ksocknal_recv_hello_v2, - .pro_pack = ksocknal_pack_msg_v2, - .pro_unpack = ksocknal_unpack_msg_v2, - .pro_queue_tx_msg = ksocknal_queue_tx_msg_v2, - .pro_queue_tx_zcack = ksocknal_queue_tx_zcack_v3, - .pro_handle_zcreq = ksocknal_handle_zcreq, - .pro_handle_zcack = ksocknal_handle_zcack, - .pro_match_tx = ksocknal_match_tx_v3 -}; diff --git a/drivers/staging/lustre/lnet/libcfs/Makefile b/drivers/staging/lustre/lnet/libcfs/Makefile deleted file mode 100644 index 6a1b232da495f8953dbdcb138df3411c35da9019..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/libcfs/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/include -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/lustre/include - -obj-$(CONFIG_LNET) += libcfs.o - -libcfs-obj-y += linux-tracefile.o linux-debug.o -libcfs-obj-y += linux-crypto.o -libcfs-obj-y += linux-crypto-adler.o - -libcfs-obj-y += debug.o fail.o module.o tracefile.o -libcfs-obj-y += libcfs_string.o hash.o -libcfs-obj-$(CONFIG_SMP) += libcfs_cpu.o -libcfs-obj-y += libcfs_mem.o libcfs_lock.o - -libcfs-objs := $(libcfs-obj-y) diff --git a/drivers/staging/lustre/lnet/libcfs/debug.c b/drivers/staging/lustre/lnet/libcfs/debug.c deleted file mode 100644 index 06f694f6a28f21abfe4afedacbdc763382360667..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/libcfs/debug.c +++ /dev/null @@ -1,461 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * libcfs/libcfs/debug.c - * - * Author: Phil Schwan - * - */ - -# define DEBUG_SUBSYSTEM S_LNET - -#include -#include -#include -#include -#include "tracefile.h" - -static char debug_file_name[1024]; - -unsigned int libcfs_subsystem_debug = ~0; -EXPORT_SYMBOL(libcfs_subsystem_debug); -module_param(libcfs_subsystem_debug, int, 0644); -MODULE_PARM_DESC(libcfs_subsystem_debug, "Lustre kernel debug subsystem mask"); - -unsigned int libcfs_debug = (D_CANTMASK | - D_NETERROR | D_HA | D_CONFIG | D_IOCTL); -EXPORT_SYMBOL(libcfs_debug); -module_param(libcfs_debug, int, 0644); -MODULE_PARM_DESC(libcfs_debug, "Lustre kernel debug mask"); - -static int libcfs_param_debug_mb_set(const char *val, - const struct kernel_param *kp) -{ - int rc; - unsigned int num; - - rc = kstrtouint(val, 0, &num); - if (rc < 0) - return rc; - - if (!*((unsigned int *)kp->arg)) { - *((unsigned int *)kp->arg) = num; - return 0; - } - - rc = cfs_trace_set_debug_mb(num); - - if (!rc) - *((unsigned int *)kp->arg) = cfs_trace_get_debug_mb(); - - return rc; -} - -/* While debug_mb setting look like unsigned int, in fact - * it needs quite a bunch of extra processing, so we define special - * debugmb parameter type with corresponding methods to handle this case - */ -static const struct kernel_param_ops param_ops_debugmb = { - .set = libcfs_param_debug_mb_set, - .get = param_get_uint, -}; - -#define param_check_debugmb(name, p) \ - __param_check(name, p, unsigned int) - -static unsigned int libcfs_debug_mb; -module_param(libcfs_debug_mb, debugmb, 0644); -MODULE_PARM_DESC(libcfs_debug_mb, "Total debug buffer size."); - -unsigned int libcfs_printk = D_CANTMASK; -module_param(libcfs_printk, uint, 0644); -MODULE_PARM_DESC(libcfs_printk, "Lustre kernel debug console mask"); - -unsigned int libcfs_console_ratelimit = 1; -module_param(libcfs_console_ratelimit, uint, 0644); -MODULE_PARM_DESC(libcfs_console_ratelimit, "Lustre kernel debug console ratelimit (0 to disable)"); - -static int param_set_delay_minmax(const char *val, - const struct kernel_param *kp, - long min, long max) -{ - long d; - int sec; - int rc; - - rc = kstrtoint(val, 0, &sec); - if (rc) - return -EINVAL; - - d = sec * HZ / 100; - if (d < min || d > max) - return -EINVAL; - - *((unsigned int *)kp->arg) = d; - - return 0; -} - -static int param_get_delay(char *buffer, const struct kernel_param *kp) -{ - unsigned int d = *(unsigned int *)kp->arg; - - return sprintf(buffer, "%u", (unsigned int)(d * 100) / HZ); -} - -unsigned int libcfs_console_max_delay; -unsigned int libcfs_console_min_delay; - -static int param_set_console_max_delay(const char *val, - const struct kernel_param *kp) -{ - return param_set_delay_minmax(val, kp, - libcfs_console_min_delay, INT_MAX); -} - -static const struct kernel_param_ops param_ops_console_max_delay = { - .set = param_set_console_max_delay, - .get = param_get_delay, -}; - -#define param_check_console_max_delay(name, p) \ - __param_check(name, p, unsigned int) - -module_param(libcfs_console_max_delay, console_max_delay, 0644); -MODULE_PARM_DESC(libcfs_console_max_delay, "Lustre kernel debug console max delay (jiffies)"); - -static int param_set_console_min_delay(const char *val, - const struct kernel_param *kp) -{ - return param_set_delay_minmax(val, kp, - 1, libcfs_console_max_delay); -} - -static const struct kernel_param_ops param_ops_console_min_delay = { - .set = param_set_console_min_delay, - .get = param_get_delay, -}; - -#define param_check_console_min_delay(name, p) \ - __param_check(name, p, unsigned int) - -module_param(libcfs_console_min_delay, console_min_delay, 0644); -MODULE_PARM_DESC(libcfs_console_min_delay, "Lustre kernel debug console min delay (jiffies)"); - -static int param_set_uint_minmax(const char *val, - const struct kernel_param *kp, - unsigned int min, unsigned int max) -{ - unsigned int num; - int ret; - - if (!val) - return -EINVAL; - ret = kstrtouint(val, 0, &num); - if (ret < 0 || num < min || num > max) - return -EINVAL; - *((unsigned int *)kp->arg) = num; - return 0; -} - -static int param_set_uintpos(const char *val, const struct kernel_param *kp) -{ - return param_set_uint_minmax(val, kp, 1, -1); -} - -static const struct kernel_param_ops param_ops_uintpos = { - .set = param_set_uintpos, - .get = param_get_uint, -}; - -#define param_check_uintpos(name, p) \ - __param_check(name, p, unsigned int) - -unsigned int libcfs_console_backoff = CDEBUG_DEFAULT_BACKOFF; -module_param(libcfs_console_backoff, uintpos, 0644); -MODULE_PARM_DESC(libcfs_console_backoff, "Lustre kernel debug console backoff factor"); - -unsigned int libcfs_debug_binary = 1; - -unsigned int libcfs_stack = 3 * THREAD_SIZE / 4; -EXPORT_SYMBOL(libcfs_stack); - -unsigned int libcfs_catastrophe; -EXPORT_SYMBOL(libcfs_catastrophe); - -unsigned int libcfs_panic_on_lbug = 1; -module_param(libcfs_panic_on_lbug, uint, 0644); -MODULE_PARM_DESC(libcfs_panic_on_lbug, "Lustre kernel panic on LBUG"); - -static wait_queue_head_t debug_ctlwq; - -char libcfs_debug_file_path_arr[PATH_MAX] = LIBCFS_DEBUG_FILE_PATH_DEFAULT; - -/* We need to pass a pointer here, but elsewhere this must be a const */ -static char *libcfs_debug_file_path; -module_param(libcfs_debug_file_path, charp, 0644); -MODULE_PARM_DESC(libcfs_debug_file_path, - "Path for dumping debug logs, set 'NONE' to prevent log dumping"); - -int libcfs_panic_in_progress; - -/* libcfs_debug_token2mask() expects the returned string in lower-case */ -static const char * -libcfs_debug_subsys2str(int subsys) -{ - static const char * const libcfs_debug_subsystems[] = - LIBCFS_DEBUG_SUBSYS_NAMES; - - if (subsys >= ARRAY_SIZE(libcfs_debug_subsystems)) - return NULL; - - return libcfs_debug_subsystems[subsys]; -} - -/* libcfs_debug_token2mask() expects the returned string in lower-case */ -static const char * -libcfs_debug_dbg2str(int debug) -{ - static const char * const libcfs_debug_masks[] = - LIBCFS_DEBUG_MASKS_NAMES; - - if (debug >= ARRAY_SIZE(libcfs_debug_masks)) - return NULL; - - return libcfs_debug_masks[debug]; -} - -int -libcfs_debug_mask2str(char *str, int size, int mask, int is_subsys) -{ - const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str : - libcfs_debug_dbg2str; - int len = 0; - const char *token; - int i; - - if (!mask) { /* "0" */ - if (size > 0) - str[0] = '0'; - len = 1; - } else { /* space-separated tokens */ - for (i = 0; i < 32; i++) { - if (!(mask & (1 << i))) - continue; - - token = fn(i); - if (!token) /* unused bit */ - continue; - - if (len > 0) { /* separator? */ - if (len < size) - str[len] = ' '; - len++; - } - - while (*token) { - if (len < size) - str[len] = *token; - token++; - len++; - } - } - } - - /* terminate 'str' */ - if (len < size) - str[len] = 0; - else - str[size - 1] = 0; - - return len; -} - -int -libcfs_debug_str2mask(int *mask, const char *str, int is_subsys) -{ - const char *(*fn)(int bit) = is_subsys ? libcfs_debug_subsys2str : - libcfs_debug_dbg2str; - int m = 0; - int matched; - int n; - int t; - - /* Allow a number for backwards compatibility */ - - for (n = strlen(str); n > 0; n--) - if (!isspace(str[n - 1])) - break; - matched = n; - t = sscanf(str, "%i%n", &m, &matched); - if (t >= 1 && matched == n) { - /* don't print warning for lctl set_param debug=0 or -1 */ - if (m && m != -1) - CWARN("You are trying to use a numerical value for the mask - this will be deprecated in a future release.\n"); - *mask = m; - return 0; - } - - return cfs_str2mask(str, fn, mask, is_subsys ? 0 : D_CANTMASK, - 0xffffffff); -} - -/** - * Dump Lustre log to ::debug_file_path by calling tracefile_dump_all_pages() - */ -void libcfs_debug_dumplog_internal(void *arg) -{ - static time64_t last_dump_time; - time64_t current_time; - void *journal_info; - - journal_info = current->journal_info; - current->journal_info = NULL; - current_time = ktime_get_real_seconds(); - - if (strncmp(libcfs_debug_file_path_arr, "NONE", 4) && - current_time > last_dump_time) { - last_dump_time = current_time; - snprintf(debug_file_name, sizeof(debug_file_name) - 1, - "%s.%lld.%ld", libcfs_debug_file_path_arr, - (s64)current_time, (long)arg); - pr_alert("LustreError: dumping log to %s\n", debug_file_name); - cfs_tracefile_dump_all_pages(debug_file_name); - libcfs_run_debug_log_upcall(debug_file_name); - } - - current->journal_info = journal_info; -} - -static int libcfs_debug_dumplog_thread(void *arg) -{ - libcfs_debug_dumplog_internal(arg); - wake_up(&debug_ctlwq); - return 0; -} - -void libcfs_debug_dumplog(void) -{ - wait_queue_entry_t wait; - struct task_struct *dumper; - - /* we're being careful to ensure that the kernel thread is - * able to set our state to running as it exits before we - * get to schedule() - */ - init_waitqueue_entry(&wait, current); - add_wait_queue(&debug_ctlwq, &wait); - - dumper = kthread_run(libcfs_debug_dumplog_thread, - (void *)(long)current->pid, - "libcfs_debug_dumper"); - set_current_state(TASK_INTERRUPTIBLE); - if (IS_ERR(dumper)) - pr_err("LustreError: cannot start log dump thread: %ld\n", - PTR_ERR(dumper)); - else - schedule(); - - /* be sure to teardown if cfs_create_thread() failed */ - remove_wait_queue(&debug_ctlwq, &wait); - set_current_state(TASK_RUNNING); -} -EXPORT_SYMBOL(libcfs_debug_dumplog); - -int libcfs_debug_init(unsigned long bufsize) -{ - unsigned int max = libcfs_debug_mb; - int rc = 0; - - init_waitqueue_head(&debug_ctlwq); - - if (libcfs_console_max_delay <= 0 || /* not set by user or */ - libcfs_console_min_delay <= 0 || /* set to invalid values */ - libcfs_console_min_delay >= libcfs_console_max_delay) { - libcfs_console_max_delay = CDEBUG_DEFAULT_MAX_DELAY; - libcfs_console_min_delay = CDEBUG_DEFAULT_MIN_DELAY; - } - - if (libcfs_debug_file_path) { - strlcpy(libcfs_debug_file_path_arr, - libcfs_debug_file_path, - sizeof(libcfs_debug_file_path_arr)); - } - - /* If libcfs_debug_mb is set to an invalid value or uninitialized - * then just make the total buffers smp_num_cpus * TCD_MAX_PAGES - */ - if (max > cfs_trace_max_debug_mb() || max < num_possible_cpus()) { - max = TCD_MAX_PAGES; - } else { - max = max / num_possible_cpus(); - max <<= (20 - PAGE_SHIFT); - } - - rc = cfs_tracefile_init(max); - if (!rc) { - libcfs_register_panic_notifier(); - libcfs_debug_mb = cfs_trace_get_debug_mb(); - } - - return rc; -} - -int libcfs_debug_cleanup(void) -{ - libcfs_unregister_panic_notifier(); - cfs_tracefile_exit(); - return 0; -} - -int libcfs_debug_clear_buffer(void) -{ - cfs_trace_flush_pages(); - return 0; -} - -/* Debug markers, although printed by S_LNET should not be marked as such. */ -#undef DEBUG_SUBSYSTEM -#define DEBUG_SUBSYSTEM S_UNDEFINED -int libcfs_debug_mark_buffer(const char *text) -{ - CDEBUG(D_TRACE, - "***************************************************\n"); - LCONSOLE(D_WARNING, "DEBUG MARKER: %s\n", text); - CDEBUG(D_TRACE, - "***************************************************\n"); - - return 0; -} - -#undef DEBUG_SUBSYSTEM -#define DEBUG_SUBSYSTEM S_LNET diff --git a/drivers/staging/lustre/lnet/libcfs/fail.c b/drivers/staging/lustre/lnet/libcfs/fail.c deleted file mode 100644 index bd86b3b5bc346ac25bfa9bdd22a258f8a2df26fd..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/libcfs/fail.c +++ /dev/null @@ -1,146 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see http://www.gnu.org/licenses - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Oracle Corporation, Inc. - */ - -#include -#include -#include -#include -#include - -unsigned long cfs_fail_loc; -EXPORT_SYMBOL(cfs_fail_loc); - -unsigned int cfs_fail_val; -EXPORT_SYMBOL(cfs_fail_val); - -int cfs_fail_err; -EXPORT_SYMBOL(cfs_fail_err); - -DECLARE_WAIT_QUEUE_HEAD(cfs_race_waitq); -EXPORT_SYMBOL(cfs_race_waitq); - -int cfs_race_state; -EXPORT_SYMBOL(cfs_race_state); - -int __cfs_fail_check_set(u32 id, u32 value, int set) -{ - static atomic_t cfs_fail_count = ATOMIC_INIT(0); - - LASSERT(!(id & CFS_FAIL_ONCE)); - - if ((cfs_fail_loc & (CFS_FAILED | CFS_FAIL_ONCE)) == - (CFS_FAILED | CFS_FAIL_ONCE)) { - atomic_set(&cfs_fail_count, 0); /* paranoia */ - return 0; - } - - /* Fail 1/cfs_fail_val times */ - if (cfs_fail_loc & CFS_FAIL_RAND) { - if (cfs_fail_val < 2 || prandom_u32_max(cfs_fail_val) > 0) - return 0; - } - - /* Skip the first cfs_fail_val, then fail */ - if (cfs_fail_loc & CFS_FAIL_SKIP) { - if (atomic_inc_return(&cfs_fail_count) <= cfs_fail_val) - return 0; - } - - /* check cfs_fail_val... */ - if (set == CFS_FAIL_LOC_VALUE) { - if (cfs_fail_val != -1 && cfs_fail_val != value) - return 0; - } - - /* Fail cfs_fail_val times, overridden by FAIL_ONCE */ - if (cfs_fail_loc & CFS_FAIL_SOME && - (!(cfs_fail_loc & CFS_FAIL_ONCE) || cfs_fail_val <= 1)) { - int count = atomic_inc_return(&cfs_fail_count); - - if (count >= cfs_fail_val) { - set_bit(CFS_FAIL_ONCE_BIT, &cfs_fail_loc); - atomic_set(&cfs_fail_count, 0); - /* we are lost race to increase */ - if (count > cfs_fail_val) - return 0; - } - } - - /* Take into account the current call for FAIL_ONCE for ORSET only, - * as RESET is a new fail_loc, it does not change the current call - */ - if ((set == CFS_FAIL_LOC_ORSET) && (value & CFS_FAIL_ONCE)) - set_bit(CFS_FAIL_ONCE_BIT, &cfs_fail_loc); - /* Lost race to set CFS_FAILED_BIT. */ - if (test_and_set_bit(CFS_FAILED_BIT, &cfs_fail_loc)) { - /* If CFS_FAIL_ONCE is valid, only one process can fail, - * otherwise multi-process can fail at the same time. - */ - if (cfs_fail_loc & CFS_FAIL_ONCE) - return 0; - } - - switch (set) { - case CFS_FAIL_LOC_NOSET: - case CFS_FAIL_LOC_VALUE: - break; - case CFS_FAIL_LOC_ORSET: - cfs_fail_loc |= value & ~(CFS_FAILED | CFS_FAIL_ONCE); - break; - case CFS_FAIL_LOC_RESET: - cfs_fail_loc = value; - atomic_set(&cfs_fail_count, 0); - break; - default: - LASSERTF(0, "called with bad set %u\n", set); - break; - } - - return 1; -} -EXPORT_SYMBOL(__cfs_fail_check_set); - -int __cfs_fail_timeout_set(u32 id, u32 value, int ms, int set) -{ - int ret; - - ret = __cfs_fail_check_set(id, value, set); - if (ret && likely(ms > 0)) { - CERROR("cfs_fail_timeout id %x sleeping for %dms\n", - id, ms); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(ms * HZ / 1000); - CERROR("cfs_fail_timeout id %x awake\n", id); - } - return ret; -} -EXPORT_SYMBOL(__cfs_fail_timeout_set); diff --git a/drivers/staging/lustre/lnet/libcfs/hash.c b/drivers/staging/lustre/lnet/libcfs/hash.c deleted file mode 100644 index 48be66f0d654c8abff6272ce1b2bdc67ef6b4f3e..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/libcfs/hash.c +++ /dev/null @@ -1,2065 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * libcfs/libcfs/hash.c - * - * Implement a hash class for hash process in lustre system. - * - * Author: YuZhangyong - * - * 2008-08-15: Brian Behlendorf - * - Simplified API and improved documentation - * - Added per-hash feature flags: - * * CFS_HASH_DEBUG additional validation - * * CFS_HASH_REHASH dynamic rehashing - * - Added per-hash statistics - * - General performance enhancements - * - * 2009-07-31: Liang Zhen - * - move all stuff to libcfs - * - don't allow cur_bits != max_bits without setting of CFS_HASH_REHASH - * - ignore hs_rwlock if without CFS_HASH_REHASH setting - * - buckets are allocated one by one(instead of contiguous memory), - * to avoid unnecessary cacheline conflict - * - * 2010-03-01: Liang Zhen - * - "bucket" is a group of hlist_head now, user can specify bucket size - * by bkt_bits of cfs_hash_create(), all hlist_heads in a bucket share - * one lock for reducing memory overhead. - * - * - support lockless hash, caller will take care of locks: - * avoid lock overhead for hash tables that are already protected - * by locking in the caller for another reason - * - * - support both spin_lock/rwlock for bucket: - * overhead of spinlock contention is lower than read/write - * contention of rwlock, so using spinlock to serialize operations on - * bucket is more reasonable for those frequently changed hash tables - * - * - support one-single lock mode: - * one lock to protect all hash operations to avoid overhead of - * multiple locks if hash table is always small - * - * - removed a lot of unnecessary addref & decref on hash element: - * addref & decref are atomic operations in many use-cases which - * are expensive. - * - * - support non-blocking cfs_hash_add() and cfs_hash_findadd(): - * some lustre use-cases require these functions to be strictly - * non-blocking, we need to schedule required rehash on a different - * thread on those cases. - * - * - safer rehash on large hash table - * In old implementation, rehash function will exclusively lock the - * hash table and finish rehash in one batch, it's dangerous on SMP - * system because rehash millions of elements could take long time. - * New implemented rehash can release lock and relax CPU in middle - * of rehash, it's safe for another thread to search/change on the - * hash table even it's in rehasing. - * - * - support two different refcount modes - * . hash table has refcount on element - * . hash table doesn't change refcount on adding/removing element - * - * - support long name hash table (for param-tree) - * - * - fix a bug for cfs_hash_rehash_key: - * in old implementation, cfs_hash_rehash_key could screw up the - * hash-table because @key is overwritten without any protection. - * Now we need user to define hs_keycpy for those rehash enabled - * hash tables, cfs_hash_rehash_key will overwrite hash-key - * inside lock by calling hs_keycpy. - * - * - better hash iteration: - * Now we support both locked iteration & lockless iteration of hash - * table. Also, user can break the iteration by return 1 in callback. - */ -#include -#include -#include -#include -#include - -#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 -static unsigned int warn_on_depth = 8; -module_param(warn_on_depth, uint, 0644); -MODULE_PARM_DESC(warn_on_depth, "warning when hash depth is high."); -#endif - -struct workqueue_struct *cfs_rehash_wq; - -static inline void -cfs_hash_nl_lock(union cfs_hash_lock *lock, int exclusive) {} - -static inline void -cfs_hash_nl_unlock(union cfs_hash_lock *lock, int exclusive) {} - -static inline void -cfs_hash_spin_lock(union cfs_hash_lock *lock, int exclusive) - __acquires(&lock->spin) -{ - spin_lock(&lock->spin); -} - -static inline void -cfs_hash_spin_unlock(union cfs_hash_lock *lock, int exclusive) - __releases(&lock->spin) -{ - spin_unlock(&lock->spin); -} - -static inline void -cfs_hash_rw_lock(union cfs_hash_lock *lock, int exclusive) - __acquires(&lock->rw) -{ - if (!exclusive) - read_lock(&lock->rw); - else - write_lock(&lock->rw); -} - -static inline void -cfs_hash_rw_unlock(union cfs_hash_lock *lock, int exclusive) - __releases(&lock->rw) -{ - if (!exclusive) - read_unlock(&lock->rw); - else - write_unlock(&lock->rw); -} - -/** No lock hash */ -static struct cfs_hash_lock_ops cfs_hash_nl_lops = { - .hs_lock = cfs_hash_nl_lock, - .hs_unlock = cfs_hash_nl_unlock, - .hs_bkt_lock = cfs_hash_nl_lock, - .hs_bkt_unlock = cfs_hash_nl_unlock, -}; - -/** no bucket lock, one spinlock to protect everything */ -static struct cfs_hash_lock_ops cfs_hash_nbl_lops = { - .hs_lock = cfs_hash_spin_lock, - .hs_unlock = cfs_hash_spin_unlock, - .hs_bkt_lock = cfs_hash_nl_lock, - .hs_bkt_unlock = cfs_hash_nl_unlock, -}; - -/** spin bucket lock, rehash is enabled */ -static struct cfs_hash_lock_ops cfs_hash_bkt_spin_lops = { - .hs_lock = cfs_hash_rw_lock, - .hs_unlock = cfs_hash_rw_unlock, - .hs_bkt_lock = cfs_hash_spin_lock, - .hs_bkt_unlock = cfs_hash_spin_unlock, -}; - -/** rw bucket lock, rehash is enabled */ -static struct cfs_hash_lock_ops cfs_hash_bkt_rw_lops = { - .hs_lock = cfs_hash_rw_lock, - .hs_unlock = cfs_hash_rw_unlock, - .hs_bkt_lock = cfs_hash_rw_lock, - .hs_bkt_unlock = cfs_hash_rw_unlock, -}; - -/** spin bucket lock, rehash is disabled */ -static struct cfs_hash_lock_ops cfs_hash_nr_bkt_spin_lops = { - .hs_lock = cfs_hash_nl_lock, - .hs_unlock = cfs_hash_nl_unlock, - .hs_bkt_lock = cfs_hash_spin_lock, - .hs_bkt_unlock = cfs_hash_spin_unlock, -}; - -/** rw bucket lock, rehash is disabled */ -static struct cfs_hash_lock_ops cfs_hash_nr_bkt_rw_lops = { - .hs_lock = cfs_hash_nl_lock, - .hs_unlock = cfs_hash_nl_unlock, - .hs_bkt_lock = cfs_hash_rw_lock, - .hs_bkt_unlock = cfs_hash_rw_unlock, -}; - -static void -cfs_hash_lock_setup(struct cfs_hash *hs) -{ - if (cfs_hash_with_no_lock(hs)) { - hs->hs_lops = &cfs_hash_nl_lops; - - } else if (cfs_hash_with_no_bktlock(hs)) { - hs->hs_lops = &cfs_hash_nbl_lops; - spin_lock_init(&hs->hs_lock.spin); - - } else if (cfs_hash_with_rehash(hs)) { - rwlock_init(&hs->hs_lock.rw); - - if (cfs_hash_with_rw_bktlock(hs)) - hs->hs_lops = &cfs_hash_bkt_rw_lops; - else if (cfs_hash_with_spin_bktlock(hs)) - hs->hs_lops = &cfs_hash_bkt_spin_lops; - else - LBUG(); - } else { - if (cfs_hash_with_rw_bktlock(hs)) - hs->hs_lops = &cfs_hash_nr_bkt_rw_lops; - else if (cfs_hash_with_spin_bktlock(hs)) - hs->hs_lops = &cfs_hash_nr_bkt_spin_lops; - else - LBUG(); - } -} - -/** - * Simple hash head without depth tracking - * new element is always added to head of hlist - */ -struct cfs_hash_head { - struct hlist_head hh_head; /**< entries list */ -}; - -static int -cfs_hash_hh_hhead_size(struct cfs_hash *hs) -{ - return sizeof(struct cfs_hash_head); -} - -static struct hlist_head * -cfs_hash_hh_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd) -{ - struct cfs_hash_head *head; - - head = (struct cfs_hash_head *)&bd->bd_bucket->hsb_head[0]; - return &head[bd->bd_offset].hh_head; -} - -static int -cfs_hash_hh_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd, - struct hlist_node *hnode) -{ - hlist_add_head(hnode, cfs_hash_hh_hhead(hs, bd)); - return -1; /* unknown depth */ -} - -static int -cfs_hash_hh_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd, - struct hlist_node *hnode) -{ - hlist_del_init(hnode); - return -1; /* unknown depth */ -} - -/** - * Simple hash head with depth tracking - * new element is always added to head of hlist - */ -struct cfs_hash_head_dep { - struct hlist_head hd_head; /**< entries list */ - unsigned int hd_depth; /**< list length */ -}; - -static int -cfs_hash_hd_hhead_size(struct cfs_hash *hs) -{ - return sizeof(struct cfs_hash_head_dep); -} - -static struct hlist_head * -cfs_hash_hd_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd) -{ - struct cfs_hash_head_dep *head; - - head = (struct cfs_hash_head_dep *)&bd->bd_bucket->hsb_head[0]; - return &head[bd->bd_offset].hd_head; -} - -static int -cfs_hash_hd_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd, - struct hlist_node *hnode) -{ - struct cfs_hash_head_dep *hh; - - hh = container_of(cfs_hash_hd_hhead(hs, bd), - struct cfs_hash_head_dep, hd_head); - hlist_add_head(hnode, &hh->hd_head); - return ++hh->hd_depth; -} - -static int -cfs_hash_hd_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd, - struct hlist_node *hnode) -{ - struct cfs_hash_head_dep *hh; - - hh = container_of(cfs_hash_hd_hhead(hs, bd), - struct cfs_hash_head_dep, hd_head); - hlist_del_init(hnode); - return --hh->hd_depth; -} - -/** - * double links hash head without depth tracking - * new element is always added to tail of hlist - */ -struct cfs_hash_dhead { - struct hlist_head dh_head; /**< entries list */ - struct hlist_node *dh_tail; /**< the last entry */ -}; - -static int -cfs_hash_dh_hhead_size(struct cfs_hash *hs) -{ - return sizeof(struct cfs_hash_dhead); -} - -static struct hlist_head * -cfs_hash_dh_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd) -{ - struct cfs_hash_dhead *head; - - head = (struct cfs_hash_dhead *)&bd->bd_bucket->hsb_head[0]; - return &head[bd->bd_offset].dh_head; -} - -static int -cfs_hash_dh_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd, - struct hlist_node *hnode) -{ - struct cfs_hash_dhead *dh; - - dh = container_of(cfs_hash_dh_hhead(hs, bd), - struct cfs_hash_dhead, dh_head); - if (dh->dh_tail) /* not empty */ - hlist_add_behind(hnode, dh->dh_tail); - else /* empty list */ - hlist_add_head(hnode, &dh->dh_head); - dh->dh_tail = hnode; - return -1; /* unknown depth */ -} - -static int -cfs_hash_dh_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd, - struct hlist_node *hnd) -{ - struct cfs_hash_dhead *dh; - - dh = container_of(cfs_hash_dh_hhead(hs, bd), - struct cfs_hash_dhead, dh_head); - if (!hnd->next) { /* it's the tail */ - dh->dh_tail = (hnd->pprev == &dh->dh_head.first) ? NULL : - container_of(hnd->pprev, struct hlist_node, next); - } - hlist_del_init(hnd); - return -1; /* unknown depth */ -} - -/** - * double links hash head with depth tracking - * new element is always added to tail of hlist - */ -struct cfs_hash_dhead_dep { - struct hlist_head dd_head; /**< entries list */ - struct hlist_node *dd_tail; /**< the last entry */ - unsigned int dd_depth; /**< list length */ -}; - -static int -cfs_hash_dd_hhead_size(struct cfs_hash *hs) -{ - return sizeof(struct cfs_hash_dhead_dep); -} - -static struct hlist_head * -cfs_hash_dd_hhead(struct cfs_hash *hs, struct cfs_hash_bd *bd) -{ - struct cfs_hash_dhead_dep *head; - - head = (struct cfs_hash_dhead_dep *)&bd->bd_bucket->hsb_head[0]; - return &head[bd->bd_offset].dd_head; -} - -static int -cfs_hash_dd_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd, - struct hlist_node *hnode) -{ - struct cfs_hash_dhead_dep *dh; - - dh = container_of(cfs_hash_dd_hhead(hs, bd), - struct cfs_hash_dhead_dep, dd_head); - if (dh->dd_tail) /* not empty */ - hlist_add_behind(hnode, dh->dd_tail); - else /* empty list */ - hlist_add_head(hnode, &dh->dd_head); - dh->dd_tail = hnode; - return ++dh->dd_depth; -} - -static int -cfs_hash_dd_hnode_del(struct cfs_hash *hs, struct cfs_hash_bd *bd, - struct hlist_node *hnd) -{ - struct cfs_hash_dhead_dep *dh; - - dh = container_of(cfs_hash_dd_hhead(hs, bd), - struct cfs_hash_dhead_dep, dd_head); - if (!hnd->next) { /* it's the tail */ - dh->dd_tail = (hnd->pprev == &dh->dd_head.first) ? NULL : - container_of(hnd->pprev, struct hlist_node, next); - } - hlist_del_init(hnd); - return --dh->dd_depth; -} - -static struct cfs_hash_hlist_ops cfs_hash_hh_hops = { - .hop_hhead = cfs_hash_hh_hhead, - .hop_hhead_size = cfs_hash_hh_hhead_size, - .hop_hnode_add = cfs_hash_hh_hnode_add, - .hop_hnode_del = cfs_hash_hh_hnode_del, -}; - -static struct cfs_hash_hlist_ops cfs_hash_hd_hops = { - .hop_hhead = cfs_hash_hd_hhead, - .hop_hhead_size = cfs_hash_hd_hhead_size, - .hop_hnode_add = cfs_hash_hd_hnode_add, - .hop_hnode_del = cfs_hash_hd_hnode_del, -}; - -static struct cfs_hash_hlist_ops cfs_hash_dh_hops = { - .hop_hhead = cfs_hash_dh_hhead, - .hop_hhead_size = cfs_hash_dh_hhead_size, - .hop_hnode_add = cfs_hash_dh_hnode_add, - .hop_hnode_del = cfs_hash_dh_hnode_del, -}; - -static struct cfs_hash_hlist_ops cfs_hash_dd_hops = { - .hop_hhead = cfs_hash_dd_hhead, - .hop_hhead_size = cfs_hash_dd_hhead_size, - .hop_hnode_add = cfs_hash_dd_hnode_add, - .hop_hnode_del = cfs_hash_dd_hnode_del, -}; - -static void -cfs_hash_hlist_setup(struct cfs_hash *hs) -{ - if (cfs_hash_with_add_tail(hs)) { - hs->hs_hops = cfs_hash_with_depth(hs) ? - &cfs_hash_dd_hops : &cfs_hash_dh_hops; - } else { - hs->hs_hops = cfs_hash_with_depth(hs) ? - &cfs_hash_hd_hops : &cfs_hash_hh_hops; - } -} - -static void -cfs_hash_bd_from_key(struct cfs_hash *hs, struct cfs_hash_bucket **bkts, - unsigned int bits, const void *key, struct cfs_hash_bd *bd) -{ - unsigned int index = cfs_hash_id(hs, key, (1U << bits) - 1); - - LASSERT(bits == hs->hs_cur_bits || bits == hs->hs_rehash_bits); - - bd->bd_bucket = bkts[index & ((1U << (bits - hs->hs_bkt_bits)) - 1)]; - bd->bd_offset = index >> (bits - hs->hs_bkt_bits); -} - -void -cfs_hash_bd_get(struct cfs_hash *hs, const void *key, struct cfs_hash_bd *bd) -{ - /* NB: caller should hold hs->hs_rwlock if REHASH is set */ - if (likely(!hs->hs_rehash_buckets)) { - cfs_hash_bd_from_key(hs, hs->hs_buckets, - hs->hs_cur_bits, key, bd); - } else { - LASSERT(hs->hs_rehash_bits); - cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets, - hs->hs_rehash_bits, key, bd); - } -} -EXPORT_SYMBOL(cfs_hash_bd_get); - -static inline void -cfs_hash_bd_dep_record(struct cfs_hash *hs, struct cfs_hash_bd *bd, int dep_cur) -{ - if (likely(dep_cur <= bd->bd_bucket->hsb_depmax)) - return; - - bd->bd_bucket->hsb_depmax = dep_cur; -# if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 - if (likely(!warn_on_depth || - max(warn_on_depth, hs->hs_dep_max) >= dep_cur)) - return; - - spin_lock(&hs->hs_dep_lock); - hs->hs_dep_max = dep_cur; - hs->hs_dep_bkt = bd->bd_bucket->hsb_index; - hs->hs_dep_off = bd->bd_offset; - hs->hs_dep_bits = hs->hs_cur_bits; - spin_unlock(&hs->hs_dep_lock); - - queue_work(cfs_rehash_wq, &hs->hs_dep_work); -# endif -} - -void -cfs_hash_bd_add_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, - struct hlist_node *hnode) -{ - int rc; - - rc = hs->hs_hops->hop_hnode_add(hs, bd, hnode); - cfs_hash_bd_dep_record(hs, bd, rc); - bd->bd_bucket->hsb_version++; - if (unlikely(!bd->bd_bucket->hsb_version)) - bd->bd_bucket->hsb_version++; - bd->bd_bucket->hsb_count++; - - if (cfs_hash_with_counter(hs)) - atomic_inc(&hs->hs_count); - if (!cfs_hash_with_no_itemref(hs)) - cfs_hash_get(hs, hnode); -} -EXPORT_SYMBOL(cfs_hash_bd_add_locked); - -void -cfs_hash_bd_del_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, - struct hlist_node *hnode) -{ - hs->hs_hops->hop_hnode_del(hs, bd, hnode); - - LASSERT(bd->bd_bucket->hsb_count > 0); - bd->bd_bucket->hsb_count--; - bd->bd_bucket->hsb_version++; - if (unlikely(!bd->bd_bucket->hsb_version)) - bd->bd_bucket->hsb_version++; - - if (cfs_hash_with_counter(hs)) { - LASSERT(atomic_read(&hs->hs_count) > 0); - atomic_dec(&hs->hs_count); - } - if (!cfs_hash_with_no_itemref(hs)) - cfs_hash_put_locked(hs, hnode); -} -EXPORT_SYMBOL(cfs_hash_bd_del_locked); - -void -cfs_hash_bd_move_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd_old, - struct cfs_hash_bd *bd_new, struct hlist_node *hnode) -{ - struct cfs_hash_bucket *obkt = bd_old->bd_bucket; - struct cfs_hash_bucket *nbkt = bd_new->bd_bucket; - int rc; - - if (!cfs_hash_bd_compare(bd_old, bd_new)) - return; - - /* use cfs_hash_bd_hnode_add/del, to avoid atomic & refcount ops - * in cfs_hash_bd_del/add_locked - */ - hs->hs_hops->hop_hnode_del(hs, bd_old, hnode); - rc = hs->hs_hops->hop_hnode_add(hs, bd_new, hnode); - cfs_hash_bd_dep_record(hs, bd_new, rc); - - LASSERT(obkt->hsb_count > 0); - obkt->hsb_count--; - obkt->hsb_version++; - if (unlikely(!obkt->hsb_version)) - obkt->hsb_version++; - nbkt->hsb_count++; - nbkt->hsb_version++; - if (unlikely(!nbkt->hsb_version)) - nbkt->hsb_version++; -} - -enum { - /** always set, for sanity (avoid ZERO intent) */ - CFS_HS_LOOKUP_MASK_FIND = BIT(0), - /** return entry with a ref */ - CFS_HS_LOOKUP_MASK_REF = BIT(1), - /** add entry if not existing */ - CFS_HS_LOOKUP_MASK_ADD = BIT(2), - /** delete entry, ignore other masks */ - CFS_HS_LOOKUP_MASK_DEL = BIT(3), -}; - -enum cfs_hash_lookup_intent { - /** return item w/o refcount */ - CFS_HS_LOOKUP_IT_PEEK = CFS_HS_LOOKUP_MASK_FIND, - /** return item with refcount */ - CFS_HS_LOOKUP_IT_FIND = (CFS_HS_LOOKUP_MASK_FIND | - CFS_HS_LOOKUP_MASK_REF), - /** return item w/o refcount if existed, otherwise add */ - CFS_HS_LOOKUP_IT_ADD = (CFS_HS_LOOKUP_MASK_FIND | - CFS_HS_LOOKUP_MASK_ADD), - /** return item with refcount if existed, otherwise add */ - CFS_HS_LOOKUP_IT_FINDADD = (CFS_HS_LOOKUP_IT_FIND | - CFS_HS_LOOKUP_MASK_ADD), - /** delete if existed */ - CFS_HS_LOOKUP_IT_FINDDEL = (CFS_HS_LOOKUP_MASK_FIND | - CFS_HS_LOOKUP_MASK_DEL) -}; - -static struct hlist_node * -cfs_hash_bd_lookup_intent(struct cfs_hash *hs, struct cfs_hash_bd *bd, - const void *key, struct hlist_node *hnode, - enum cfs_hash_lookup_intent intent) - -{ - struct hlist_head *hhead = cfs_hash_bd_hhead(hs, bd); - struct hlist_node *ehnode; - struct hlist_node *match; - int intent_add = intent & CFS_HS_LOOKUP_MASK_ADD; - - /* with this function, we can avoid a lot of useless refcount ops, - * which are expensive atomic operations most time. - */ - match = intent_add ? NULL : hnode; - hlist_for_each(ehnode, hhead) { - if (!cfs_hash_keycmp(hs, key, ehnode)) - continue; - - if (match && match != ehnode) /* can't match */ - continue; - - /* match and ... */ - if (intent & CFS_HS_LOOKUP_MASK_DEL) { - cfs_hash_bd_del_locked(hs, bd, ehnode); - return ehnode; - } - - /* caller wants refcount? */ - if (intent & CFS_HS_LOOKUP_MASK_REF) - cfs_hash_get(hs, ehnode); - return ehnode; - } - /* no match item */ - if (!intent_add) - return NULL; - - LASSERT(hnode); - cfs_hash_bd_add_locked(hs, bd, hnode); - return hnode; -} - -struct hlist_node * -cfs_hash_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, - const void *key) -{ - return cfs_hash_bd_lookup_intent(hs, bd, key, NULL, - CFS_HS_LOOKUP_IT_FIND); -} -EXPORT_SYMBOL(cfs_hash_bd_lookup_locked); - -struct hlist_node * -cfs_hash_bd_peek_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, - const void *key) -{ - return cfs_hash_bd_lookup_intent(hs, bd, key, NULL, - CFS_HS_LOOKUP_IT_PEEK); -} -EXPORT_SYMBOL(cfs_hash_bd_peek_locked); - -static void -cfs_hash_multi_bd_lock(struct cfs_hash *hs, struct cfs_hash_bd *bds, - unsigned int n, int excl) -{ - struct cfs_hash_bucket *prev = NULL; - int i; - - /** - * bds must be ascendantly ordered by bd->bd_bucket->hsb_index. - * NB: it's possible that several bds point to the same bucket but - * have different bd::bd_offset, so need take care of deadlock. - */ - cfs_hash_for_each_bd(bds, n, i) { - if (prev == bds[i].bd_bucket) - continue; - - LASSERT(!prev || prev->hsb_index < bds[i].bd_bucket->hsb_index); - cfs_hash_bd_lock(hs, &bds[i], excl); - prev = bds[i].bd_bucket; - } -} - -static void -cfs_hash_multi_bd_unlock(struct cfs_hash *hs, struct cfs_hash_bd *bds, - unsigned int n, int excl) -{ - struct cfs_hash_bucket *prev = NULL; - int i; - - cfs_hash_for_each_bd(bds, n, i) { - if (prev != bds[i].bd_bucket) { - cfs_hash_bd_unlock(hs, &bds[i], excl); - prev = bds[i].bd_bucket; - } - } -} - -static struct hlist_node * -cfs_hash_multi_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, - unsigned int n, const void *key) -{ - struct hlist_node *ehnode; - unsigned int i; - - cfs_hash_for_each_bd(bds, n, i) { - ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, NULL, - CFS_HS_LOOKUP_IT_FIND); - if (ehnode) - return ehnode; - } - return NULL; -} - -static struct hlist_node * -cfs_hash_multi_bd_findadd_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, - unsigned int n, const void *key, - struct hlist_node *hnode, int noref) -{ - struct hlist_node *ehnode; - int intent; - unsigned int i; - - LASSERT(hnode); - intent = (!noref * CFS_HS_LOOKUP_MASK_REF) | CFS_HS_LOOKUP_IT_PEEK; - - cfs_hash_for_each_bd(bds, n, i) { - ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, - NULL, intent); - if (ehnode) - return ehnode; - } - - if (i == 1) { /* only one bucket */ - cfs_hash_bd_add_locked(hs, &bds[0], hnode); - } else { - struct cfs_hash_bd mybd; - - cfs_hash_bd_get(hs, key, &mybd); - cfs_hash_bd_add_locked(hs, &mybd, hnode); - } - - return hnode; -} - -static struct hlist_node * -cfs_hash_multi_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, - unsigned int n, const void *key, - struct hlist_node *hnode) -{ - struct hlist_node *ehnode; - unsigned int i; - - cfs_hash_for_each_bd(bds, n, i) { - ehnode = cfs_hash_bd_lookup_intent(hs, &bds[i], key, hnode, - CFS_HS_LOOKUP_IT_FINDDEL); - if (ehnode) - return ehnode; - } - return NULL; -} - -static void -cfs_hash_bd_order(struct cfs_hash_bd *bd1, struct cfs_hash_bd *bd2) -{ - int rc; - - if (!bd2->bd_bucket) - return; - - if (!bd1->bd_bucket) { - *bd1 = *bd2; - bd2->bd_bucket = NULL; - return; - } - - rc = cfs_hash_bd_compare(bd1, bd2); - if (!rc) - bd2->bd_bucket = NULL; - else if (rc > 0) - swap(*bd1, *bd2); /* swap bd1 and bd2 */ -} - -void -cfs_hash_dual_bd_get(struct cfs_hash *hs, const void *key, - struct cfs_hash_bd *bds) -{ - /* NB: caller should hold hs_lock.rw if REHASH is set */ - cfs_hash_bd_from_key(hs, hs->hs_buckets, - hs->hs_cur_bits, key, &bds[0]); - if (likely(!hs->hs_rehash_buckets)) { - /* no rehash or not rehashing */ - bds[1].bd_bucket = NULL; - return; - } - - LASSERT(hs->hs_rehash_bits); - cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets, - hs->hs_rehash_bits, key, &bds[1]); - - cfs_hash_bd_order(&bds[0], &bds[1]); -} - -void -cfs_hash_dual_bd_lock(struct cfs_hash *hs, struct cfs_hash_bd *bds, int excl) -{ - cfs_hash_multi_bd_lock(hs, bds, 2, excl); -} - -void -cfs_hash_dual_bd_unlock(struct cfs_hash *hs, struct cfs_hash_bd *bds, int excl) -{ - cfs_hash_multi_bd_unlock(hs, bds, 2, excl); -} - -struct hlist_node * -cfs_hash_dual_bd_lookup_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, - const void *key) -{ - return cfs_hash_multi_bd_lookup_locked(hs, bds, 2, key); -} - -struct hlist_node * -cfs_hash_dual_bd_findadd_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, - const void *key, struct hlist_node *hnode, - int noref) -{ - return cfs_hash_multi_bd_findadd_locked(hs, bds, 2, key, - hnode, noref); -} - -struct hlist_node * -cfs_hash_dual_bd_finddel_locked(struct cfs_hash *hs, struct cfs_hash_bd *bds, - const void *key, struct hlist_node *hnode) -{ - return cfs_hash_multi_bd_finddel_locked(hs, bds, 2, key, hnode); -} - -static void -cfs_hash_buckets_free(struct cfs_hash_bucket **buckets, - int bkt_size, int prev_size, int size) -{ - int i; - - for (i = prev_size; i < size; i++) - kfree(buckets[i]); - - kvfree(buckets); -} - -/* - * Create or grow bucket memory. Return old_buckets if no allocation was - * needed, the newly allocated buckets if allocation was needed and - * successful, and NULL on error. - */ -static struct cfs_hash_bucket ** -cfs_hash_buckets_realloc(struct cfs_hash *hs, struct cfs_hash_bucket **old_bkts, - unsigned int old_size, unsigned int new_size) -{ - struct cfs_hash_bucket **new_bkts; - int i; - - LASSERT(!old_size || old_bkts); - - if (old_bkts && old_size == new_size) - return old_bkts; - - new_bkts = kvmalloc_array(new_size, sizeof(new_bkts[0]), GFP_KERNEL); - if (!new_bkts) - return NULL; - - if (old_bkts) { - memcpy(new_bkts, old_bkts, - min(old_size, new_size) * sizeof(*old_bkts)); - } - - for (i = old_size; i < new_size; i++) { - struct hlist_head *hhead; - struct cfs_hash_bd bd; - - new_bkts[i] = kzalloc(cfs_hash_bkt_size(hs), GFP_KERNEL); - if (!new_bkts[i]) { - cfs_hash_buckets_free(new_bkts, cfs_hash_bkt_size(hs), - old_size, new_size); - return NULL; - } - - new_bkts[i]->hsb_index = i; - new_bkts[i]->hsb_version = 1; /* shouldn't be zero */ - new_bkts[i]->hsb_depmax = -1; /* unknown */ - bd.bd_bucket = new_bkts[i]; - cfs_hash_bd_for_each_hlist(hs, &bd, hhead) - INIT_HLIST_HEAD(hhead); - - if (cfs_hash_with_no_lock(hs) || - cfs_hash_with_no_bktlock(hs)) - continue; - - if (cfs_hash_with_rw_bktlock(hs)) - rwlock_init(&new_bkts[i]->hsb_lock.rw); - else if (cfs_hash_with_spin_bktlock(hs)) - spin_lock_init(&new_bkts[i]->hsb_lock.spin); - else - LBUG(); /* invalid use-case */ - } - return new_bkts; -} - -/** - * Initialize new libcfs hash, where: - * @name - Descriptive hash name - * @cur_bits - Initial hash table size, in bits - * @max_bits - Maximum allowed hash table resize, in bits - * @ops - Registered hash table operations - * @flags - CFS_HASH_REHASH enable synamic hash resizing - * - CFS_HASH_SORT enable chained hash sort - */ -static void cfs_hash_rehash_worker(struct work_struct *work); - -#if CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 -static void cfs_hash_dep_print(struct work_struct *work) -{ - struct cfs_hash *hs = container_of(work, struct cfs_hash, hs_dep_work); - int dep; - int bkt; - int off; - int bits; - - spin_lock(&hs->hs_dep_lock); - dep = hs->hs_dep_max; - bkt = hs->hs_dep_bkt; - off = hs->hs_dep_off; - bits = hs->hs_dep_bits; - spin_unlock(&hs->hs_dep_lock); - - LCONSOLE_WARN("#### HASH %s (bits: %d): max depth %d at bucket %d/%d\n", - hs->hs_name, bits, dep, bkt, off); - spin_lock(&hs->hs_dep_lock); - hs->hs_dep_bits = 0; /* mark as workitem done */ - spin_unlock(&hs->hs_dep_lock); - return 0; -} - -static void cfs_hash_depth_wi_init(struct cfs_hash *hs) -{ - spin_lock_init(&hs->hs_dep_lock); - INIT_WORK(&hs->hs_dep_work, cfs_hash_dep_print); -} - -static void cfs_hash_depth_wi_cancel(struct cfs_hash *hs) -{ - cancel_work_sync(&hs->hs_dep_work); -} - -#else /* CFS_HASH_DEBUG_LEVEL < CFS_HASH_DEBUG_1 */ - -static inline void cfs_hash_depth_wi_init(struct cfs_hash *hs) {} -static inline void cfs_hash_depth_wi_cancel(struct cfs_hash *hs) {} - -#endif /* CFS_HASH_DEBUG_LEVEL >= CFS_HASH_DEBUG_1 */ - -struct cfs_hash * -cfs_hash_create(char *name, unsigned int cur_bits, unsigned int max_bits, - unsigned int bkt_bits, unsigned int extra_bytes, - unsigned int min_theta, unsigned int max_theta, - struct cfs_hash_ops *ops, unsigned int flags) -{ - struct cfs_hash *hs; - int len; - - BUILD_BUG_ON(CFS_HASH_THETA_BITS >= 15); - - LASSERT(name); - LASSERT(ops->hs_key); - LASSERT(ops->hs_hash); - LASSERT(ops->hs_object); - LASSERT(ops->hs_keycmp); - LASSERT(ops->hs_get); - LASSERT(ops->hs_put || ops->hs_put_locked); - - if (flags & CFS_HASH_REHASH) - flags |= CFS_HASH_COUNTER; /* must have counter */ - - LASSERT(cur_bits > 0); - LASSERT(cur_bits >= bkt_bits); - LASSERT(max_bits >= cur_bits && max_bits < 31); - LASSERT(ergo(!(flags & CFS_HASH_REHASH), cur_bits == max_bits)); - LASSERT(ergo(flags & CFS_HASH_REHASH, !(flags & CFS_HASH_NO_LOCK))); - LASSERT(ergo(flags & CFS_HASH_REHASH_KEY, ops->hs_keycpy)); - - len = !(flags & CFS_HASH_BIGNAME) ? - CFS_HASH_NAME_LEN : CFS_HASH_BIGNAME_LEN; - hs = kzalloc(offsetof(struct cfs_hash, hs_name[len]), GFP_KERNEL); - if (!hs) - return NULL; - - strlcpy(hs->hs_name, name, len); - hs->hs_flags = flags; - - atomic_set(&hs->hs_refcount, 1); - atomic_set(&hs->hs_count, 0); - - cfs_hash_lock_setup(hs); - cfs_hash_hlist_setup(hs); - - hs->hs_cur_bits = (u8)cur_bits; - hs->hs_min_bits = (u8)cur_bits; - hs->hs_max_bits = (u8)max_bits; - hs->hs_bkt_bits = (u8)bkt_bits; - - hs->hs_ops = ops; - hs->hs_extra_bytes = extra_bytes; - hs->hs_rehash_bits = 0; - INIT_WORK(&hs->hs_rehash_work, cfs_hash_rehash_worker); - cfs_hash_depth_wi_init(hs); - - if (cfs_hash_with_rehash(hs)) - __cfs_hash_set_theta(hs, min_theta, max_theta); - - hs->hs_buckets = cfs_hash_buckets_realloc(hs, NULL, 0, - CFS_HASH_NBKT(hs)); - if (hs->hs_buckets) - return hs; - - kfree(hs); - return NULL; -} -EXPORT_SYMBOL(cfs_hash_create); - -/** - * Cleanup libcfs hash @hs. - */ -static void -cfs_hash_destroy(struct cfs_hash *hs) -{ - struct hlist_node *hnode; - struct hlist_node *pos; - struct cfs_hash_bd bd; - int i; - - LASSERT(hs); - LASSERT(!cfs_hash_is_exiting(hs) && - !cfs_hash_is_iterating(hs)); - - /** - * prohibit further rehashes, don't need any lock because - * I'm the only (last) one can change it. - */ - hs->hs_exiting = 1; - if (cfs_hash_with_rehash(hs)) - cfs_hash_rehash_cancel(hs); - - cfs_hash_depth_wi_cancel(hs); - /* rehash should be done/canceled */ - LASSERT(hs->hs_buckets && !hs->hs_rehash_buckets); - - cfs_hash_for_each_bucket(hs, &bd, i) { - struct hlist_head *hhead; - - LASSERT(bd.bd_bucket); - /* no need to take this lock, just for consistent code */ - cfs_hash_bd_lock(hs, &bd, 1); - - cfs_hash_bd_for_each_hlist(hs, &bd, hhead) { - hlist_for_each_safe(hnode, pos, hhead) { - LASSERTF(!cfs_hash_with_assert_empty(hs), - "hash %s bucket %u(%u) is not empty: %u items left\n", - hs->hs_name, bd.bd_bucket->hsb_index, - bd.bd_offset, bd.bd_bucket->hsb_count); - /* can't assert key valicate, because we - * can interrupt rehash - */ - cfs_hash_bd_del_locked(hs, &bd, hnode); - cfs_hash_exit(hs, hnode); - } - } - LASSERT(!bd.bd_bucket->hsb_count); - cfs_hash_bd_unlock(hs, &bd, 1); - cond_resched(); - } - - LASSERT(!atomic_read(&hs->hs_count)); - - cfs_hash_buckets_free(hs->hs_buckets, cfs_hash_bkt_size(hs), - 0, CFS_HASH_NBKT(hs)); - i = cfs_hash_with_bigname(hs) ? - CFS_HASH_BIGNAME_LEN : CFS_HASH_NAME_LEN; - kfree(hs); -} - -struct cfs_hash *cfs_hash_getref(struct cfs_hash *hs) -{ - if (atomic_inc_not_zero(&hs->hs_refcount)) - return hs; - return NULL; -} -EXPORT_SYMBOL(cfs_hash_getref); - -void cfs_hash_putref(struct cfs_hash *hs) -{ - if (atomic_dec_and_test(&hs->hs_refcount)) - cfs_hash_destroy(hs); -} -EXPORT_SYMBOL(cfs_hash_putref); - -static inline int -cfs_hash_rehash_bits(struct cfs_hash *hs) -{ - if (cfs_hash_with_no_lock(hs) || - !cfs_hash_with_rehash(hs)) - return -EOPNOTSUPP; - - if (unlikely(cfs_hash_is_exiting(hs))) - return -ESRCH; - - if (unlikely(cfs_hash_is_rehashing(hs))) - return -EALREADY; - - if (unlikely(cfs_hash_is_iterating(hs))) - return -EAGAIN; - - /* XXX: need to handle case with max_theta != 2.0 - * and the case with min_theta != 0.5 - */ - if ((hs->hs_cur_bits < hs->hs_max_bits) && - (__cfs_hash_theta(hs) > hs->hs_max_theta)) - return hs->hs_cur_bits + 1; - - if (!cfs_hash_with_shrink(hs)) - return 0; - - if ((hs->hs_cur_bits > hs->hs_min_bits) && - (__cfs_hash_theta(hs) < hs->hs_min_theta)) - return hs->hs_cur_bits - 1; - - return 0; -} - -/** - * don't allow inline rehash if: - * - user wants non-blocking change (add/del) on hash table - * - too many elements - */ -static inline int -cfs_hash_rehash_inline(struct cfs_hash *hs) -{ - return !cfs_hash_with_nblk_change(hs) && - atomic_read(&hs->hs_count) < CFS_HASH_LOOP_HOG; -} - -/** - * Add item @hnode to libcfs hash @hs using @key. The registered - * ops->hs_get function will be called when the item is added. - */ -void -cfs_hash_add(struct cfs_hash *hs, const void *key, struct hlist_node *hnode) -{ - struct cfs_hash_bd bd; - int bits; - - LASSERT(hlist_unhashed(hnode)); - - cfs_hash_lock(hs, 0); - cfs_hash_bd_get_and_lock(hs, key, &bd, 1); - - cfs_hash_key_validate(hs, key, hnode); - cfs_hash_bd_add_locked(hs, &bd, hnode); - - cfs_hash_bd_unlock(hs, &bd, 1); - - bits = cfs_hash_rehash_bits(hs); - cfs_hash_unlock(hs, 0); - if (bits > 0) - cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs)); -} -EXPORT_SYMBOL(cfs_hash_add); - -static struct hlist_node * -cfs_hash_find_or_add(struct cfs_hash *hs, const void *key, - struct hlist_node *hnode, int noref) -{ - struct hlist_node *ehnode; - struct cfs_hash_bd bds[2]; - int bits = 0; - - LASSERTF(hlist_unhashed(hnode), "hnode = %p\n", hnode); - - cfs_hash_lock(hs, 0); - cfs_hash_dual_bd_get_and_lock(hs, key, bds, 1); - - cfs_hash_key_validate(hs, key, hnode); - ehnode = cfs_hash_dual_bd_findadd_locked(hs, bds, key, - hnode, noref); - cfs_hash_dual_bd_unlock(hs, bds, 1); - - if (ehnode == hnode) /* new item added */ - bits = cfs_hash_rehash_bits(hs); - cfs_hash_unlock(hs, 0); - if (bits > 0) - cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs)); - - return ehnode; -} - -/** - * Add item @hnode to libcfs hash @hs using @key. The registered - * ops->hs_get function will be called if the item was added. - * Returns 0 on success or -EALREADY on key collisions. - */ -int -cfs_hash_add_unique(struct cfs_hash *hs, const void *key, - struct hlist_node *hnode) -{ - return cfs_hash_find_or_add(hs, key, hnode, 1) != hnode ? - -EALREADY : 0; -} -EXPORT_SYMBOL(cfs_hash_add_unique); - -/** - * Add item @hnode to libcfs hash @hs using @key. If this @key - * already exists in the hash then ops->hs_get will be called on the - * conflicting entry and that entry will be returned to the caller. - * Otherwise ops->hs_get is called on the item which was added. - */ -void * -cfs_hash_findadd_unique(struct cfs_hash *hs, const void *key, - struct hlist_node *hnode) -{ - hnode = cfs_hash_find_or_add(hs, key, hnode, 0); - - return cfs_hash_object(hs, hnode); -} -EXPORT_SYMBOL(cfs_hash_findadd_unique); - -/** - * Delete item @hnode from the libcfs hash @hs using @key. The @key - * is required to ensure the correct hash bucket is locked since there - * is no direct linkage from the item to the bucket. The object - * removed from the hash will be returned and obs->hs_put is called - * on the removed object. - */ -void * -cfs_hash_del(struct cfs_hash *hs, const void *key, struct hlist_node *hnode) -{ - void *obj = NULL; - int bits = 0; - struct cfs_hash_bd bds[2]; - - cfs_hash_lock(hs, 0); - cfs_hash_dual_bd_get_and_lock(hs, key, bds, 1); - - /* NB: do nothing if @hnode is not in hash table */ - if (!hnode || !hlist_unhashed(hnode)) { - if (!bds[1].bd_bucket && hnode) { - cfs_hash_bd_del_locked(hs, &bds[0], hnode); - } else { - hnode = cfs_hash_dual_bd_finddel_locked(hs, bds, - key, hnode); - } - } - - if (hnode) { - obj = cfs_hash_object(hs, hnode); - bits = cfs_hash_rehash_bits(hs); - } - - cfs_hash_dual_bd_unlock(hs, bds, 1); - cfs_hash_unlock(hs, 0); - if (bits > 0) - cfs_hash_rehash(hs, cfs_hash_rehash_inline(hs)); - - return obj; -} -EXPORT_SYMBOL(cfs_hash_del); - -/** - * Delete item given @key in libcfs hash @hs. The first @key found in - * the hash will be removed, if the key exists multiple times in the hash - * @hs this function must be called once per key. The removed object - * will be returned and ops->hs_put is called on the removed object. - */ -void * -cfs_hash_del_key(struct cfs_hash *hs, const void *key) -{ - return cfs_hash_del(hs, key, NULL); -} -EXPORT_SYMBOL(cfs_hash_del_key); - -/** - * Lookup an item using @key in the libcfs hash @hs and return it. - * If the @key is found in the hash hs->hs_get() is called and the - * matching objects is returned. It is the callers responsibility - * to call the counterpart ops->hs_put using the cfs_hash_put() macro - * when when finished with the object. If the @key was not found - * in the hash @hs NULL is returned. - */ -void * -cfs_hash_lookup(struct cfs_hash *hs, const void *key) -{ - void *obj = NULL; - struct hlist_node *hnode; - struct cfs_hash_bd bds[2]; - - cfs_hash_lock(hs, 0); - cfs_hash_dual_bd_get_and_lock(hs, key, bds, 0); - - hnode = cfs_hash_dual_bd_lookup_locked(hs, bds, key); - if (hnode) - obj = cfs_hash_object(hs, hnode); - - cfs_hash_dual_bd_unlock(hs, bds, 0); - cfs_hash_unlock(hs, 0); - - return obj; -} -EXPORT_SYMBOL(cfs_hash_lookup); - -static void -cfs_hash_for_each_enter(struct cfs_hash *hs) -{ - LASSERT(!cfs_hash_is_exiting(hs)); - - if (!cfs_hash_with_rehash(hs)) - return; - /* - * NB: it's race on cfs_has_t::hs_iterating, but doesn't matter - * because it's just an unreliable signal to rehash-thread, - * rehash-thread will try to finish rehash ASAP when seeing this. - */ - hs->hs_iterating = 1; - - cfs_hash_lock(hs, 1); - hs->hs_iterators++; - cfs_hash_unlock(hs, 1); - - /* NB: iteration is mostly called by service thread, - * we tend to cancel pending rehash-request, instead of - * blocking service thread, we will relaunch rehash request - * after iteration - */ - if (cfs_hash_is_rehashing(hs)) - cfs_hash_rehash_cancel(hs); -} - -static void -cfs_hash_for_each_exit(struct cfs_hash *hs) -{ - int remained; - int bits; - - if (!cfs_hash_with_rehash(hs)) - return; - cfs_hash_lock(hs, 1); - remained = --hs->hs_iterators; - bits = cfs_hash_rehash_bits(hs); - cfs_hash_unlock(hs, 1); - /* NB: it's race on cfs_has_t::hs_iterating, see above */ - if (!remained) - hs->hs_iterating = 0; - if (bits > 0) { - cfs_hash_rehash(hs, atomic_read(&hs->hs_count) < - CFS_HASH_LOOP_HOG); - } -} - -/** - * For each item in the libcfs hash @hs call the passed callback @func - * and pass to it as an argument each hash item and the private @data. - * - * a) the function may sleep! - * b) during the callback: - * . the bucket lock is held so the callback must never sleep. - * . if @removal_safe is true, use can remove current item by - * cfs_hash_bd_del_locked - */ -static u64 -cfs_hash_for_each_tight(struct cfs_hash *hs, cfs_hash_for_each_cb_t func, - void *data, int remove_safe) -{ - struct hlist_node *hnode; - struct hlist_node *pos; - struct cfs_hash_bd bd; - u64 count = 0; - int excl = !!remove_safe; - int loop = 0; - int i; - - cfs_hash_for_each_enter(hs); - - cfs_hash_lock(hs, 0); - LASSERT(!cfs_hash_is_rehashing(hs)); - - cfs_hash_for_each_bucket(hs, &bd, i) { - struct hlist_head *hhead; - - cfs_hash_bd_lock(hs, &bd, excl); - if (!func) { /* only glimpse size */ - count += bd.bd_bucket->hsb_count; - cfs_hash_bd_unlock(hs, &bd, excl); - continue; - } - - cfs_hash_bd_for_each_hlist(hs, &bd, hhead) { - hlist_for_each_safe(hnode, pos, hhead) { - cfs_hash_bucket_validate(hs, &bd, hnode); - count++; - loop++; - if (func(hs, &bd, hnode, data)) { - cfs_hash_bd_unlock(hs, &bd, excl); - goto out; - } - } - } - cfs_hash_bd_unlock(hs, &bd, excl); - if (loop < CFS_HASH_LOOP_HOG) - continue; - loop = 0; - cfs_hash_unlock(hs, 0); - cond_resched(); - cfs_hash_lock(hs, 0); - } - out: - cfs_hash_unlock(hs, 0); - - cfs_hash_for_each_exit(hs); - return count; -} - -struct cfs_hash_cond_arg { - cfs_hash_cond_opt_cb_t func; - void *arg; -}; - -static int -cfs_hash_cond_del_locked(struct cfs_hash *hs, struct cfs_hash_bd *bd, - struct hlist_node *hnode, void *data) -{ - struct cfs_hash_cond_arg *cond = data; - - if (cond->func(cfs_hash_object(hs, hnode), cond->arg)) - cfs_hash_bd_del_locked(hs, bd, hnode); - return 0; -} - -/** - * Delete item from the libcfs hash @hs when @func return true. - * The write lock being hold during loop for each bucket to avoid - * any object be reference. - */ -void -cfs_hash_cond_del(struct cfs_hash *hs, cfs_hash_cond_opt_cb_t func, void *data) -{ - struct cfs_hash_cond_arg arg = { - .func = func, - .arg = data, - }; - - cfs_hash_for_each_tight(hs, cfs_hash_cond_del_locked, &arg, 1); -} -EXPORT_SYMBOL(cfs_hash_cond_del); - -void -cfs_hash_for_each(struct cfs_hash *hs, cfs_hash_for_each_cb_t func, - void *data) -{ - cfs_hash_for_each_tight(hs, func, data, 0); -} -EXPORT_SYMBOL(cfs_hash_for_each); - -void -cfs_hash_for_each_safe(struct cfs_hash *hs, cfs_hash_for_each_cb_t func, - void *data) -{ - cfs_hash_for_each_tight(hs, func, data, 1); -} -EXPORT_SYMBOL(cfs_hash_for_each_safe); - -static int -cfs_hash_peek(struct cfs_hash *hs, struct cfs_hash_bd *bd, - struct hlist_node *hnode, void *data) -{ - *(int *)data = 0; - return 1; /* return 1 to break the loop */ -} - -int -cfs_hash_is_empty(struct cfs_hash *hs) -{ - int empty = 1; - - cfs_hash_for_each_tight(hs, cfs_hash_peek, &empty, 0); - return empty; -} -EXPORT_SYMBOL(cfs_hash_is_empty); - -u64 -cfs_hash_size_get(struct cfs_hash *hs) -{ - return cfs_hash_with_counter(hs) ? - atomic_read(&hs->hs_count) : - cfs_hash_for_each_tight(hs, NULL, NULL, 0); -} -EXPORT_SYMBOL(cfs_hash_size_get); - -/* - * cfs_hash_for_each_relax: - * Iterate the hash table and call @func on each item without - * any lock. This function can't guarantee to finish iteration - * if these features are enabled: - * - * a. if rehash_key is enabled, an item can be moved from - * one bucket to another bucket - * b. user can remove non-zero-ref item from hash-table, - * so the item can be removed from hash-table, even worse, - * it's possible that user changed key and insert to another - * hash bucket. - * there's no way for us to finish iteration correctly on previous - * two cases, so iteration has to be stopped on change. - */ -static int -cfs_hash_for_each_relax(struct cfs_hash *hs, cfs_hash_for_each_cb_t func, - void *data, int start) -{ - struct hlist_node *next = NULL; - struct hlist_node *hnode; - struct cfs_hash_bd bd; - u32 version; - int count = 0; - int stop_on_change; - int has_put_locked; - int end = -1; - int rc = 0; - int i; - - stop_on_change = cfs_hash_with_rehash_key(hs) || - !cfs_hash_with_no_itemref(hs); - has_put_locked = hs->hs_ops->hs_put_locked != NULL; - cfs_hash_lock(hs, 0); -again: - LASSERT(!cfs_hash_is_rehashing(hs)); - - cfs_hash_for_each_bucket(hs, &bd, i) { - struct hlist_head *hhead; - - if (i < start) - continue; - else if (end > 0 && i >= end) - break; - - cfs_hash_bd_lock(hs, &bd, 0); - version = cfs_hash_bd_version_get(&bd); - - cfs_hash_bd_for_each_hlist(hs, &bd, hhead) { - hnode = hhead->first; - if (!hnode) - continue; - cfs_hash_get(hs, hnode); - - for (; hnode; hnode = next) { - cfs_hash_bucket_validate(hs, &bd, hnode); - next = hnode->next; - if (next) - cfs_hash_get(hs, next); - cfs_hash_bd_unlock(hs, &bd, 0); - cfs_hash_unlock(hs, 0); - - rc = func(hs, &bd, hnode, data); - if (stop_on_change || !has_put_locked) - cfs_hash_put(hs, hnode); - cond_resched(); - count++; - - cfs_hash_lock(hs, 0); - cfs_hash_bd_lock(hs, &bd, 0); - if (stop_on_change) { - if (version != - cfs_hash_bd_version_get(&bd)) - rc = -EINTR; - } else if (has_put_locked) { - cfs_hash_put_locked(hs, hnode); - } - if (rc) /* callback wants to break iteration */ - break; - } - if (next) { - if (has_put_locked) { - cfs_hash_put_locked(hs, next); - next = NULL; - } - break; - } else if (rc) { - break; - } - } - cfs_hash_bd_unlock(hs, &bd, 0); - if (next && !has_put_locked) { - cfs_hash_put(hs, next); - next = NULL; - } - if (rc) /* callback wants to break iteration */ - break; - } - if (start > 0 && !rc) { - end = start; - start = 0; - goto again; - } - - cfs_hash_unlock(hs, 0); - return count; -} - -int -cfs_hash_for_each_nolock(struct cfs_hash *hs, cfs_hash_for_each_cb_t func, - void *data, int start) -{ - if (cfs_hash_with_no_lock(hs) || - cfs_hash_with_rehash_key(hs) || - !cfs_hash_with_no_itemref(hs)) - return -EOPNOTSUPP; - - if (!hs->hs_ops->hs_get || - (!hs->hs_ops->hs_put && !hs->hs_ops->hs_put_locked)) - return -EOPNOTSUPP; - - cfs_hash_for_each_enter(hs); - cfs_hash_for_each_relax(hs, func, data, start); - cfs_hash_for_each_exit(hs); - - return 0; -} -EXPORT_SYMBOL(cfs_hash_for_each_nolock); - -/** - * For each hash bucket in the libcfs hash @hs call the passed callback - * @func until all the hash buckets are empty. The passed callback @func - * or the previously registered callback hs->hs_put must remove the item - * from the hash. You may either use the cfs_hash_del() or hlist_del() - * functions. No rwlocks will be held during the callback @func it is - * safe to sleep if needed. This function will not terminate until the - * hash is empty. Note it is still possible to concurrently add new - * items in to the hash. It is the callers responsibility to ensure - * the required locking is in place to prevent concurrent insertions. - */ -int -cfs_hash_for_each_empty(struct cfs_hash *hs, cfs_hash_for_each_cb_t func, - void *data) -{ - unsigned int i = 0; - - if (cfs_hash_with_no_lock(hs)) - return -EOPNOTSUPP; - - if (!hs->hs_ops->hs_get || - (!hs->hs_ops->hs_put && !hs->hs_ops->hs_put_locked)) - return -EOPNOTSUPP; - - cfs_hash_for_each_enter(hs); - while (cfs_hash_for_each_relax(hs, func, data, 0)) { - CDEBUG(D_INFO, "Try to empty hash: %s, loop: %u\n", - hs->hs_name, i++); - } - cfs_hash_for_each_exit(hs); - return 0; -} -EXPORT_SYMBOL(cfs_hash_for_each_empty); - -void -cfs_hash_hlist_for_each(struct cfs_hash *hs, unsigned int hindex, - cfs_hash_for_each_cb_t func, void *data) -{ - struct hlist_head *hhead; - struct hlist_node *hnode; - struct cfs_hash_bd bd; - - cfs_hash_for_each_enter(hs); - cfs_hash_lock(hs, 0); - if (hindex >= CFS_HASH_NHLIST(hs)) - goto out; - - cfs_hash_bd_index_set(hs, hindex, &bd); - - cfs_hash_bd_lock(hs, &bd, 0); - hhead = cfs_hash_bd_hhead(hs, &bd); - hlist_for_each(hnode, hhead) { - if (func(hs, &bd, hnode, data)) - break; - } - cfs_hash_bd_unlock(hs, &bd, 0); -out: - cfs_hash_unlock(hs, 0); - cfs_hash_for_each_exit(hs); -} -EXPORT_SYMBOL(cfs_hash_hlist_for_each); - -/* - * For each item in the libcfs hash @hs which matches the @key call - * the passed callback @func and pass to it as an argument each hash - * item and the private @data. During the callback the bucket lock - * is held so the callback must never sleep. - */ -void -cfs_hash_for_each_key(struct cfs_hash *hs, const void *key, - cfs_hash_for_each_cb_t func, void *data) -{ - struct hlist_node *hnode; - struct cfs_hash_bd bds[2]; - unsigned int i; - - cfs_hash_lock(hs, 0); - - cfs_hash_dual_bd_get_and_lock(hs, key, bds, 0); - - cfs_hash_for_each_bd(bds, 2, i) { - struct hlist_head *hlist = cfs_hash_bd_hhead(hs, &bds[i]); - - hlist_for_each(hnode, hlist) { - cfs_hash_bucket_validate(hs, &bds[i], hnode); - - if (cfs_hash_keycmp(hs, key, hnode)) { - if (func(hs, &bds[i], hnode, data)) - break; - } - } - } - - cfs_hash_dual_bd_unlock(hs, bds, 0); - cfs_hash_unlock(hs, 0); -} -EXPORT_SYMBOL(cfs_hash_for_each_key); - -/** - * Rehash the libcfs hash @hs to the given @bits. This can be used - * to grow the hash size when excessive chaining is detected, or to - * shrink the hash when it is larger than needed. When the CFS_HASH_REHASH - * flag is set in @hs the libcfs hash may be dynamically rehashed - * during addition or removal if the hash's theta value exceeds - * either the hs->hs_min_theta or hs->max_theta values. By default - * these values are tuned to keep the chained hash depth small, and - * this approach assumes a reasonably uniform hashing function. The - * theta thresholds for @hs are tunable via cfs_hash_set_theta(). - */ -void -cfs_hash_rehash_cancel(struct cfs_hash *hs) -{ - LASSERT(cfs_hash_with_rehash(hs)); - cancel_work_sync(&hs->hs_rehash_work); -} - -void -cfs_hash_rehash(struct cfs_hash *hs, int do_rehash) -{ - int rc; - - LASSERT(cfs_hash_with_rehash(hs) && !cfs_hash_with_no_lock(hs)); - - cfs_hash_lock(hs, 1); - - rc = cfs_hash_rehash_bits(hs); - if (rc <= 0) { - cfs_hash_unlock(hs, 1); - return; - } - - hs->hs_rehash_bits = rc; - if (!do_rehash) { - /* launch and return */ - queue_work(cfs_rehash_wq, &hs->hs_rehash_work); - cfs_hash_unlock(hs, 1); - return; - } - - /* rehash right now */ - cfs_hash_unlock(hs, 1); - - cfs_hash_rehash_worker(&hs->hs_rehash_work); -} - -static int -cfs_hash_rehash_bd(struct cfs_hash *hs, struct cfs_hash_bd *old) -{ - struct cfs_hash_bd new; - struct hlist_head *hhead; - struct hlist_node *hnode; - struct hlist_node *pos; - void *key; - int c = 0; - - /* hold cfs_hash_lock(hs, 1), so don't need any bucket lock */ - cfs_hash_bd_for_each_hlist(hs, old, hhead) { - hlist_for_each_safe(hnode, pos, hhead) { - key = cfs_hash_key(hs, hnode); - LASSERT(key); - /* Validate hnode is in the correct bucket. */ - cfs_hash_bucket_validate(hs, old, hnode); - /* - * Delete from old hash bucket; move to new bucket. - * ops->hs_key must be defined. - */ - cfs_hash_bd_from_key(hs, hs->hs_rehash_buckets, - hs->hs_rehash_bits, key, &new); - cfs_hash_bd_move_locked(hs, old, &new, hnode); - c++; - } - } - - return c; -} - -static void -cfs_hash_rehash_worker(struct work_struct *work) -{ - struct cfs_hash *hs = container_of(work, struct cfs_hash, hs_rehash_work); - struct cfs_hash_bucket **bkts; - struct cfs_hash_bd bd; - unsigned int old_size; - unsigned int new_size; - int bsize; - int count = 0; - int rc = 0; - int i; - - LASSERT(hs && cfs_hash_with_rehash(hs)); - - cfs_hash_lock(hs, 0); - LASSERT(cfs_hash_is_rehashing(hs)); - - old_size = CFS_HASH_NBKT(hs); - new_size = CFS_HASH_RH_NBKT(hs); - - cfs_hash_unlock(hs, 0); - - /* - * don't need hs::hs_rwlock for hs::hs_buckets, - * because nobody can change bkt-table except me. - */ - bkts = cfs_hash_buckets_realloc(hs, hs->hs_buckets, - old_size, new_size); - cfs_hash_lock(hs, 1); - if (!bkts) { - rc = -ENOMEM; - goto out; - } - - if (bkts == hs->hs_buckets) { - bkts = NULL; /* do nothing */ - goto out; - } - - rc = __cfs_hash_theta(hs); - if ((rc >= hs->hs_min_theta) && (rc <= hs->hs_max_theta)) { - /* free the new allocated bkt-table */ - old_size = new_size; - new_size = CFS_HASH_NBKT(hs); - rc = -EALREADY; - goto out; - } - - LASSERT(!hs->hs_rehash_buckets); - hs->hs_rehash_buckets = bkts; - - rc = 0; - cfs_hash_for_each_bucket(hs, &bd, i) { - if (cfs_hash_is_exiting(hs)) { - rc = -ESRCH; - /* someone wants to destroy the hash, abort now */ - if (old_size < new_size) /* OK to free old bkt-table */ - break; - /* it's shrinking, need free new bkt-table */ - hs->hs_rehash_buckets = NULL; - old_size = new_size; - new_size = CFS_HASH_NBKT(hs); - goto out; - } - - count += cfs_hash_rehash_bd(hs, &bd); - if (count < CFS_HASH_LOOP_HOG || - cfs_hash_is_iterating(hs)) { /* need to finish ASAP */ - continue; - } - - count = 0; - cfs_hash_unlock(hs, 1); - cond_resched(); - cfs_hash_lock(hs, 1); - } - - hs->hs_rehash_count++; - - bkts = hs->hs_buckets; - hs->hs_buckets = hs->hs_rehash_buckets; - hs->hs_rehash_buckets = NULL; - - hs->hs_cur_bits = hs->hs_rehash_bits; -out: - hs->hs_rehash_bits = 0; - bsize = cfs_hash_bkt_size(hs); - cfs_hash_unlock(hs, 1); - /* can't refer to @hs anymore because it could be destroyed */ - if (bkts) - cfs_hash_buckets_free(bkts, bsize, new_size, old_size); - if (rc) - CDEBUG(D_INFO, "early quit of rehashing: %d\n", rc); -} - -/** - * Rehash the object referenced by @hnode in the libcfs hash @hs. The - * @old_key must be provided to locate the objects previous location - * in the hash, and the @new_key will be used to reinsert the object. - * Use this function instead of a cfs_hash_add() + cfs_hash_del() - * combo when it is critical that there is no window in time where the - * object is missing from the hash. When an object is being rehashed - * the registered cfs_hash_get() and cfs_hash_put() functions will - * not be called. - */ -void cfs_hash_rehash_key(struct cfs_hash *hs, const void *old_key, - void *new_key, struct hlist_node *hnode) -{ - struct cfs_hash_bd bds[3]; - struct cfs_hash_bd old_bds[2]; - struct cfs_hash_bd new_bd; - - LASSERT(!hlist_unhashed(hnode)); - - cfs_hash_lock(hs, 0); - - cfs_hash_dual_bd_get(hs, old_key, old_bds); - cfs_hash_bd_get(hs, new_key, &new_bd); - - bds[0] = old_bds[0]; - bds[1] = old_bds[1]; - bds[2] = new_bd; - - /* NB: bds[0] and bds[1] are ordered already */ - cfs_hash_bd_order(&bds[1], &bds[2]); - cfs_hash_bd_order(&bds[0], &bds[1]); - - cfs_hash_multi_bd_lock(hs, bds, 3, 1); - if (likely(!old_bds[1].bd_bucket)) { - cfs_hash_bd_move_locked(hs, &old_bds[0], &new_bd, hnode); - } else { - cfs_hash_dual_bd_finddel_locked(hs, old_bds, old_key, hnode); - cfs_hash_bd_add_locked(hs, &new_bd, hnode); - } - /* overwrite key inside locks, otherwise may screw up with - * other operations, i.e: rehash - */ - cfs_hash_keycpy(hs, hnode, new_key); - - cfs_hash_multi_bd_unlock(hs, bds, 3, 1); - cfs_hash_unlock(hs, 0); -} -EXPORT_SYMBOL(cfs_hash_rehash_key); - -void cfs_hash_debug_header(struct seq_file *m) -{ - seq_printf(m, "%-*s cur min max theta t-min t-max flags rehash count maxdep maxdepb distribution\n", - CFS_HASH_BIGNAME_LEN, "name"); -} -EXPORT_SYMBOL(cfs_hash_debug_header); - -static struct cfs_hash_bucket ** -cfs_hash_full_bkts(struct cfs_hash *hs) -{ - /* NB: caller should hold hs->hs_rwlock if REHASH is set */ - if (!hs->hs_rehash_buckets) - return hs->hs_buckets; - - LASSERT(hs->hs_rehash_bits); - return hs->hs_rehash_bits > hs->hs_cur_bits ? - hs->hs_rehash_buckets : hs->hs_buckets; -} - -static unsigned int -cfs_hash_full_nbkt(struct cfs_hash *hs) -{ - /* NB: caller should hold hs->hs_rwlock if REHASH is set */ - if (!hs->hs_rehash_buckets) - return CFS_HASH_NBKT(hs); - - LASSERT(hs->hs_rehash_bits); - return hs->hs_rehash_bits > hs->hs_cur_bits ? - CFS_HASH_RH_NBKT(hs) : CFS_HASH_NBKT(hs); -} - -void cfs_hash_debug_str(struct cfs_hash *hs, struct seq_file *m) -{ - int dist[8] = { 0, }; - int maxdep = -1; - int maxdepb = -1; - int total = 0; - int theta; - int i; - - cfs_hash_lock(hs, 0); - theta = __cfs_hash_theta(hs); - - seq_printf(m, "%-*s %5d %5d %5d %d.%03d %d.%03d %d.%03d 0x%02x %6d ", - CFS_HASH_BIGNAME_LEN, hs->hs_name, - 1 << hs->hs_cur_bits, 1 << hs->hs_min_bits, - 1 << hs->hs_max_bits, - __cfs_hash_theta_int(theta), __cfs_hash_theta_frac(theta), - __cfs_hash_theta_int(hs->hs_min_theta), - __cfs_hash_theta_frac(hs->hs_min_theta), - __cfs_hash_theta_int(hs->hs_max_theta), - __cfs_hash_theta_frac(hs->hs_max_theta), - hs->hs_flags, hs->hs_rehash_count); - - /* - * The distribution is a summary of the chained hash depth in - * each of the libcfs hash buckets. Each buckets hsb_count is - * divided by the hash theta value and used to generate a - * histogram of the hash distribution. A uniform hash will - * result in all hash buckets being close to the average thus - * only the first few entries in the histogram will be non-zero. - * If you hash function results in a non-uniform hash the will - * be observable by outlier bucks in the distribution histogram. - * - * Uniform hash distribution: 128/128/0/0/0/0/0/0 - * Non-Uniform hash distribution: 128/125/0/0/0/0/2/1 - */ - for (i = 0; i < cfs_hash_full_nbkt(hs); i++) { - struct cfs_hash_bd bd; - - bd.bd_bucket = cfs_hash_full_bkts(hs)[i]; - cfs_hash_bd_lock(hs, &bd, 0); - if (maxdep < bd.bd_bucket->hsb_depmax) { - maxdep = bd.bd_bucket->hsb_depmax; - maxdepb = ffz(~maxdep); - } - total += bd.bd_bucket->hsb_count; - dist[min(fls(bd.bd_bucket->hsb_count / max(theta, 1)), 7)]++; - cfs_hash_bd_unlock(hs, &bd, 0); - } - - seq_printf(m, "%7d %7d %7d ", total, maxdep, maxdepb); - for (i = 0; i < 8; i++) - seq_printf(m, "%d%c", dist[i], (i == 7) ? '\n' : '/'); - - cfs_hash_unlock(hs, 0); -} -EXPORT_SYMBOL(cfs_hash_debug_str); diff --git a/drivers/staging/lustre/lnet/libcfs/libcfs_cpu.c b/drivers/staging/lustre/lnet/libcfs/libcfs_cpu.c deleted file mode 100644 index 3d1cf457b28649ffd9a48fe649068c2c7d970d5f..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/libcfs/libcfs_cpu.c +++ /dev/null @@ -1,1086 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * GPL HEADER END - */ -/* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Please see comments in libcfs/include/libcfs/libcfs_cpu.h for introduction - * - * Author: liang@whamcloud.com - */ - -#define DEBUG_SUBSYSTEM S_LNET - -#include -#include -#include -#include -#include - -#include -#include -#include - -/** Global CPU partition table */ -struct cfs_cpt_table *cfs_cpt_tab __read_mostly; -EXPORT_SYMBOL(cfs_cpt_tab); - -/** - * modparam for setting number of partitions - * - * 0 : estimate best value based on cores or NUMA nodes - * 1 : disable multiple partitions - * >1 : specify number of partitions - */ -static int cpu_npartitions; -module_param(cpu_npartitions, int, 0444); -MODULE_PARM_DESC(cpu_npartitions, "# of CPU partitions"); - -/** - * modparam for setting CPU partitions patterns: - * - * i.e: "0[0,1,2,3] 1[4,5,6,7]", number before bracket is CPU partition ID, - * number in bracket is processor ID (core or HT) - * - * i.e: "N 0[0,1] 1[2,3]" the first character 'N' means numbers in bracket - * are NUMA node ID, number before bracket is CPU partition ID. - * - * i.e: "N", shortcut expression to create CPT from NUMA & CPU topology - * - * NB: If user specified cpu_pattern, cpu_npartitions will be ignored - */ -static char *cpu_pattern = "N"; -module_param(cpu_pattern, charp, 0444); -MODULE_PARM_DESC(cpu_pattern, "CPU partitions pattern"); - -static struct cfs_cpt_data { - /* serialize hotplug etc */ - spinlock_t cpt_lock; - /* reserved for hotplug */ - unsigned long cpt_version; - /* mutex to protect cpt_cpumask */ - struct mutex cpt_mutex; - /* scratch buffer for set/unset_node */ - cpumask_var_t cpt_cpumask; -} cpt_data; - -#define CFS_CPU_VERSION_MAGIC 0xbabecafe - -struct cfs_cpt_table * -cfs_cpt_table_alloc(unsigned int ncpt) -{ - struct cfs_cpt_table *cptab; - int i; - - cptab = kzalloc(sizeof(*cptab), GFP_NOFS); - if (!cptab) - return NULL; - - cptab->ctb_nparts = ncpt; - - cptab->ctb_nodemask = kzalloc(sizeof(*cptab->ctb_nodemask), - GFP_NOFS); - if (!zalloc_cpumask_var(&cptab->ctb_cpumask, GFP_NOFS) || - !cptab->ctb_nodemask) - goto failed; - - cptab->ctb_cpu2cpt = kvmalloc_array(num_possible_cpus(), - sizeof(cptab->ctb_cpu2cpt[0]), - GFP_KERNEL); - if (!cptab->ctb_cpu2cpt) - goto failed; - - memset(cptab->ctb_cpu2cpt, -1, - num_possible_cpus() * sizeof(cptab->ctb_cpu2cpt[0])); - - cptab->ctb_parts = kvmalloc_array(ncpt, sizeof(cptab->ctb_parts[0]), - GFP_KERNEL); - if (!cptab->ctb_parts) - goto failed; - - for (i = 0; i < ncpt; i++) { - struct cfs_cpu_partition *part = &cptab->ctb_parts[i]; - - part->cpt_nodemask = kzalloc(sizeof(*part->cpt_nodemask), - GFP_NOFS); - if (!zalloc_cpumask_var(&part->cpt_cpumask, GFP_NOFS) || - !part->cpt_nodemask) - goto failed; - } - - spin_lock(&cpt_data.cpt_lock); - /* Reserved for hotplug */ - cptab->ctb_version = cpt_data.cpt_version; - spin_unlock(&cpt_data.cpt_lock); - - return cptab; - - failed: - cfs_cpt_table_free(cptab); - return NULL; -} -EXPORT_SYMBOL(cfs_cpt_table_alloc); - -void -cfs_cpt_table_free(struct cfs_cpt_table *cptab) -{ - int i; - - kvfree(cptab->ctb_cpu2cpt); - - for (i = 0; cptab->ctb_parts && i < cptab->ctb_nparts; i++) { - struct cfs_cpu_partition *part = &cptab->ctb_parts[i]; - - kfree(part->cpt_nodemask); - free_cpumask_var(part->cpt_cpumask); - } - - kvfree(cptab->ctb_parts); - - kfree(cptab->ctb_nodemask); - free_cpumask_var(cptab->ctb_cpumask); - - kfree(cptab); -} -EXPORT_SYMBOL(cfs_cpt_table_free); - -int -cfs_cpt_table_print(struct cfs_cpt_table *cptab, char *buf, int len) -{ - char *tmp = buf; - int rc = 0; - int i; - int j; - - for (i = 0; i < cptab->ctb_nparts; i++) { - if (len > 0) { - rc = snprintf(tmp, len, "%d\t: ", i); - len -= rc; - } - - if (len <= 0) { - rc = -EFBIG; - goto out; - } - - tmp += rc; - for_each_cpu(j, cptab->ctb_parts[i].cpt_cpumask) { - rc = snprintf(tmp, len, "%d ", j); - len -= rc; - if (len <= 0) { - rc = -EFBIG; - goto out; - } - tmp += rc; - } - - *tmp = '\n'; - tmp++; - len--; - } - - out: - if (rc < 0) - return rc; - - return tmp - buf; -} -EXPORT_SYMBOL(cfs_cpt_table_print); - -static void -cfs_node_to_cpumask(int node, cpumask_t *mask) -{ - const cpumask_t *tmp = cpumask_of_node(node); - - if (tmp) - cpumask_copy(mask, tmp); - else - cpumask_clear(mask); -} - -int -cfs_cpt_number(struct cfs_cpt_table *cptab) -{ - return cptab->ctb_nparts; -} -EXPORT_SYMBOL(cfs_cpt_number); - -int -cfs_cpt_weight(struct cfs_cpt_table *cptab, int cpt) -{ - LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); - - return cpt == CFS_CPT_ANY ? - cpumask_weight(cptab->ctb_cpumask) : - cpumask_weight(cptab->ctb_parts[cpt].cpt_cpumask); -} -EXPORT_SYMBOL(cfs_cpt_weight); - -int -cfs_cpt_online(struct cfs_cpt_table *cptab, int cpt) -{ - LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); - - return cpt == CFS_CPT_ANY ? - cpumask_any_and(cptab->ctb_cpumask, - cpu_online_mask) < nr_cpu_ids : - cpumask_any_and(cptab->ctb_parts[cpt].cpt_cpumask, - cpu_online_mask) < nr_cpu_ids; -} -EXPORT_SYMBOL(cfs_cpt_online); - -cpumask_var_t * -cfs_cpt_cpumask(struct cfs_cpt_table *cptab, int cpt) -{ - LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); - - return cpt == CFS_CPT_ANY ? - &cptab->ctb_cpumask : &cptab->ctb_parts[cpt].cpt_cpumask; -} -EXPORT_SYMBOL(cfs_cpt_cpumask); - -nodemask_t * -cfs_cpt_nodemask(struct cfs_cpt_table *cptab, int cpt) -{ - LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); - - return cpt == CFS_CPT_ANY ? - cptab->ctb_nodemask : cptab->ctb_parts[cpt].cpt_nodemask; -} -EXPORT_SYMBOL(cfs_cpt_nodemask); - -int -cfs_cpt_set_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu) -{ - int node; - - LASSERT(cpt >= 0 && cpt < cptab->ctb_nparts); - - if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_online(cpu)) { - CDEBUG(D_INFO, "CPU %d is invalid or it's offline\n", cpu); - return 0; - } - - if (cptab->ctb_cpu2cpt[cpu] != -1) { - CDEBUG(D_INFO, "CPU %d is already in partition %d\n", - cpu, cptab->ctb_cpu2cpt[cpu]); - return 0; - } - - cptab->ctb_cpu2cpt[cpu] = cpt; - - LASSERT(!cpumask_test_cpu(cpu, cptab->ctb_cpumask)); - LASSERT(!cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask)); - - cpumask_set_cpu(cpu, cptab->ctb_cpumask); - cpumask_set_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask); - - node = cpu_to_node(cpu); - - /* first CPU of @node in this CPT table */ - if (!node_isset(node, *cptab->ctb_nodemask)) - node_set(node, *cptab->ctb_nodemask); - - /* first CPU of @node in this partition */ - if (!node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask)) - node_set(node, *cptab->ctb_parts[cpt].cpt_nodemask); - - return 1; -} -EXPORT_SYMBOL(cfs_cpt_set_cpu); - -void -cfs_cpt_unset_cpu(struct cfs_cpt_table *cptab, int cpt, int cpu) -{ - int node; - int i; - - LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); - - if (cpu < 0 || cpu >= nr_cpu_ids) { - CDEBUG(D_INFO, "Invalid CPU id %d\n", cpu); - return; - } - - if (cpt == CFS_CPT_ANY) { - /* caller doesn't know the partition ID */ - cpt = cptab->ctb_cpu2cpt[cpu]; - if (cpt < 0) { /* not set in this CPT-table */ - CDEBUG(D_INFO, "Try to unset cpu %d which is not in CPT-table %p\n", - cpt, cptab); - return; - } - - } else if (cpt != cptab->ctb_cpu2cpt[cpu]) { - CDEBUG(D_INFO, - "CPU %d is not in cpu-partition %d\n", cpu, cpt); - return; - } - - LASSERT(cpumask_test_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask)); - LASSERT(cpumask_test_cpu(cpu, cptab->ctb_cpumask)); - - cpumask_clear_cpu(cpu, cptab->ctb_parts[cpt].cpt_cpumask); - cpumask_clear_cpu(cpu, cptab->ctb_cpumask); - cptab->ctb_cpu2cpt[cpu] = -1; - - node = cpu_to_node(cpu); - - LASSERT(node_isset(node, *cptab->ctb_parts[cpt].cpt_nodemask)); - LASSERT(node_isset(node, *cptab->ctb_nodemask)); - - for_each_cpu(i, cptab->ctb_parts[cpt].cpt_cpumask) { - /* this CPT has other CPU belonging to this node? */ - if (cpu_to_node(i) == node) - break; - } - - if (i >= nr_cpu_ids) - node_clear(node, *cptab->ctb_parts[cpt].cpt_nodemask); - - for_each_cpu(i, cptab->ctb_cpumask) { - /* this CPT-table has other CPU belonging to this node? */ - if (cpu_to_node(i) == node) - break; - } - - if (i >= nr_cpu_ids) - node_clear(node, *cptab->ctb_nodemask); -} -EXPORT_SYMBOL(cfs_cpt_unset_cpu); - -int -cfs_cpt_set_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask) -{ - int i; - - if (!cpumask_weight(mask) || - cpumask_any_and(mask, cpu_online_mask) >= nr_cpu_ids) { - CDEBUG(D_INFO, "No online CPU is found in the CPU mask for CPU partition %d\n", - cpt); - return 0; - } - - for_each_cpu(i, mask) { - if (!cfs_cpt_set_cpu(cptab, cpt, i)) - return 0; - } - - return 1; -} -EXPORT_SYMBOL(cfs_cpt_set_cpumask); - -void -cfs_cpt_unset_cpumask(struct cfs_cpt_table *cptab, int cpt, cpumask_t *mask) -{ - int i; - - for_each_cpu(i, mask) - cfs_cpt_unset_cpu(cptab, cpt, i); -} -EXPORT_SYMBOL(cfs_cpt_unset_cpumask); - -int -cfs_cpt_set_node(struct cfs_cpt_table *cptab, int cpt, int node) -{ - int rc; - - if (node < 0 || node >= MAX_NUMNODES) { - CDEBUG(D_INFO, - "Invalid NUMA id %d for CPU partition %d\n", node, cpt); - return 0; - } - - mutex_lock(&cpt_data.cpt_mutex); - - cfs_node_to_cpumask(node, cpt_data.cpt_cpumask); - - rc = cfs_cpt_set_cpumask(cptab, cpt, cpt_data.cpt_cpumask); - - mutex_unlock(&cpt_data.cpt_mutex); - - return rc; -} -EXPORT_SYMBOL(cfs_cpt_set_node); - -void -cfs_cpt_unset_node(struct cfs_cpt_table *cptab, int cpt, int node) -{ - if (node < 0 || node >= MAX_NUMNODES) { - CDEBUG(D_INFO, - "Invalid NUMA id %d for CPU partition %d\n", node, cpt); - return; - } - - mutex_lock(&cpt_data.cpt_mutex); - - cfs_node_to_cpumask(node, cpt_data.cpt_cpumask); - - cfs_cpt_unset_cpumask(cptab, cpt, cpt_data.cpt_cpumask); - - mutex_unlock(&cpt_data.cpt_mutex); -} -EXPORT_SYMBOL(cfs_cpt_unset_node); - -int -cfs_cpt_set_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask) -{ - int i; - - for_each_node_mask(i, *mask) { - if (!cfs_cpt_set_node(cptab, cpt, i)) - return 0; - } - - return 1; -} -EXPORT_SYMBOL(cfs_cpt_set_nodemask); - -void -cfs_cpt_unset_nodemask(struct cfs_cpt_table *cptab, int cpt, nodemask_t *mask) -{ - int i; - - for_each_node_mask(i, *mask) - cfs_cpt_unset_node(cptab, cpt, i); -} -EXPORT_SYMBOL(cfs_cpt_unset_nodemask); - -void -cfs_cpt_clear(struct cfs_cpt_table *cptab, int cpt) -{ - int last; - int i; - - if (cpt == CFS_CPT_ANY) { - last = cptab->ctb_nparts - 1; - cpt = 0; - } else { - last = cpt; - } - - for (; cpt <= last; cpt++) { - for_each_cpu(i, cptab->ctb_parts[cpt].cpt_cpumask) - cfs_cpt_unset_cpu(cptab, cpt, i); - } -} -EXPORT_SYMBOL(cfs_cpt_clear); - -int -cfs_cpt_spread_node(struct cfs_cpt_table *cptab, int cpt) -{ - nodemask_t *mask; - int weight; - int rotor; - int node; - - /* convert CPU partition ID to HW node id */ - - if (cpt < 0 || cpt >= cptab->ctb_nparts) { - mask = cptab->ctb_nodemask; - rotor = cptab->ctb_spread_rotor++; - } else { - mask = cptab->ctb_parts[cpt].cpt_nodemask; - rotor = cptab->ctb_parts[cpt].cpt_spread_rotor++; - } - - weight = nodes_weight(*mask); - LASSERT(weight > 0); - - rotor %= weight; - - for_each_node_mask(node, *mask) { - if (!rotor--) - return node; - } - - LBUG(); - return 0; -} -EXPORT_SYMBOL(cfs_cpt_spread_node); - -int -cfs_cpt_current(struct cfs_cpt_table *cptab, int remap) -{ - int cpu; - int cpt; - - preempt_disable(); - cpu = smp_processor_id(); - cpt = cptab->ctb_cpu2cpt[cpu]; - - if (cpt < 0 && remap) { - /* don't return negative value for safety of upper layer, - * instead we shadow the unknown cpu to a valid partition ID - */ - cpt = cpu % cptab->ctb_nparts; - } - preempt_enable(); - return cpt; -} -EXPORT_SYMBOL(cfs_cpt_current); - -int -cfs_cpt_of_cpu(struct cfs_cpt_table *cptab, int cpu) -{ - LASSERT(cpu >= 0 && cpu < nr_cpu_ids); - - return cptab->ctb_cpu2cpt[cpu]; -} -EXPORT_SYMBOL(cfs_cpt_of_cpu); - -int -cfs_cpt_bind(struct cfs_cpt_table *cptab, int cpt) -{ - cpumask_var_t *cpumask; - nodemask_t *nodemask; - int rc; - int i; - - LASSERT(cpt == CFS_CPT_ANY || (cpt >= 0 && cpt < cptab->ctb_nparts)); - - if (cpt == CFS_CPT_ANY) { - cpumask = &cptab->ctb_cpumask; - nodemask = cptab->ctb_nodemask; - } else { - cpumask = &cptab->ctb_parts[cpt].cpt_cpumask; - nodemask = cptab->ctb_parts[cpt].cpt_nodemask; - } - - if (cpumask_any_and(*cpumask, cpu_online_mask) >= nr_cpu_ids) { - CERROR("No online CPU found in CPU partition %d, did someone do CPU hotplug on system? You might need to reload Lustre modules to keep system working well.\n", - cpt); - return -EINVAL; - } - - for_each_online_cpu(i) { - if (cpumask_test_cpu(i, *cpumask)) - continue; - - rc = set_cpus_allowed_ptr(current, *cpumask); - set_mems_allowed(*nodemask); - if (!rc) - schedule(); /* switch to allowed CPU */ - - return rc; - } - - /* don't need to set affinity because all online CPUs are covered */ - return 0; -} -EXPORT_SYMBOL(cfs_cpt_bind); - -/** - * Choose max to \a number CPUs from \a node and set them in \a cpt. - * We always prefer to choose CPU in the same core/socket. - */ -static int -cfs_cpt_choose_ncpus(struct cfs_cpt_table *cptab, int cpt, - cpumask_t *node, int number) -{ - cpumask_var_t socket; - cpumask_var_t core; - int rc = 0; - int cpu; - - LASSERT(number > 0); - - if (number >= cpumask_weight(node)) { - while (!cpumask_empty(node)) { - cpu = cpumask_first(node); - - rc = cfs_cpt_set_cpu(cptab, cpt, cpu); - if (!rc) - return -EINVAL; - cpumask_clear_cpu(cpu, node); - } - return 0; - } - - /* - * Allocate scratch buffers - * As we cannot initialize a cpumask_var_t, we need - * to alloc both before we can risk trying to free either - */ - if (!zalloc_cpumask_var(&socket, GFP_NOFS)) - rc = -ENOMEM; - if (!zalloc_cpumask_var(&core, GFP_NOFS)) - rc = -ENOMEM; - if (rc) - goto out; - - while (!cpumask_empty(node)) { - cpu = cpumask_first(node); - - /* get cpumask for cores in the same socket */ - cpumask_copy(socket, topology_core_cpumask(cpu)); - cpumask_and(socket, socket, node); - - LASSERT(!cpumask_empty(socket)); - - while (!cpumask_empty(socket)) { - int i; - - /* get cpumask for hts in the same core */ - cpumask_copy(core, topology_sibling_cpumask(cpu)); - cpumask_and(core, core, node); - - LASSERT(!cpumask_empty(core)); - - for_each_cpu(i, core) { - cpumask_clear_cpu(i, socket); - cpumask_clear_cpu(i, node); - - rc = cfs_cpt_set_cpu(cptab, cpt, i); - if (!rc) { - rc = -EINVAL; - goto out; - } - - if (!--number) - goto out; - } - cpu = cpumask_first(socket); - } - } - -out: - free_cpumask_var(socket); - free_cpumask_var(core); - return rc; -} - -#define CPT_WEIGHT_MIN 4u - -static unsigned int -cfs_cpt_num_estimate(void) -{ - unsigned int nnode = num_online_nodes(); - unsigned int ncpu = num_online_cpus(); - unsigned int ncpt; - - if (ncpu <= CPT_WEIGHT_MIN) { - ncpt = 1; - goto out; - } - - /* generate reasonable number of CPU partitions based on total number - * of CPUs, Preferred N should be power2 and match this condition: - * 2 * (N - 1)^2 < NCPUS <= 2 * N^2 - */ - for (ncpt = 2; ncpu > 2 * ncpt * ncpt; ncpt <<= 1) - ; - - if (ncpt <= nnode) { /* fat numa system */ - while (nnode > ncpt) - nnode >>= 1; - - } else { /* ncpt > nnode */ - while ((nnode << 1) <= ncpt) - nnode <<= 1; - } - - ncpt = nnode; - -out: -#if (BITS_PER_LONG == 32) - /* config many CPU partitions on 32-bit system could consume - * too much memory - */ - ncpt = min(2U, ncpt); -#endif - while (ncpu % ncpt) - ncpt--; /* worst case is 1 */ - - return ncpt; -} - -static struct cfs_cpt_table * -cfs_cpt_table_create(int ncpt) -{ - struct cfs_cpt_table *cptab = NULL; - cpumask_var_t mask; - int cpt = 0; - int num; - int rc; - int i; - - rc = cfs_cpt_num_estimate(); - if (ncpt <= 0) - ncpt = rc; - - if (ncpt > num_online_cpus() || ncpt > 4 * rc) { - CWARN("CPU partition number %d is larger than suggested value (%d), your system may have performance issue or run out of memory while under pressure\n", - ncpt, rc); - } - - if (num_online_cpus() % ncpt) { - CERROR("CPU number %d is not multiple of cpu_npartition %d, please try different cpu_npartitions value or set pattern string by cpu_pattern=STRING\n", - (int)num_online_cpus(), ncpt); - goto failed; - } - - cptab = cfs_cpt_table_alloc(ncpt); - if (!cptab) { - CERROR("Failed to allocate CPU map(%d)\n", ncpt); - goto failed; - } - - num = num_online_cpus() / ncpt; - if (!num) { - CERROR("CPU changed while setting CPU partition\n"); - goto failed; - } - - if (!zalloc_cpumask_var(&mask, GFP_NOFS)) { - CERROR("Failed to allocate scratch cpumask\n"); - goto failed; - } - - for_each_online_node(i) { - cfs_node_to_cpumask(i, mask); - - while (!cpumask_empty(mask)) { - struct cfs_cpu_partition *part; - int n; - - /* - * Each emulated NUMA node has all allowed CPUs in - * the mask. - * End loop when all partitions have assigned CPUs. - */ - if (cpt == ncpt) - break; - - part = &cptab->ctb_parts[cpt]; - - n = num - cpumask_weight(part->cpt_cpumask); - LASSERT(n > 0); - - rc = cfs_cpt_choose_ncpus(cptab, cpt, mask, n); - if (rc < 0) - goto failed_mask; - - LASSERT(num >= cpumask_weight(part->cpt_cpumask)); - if (num == cpumask_weight(part->cpt_cpumask)) - cpt++; - } - } - - if (cpt != ncpt || - num != cpumask_weight(cptab->ctb_parts[ncpt - 1].cpt_cpumask)) { - CERROR("Expect %d(%d) CPU partitions but got %d(%d), CPU hotplug/unplug while setting?\n", - cptab->ctb_nparts, num, cpt, - cpumask_weight(cptab->ctb_parts[ncpt - 1].cpt_cpumask)); - goto failed_mask; - } - - free_cpumask_var(mask); - - return cptab; - - failed_mask: - free_cpumask_var(mask); - failed: - CERROR("Failed to setup CPU-partition-table with %d CPU-partitions, online HW nodes: %d, HW cpus: %d.\n", - ncpt, num_online_nodes(), num_online_cpus()); - - if (cptab) - cfs_cpt_table_free(cptab); - - return NULL; -} - -static struct cfs_cpt_table * -cfs_cpt_table_create_pattern(char *pattern) -{ - struct cfs_cpt_table *cptab; - char *str; - int node = 0; - int high; - int ncpt = 0; - int cpt; - int rc; - int c; - int i; - - str = strim(pattern); - if (*str == 'n' || *str == 'N') { - pattern = str + 1; - if (*pattern != '\0') { - node = 1; - } else { /* shortcut to create CPT from NUMA & CPU topology */ - node = -1; - ncpt = num_online_nodes(); - } - } - - if (!ncpt) { /* scanning bracket which is mark of partition */ - for (str = pattern;; str++, ncpt++) { - str = strchr(str, '['); - if (!str) - break; - } - } - - if (!ncpt || - (node && ncpt > num_online_nodes()) || - (!node && ncpt > num_online_cpus())) { - CERROR("Invalid pattern %s, or too many partitions %d\n", - pattern, ncpt); - return NULL; - } - - cptab = cfs_cpt_table_alloc(ncpt); - if (!cptab) { - CERROR("Failed to allocate cpu partition table\n"); - return NULL; - } - - if (node < 0) { /* shortcut to create CPT from NUMA & CPU topology */ - cpt = 0; - - for_each_online_node(i) { - if (cpt >= ncpt) { - CERROR("CPU changed while setting CPU partition table, %d/%d\n", - cpt, ncpt); - goto failed; - } - - rc = cfs_cpt_set_node(cptab, cpt++, i); - if (!rc) - goto failed; - } - return cptab; - } - - high = node ? MAX_NUMNODES - 1 : nr_cpu_ids - 1; - - for (str = strim(pattern), c = 0;; c++) { - struct cfs_range_expr *range; - struct cfs_expr_list *el; - char *bracket = strchr(str, '['); - int n; - - if (!bracket) { - if (*str) { - CERROR("Invalid pattern %s\n", str); - goto failed; - } - if (c != ncpt) { - CERROR("expect %d partitions but found %d\n", - ncpt, c); - goto failed; - } - break; - } - - if (sscanf(str, "%d%n", &cpt, &n) < 1) { - CERROR("Invalid cpu pattern %s\n", str); - goto failed; - } - - if (cpt < 0 || cpt >= ncpt) { - CERROR("Invalid partition id %d, total partitions %d\n", - cpt, ncpt); - goto failed; - } - - if (cfs_cpt_weight(cptab, cpt)) { - CERROR("Partition %d has already been set.\n", cpt); - goto failed; - } - - str = strim(str + n); - if (str != bracket) { - CERROR("Invalid pattern %s\n", str); - goto failed; - } - - bracket = strchr(str, ']'); - if (!bracket) { - CERROR("missing right bracket for cpt %d, %s\n", - cpt, str); - goto failed; - } - - if (cfs_expr_list_parse(str, (bracket - str) + 1, - 0, high, &el)) { - CERROR("Can't parse number range: %s\n", str); - goto failed; - } - - list_for_each_entry(range, &el->el_exprs, re_link) { - for (i = range->re_lo; i <= range->re_hi; i++) { - if ((i - range->re_lo) % range->re_stride) - continue; - - rc = node ? cfs_cpt_set_node(cptab, cpt, i) : - cfs_cpt_set_cpu(cptab, cpt, i); - if (!rc) { - cfs_expr_list_free(el); - goto failed; - } - } - } - - cfs_expr_list_free(el); - - if (!cfs_cpt_online(cptab, cpt)) { - CERROR("No online CPU is found on partition %d\n", cpt); - goto failed; - } - - str = strim(bracket + 1); - } - - return cptab; - - failed: - cfs_cpt_table_free(cptab); - return NULL; -} - -#ifdef CONFIG_HOTPLUG_CPU -static enum cpuhp_state lustre_cpu_online; - -static void cfs_cpu_incr_cpt_version(void) -{ - spin_lock(&cpt_data.cpt_lock); - cpt_data.cpt_version++; - spin_unlock(&cpt_data.cpt_lock); -} - -static int cfs_cpu_online(unsigned int cpu) -{ - cfs_cpu_incr_cpt_version(); - return 0; -} - -static int cfs_cpu_dead(unsigned int cpu) -{ - bool warn; - - cfs_cpu_incr_cpt_version(); - - mutex_lock(&cpt_data.cpt_mutex); - /* if all HTs in a core are offline, it may break affinity */ - cpumask_copy(cpt_data.cpt_cpumask, topology_sibling_cpumask(cpu)); - warn = cpumask_any_and(cpt_data.cpt_cpumask, - cpu_online_mask) >= nr_cpu_ids; - mutex_unlock(&cpt_data.cpt_mutex); - CDEBUG(warn ? D_WARNING : D_INFO, - "Lustre: can't support CPU plug-out well now, performance and stability could be impacted [CPU %u]\n", - cpu); - return 0; -} -#endif - -void -cfs_cpu_fini(void) -{ - if (cfs_cpt_tab) - cfs_cpt_table_free(cfs_cpt_tab); - -#ifdef CONFIG_HOTPLUG_CPU - if (lustre_cpu_online > 0) - cpuhp_remove_state_nocalls(lustre_cpu_online); - cpuhp_remove_state_nocalls(CPUHP_LUSTRE_CFS_DEAD); -#endif - free_cpumask_var(cpt_data.cpt_cpumask); -} - -int -cfs_cpu_init(void) -{ - int ret = 0; - - LASSERT(!cfs_cpt_tab); - - memset(&cpt_data, 0, sizeof(cpt_data)); - - if (!zalloc_cpumask_var(&cpt_data.cpt_cpumask, GFP_NOFS)) { - CERROR("Failed to allocate scratch buffer\n"); - return -1; - } - - spin_lock_init(&cpt_data.cpt_lock); - mutex_init(&cpt_data.cpt_mutex); - -#ifdef CONFIG_HOTPLUG_CPU - ret = cpuhp_setup_state_nocalls(CPUHP_LUSTRE_CFS_DEAD, - "staging/lustre/cfe:dead", NULL, - cfs_cpu_dead); - if (ret < 0) - goto failed; - ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, - "staging/lustre/cfe:online", - cfs_cpu_online, NULL); - if (ret < 0) - goto failed; - lustre_cpu_online = ret; -#endif - ret = -EINVAL; - - if (*cpu_pattern) { - char *cpu_pattern_dup = kstrdup(cpu_pattern, GFP_KERNEL); - - if (!cpu_pattern_dup) { - CERROR("Failed to duplicate cpu_pattern\n"); - goto failed; - } - - cfs_cpt_tab = cfs_cpt_table_create_pattern(cpu_pattern_dup); - kfree(cpu_pattern_dup); - if (!cfs_cpt_tab) { - CERROR("Failed to create cptab from pattern %s\n", - cpu_pattern); - goto failed; - } - - } else { - cfs_cpt_tab = cfs_cpt_table_create(cpu_npartitions); - if (!cfs_cpt_tab) { - CERROR("Failed to create ptable with npartitions %d\n", - cpu_npartitions); - goto failed; - } - } - - spin_lock(&cpt_data.cpt_lock); - if (cfs_cpt_tab->ctb_version != cpt_data.cpt_version) { - spin_unlock(&cpt_data.cpt_lock); - CERROR("CPU hotplug/unplug during setup\n"); - goto failed; - } - spin_unlock(&cpt_data.cpt_lock); - - LCONSOLE(0, "HW nodes: %d, HW CPU cores: %d, npartitions: %d\n", - num_online_nodes(), num_online_cpus(), - cfs_cpt_number(cfs_cpt_tab)); - return 0; - - failed: - cfs_cpu_fini(); - return ret; -} diff --git a/drivers/staging/lustre/lnet/libcfs/libcfs_lock.c b/drivers/staging/lustre/lnet/libcfs/libcfs_lock.c deleted file mode 100644 index 223505c3754510fe833cc04dabe4ff53d179e85f..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/libcfs/libcfs_lock.c +++ /dev/null @@ -1,155 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * GPL HEADER END - */ -/* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2015 Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Author: liang@whamcloud.com - */ - -#define DEBUG_SUBSYSTEM S_LNET - -#include -#include -#include -#include - -/** destroy cpu-partition lock, see libcfs_private.h for more detail */ -void -cfs_percpt_lock_free(struct cfs_percpt_lock *pcl) -{ - LASSERT(pcl->pcl_locks); - LASSERT(!pcl->pcl_locked); - - cfs_percpt_free(pcl->pcl_locks); - kfree(pcl); -} -EXPORT_SYMBOL(cfs_percpt_lock_free); - -/** - * create cpu-partition lock, see libcfs_private.h for more detail. - * - * cpu-partition lock is designed for large-scale SMP system, so we need to - * reduce cacheline conflict as possible as we can, that's the - * reason we always allocate cacheline-aligned memory block. - */ -struct cfs_percpt_lock * -cfs_percpt_lock_create(struct cfs_cpt_table *cptab, - struct lock_class_key *keys) -{ - struct cfs_percpt_lock *pcl; - spinlock_t *lock; - int i; - - /* NB: cptab can be NULL, pcl will be for HW CPUs on that case */ - pcl = kzalloc(sizeof(*pcl), GFP_NOFS); - if (!pcl) - return NULL; - - pcl->pcl_cptab = cptab; - pcl->pcl_locks = cfs_percpt_alloc(cptab, sizeof(*lock)); - if (!pcl->pcl_locks) { - kfree(pcl); - return NULL; - } - - if (!keys) - CWARN("Cannot setup class key for percpt lock, you may see recursive locking warnings which are actually fake.\n"); - - cfs_percpt_for_each(lock, i, pcl->pcl_locks) { - spin_lock_init(lock); - if (keys) - lockdep_set_class(lock, &keys[i]); - } - - return pcl; -} -EXPORT_SYMBOL(cfs_percpt_lock_create); - -/** - * lock a CPU partition - * - * \a index != CFS_PERCPT_LOCK_EX - * hold private lock indexed by \a index - * - * \a index == CFS_PERCPT_LOCK_EX - * exclusively lock @pcl and nobody can take private lock - */ -void -cfs_percpt_lock(struct cfs_percpt_lock *pcl, int index) - __acquires(pcl->pcl_locks) -{ - int ncpt = cfs_cpt_number(pcl->pcl_cptab); - int i; - - LASSERT(index >= CFS_PERCPT_LOCK_EX && index < ncpt); - - if (ncpt == 1) { - index = 0; - } else { /* serialize with exclusive lock */ - while (pcl->pcl_locked) - cpu_relax(); - } - - if (likely(index != CFS_PERCPT_LOCK_EX)) { - spin_lock(pcl->pcl_locks[index]); - return; - } - - /* exclusive lock request */ - for (i = 0; i < ncpt; i++) { - spin_lock(pcl->pcl_locks[i]); - if (!i) { - LASSERT(!pcl->pcl_locked); - /* nobody should take private lock after this - * so I wouldn't starve for too long time - */ - pcl->pcl_locked = 1; - } - } -} -EXPORT_SYMBOL(cfs_percpt_lock); - -/** unlock a CPU partition */ -void -cfs_percpt_unlock(struct cfs_percpt_lock *pcl, int index) - __releases(pcl->pcl_locks) -{ - int ncpt = cfs_cpt_number(pcl->pcl_cptab); - int i; - - index = ncpt == 1 ? 0 : index; - - if (likely(index != CFS_PERCPT_LOCK_EX)) { - spin_unlock(pcl->pcl_locks[index]); - return; - } - - for (i = ncpt - 1; i >= 0; i--) { - if (!i) { - LASSERT(pcl->pcl_locked); - pcl->pcl_locked = 0; - } - spin_unlock(pcl->pcl_locks[i]); - } -} -EXPORT_SYMBOL(cfs_percpt_unlock); diff --git a/drivers/staging/lustre/lnet/libcfs/libcfs_mem.c b/drivers/staging/lustre/lnet/libcfs/libcfs_mem.c deleted file mode 100644 index 2d533be9bb30182cbc6d5a03591ba709aab6c927..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/libcfs/libcfs_mem.c +++ /dev/null @@ -1,171 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * GPL HEADER END - */ -/* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Author: liang@whamcloud.com - */ - -#define DEBUG_SUBSYSTEM S_LNET - -#include -#include -#include -#include -#include - -struct cfs_var_array { - unsigned int va_count; /* # of buffers */ - unsigned int va_size; /* size of each var */ - struct cfs_cpt_table *va_cptab; /* cpu partition table */ - void *va_ptrs[0]; /* buffer addresses */ -}; - -/* - * free per-cpu data, see more detail in cfs_percpt_free - */ -void -cfs_percpt_free(void *vars) -{ - struct cfs_var_array *arr; - int i; - - arr = container_of(vars, struct cfs_var_array, va_ptrs[0]); - - for (i = 0; i < arr->va_count; i++) - kfree(arr->va_ptrs[i]); - - kvfree(arr); -} -EXPORT_SYMBOL(cfs_percpt_free); - -/* - * allocate per cpu-partition variables, returned value is an array of pointers, - * variable can be indexed by CPU partition ID, i.e: - * - * arr = cfs_percpt_alloc(cfs_cpu_pt, size); - * then caller can access memory block for CPU 0 by arr[0], - * memory block for CPU 1 by arr[1]... - * memory block for CPU N by arr[N]... - * - * cacheline aligned. - */ -void * -cfs_percpt_alloc(struct cfs_cpt_table *cptab, unsigned int size) -{ - struct cfs_var_array *arr; - int count; - int i; - - count = cfs_cpt_number(cptab); - - arr = kvzalloc(offsetof(struct cfs_var_array, va_ptrs[count]), - GFP_KERNEL); - if (!arr) - return NULL; - - size = L1_CACHE_ALIGN(size); - arr->va_size = size; - arr->va_count = count; - arr->va_cptab = cptab; - - for (i = 0; i < count; i++) { - arr->va_ptrs[i] = kzalloc_node(size, GFP_KERNEL, - cfs_cpt_spread_node(cptab, i)); - if (!arr->va_ptrs[i]) { - cfs_percpt_free((void *)&arr->va_ptrs[0]); - return NULL; - } - } - - return (void *)&arr->va_ptrs[0]; -} -EXPORT_SYMBOL(cfs_percpt_alloc); - -/* - * return number of CPUs (or number of elements in per-cpu data) - * according to cptab of @vars - */ -int -cfs_percpt_number(void *vars) -{ - struct cfs_var_array *arr; - - arr = container_of(vars, struct cfs_var_array, va_ptrs[0]); - - return arr->va_count; -} -EXPORT_SYMBOL(cfs_percpt_number); - -/* - * free variable array, see more detail in cfs_array_alloc - */ -void -cfs_array_free(void *vars) -{ - struct cfs_var_array *arr; - int i; - - arr = container_of(vars, struct cfs_var_array, va_ptrs[0]); - - for (i = 0; i < arr->va_count; i++) { - if (!arr->va_ptrs[i]) - continue; - - kvfree(arr->va_ptrs[i]); - } - kvfree(arr); -} -EXPORT_SYMBOL(cfs_array_free); - -/* - * allocate a variable array, returned value is an array of pointers. - * Caller can specify length of array by @count, @size is size of each - * memory block in array. - */ -void * -cfs_array_alloc(int count, unsigned int size) -{ - struct cfs_var_array *arr; - int i; - - arr = kvmalloc(offsetof(struct cfs_var_array, va_ptrs[count]), GFP_KERNEL); - if (!arr) - return NULL; - - arr->va_count = count; - arr->va_size = size; - - for (i = 0; i < count; i++) { - arr->va_ptrs[i] = kvzalloc(size, GFP_KERNEL); - - if (!arr->va_ptrs[i]) { - cfs_array_free((void *)&arr->va_ptrs[0]); - return NULL; - } - } - - return (void *)&arr->va_ptrs[0]; -} -EXPORT_SYMBOL(cfs_array_alloc); diff --git a/drivers/staging/lustre/lnet/libcfs/libcfs_string.c b/drivers/staging/lustre/lnet/libcfs/libcfs_string.c deleted file mode 100644 index e1fb1263e3ae5f61202a712343386ab4b4122fba..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/libcfs/libcfs_string.c +++ /dev/null @@ -1,562 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015 Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * String manipulation functions. - * - * libcfs/libcfs/libcfs_string.c - * - * Author: Nathan Rutman - */ - -#include -#include -#include -#include -#include -#include -#include - -/* Convert a text string to a bitmask */ -int cfs_str2mask(const char *str, const char *(*bit2str)(int bit), - int *oldmask, int minmask, int allmask) -{ - const char *debugstr; - char op = '\0'; - int newmask = minmask, i, len, found = 0; - - /* must be a list of tokens separated by whitespace - * and optionally an operator ('+' or '-'). If an operator - * appears first in , '*oldmask' is used as the starting point - * (relative), otherwise minmask is used (absolute). An operator - * applies to all following tokens up to the next operator. - */ - while (*str != '\0') { - while (isspace(*str)) - str++; - if (*str == '\0') - break; - if (*str == '+' || *str == '-') { - op = *str++; - if (!found) - /* only if first token is relative */ - newmask = *oldmask; - while (isspace(*str)) - str++; - if (*str == '\0') /* trailing op */ - return -EINVAL; - } - - /* find token length */ - len = 0; - while (str[len] != '\0' && !isspace(str[len]) && - str[len] != '+' && str[len] != '-') - len++; - - /* match token */ - found = 0; - for (i = 0; i < 32; i++) { - debugstr = bit2str(i); - if (debugstr && strlen(debugstr) == len && - !strncasecmp(str, debugstr, len)) { - if (op == '-') - newmask &= ~(1 << i); - else - newmask |= (1 << i); - found = 1; - break; - } - } - if (!found && len == 3 && - !strncasecmp(str, "ALL", len)) { - if (op == '-') - newmask = minmask; - else - newmask = allmask; - found = 1; - } - if (!found) { - CWARN("unknown mask '%.*s'.\n" - "mask usage: [+|-] ...\n", len, str); - return -EINVAL; - } - str += len; - } - - *oldmask = newmask; - return 0; -} - -/* get the first string out of @str */ -char *cfs_firststr(char *str, size_t size) -{ - size_t i = 0; - char *end; - - /* trim leading spaces */ - while (i < size && *str && isspace(*str)) { - ++i; - ++str; - } - - /* string with all spaces */ - if (*str == '\0') - goto out; - - end = str; - while (i < size && *end != '\0' && !isspace(*end)) { - ++i; - ++end; - } - - *end = '\0'; -out: - return str; -} -EXPORT_SYMBOL(cfs_firststr); - -/** - * Extracts tokens from strings. - * - * Looks for \a delim in string \a next, sets \a res to point to - * substring before the delimiter, sets \a next right after the found - * delimiter. - * - * \retval 1 if \a res points to a string of non-whitespace characters - * \retval 0 otherwise - */ -int -cfs_gettok(struct cfs_lstr *next, char delim, struct cfs_lstr *res) -{ - char *end; - - if (!next->ls_str) - return 0; - - /* skip leading white spaces */ - while (next->ls_len) { - if (!isspace(*next->ls_str)) - break; - next->ls_str++; - next->ls_len--; - } - - if (!next->ls_len) /* whitespaces only */ - return 0; - - if (*next->ls_str == delim) { - /* first non-writespace is the delimiter */ - return 0; - } - - res->ls_str = next->ls_str; - end = memchr(next->ls_str, delim, next->ls_len); - if (!end) { - /* there is no the delimeter in the string */ - end = next->ls_str + next->ls_len; - next->ls_str = NULL; - } else { - next->ls_str = end + 1; - next->ls_len -= (end - res->ls_str + 1); - } - - /* skip ending whitespaces */ - while (--end != res->ls_str) { - if (!isspace(*end)) - break; - } - - res->ls_len = end - res->ls_str + 1; - return 1; -} -EXPORT_SYMBOL(cfs_gettok); - -/** - * Converts string to integer. - * - * Accepts decimal and hexadecimal number recordings. - * - * \retval 1 if first \a nob chars of \a str convert to decimal or - * hexadecimal integer in the range [\a min, \a max] - * \retval 0 otherwise - */ -int -cfs_str2num_check(char *str, int nob, unsigned int *num, - unsigned int min, unsigned int max) -{ - bool all_numbers = true; - char *endp, cache; - int rc; - - /** - * kstrouint can only handle strings composed - * of only numbers. We need to scan the string - * passed in for the first non-digit character - * and end the string at that location. If we - * don't find any non-digit character we still - * need to place a '\0' at position nob since - * we are not interested in the rest of the - * string which is longer than nob in size. - * After we are done the character at the - * position we placed '\0' must be restored. - */ - for (endp = str; endp < str + nob; endp++) { - if (!isdigit(*endp)) { - all_numbers = false; - break; - } - } - cache = *endp; - *endp = '\0'; - - rc = kstrtouint(str, 10, num); - *endp = cache; - if (rc || !all_numbers) - return 0; - - return (*num >= min && *num <= max); -} -EXPORT_SYMBOL(cfs_str2num_check); - -/** - * Parses \ token of the syntax. If \a bracketed is false, - * \a src should only have a single token which can be \ or \* - * - * \retval pointer to allocated range_expr and initialized - * range_expr::re_lo, range_expr::re_hi and range_expr:re_stride if \a - `* src parses to - * \ | - * \ '-' \ | - * \ '-' \ '/' \ - * \retval 0 will be returned if it can be parsed, otherwise -EINVAL or - * -ENOMEM will be returned. - */ -static int -cfs_range_expr_parse(struct cfs_lstr *src, unsigned int min, unsigned int max, - int bracketed, struct cfs_range_expr **expr) -{ - struct cfs_range_expr *re; - struct cfs_lstr tok; - - re = kzalloc(sizeof(*re), GFP_NOFS); - if (!re) - return -ENOMEM; - - if (src->ls_len == 1 && src->ls_str[0] == '*') { - re->re_lo = min; - re->re_hi = max; - re->re_stride = 1; - goto out; - } - - if (cfs_str2num_check(src->ls_str, src->ls_len, - &re->re_lo, min, max)) { - /* is parsed */ - re->re_hi = re->re_lo; - re->re_stride = 1; - goto out; - } - - if (!bracketed || !cfs_gettok(src, '-', &tok)) - goto failed; - - if (!cfs_str2num_check(tok.ls_str, tok.ls_len, - &re->re_lo, min, max)) - goto failed; - - /* - */ - if (cfs_str2num_check(src->ls_str, src->ls_len, - &re->re_hi, min, max)) { - /* - is parsed */ - re->re_stride = 1; - goto out; - } - - /* go to check '-' '/' */ - if (cfs_gettok(src, '/', &tok)) { - if (!cfs_str2num_check(tok.ls_str, tok.ls_len, - &re->re_hi, min, max)) - goto failed; - - /* - / ... */ - if (cfs_str2num_check(src->ls_str, src->ls_len, - &re->re_stride, min, max)) { - /* - / is parsed */ - goto out; - } - } - - out: - *expr = re; - return 0; - - failed: - kfree(re); - return -EINVAL; -} - -/** - * Print the range expression \a re into specified \a buffer. - * If \a bracketed is true, expression does not need additional - * brackets. - * - * \retval number of characters written - */ -static int -cfs_range_expr_print(char *buffer, int count, struct cfs_range_expr *expr, - bool bracketed) -{ - int i; - char s[] = "["; - char e[] = "]"; - - if (bracketed) { - s[0] = '\0'; - e[0] = '\0'; - } - - if (expr->re_lo == expr->re_hi) - i = scnprintf(buffer, count, "%u", expr->re_lo); - else if (expr->re_stride == 1) - i = scnprintf(buffer, count, "%s%u-%u%s", - s, expr->re_lo, expr->re_hi, e); - else - i = scnprintf(buffer, count, "%s%u-%u/%u%s", - s, expr->re_lo, expr->re_hi, expr->re_stride, e); - return i; -} - -/** - * Print a list of range expressions (\a expr_list) into specified \a buffer. - * If the list contains several expressions, separate them with comma - * and surround the list with brackets. - * - * \retval number of characters written - */ -int -cfs_expr_list_print(char *buffer, int count, struct cfs_expr_list *expr_list) -{ - struct cfs_range_expr *expr; - int i = 0, j = 0; - int numexprs = 0; - - if (count <= 0) - return 0; - - list_for_each_entry(expr, &expr_list->el_exprs, re_link) - numexprs++; - - if (numexprs > 1) - i += scnprintf(buffer + i, count - i, "["); - - list_for_each_entry(expr, &expr_list->el_exprs, re_link) { - if (j++) - i += scnprintf(buffer + i, count - i, ","); - i += cfs_range_expr_print(buffer + i, count - i, expr, - numexprs > 1); - } - - if (numexprs > 1) - i += scnprintf(buffer + i, count - i, "]"); - - return i; -} -EXPORT_SYMBOL(cfs_expr_list_print); - -/** - * Matches value (\a value) against ranges expression list \a expr_list. - * - * \retval 1 if \a value matches - * \retval 0 otherwise - */ -int -cfs_expr_list_match(u32 value, struct cfs_expr_list *expr_list) -{ - struct cfs_range_expr *expr; - - list_for_each_entry(expr, &expr_list->el_exprs, re_link) { - if (value >= expr->re_lo && value <= expr->re_hi && - !((value - expr->re_lo) % expr->re_stride)) - return 1; - } - - return 0; -} -EXPORT_SYMBOL(cfs_expr_list_match); - -/** - * Convert express list (\a expr_list) to an array of all matched values - * - * \retval N N is total number of all matched values - * \retval 0 if expression list is empty - * \retval < 0 for failure - */ -int -cfs_expr_list_values(struct cfs_expr_list *expr_list, int max, u32 **valpp) -{ - struct cfs_range_expr *expr; - u32 *val; - int count = 0; - int i; - - list_for_each_entry(expr, &expr_list->el_exprs, re_link) { - for (i = expr->re_lo; i <= expr->re_hi; i++) { - if (!((i - expr->re_lo) % expr->re_stride)) - count++; - } - } - - if (!count) /* empty expression list */ - return 0; - - if (count > max) { - CERROR("Number of values %d exceeds max allowed %d\n", - max, count); - return -EINVAL; - } - - val = kvmalloc_array(count, sizeof(val[0]), GFP_KERNEL | __GFP_ZERO); - if (!val) - return -ENOMEM; - - count = 0; - list_for_each_entry(expr, &expr_list->el_exprs, re_link) { - for (i = expr->re_lo; i <= expr->re_hi; i++) { - if (!((i - expr->re_lo) % expr->re_stride)) - val[count++] = i; - } - } - - *valpp = val; - return count; -} -EXPORT_SYMBOL(cfs_expr_list_values); - -/** - * Frees cfs_range_expr structures of \a expr_list. - * - * \retval none - */ -void -cfs_expr_list_free(struct cfs_expr_list *expr_list) -{ - while (!list_empty(&expr_list->el_exprs)) { - struct cfs_range_expr *expr; - - expr = list_entry(expr_list->el_exprs.next, - struct cfs_range_expr, re_link); - list_del(&expr->re_link); - kfree(expr); - } - - kfree(expr_list); -} -EXPORT_SYMBOL(cfs_expr_list_free); - -/** - * Parses \ token of the syntax. - * - * \retval 0 if \a str parses to \ | \ - * \retval -errno otherwise - */ -int -cfs_expr_list_parse(char *str, int len, unsigned int min, unsigned int max, - struct cfs_expr_list **elpp) -{ - struct cfs_expr_list *expr_list; - struct cfs_range_expr *expr; - struct cfs_lstr src; - int rc; - - expr_list = kzalloc(sizeof(*expr_list), GFP_NOFS); - if (!expr_list) - return -ENOMEM; - - src.ls_str = str; - src.ls_len = len; - - INIT_LIST_HEAD(&expr_list->el_exprs); - - if (src.ls_str[0] == '[' && - src.ls_str[src.ls_len - 1] == ']') { - src.ls_str++; - src.ls_len -= 2; - - rc = -EINVAL; - while (src.ls_str) { - struct cfs_lstr tok; - - if (!cfs_gettok(&src, ',', &tok)) { - rc = -EINVAL; - break; - } - - rc = cfs_range_expr_parse(&tok, min, max, 1, &expr); - if (rc) - break; - - list_add_tail(&expr->re_link, &expr_list->el_exprs); - } - } else { - rc = cfs_range_expr_parse(&src, min, max, 0, &expr); - if (!rc) - list_add_tail(&expr->re_link, &expr_list->el_exprs); - } - - if (rc) - cfs_expr_list_free(expr_list); - else - *elpp = expr_list; - - return rc; -} -EXPORT_SYMBOL(cfs_expr_list_parse); - -/** - * Frees cfs_expr_list structures of \a list. - * - * For each struct cfs_expr_list structure found on \a list it frees - * range_expr list attached to it and frees the cfs_expr_list itself. - * - * \retval none - */ -void -cfs_expr_list_free_list(struct list_head *list) -{ - struct cfs_expr_list *el; - - while (!list_empty(list)) { - el = list_entry(list->next, struct cfs_expr_list, el_link); - list_del(&el->el_link); - cfs_expr_list_free(el); - } -} -EXPORT_SYMBOL(cfs_expr_list_free_list); diff --git a/drivers/staging/lustre/lnet/libcfs/linux-crypto-adler.c b/drivers/staging/lustre/lnet/libcfs/linux-crypto-adler.c deleted file mode 100644 index db81ed527452850508f7d5e067d74ff7d86c9a3d..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/libcfs/linux-crypto-adler.c +++ /dev/null @@ -1,139 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see http://www.gnu.org/licenses - * - * Please visit http://www.xyratex.com/contact if you need additional - * information or have any questions. - * - * GPL HEADER END - */ - -/* - * Copyright 2012 Xyratex Technology Limited - */ - -/* - * This is crypto api shash wrappers to zlib_adler32. - */ - -#include -#include -#include -#include "linux-crypto.h" - -#define CHKSUM_BLOCK_SIZE 1 -#define CHKSUM_DIGEST_SIZE 4 - -static int adler32_cra_init(struct crypto_tfm *tfm) -{ - u32 *key = crypto_tfm_ctx(tfm); - - *key = 1; - - return 0; -} - -static int adler32_setkey(struct crypto_shash *hash, const u8 *key, - unsigned int keylen) -{ - u32 *mctx = crypto_shash_ctx(hash); - - if (keylen != sizeof(u32)) { - crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } - *mctx = *(u32 *)key; - return 0; -} - -static int adler32_init(struct shash_desc *desc) -{ - u32 *mctx = crypto_shash_ctx(desc->tfm); - u32 *cksump = shash_desc_ctx(desc); - - *cksump = *mctx; - - return 0; -} - -static int adler32_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - u32 *cksump = shash_desc_ctx(desc); - - *cksump = zlib_adler32(*cksump, data, len); - return 0; -} - -static int __adler32_finup(u32 *cksump, const u8 *data, unsigned int len, - u8 *out) -{ - *(u32 *)out = zlib_adler32(*cksump, data, len); - return 0; -} - -static int adler32_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return __adler32_finup(shash_desc_ctx(desc), data, len, out); -} - -static int adler32_final(struct shash_desc *desc, u8 *out) -{ - u32 *cksump = shash_desc_ctx(desc); - - *(u32 *)out = *cksump; - return 0; -} - -static int adler32_digest(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return __adler32_finup(crypto_shash_ctx(desc->tfm), data, len, - out); -} - -static struct shash_alg alg = { - .setkey = adler32_setkey, - .init = adler32_init, - .update = adler32_update, - .final = adler32_final, - .finup = adler32_finup, - .digest = adler32_digest, - .descsize = sizeof(u32), - .digestsize = CHKSUM_DIGEST_SIZE, - .base = { - .cra_name = "adler32", - .cra_driver_name = "adler32-zlib", - .cra_priority = 100, - .cra_flags = CRYPTO_ALG_OPTIONAL_KEY, - .cra_blocksize = CHKSUM_BLOCK_SIZE, - .cra_ctxsize = sizeof(u32), - .cra_module = THIS_MODULE, - .cra_init = adler32_cra_init, - } -}; - -int cfs_crypto_adler32_register(void) -{ - return crypto_register_shash(&alg); -} - -void cfs_crypto_adler32_unregister(void) -{ - crypto_unregister_shash(&alg); -} diff --git a/drivers/staging/lustre/lnet/libcfs/linux-crypto.c b/drivers/staging/lustre/lnet/libcfs/linux-crypto.c deleted file mode 100644 index 21ff9bf6da47d3e3a0b92b2410f3139c2b36d95c..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/libcfs/linux-crypto.c +++ /dev/null @@ -1,447 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see http://www.gnu.org/licenses - * - * Please visit http://www.xyratex.com/contact if you need additional - * information or have any questions. - * - * GPL HEADER END - */ - -/* - * Copyright 2012 Xyratex Technology Limited - * - * Copyright (c) 2012, Intel Corporation. - */ - -#include -#include -#include -#include -#include -#include -#include "linux-crypto.h" - -/** - * Array of hash algorithm speed in MByte per second - */ -static int cfs_crypto_hash_speeds[CFS_HASH_ALG_MAX]; - -/** - * Initialize the state descriptor for the specified hash algorithm. - * - * An internal routine to allocate the hash-specific state in \a req for - * use with cfs_crypto_hash_digest() to compute the hash of a single message, - * though possibly in multiple chunks. The descriptor internal state should - * be freed with cfs_crypto_hash_final(). - * - * \param[in] hash_alg hash algorithm id (CFS_HASH_ALG_*) - * \param[out] type pointer to the hash description in hash_types[] - * array - * \param[in,out] req hash state descriptor to be initialized - * \param[in] key initial hash value/state, NULL to use default - * value - * \param[in] key_len length of \a key - * - * \retval 0 on success - * \retval negative errno on failure - */ -static int cfs_crypto_hash_alloc(enum cfs_crypto_hash_alg hash_alg, - const struct cfs_crypto_hash_type **type, - struct ahash_request **req, - unsigned char *key, - unsigned int key_len) -{ - struct crypto_ahash *tfm; - int err = 0; - - *type = cfs_crypto_hash_type(hash_alg); - - if (!*type) { - CWARN("Unsupported hash algorithm id = %d, max id is %d\n", - hash_alg, CFS_HASH_ALG_MAX); - return -EINVAL; - } - tfm = crypto_alloc_ahash((*type)->cht_name, 0, CRYPTO_ALG_ASYNC); - - if (IS_ERR(tfm)) { - CDEBUG(D_INFO, "Failed to alloc crypto hash %s\n", - (*type)->cht_name); - return PTR_ERR(tfm); - } - - *req = ahash_request_alloc(tfm, GFP_KERNEL); - if (!*req) { - CDEBUG(D_INFO, "Failed to alloc ahash_request for %s\n", - (*type)->cht_name); - crypto_free_ahash(tfm); - return -ENOMEM; - } - - ahash_request_set_callback(*req, 0, NULL, NULL); - - if (key) - err = crypto_ahash_setkey(tfm, key, key_len); - else if ((*type)->cht_key) - err = crypto_ahash_setkey(tfm, - (unsigned char *)&((*type)->cht_key), - (*type)->cht_size); - - if (err) { - ahash_request_free(*req); - crypto_free_ahash(tfm); - return err; - } - - CDEBUG(D_INFO, "Using crypto hash: %s (%s) speed %d MB/s\n", - crypto_ahash_alg_name(tfm), crypto_ahash_driver_name(tfm), - cfs_crypto_hash_speeds[hash_alg]); - - err = crypto_ahash_init(*req); - if (err) { - ahash_request_free(*req); - crypto_free_ahash(tfm); - } - return err; -} - -/** - * Calculate hash digest for the passed buffer. - * - * This should be used when computing the hash on a single contiguous buffer. - * It combines the hash initialization, computation, and cleanup. - * - * \param[in] hash_alg id of hash algorithm (CFS_HASH_ALG_*) - * \param[in] buf data buffer on which to compute hash - * \param[in] buf_len length of \a buf in bytes - * \param[in] key initial value/state for algorithm, - * if \a key = NULL use default initial value - * \param[in] key_len length of \a key in bytes - * \param[out] hash pointer to computed hash value, - * if \a hash = NULL then \a hash_len is to digest - * size in bytes, retval -ENOSPC - * \param[in,out] hash_len size of \a hash buffer - * - * \retval -EINVAL \a buf, \a buf_len, \a hash_len, - * \a hash_alg invalid - * \retval -ENOENT \a hash_alg is unsupported - * \retval -ENOSPC \a hash is NULL, or \a hash_len less than - * digest size - * \retval 0 for success - * \retval negative errno for other errors from lower - * layers. - */ -int cfs_crypto_hash_digest(enum cfs_crypto_hash_alg hash_alg, - const void *buf, unsigned int buf_len, - unsigned char *key, unsigned int key_len, - unsigned char *hash, unsigned int *hash_len) -{ - struct scatterlist sl; - struct ahash_request *req; - int err; - const struct cfs_crypto_hash_type *type; - - if (!buf || !buf_len || !hash_len) - return -EINVAL; - - err = cfs_crypto_hash_alloc(hash_alg, &type, &req, key, key_len); - if (err) - return err; - - if (!hash || *hash_len < type->cht_size) { - *hash_len = type->cht_size; - crypto_free_ahash(crypto_ahash_reqtfm(req)); - ahash_request_free(req); - return -ENOSPC; - } - sg_init_one(&sl, buf, buf_len); - - ahash_request_set_crypt(req, &sl, hash, sl.length); - err = crypto_ahash_digest(req); - crypto_free_ahash(crypto_ahash_reqtfm(req)); - ahash_request_free(req); - - return err; -} -EXPORT_SYMBOL(cfs_crypto_hash_digest); - -/** - * Allocate and initialize descriptor for hash algorithm. - * - * This should be used to initialize a hash descriptor for multiple calls - * to a single hash function when computing the hash across multiple - * separate buffers or pages using cfs_crypto_hash_update{,_page}(). - * - * The hash descriptor should be freed with cfs_crypto_hash_final(). - * - * \param[in] hash_alg algorithm id (CFS_HASH_ALG_*) - * \param[in] key initial value/state for algorithm, if \a key = NULL - * use default initial value - * \param[in] key_len length of \a key in bytes - * - * \retval pointer to descriptor of hash instance - * \retval ERR_PTR(errno) in case of error - */ -struct ahash_request * -cfs_crypto_hash_init(enum cfs_crypto_hash_alg hash_alg, - unsigned char *key, unsigned int key_len) -{ - struct ahash_request *req; - int err; - const struct cfs_crypto_hash_type *type; - - err = cfs_crypto_hash_alloc(hash_alg, &type, &req, key, key_len); - - if (err) - return ERR_PTR(err); - return req; -} -EXPORT_SYMBOL(cfs_crypto_hash_init); - -/** - * Update hash digest computed on data within the given \a page - * - * \param[in] hreq hash state descriptor - * \param[in] page data page on which to compute the hash - * \param[in] offset offset within \a page at which to start hash - * \param[in] len length of data on which to compute hash - * - * \retval 0 for success - * \retval negative errno on failure - */ -int cfs_crypto_hash_update_page(struct ahash_request *req, - struct page *page, unsigned int offset, - unsigned int len) -{ - struct scatterlist sl; - - sg_init_table(&sl, 1); - sg_set_page(&sl, page, len, offset & ~PAGE_MASK); - - ahash_request_set_crypt(req, &sl, NULL, sl.length); - return crypto_ahash_update(req); -} -EXPORT_SYMBOL(cfs_crypto_hash_update_page); - -/** - * Update hash digest computed on the specified data - * - * \param[in] req hash state descriptor - * \param[in] buf data buffer on which to compute the hash - * \param[in] buf_len length of \buf on which to compute hash - * - * \retval 0 for success - * \retval negative errno on failure - */ -int cfs_crypto_hash_update(struct ahash_request *req, - const void *buf, unsigned int buf_len) -{ - struct scatterlist sl; - - sg_init_one(&sl, buf, buf_len); - - ahash_request_set_crypt(req, &sl, NULL, sl.length); - return crypto_ahash_update(req); -} -EXPORT_SYMBOL(cfs_crypto_hash_update); - -/** - * Finish hash calculation, copy hash digest to buffer, clean up hash descriptor - * - * \param[in] req hash descriptor - * \param[out] hash pointer to hash buffer to store hash digest - * \param[in,out] hash_len pointer to hash buffer size, if \a req = NULL - * only free \a req instead of computing the hash - * - * \retval 0 for success - * \retval -EOVERFLOW if hash_len is too small for the hash digest - * \retval negative errno for other errors from lower layers - */ -int cfs_crypto_hash_final(struct ahash_request *req, - unsigned char *hash, unsigned int *hash_len) -{ - int err; - int size = crypto_ahash_digestsize(crypto_ahash_reqtfm(req)); - - if (!hash || !hash_len) { - err = 0; - goto free_ahash; - } - if (*hash_len < size) { - err = -EOVERFLOW; - goto free_ahash; - } - - ahash_request_set_crypt(req, NULL, hash, 0); - err = crypto_ahash_final(req); - if (!err) - *hash_len = size; -free_ahash: - crypto_free_ahash(crypto_ahash_reqtfm(req)); - ahash_request_free(req); - return err; -} -EXPORT_SYMBOL(cfs_crypto_hash_final); - -/** - * Compute the speed of specified hash function - * - * Run a speed test on the given hash algorithm on buffer of the given size. - * The speed is stored internally in the cfs_crypto_hash_speeds[] array, and - * is available through the cfs_crypto_hash_speed() function. - * - * \param[in] hash_alg hash algorithm id (CFS_HASH_ALG_*) - * \param[in] buf data buffer on which to compute the hash - * \param[in] buf_len length of \buf on which to compute hash - */ -static void cfs_crypto_performance_test(enum cfs_crypto_hash_alg hash_alg) -{ - int buf_len = max(PAGE_SIZE, 1048576UL); - void *buf; - unsigned long start, end; - int bcount, err = 0; - struct page *page; - unsigned char hash[CFS_CRYPTO_HASH_DIGESTSIZE_MAX]; - unsigned int hash_len = sizeof(hash); - - page = alloc_page(GFP_KERNEL); - if (!page) { - err = -ENOMEM; - goto out_err; - } - - buf = kmap(page); - memset(buf, 0xAD, PAGE_SIZE); - kunmap(page); - - for (start = jiffies, end = start + msecs_to_jiffies(MSEC_PER_SEC), - bcount = 0; time_before(jiffies, end); bcount++) { - struct ahash_request *hdesc; - int i; - - hdesc = cfs_crypto_hash_init(hash_alg, NULL, 0); - if (IS_ERR(hdesc)) { - err = PTR_ERR(hdesc); - break; - } - - for (i = 0; i < buf_len / PAGE_SIZE; i++) { - err = cfs_crypto_hash_update_page(hdesc, page, 0, - PAGE_SIZE); - if (err) - break; - } - - err = cfs_crypto_hash_final(hdesc, hash, &hash_len); - if (err) - break; - } - end = jiffies; - __free_page(page); -out_err: - if (err) { - cfs_crypto_hash_speeds[hash_alg] = err; - CDEBUG(D_INFO, "Crypto hash algorithm %s test error: rc = %d\n", - cfs_crypto_hash_name(hash_alg), err); - } else { - unsigned long tmp; - - tmp = ((bcount * buf_len / jiffies_to_msecs(end - start)) * - 1000) / (1024 * 1024); - cfs_crypto_hash_speeds[hash_alg] = (int)tmp; - CDEBUG(D_CONFIG, "Crypto hash algorithm %s speed = %d MB/s\n", - cfs_crypto_hash_name(hash_alg), - cfs_crypto_hash_speeds[hash_alg]); - } -} - -/** - * hash speed in Mbytes per second for valid hash algorithm - * - * Return the performance of the specified \a hash_alg that was previously - * computed using cfs_crypto_performance_test(). - * - * \param[in] hash_alg hash algorithm id (CFS_HASH_ALG_*) - * - * \retval positive speed of the hash function in MB/s - * \retval -ENOENT if \a hash_alg is unsupported - * \retval negative errno if \a hash_alg speed is unavailable - */ -int cfs_crypto_hash_speed(enum cfs_crypto_hash_alg hash_alg) -{ - if (hash_alg < CFS_HASH_ALG_MAX) - return cfs_crypto_hash_speeds[hash_alg]; - return -ENOENT; -} -EXPORT_SYMBOL(cfs_crypto_hash_speed); - -/** - * Run the performance test for all hash algorithms. - * - * Run the cfs_crypto_performance_test() benchmark for all of the available - * hash functions using a 1MB buffer size. This is a reasonable buffer size - * for Lustre RPCs, even if the actual RPC size is larger or smaller. - * - * Since the setup cost and computation speed of various hash algorithms is - * a function of the buffer size (and possibly internal contention of offload - * engines), this speed only represents an estimate of the actual speed under - * actual usage, but is reasonable for comparing available algorithms. - * - * The actual speeds are available via cfs_crypto_hash_speed() for later - * comparison. - * - * \retval 0 on success - * \retval -ENOMEM if no memory is available for test buffer - */ -static int cfs_crypto_test_hashes(void) -{ - enum cfs_crypto_hash_alg hash_alg; - - for (hash_alg = 0; hash_alg < CFS_HASH_ALG_MAX; hash_alg++) - cfs_crypto_performance_test(hash_alg); - - return 0; -} - -static int adler32; - -/** - * Register available hash functions - * - * \retval 0 - */ -int cfs_crypto_register(void) -{ - request_module("crc32c"); - - if (cfs_crypto_adler32_register() == 0) - adler32 = 1; - - /* check all algorithms and do performance test */ - cfs_crypto_test_hashes(); - return 0; -} - -/** - * Unregister previously registered hash functions - */ -void cfs_crypto_unregister(void) -{ - if (adler32) - cfs_crypto_adler32_unregister(); - adler32 = 0; -} diff --git a/drivers/staging/lustre/lnet/libcfs/linux-crypto.h b/drivers/staging/lustre/lnet/libcfs/linux-crypto.h deleted file mode 100644 index 5616e9ea145056a547012d9795625e7922458697..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/libcfs/linux-crypto.h +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see http://www.gnu.org/licenses - * - * Please visit http://www.xyratex.com/contact if you need additional - * information or have any questions. - * - * GPL HEADER END - */ - -/** - * Functions for start/stop shash adler32 algorithm. - */ -int cfs_crypto_adler32_register(void); -void cfs_crypto_adler32_unregister(void); diff --git a/drivers/staging/lustre/lnet/libcfs/linux-debug.c b/drivers/staging/lustre/lnet/libcfs/linux-debug.c deleted file mode 100644 index 15ab849374c20c53e120d194533f06b3ea9f0098..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/libcfs/linux-debug.c +++ /dev/null @@ -1,142 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * libcfs/libcfs/linux/linux-debug.c - * - * Author: Phil Schwan - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -# define DEBUG_SUBSYSTEM S_LNET - -#include "tracefile.h" - -#include - -char lnet_debug_log_upcall[1024] = "/usr/lib/lustre/lnet_debug_log_upcall"; - -/** - * Upcall function once a Lustre log has been dumped. - * - * \param file path of the dumped log - */ -void libcfs_run_debug_log_upcall(char *file) -{ - char *argv[3]; - int rc; - static const char * const envp[] = { - "HOME=/", - "PATH=/sbin:/bin:/usr/sbin:/usr/bin", - NULL - }; - - argv[0] = lnet_debug_log_upcall; - - LASSERTF(file, "called on a null filename\n"); - argv[1] = file; /* only need to pass the path of the file */ - - argv[2] = NULL; - - rc = call_usermodehelper(argv[0], argv, (char **)envp, 1); - if (rc < 0 && rc != -ENOENT) { - CERROR("Error %d invoking LNET debug log upcall %s %s; check /sys/kernel/debug/lnet/debug_log_upcall\n", - rc, argv[0], argv[1]); - } else { - CDEBUG(D_HA, "Invoked LNET debug log upcall %s %s\n", - argv[0], argv[1]); - } -} - -/* coverity[+kill] */ -void __noreturn lbug_with_loc(struct libcfs_debug_msg_data *msgdata) -{ - libcfs_catastrophe = 1; - libcfs_debug_msg(msgdata, "LBUG\n"); - - if (in_interrupt()) { - panic("LBUG in interrupt.\n"); - /* not reached */ - } - - dump_stack(); - if (!libcfs_panic_on_lbug) - libcfs_debug_dumplog(); - if (libcfs_panic_on_lbug) - panic("LBUG"); - set_current_state(TASK_UNINTERRUPTIBLE); - while (1) - schedule(); -} -EXPORT_SYMBOL(lbug_with_loc); - -static int panic_notifier(struct notifier_block *self, unsigned long unused1, - void *unused2) -{ - if (libcfs_panic_in_progress) - return 0; - - libcfs_panic_in_progress = 1; - mb(); - - return 0; -} - -static struct notifier_block libcfs_panic_notifier = { - .notifier_call = panic_notifier, - .next = NULL, - .priority = 10000, -}; - -void libcfs_register_panic_notifier(void) -{ - atomic_notifier_chain_register(&panic_notifier_list, - &libcfs_panic_notifier); -} - -void libcfs_unregister_panic_notifier(void) -{ - atomic_notifier_chain_unregister(&panic_notifier_list, - &libcfs_panic_notifier); -} diff --git a/drivers/staging/lustre/lnet/libcfs/linux-tracefile.c b/drivers/staging/lustre/lnet/libcfs/linux-tracefile.c deleted file mode 100644 index 347138409eba33a88cdcdff4a19277f39b11b4b7..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/libcfs/linux-tracefile.c +++ /dev/null @@ -1,258 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_LNET -#define LUSTRE_TRACEFILE_PRIVATE - -#include -#include -#include "tracefile.h" - -/* percents to share the total debug memory for each type */ -static unsigned int pages_factor[CFS_TCD_TYPE_MAX] = { - 80, /* 80% pages for CFS_TCD_TYPE_PROC */ - 10, /* 10% pages for CFS_TCD_TYPE_SOFTIRQ */ - 10 /* 10% pages for CFS_TCD_TYPE_IRQ */ -}; - -char *cfs_trace_console_buffers[NR_CPUS][CFS_TCD_TYPE_MAX]; - -static DECLARE_RWSEM(cfs_tracefile_sem); - -int cfs_tracefile_init_arch(void) -{ - int i; - int j; - struct cfs_trace_cpu_data *tcd; - - /* initialize trace_data */ - memset(cfs_trace_data, 0, sizeof(cfs_trace_data)); - for (i = 0; i < CFS_TCD_TYPE_MAX; i++) { - cfs_trace_data[i] = - kmalloc_array(num_possible_cpus(), - sizeof(union cfs_trace_data_union), - GFP_KERNEL); - if (!cfs_trace_data[i]) - goto out; - } - - /* arch related info initialized */ - cfs_tcd_for_each(tcd, i, j) { - spin_lock_init(&tcd->tcd_lock); - tcd->tcd_pages_factor = pages_factor[i]; - tcd->tcd_type = i; - tcd->tcd_cpu = j; - } - - for (i = 0; i < num_possible_cpus(); i++) - for (j = 0; j < 3; j++) { - cfs_trace_console_buffers[i][j] = - kmalloc(CFS_TRACE_CONSOLE_BUFFER_SIZE, - GFP_KERNEL); - - if (!cfs_trace_console_buffers[i][j]) - goto out; - } - - return 0; - -out: - cfs_tracefile_fini_arch(); - pr_err("lnet: Not enough memory\n"); - return -ENOMEM; -} - -void cfs_tracefile_fini_arch(void) -{ - int i; - int j; - - for (i = 0; i < num_possible_cpus(); i++) - for (j = 0; j < 3; j++) { - kfree(cfs_trace_console_buffers[i][j]); - cfs_trace_console_buffers[i][j] = NULL; - } - - for (i = 0; cfs_trace_data[i]; i++) { - kfree(cfs_trace_data[i]); - cfs_trace_data[i] = NULL; - } -} - -void cfs_tracefile_read_lock(void) -{ - down_read(&cfs_tracefile_sem); -} - -void cfs_tracefile_read_unlock(void) -{ - up_read(&cfs_tracefile_sem); -} - -void cfs_tracefile_write_lock(void) -{ - down_write(&cfs_tracefile_sem); -} - -void cfs_tracefile_write_unlock(void) -{ - up_write(&cfs_tracefile_sem); -} - -enum cfs_trace_buf_type cfs_trace_buf_idx_get(void) -{ - if (in_irq()) - return CFS_TCD_TYPE_IRQ; - if (in_softirq()) - return CFS_TCD_TYPE_SOFTIRQ; - return CFS_TCD_TYPE_PROC; -} - -/* - * The walking argument indicates the locking comes from all tcd types - * iterator and we must lock it and dissable local irqs to avoid deadlocks - * with other interrupt locks that might be happening. See LU-1311 - * for details. - */ -int cfs_trace_lock_tcd(struct cfs_trace_cpu_data *tcd, int walking) - __acquires(&tcd->tc_lock) -{ - __LASSERT(tcd->tcd_type < CFS_TCD_TYPE_MAX); - if (tcd->tcd_type == CFS_TCD_TYPE_IRQ) - spin_lock_irqsave(&tcd->tcd_lock, tcd->tcd_lock_flags); - else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ) - spin_lock_bh(&tcd->tcd_lock); - else if (unlikely(walking)) - spin_lock_irq(&tcd->tcd_lock); - else - spin_lock(&tcd->tcd_lock); - return 1; -} - -void cfs_trace_unlock_tcd(struct cfs_trace_cpu_data *tcd, int walking) - __releases(&tcd->tcd_lock) -{ - __LASSERT(tcd->tcd_type < CFS_TCD_TYPE_MAX); - if (tcd->tcd_type == CFS_TCD_TYPE_IRQ) - spin_unlock_irqrestore(&tcd->tcd_lock, tcd->tcd_lock_flags); - else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ) - spin_unlock_bh(&tcd->tcd_lock); - else if (unlikely(walking)) - spin_unlock_irq(&tcd->tcd_lock); - else - spin_unlock(&tcd->tcd_lock); -} - -void -cfs_set_ptldebug_header(struct ptldebug_header *header, - struct libcfs_debug_msg_data *msgdata, - unsigned long stack) -{ - struct timespec64 ts; - - ktime_get_real_ts64(&ts); - - header->ph_subsys = msgdata->msg_subsys; - header->ph_mask = msgdata->msg_mask; - header->ph_cpu_id = smp_processor_id(); - header->ph_type = cfs_trace_buf_idx_get(); - /* y2038 safe since all user space treats this as unsigned, but - * will overflow in 2106 - */ - header->ph_sec = (u32)ts.tv_sec; - header->ph_usec = ts.tv_nsec / NSEC_PER_USEC; - header->ph_stack = stack; - header->ph_pid = current->pid; - header->ph_line_num = msgdata->msg_line; - header->ph_extern_pid = 0; -} - -static char * -dbghdr_to_err_string(struct ptldebug_header *hdr) -{ - switch (hdr->ph_subsys) { - case S_LND: - case S_LNET: - return "LNetError"; - default: - return "LustreError"; - } -} - -static char * -dbghdr_to_info_string(struct ptldebug_header *hdr) -{ - switch (hdr->ph_subsys) { - case S_LND: - case S_LNET: - return "LNet"; - default: - return "Lustre"; - } -} - -void cfs_print_to_console(struct ptldebug_header *hdr, int mask, - const char *buf, int len, const char *file, - const char *fn) -{ - char *prefix = "Lustre", *ptype = NULL; - - if (mask & D_EMERG) { - prefix = dbghdr_to_err_string(hdr); - ptype = KERN_EMERG; - } else if (mask & D_ERROR) { - prefix = dbghdr_to_err_string(hdr); - ptype = KERN_ERR; - } else if (mask & D_WARNING) { - prefix = dbghdr_to_info_string(hdr); - ptype = KERN_WARNING; - } else if (mask & (D_CONSOLE | libcfs_printk)) { - prefix = dbghdr_to_info_string(hdr); - ptype = KERN_INFO; - } - - if (mask & D_CONSOLE) { - pr_info("%s%s: %.*s", ptype, prefix, len, buf); - } else { - pr_info("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix, - hdr->ph_pid, hdr->ph_extern_pid, file, - hdr->ph_line_num, fn, len, buf); - } -} - -int cfs_trace_max_debug_mb(void) -{ - int total_mb = (totalram_pages >> (20 - PAGE_SHIFT)); - - return max(512, (total_mb * 80) / 100); -} diff --git a/drivers/staging/lustre/lnet/libcfs/module.c b/drivers/staging/lustre/lnet/libcfs/module.c deleted file mode 100644 index 5dc7de9e6478cc76ef5f4428388f7c27e450e9a4..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/libcfs/module.c +++ /dev/null @@ -1,758 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015 Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include - -#include -#include - -# define DEBUG_SUBSYSTEM S_LNET - -#include - -#include -#include -#include -#include "tracefile.h" - -struct lnet_debugfs_symlink_def { - char *name; - char *target; -}; - -static struct dentry *lnet_debugfs_root; - -BLOCKING_NOTIFIER_HEAD(libcfs_ioctl_list); -EXPORT_SYMBOL(libcfs_ioctl_list); - -static inline size_t libcfs_ioctl_packlen(struct libcfs_ioctl_data *data) -{ - size_t len = sizeof(*data); - - len += cfs_size_round(data->ioc_inllen1); - len += cfs_size_round(data->ioc_inllen2); - return len; -} - -static inline bool libcfs_ioctl_is_invalid(struct libcfs_ioctl_data *data) -{ - if (data->ioc_hdr.ioc_len > BIT(30)) { - CERROR("LIBCFS ioctl: ioc_len larger than 1<<30\n"); - return true; - } - if (data->ioc_inllen1 > BIT(30)) { - CERROR("LIBCFS ioctl: ioc_inllen1 larger than 1<<30\n"); - return true; - } - if (data->ioc_inllen2 > BIT(30)) { - CERROR("LIBCFS ioctl: ioc_inllen2 larger than 1<<30\n"); - return true; - } - if (data->ioc_inlbuf1 && !data->ioc_inllen1) { - CERROR("LIBCFS ioctl: inlbuf1 pointer but 0 length\n"); - return true; - } - if (data->ioc_inlbuf2 && !data->ioc_inllen2) { - CERROR("LIBCFS ioctl: inlbuf2 pointer but 0 length\n"); - return true; - } - if (data->ioc_pbuf1 && !data->ioc_plen1) { - CERROR("LIBCFS ioctl: pbuf1 pointer but 0 length\n"); - return true; - } - if (data->ioc_pbuf2 && !data->ioc_plen2) { - CERROR("LIBCFS ioctl: pbuf2 pointer but 0 length\n"); - return true; - } - if (data->ioc_plen1 && !data->ioc_pbuf1) { - CERROR("LIBCFS ioctl: plen1 nonzero but no pbuf1 pointer\n"); - return true; - } - if (data->ioc_plen2 && !data->ioc_pbuf2) { - CERROR("LIBCFS ioctl: plen2 nonzero but no pbuf2 pointer\n"); - return true; - } - if ((u32)libcfs_ioctl_packlen(data) != data->ioc_hdr.ioc_len) { - CERROR("LIBCFS ioctl: packlen != ioc_len\n"); - return true; - } - if (data->ioc_inllen1 && - data->ioc_bulk[data->ioc_inllen1 - 1] != '\0') { - CERROR("LIBCFS ioctl: inlbuf1 not 0 terminated\n"); - return true; - } - if (data->ioc_inllen2 && - data->ioc_bulk[cfs_size_round(data->ioc_inllen1) + - data->ioc_inllen2 - 1] != '\0') { - CERROR("LIBCFS ioctl: inlbuf2 not 0 terminated\n"); - return true; - } - return false; -} - -static int libcfs_ioctl_data_adjust(struct libcfs_ioctl_data *data) -{ - if (libcfs_ioctl_is_invalid(data)) { - CERROR("libcfs ioctl: parameter not correctly formatted\n"); - return -EINVAL; - } - - if (data->ioc_inllen1) - data->ioc_inlbuf1 = &data->ioc_bulk[0]; - - if (data->ioc_inllen2) - data->ioc_inlbuf2 = &data->ioc_bulk[0] + - cfs_size_round(data->ioc_inllen1); - - return 0; -} - -static int libcfs_ioctl_getdata(struct libcfs_ioctl_hdr **hdr_pp, - const struct libcfs_ioctl_hdr __user *uhdr) -{ - struct libcfs_ioctl_hdr hdr; - int err; - - if (copy_from_user(&hdr, uhdr, sizeof(hdr))) - return -EFAULT; - - if (hdr.ioc_version != LIBCFS_IOCTL_VERSION && - hdr.ioc_version != LIBCFS_IOCTL_VERSION2) { - CERROR("libcfs ioctl: version mismatch expected %#x, got %#x\n", - LIBCFS_IOCTL_VERSION, hdr.ioc_version); - return -EINVAL; - } - - if (hdr.ioc_len < sizeof(hdr)) { - CERROR("libcfs ioctl: user buffer too small for ioctl\n"); - return -EINVAL; - } - - if (hdr.ioc_len > LIBCFS_IOC_DATA_MAX) { - CERROR("libcfs ioctl: user buffer is too large %d/%d\n", - hdr.ioc_len, LIBCFS_IOC_DATA_MAX); - return -EINVAL; - } - - *hdr_pp = kvmalloc(hdr.ioc_len, GFP_KERNEL); - if (!*hdr_pp) - return -ENOMEM; - - if (copy_from_user(*hdr_pp, uhdr, hdr.ioc_len)) { - err = -EFAULT; - goto free; - } - - if ((*hdr_pp)->ioc_version != hdr.ioc_version || - (*hdr_pp)->ioc_len != hdr.ioc_len) { - err = -EINVAL; - goto free; - } - - return 0; - -free: - kvfree(*hdr_pp); - return err; -} - -static int libcfs_ioctl(unsigned long cmd, void __user *uparam) -{ - struct libcfs_ioctl_data *data = NULL; - struct libcfs_ioctl_hdr *hdr; - int err; - - /* 'cmd' and permissions get checked in our arch-specific caller */ - err = libcfs_ioctl_getdata(&hdr, uparam); - if (err) { - CDEBUG_LIMIT(D_ERROR, - "libcfs ioctl: data header error %d\n", err); - return err; - } - - if (hdr->ioc_version == LIBCFS_IOCTL_VERSION) { - /* - * The libcfs_ioctl_data_adjust() function performs adjustment - * operations on the libcfs_ioctl_data structure to make - * it usable by the code. This doesn't need to be called - * for new data structures added. - */ - data = container_of(hdr, struct libcfs_ioctl_data, ioc_hdr); - err = libcfs_ioctl_data_adjust(data); - if (err) - goto out; - } - - CDEBUG(D_IOCTL, "libcfs ioctl cmd %lu\n", cmd); - switch (cmd) { - case IOC_LIBCFS_CLEAR_DEBUG: - libcfs_debug_clear_buffer(); - break; - - case IOC_LIBCFS_MARK_DEBUG: - if (!data || !data->ioc_inlbuf1 || - data->ioc_inlbuf1[data->ioc_inllen1 - 1] != '\0') { - err = -EINVAL; - goto out; - } - libcfs_debug_mark_buffer(data->ioc_inlbuf1); - break; - - default: - err = blocking_notifier_call_chain(&libcfs_ioctl_list, - cmd, hdr); - if (!(err & NOTIFY_STOP_MASK)) - /* No-one claimed the ioctl */ - err = -EINVAL; - else - err = notifier_to_errno(err); - if (!err) - if (copy_to_user(uparam, hdr, hdr->ioc_len)) - err = -EFAULT; - break; - } -out: - kvfree(hdr); - return err; -} - -static long -libcfs_psdev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - - if (_IOC_TYPE(cmd) != IOC_LIBCFS_TYPE || - _IOC_NR(cmd) < IOC_LIBCFS_MIN_NR || - _IOC_NR(cmd) > IOC_LIBCFS_MAX_NR) { - CDEBUG(D_IOCTL, "invalid ioctl ( type %d, nr %d, size %d )\n", - _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd)); - return -EINVAL; - } - - return libcfs_ioctl(cmd, (void __user *)arg); -} - -static const struct file_operations libcfs_fops = { - .owner = THIS_MODULE, - .unlocked_ioctl = libcfs_psdev_ioctl, -}; - -static struct miscdevice libcfs_dev = { - .minor = MISC_DYNAMIC_MINOR, - .name = "lnet", - .fops = &libcfs_fops, -}; - -static int libcfs_dev_registered; - -int lprocfs_call_handler(void *data, int write, loff_t *ppos, - void __user *buffer, size_t *lenp, - int (*handler)(void *data, int write, loff_t pos, - void __user *buffer, int len)) -{ - int rc = handler(data, write, *ppos, buffer, *lenp); - - if (rc < 0) - return rc; - - if (write) { - *ppos += *lenp; - } else { - *lenp = rc; - *ppos += rc; - } - return 0; -} -EXPORT_SYMBOL(lprocfs_call_handler); - -static int __proc_dobitmasks(void *data, int write, - loff_t pos, void __user *buffer, int nob) -{ - const int tmpstrlen = 512; - char *tmpstr; - int rc; - unsigned int *mask = data; - int is_subsys = (mask == &libcfs_subsystem_debug) ? 1 : 0; - int is_printk = (mask == &libcfs_printk) ? 1 : 0; - - rc = cfs_trace_allocate_string_buffer(&tmpstr, tmpstrlen); - if (rc < 0) - return rc; - - if (!write) { - libcfs_debug_mask2str(tmpstr, tmpstrlen, *mask, is_subsys); - rc = strlen(tmpstr); - - if (pos >= rc) { - rc = 0; - } else { - rc = cfs_trace_copyout_string(buffer, nob, - tmpstr + pos, "\n"); - } - } else { - rc = cfs_trace_copyin_string(tmpstr, tmpstrlen, buffer, nob); - if (rc < 0) { - kfree(tmpstr); - return rc; - } - - rc = libcfs_debug_str2mask(mask, tmpstr, is_subsys); - /* Always print LBUG/LASSERT to console, so keep this mask */ - if (is_printk) - *mask |= D_EMERG; - } - - kfree(tmpstr); - return rc; -} - -static int proc_dobitmasks(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return lprocfs_call_handler(table->data, write, ppos, buffer, lenp, - __proc_dobitmasks); -} - -static int __proc_dump_kernel(void *data, int write, - loff_t pos, void __user *buffer, int nob) -{ - if (!write) - return 0; - - return cfs_trace_dump_debug_buffer_usrstr(buffer, nob); -} - -static int proc_dump_kernel(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return lprocfs_call_handler(table->data, write, ppos, buffer, lenp, - __proc_dump_kernel); -} - -static int __proc_daemon_file(void *data, int write, - loff_t pos, void __user *buffer, int nob) -{ - if (!write) { - int len = strlen(cfs_tracefile); - - if (pos >= len) - return 0; - - return cfs_trace_copyout_string(buffer, nob, - cfs_tracefile + pos, "\n"); - } - - return cfs_trace_daemon_command_usrstr(buffer, nob); -} - -static int proc_daemon_file(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return lprocfs_call_handler(table->data, write, ppos, buffer, lenp, - __proc_daemon_file); -} - -static int libcfs_force_lbug(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - if (write) - LBUG(); - return 0; -} - -static int proc_fail_loc(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - int rc; - long old_fail_loc = cfs_fail_loc; - - rc = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); - if (old_fail_loc != cfs_fail_loc) - wake_up(&cfs_race_waitq); - return rc; -} - -static int __proc_cpt_table(void *data, int write, - loff_t pos, void __user *buffer, int nob) -{ - char *buf = NULL; - int len = 4096; - int rc = 0; - - if (write) - return -EPERM; - - while (1) { - buf = kzalloc(len, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - rc = cfs_cpt_table_print(cfs_cpt_tab, buf, len); - if (rc >= 0) - break; - - if (rc == -EFBIG) { - kfree(buf); - len <<= 1; - continue; - } - goto out; - } - - if (pos >= rc) { - rc = 0; - goto out; - } - - rc = cfs_trace_copyout_string(buffer, nob, buf + pos, NULL); - out: - kfree(buf); - return rc; -} - -static int proc_cpt_table(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return lprocfs_call_handler(table->data, write, ppos, buffer, lenp, - __proc_cpt_table); -} - -static struct ctl_table lnet_table[] = { - { - .procname = "debug", - .data = &libcfs_debug, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dobitmasks, - }, - { - .procname = "subsystem_debug", - .data = &libcfs_subsystem_debug, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dobitmasks, - }, - { - .procname = "printk", - .data = &libcfs_printk, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dobitmasks, - }, - { - .procname = "cpu_partition_table", - .maxlen = 128, - .mode = 0444, - .proc_handler = &proc_cpt_table, - }, - { - .procname = "debug_log_upcall", - .data = lnet_debug_log_upcall, - .maxlen = sizeof(lnet_debug_log_upcall), - .mode = 0644, - .proc_handler = &proc_dostring, - }, - { - .procname = "catastrophe", - .data = &libcfs_catastrophe, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec, - }, - { - .procname = "dump_kernel", - .maxlen = 256, - .mode = 0200, - .proc_handler = &proc_dump_kernel, - }, - { - .procname = "daemon_file", - .mode = 0644, - .maxlen = 256, - .proc_handler = &proc_daemon_file, - }, - { - .procname = "force_lbug", - .data = NULL, - .maxlen = 0, - .mode = 0200, - .proc_handler = &libcfs_force_lbug - }, - { - .procname = "fail_loc", - .data = &cfs_fail_loc, - .maxlen = sizeof(cfs_fail_loc), - .mode = 0644, - .proc_handler = &proc_fail_loc - }, - { - .procname = "fail_val", - .data = &cfs_fail_val, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .procname = "fail_err", - .data = &cfs_fail_err, - .maxlen = sizeof(cfs_fail_err), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - } -}; - -static const struct lnet_debugfs_symlink_def lnet_debugfs_symlinks[] = { - { "console_ratelimit", - "/sys/module/libcfs/parameters/libcfs_console_ratelimit"}, - { "debug_path", - "/sys/module/libcfs/parameters/libcfs_debug_file_path"}, - { "panic_on_lbug", - "/sys/module/libcfs/parameters/libcfs_panic_on_lbug"}, - { "libcfs_console_backoff", - "/sys/module/libcfs/parameters/libcfs_console_backoff"}, - { "debug_mb", - "/sys/module/libcfs/parameters/libcfs_debug_mb"}, - { "console_min_delay_centisecs", - "/sys/module/libcfs/parameters/libcfs_console_min_delay"}, - { "console_max_delay_centisecs", - "/sys/module/libcfs/parameters/libcfs_console_max_delay"}, - {}, -}; - -static ssize_t lnet_debugfs_read(struct file *filp, char __user *buf, - size_t count, loff_t *ppos) -{ - struct ctl_table *table = filp->private_data; - int error; - - error = table->proc_handler(table, 0, (void __user *)buf, &count, ppos); - if (!error) - error = count; - - return error; -} - -static ssize_t lnet_debugfs_write(struct file *filp, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct ctl_table *table = filp->private_data; - int error; - - error = table->proc_handler(table, 1, (void __user *)buf, &count, ppos); - if (!error) - error = count; - - return error; -} - -static const struct file_operations lnet_debugfs_file_operations_rw = { - .open = simple_open, - .read = lnet_debugfs_read, - .write = lnet_debugfs_write, - .llseek = default_llseek, -}; - -static const struct file_operations lnet_debugfs_file_operations_ro = { - .open = simple_open, - .read = lnet_debugfs_read, - .llseek = default_llseek, -}; - -static const struct file_operations lnet_debugfs_file_operations_wo = { - .open = simple_open, - .write = lnet_debugfs_write, - .llseek = default_llseek, -}; - -static const struct file_operations *lnet_debugfs_fops_select(umode_t mode) -{ - if (!(mode & 0222)) - return &lnet_debugfs_file_operations_ro; - - if (!(mode & 0444)) - return &lnet_debugfs_file_operations_wo; - - return &lnet_debugfs_file_operations_rw; -} - -void lustre_insert_debugfs(struct ctl_table *table) -{ - if (!lnet_debugfs_root) - lnet_debugfs_root = debugfs_create_dir("lnet", NULL); - - /* Even if we cannot create, just ignore it altogether) */ - if (IS_ERR_OR_NULL(lnet_debugfs_root)) - return; - - /* - * We don't save the dentry returned because we don't call - * debugfs_remove() but rather remove_recursive() - */ - for (; table->procname; table++) - debugfs_create_file(table->procname, table->mode, - lnet_debugfs_root, table, - lnet_debugfs_fops_select(table->mode)); -} -EXPORT_SYMBOL_GPL(lustre_insert_debugfs); - -static void lustre_insert_debugfs_links( - const struct lnet_debugfs_symlink_def *symlinks) -{ - for (; symlinks && symlinks->name; symlinks++) - debugfs_create_symlink(symlinks->name, lnet_debugfs_root, - symlinks->target); -} - -static void lustre_remove_debugfs(void) -{ - debugfs_remove_recursive(lnet_debugfs_root); - - lnet_debugfs_root = NULL; -} - -static DEFINE_MUTEX(libcfs_startup); -static int libcfs_active; - -int libcfs_setup(void) -{ - int rc = -EINVAL; - - mutex_lock(&libcfs_startup); - if (libcfs_active) - goto out; - - if (!libcfs_dev_registered) - goto err; - - rc = libcfs_debug_init(5 * 1024 * 1024); - if (rc < 0) { - pr_err("LustreError: libcfs_debug_init: %d\n", rc); - goto err; - } - - rc = cfs_cpu_init(); - if (rc) - goto err; - - cfs_rehash_wq = alloc_workqueue("cfs_rh", WQ_SYSFS, 4); - if (!cfs_rehash_wq) { - CERROR("Failed to start rehash workqueue.\n"); - rc = -ENOMEM; - goto err; - } - - rc = cfs_crypto_register(); - if (rc) { - CERROR("cfs_crypto_register: error %d\n", rc); - goto err; - } - - lustre_insert_debugfs(lnet_table); - if (!IS_ERR_OR_NULL(lnet_debugfs_root)) - lustre_insert_debugfs_links(lnet_debugfs_symlinks); - - CDEBUG(D_OTHER, "portals setup OK\n"); -out: - libcfs_active = 1; - mutex_unlock(&libcfs_startup); - return 0; -err: - cfs_crypto_unregister(); - if (cfs_rehash_wq) - destroy_workqueue(cfs_rehash_wq); - cfs_cpu_fini(); - libcfs_debug_cleanup(); - mutex_unlock(&libcfs_startup); - return rc; -} -EXPORT_SYMBOL(libcfs_setup); - -static int libcfs_init(void) -{ - int rc; - - rc = misc_register(&libcfs_dev); - if (rc) - CERROR("misc_register: error %d\n", rc); - else - libcfs_dev_registered = 1; - return rc; -} - -static void libcfs_exit(void) -{ - int rc; - - lustre_remove_debugfs(); - - if (cfs_rehash_wq) - destroy_workqueue(cfs_rehash_wq); - - cfs_crypto_unregister(); - - if (libcfs_dev_registered) - misc_deregister(&libcfs_dev); - - cfs_cpu_fini(); - - rc = libcfs_debug_cleanup(); - if (rc) - pr_err("LustreError: libcfs_debug_cleanup: %d\n", rc); -} - -MODULE_AUTHOR("OpenSFS, Inc. "); -MODULE_DESCRIPTION("Lustre helper library"); -MODULE_VERSION(LIBCFS_VERSION); -MODULE_LICENSE("GPL"); - -module_init(libcfs_init); -module_exit(libcfs_exit); diff --git a/drivers/staging/lustre/lnet/libcfs/tracefile.c b/drivers/staging/lustre/lnet/libcfs/tracefile.c deleted file mode 100644 index 7ca562e156f078a96242c8e287a01ad0b2959b8c..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/libcfs/tracefile.c +++ /dev/null @@ -1,1198 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * libcfs/libcfs/tracefile.c - * - * Author: Zach Brown - * Author: Phil Schwan - */ - -#define DEBUG_SUBSYSTEM S_LNET -#define LUSTRE_TRACEFILE_PRIVATE -#define pr_fmt(fmt) "Lustre: " fmt - -#include -#include -#include -#include -#include -#include -#include -#include "tracefile.h" - -/* XXX move things up to the top, comment */ -union cfs_trace_data_union (*cfs_trace_data[TCD_MAX_TYPES])[NR_CPUS] __cacheline_aligned; - -char cfs_tracefile[TRACEFILE_NAME_SIZE]; -long long cfs_tracefile_size = CFS_TRACEFILE_SIZE; -static struct tracefiled_ctl trace_tctl; -static DEFINE_MUTEX(cfs_trace_thread_mutex); -static int thread_running; - -static atomic_t cfs_tage_allocated = ATOMIC_INIT(0); - -struct page_collection { - struct list_head pc_pages; - /* - * if this flag is set, collect_pages() will spill both - * ->tcd_daemon_pages and ->tcd_pages to the ->pc_pages. Otherwise, - * only ->tcd_pages are spilled. - */ - int pc_want_daemon_pages; -}; - -struct tracefiled_ctl { - struct completion tctl_start; - struct completion tctl_stop; - wait_queue_head_t tctl_waitq; - pid_t tctl_pid; - atomic_t tctl_shutdown; -}; - -/* - * small data-structure for each page owned by tracefiled. - */ -struct cfs_trace_page { - /* - * page itself - */ - struct page *page; - /* - * linkage into one of the lists in trace_data_union or - * page_collection - */ - struct list_head linkage; - /* - * number of bytes used within this page - */ - unsigned int used; - /* - * cpu that owns this page - */ - unsigned short cpu; - /* - * type(context) of this page - */ - unsigned short type; -}; - -static void put_pages_on_tcd_daemon_list(struct page_collection *pc, - struct cfs_trace_cpu_data *tcd); - -static inline struct cfs_trace_page * -cfs_tage_from_list(struct list_head *list) -{ - return list_entry(list, struct cfs_trace_page, linkage); -} - -static struct cfs_trace_page *cfs_tage_alloc(gfp_t gfp) -{ - struct page *page; - struct cfs_trace_page *tage; - - /* My caller is trying to free memory */ - if (!in_interrupt() && (current->flags & PF_MEMALLOC)) - return NULL; - - /* - * Don't spam console with allocation failures: they will be reported - * by upper layer anyway. - */ - gfp |= __GFP_NOWARN; - page = alloc_page(gfp); - if (!page) - return NULL; - - tage = kmalloc(sizeof(*tage), gfp); - if (!tage) { - __free_page(page); - return NULL; - } - - tage->page = page; - atomic_inc(&cfs_tage_allocated); - return tage; -} - -static void cfs_tage_free(struct cfs_trace_page *tage) -{ - __free_page(tage->page); - kfree(tage); - atomic_dec(&cfs_tage_allocated); -} - -static void cfs_tage_to_tail(struct cfs_trace_page *tage, - struct list_head *queue) -{ - list_move_tail(&tage->linkage, queue); -} - -int cfs_trace_refill_stock(struct cfs_trace_cpu_data *tcd, gfp_t gfp, - struct list_head *stock) -{ - int i; - - /* - * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT) - * from here: this will lead to infinite recursion. - */ - - for (i = 0; i + tcd->tcd_cur_stock_pages < TCD_STOCK_PAGES ; ++i) { - struct cfs_trace_page *tage; - - tage = cfs_tage_alloc(gfp); - if (!tage) - break; - list_add_tail(&tage->linkage, stock); - } - return i; -} - -/* return a page that has 'len' bytes left at the end */ -static struct cfs_trace_page * -cfs_trace_get_tage_try(struct cfs_trace_cpu_data *tcd, unsigned long len) -{ - struct cfs_trace_page *tage; - - if (tcd->tcd_cur_pages > 0) { - __LASSERT(!list_empty(&tcd->tcd_pages)); - tage = cfs_tage_from_list(tcd->tcd_pages.prev); - if (tage->used + len <= PAGE_SIZE) - return tage; - } - - if (tcd->tcd_cur_pages < tcd->tcd_max_pages) { - if (tcd->tcd_cur_stock_pages > 0) { - tage = cfs_tage_from_list(tcd->tcd_stock_pages.prev); - --tcd->tcd_cur_stock_pages; - list_del_init(&tage->linkage); - } else { - tage = cfs_tage_alloc(GFP_ATOMIC); - if (unlikely(!tage)) { - if (!(current->flags & PF_MEMALLOC) || - in_interrupt()) - pr_warn_ratelimited("cannot allocate a tage (%ld)\n", - tcd->tcd_cur_pages); - return NULL; - } - } - - tage->used = 0; - tage->cpu = smp_processor_id(); - tage->type = tcd->tcd_type; - list_add_tail(&tage->linkage, &tcd->tcd_pages); - tcd->tcd_cur_pages++; - - if (tcd->tcd_cur_pages > 8 && thread_running) { - struct tracefiled_ctl *tctl = &trace_tctl; - /* - * wake up tracefiled to process some pages. - */ - wake_up(&tctl->tctl_waitq); - } - return tage; - } - return NULL; -} - -static void cfs_tcd_shrink(struct cfs_trace_cpu_data *tcd) -{ - int pgcount = tcd->tcd_cur_pages / 10; - struct page_collection pc; - struct cfs_trace_page *tage; - struct cfs_trace_page *tmp; - - /* - * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT) - * from here: this will lead to infinite recursion. - */ - - pr_warn_ratelimited("debug daemon buffer overflowed; discarding 10%% of pages (%d of %ld)\n", - pgcount + 1, tcd->tcd_cur_pages); - - INIT_LIST_HEAD(&pc.pc_pages); - - list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, linkage) { - if (!pgcount--) - break; - - list_move_tail(&tage->linkage, &pc.pc_pages); - tcd->tcd_cur_pages--; - } - put_pages_on_tcd_daemon_list(&pc, tcd); -} - -/* return a page that has 'len' bytes left at the end */ -static struct cfs_trace_page *cfs_trace_get_tage(struct cfs_trace_cpu_data *tcd, - unsigned long len) -{ - struct cfs_trace_page *tage; - - /* - * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT) - * from here: this will lead to infinite recursion. - */ - - if (len > PAGE_SIZE) { - pr_err("cowardly refusing to write %lu bytes in a page\n", len); - return NULL; - } - - tage = cfs_trace_get_tage_try(tcd, len); - if (tage) - return tage; - if (thread_running) - cfs_tcd_shrink(tcd); - if (tcd->tcd_cur_pages > 0) { - tage = cfs_tage_from_list(tcd->tcd_pages.next); - tage->used = 0; - cfs_tage_to_tail(tage, &tcd->tcd_pages); - } - return tage; -} - -int libcfs_debug_msg(struct libcfs_debug_msg_data *msgdata, - const char *format, ...) -{ - va_list args; - int rc; - - va_start(args, format); - rc = libcfs_debug_vmsg2(msgdata, format, args, NULL); - va_end(args); - - return rc; -} -EXPORT_SYMBOL(libcfs_debug_msg); - -int libcfs_debug_vmsg2(struct libcfs_debug_msg_data *msgdata, - const char *format1, va_list args, - const char *format2, ...) -{ - struct cfs_trace_cpu_data *tcd = NULL; - struct ptldebug_header header = { 0 }; - struct cfs_trace_page *tage; - /* string_buf is used only if tcd != NULL, and is always set then */ - char *string_buf = NULL; - char *debug_buf; - int known_size; - int needed = 85; /* average message length */ - int max_nob; - va_list ap; - int depth; - int i; - int remain; - int mask = msgdata->msg_mask; - const char *file = kbasename(msgdata->msg_file); - struct cfs_debug_limit_state *cdls = msgdata->msg_cdls; - - tcd = cfs_trace_get_tcd(); - - /* cfs_trace_get_tcd() grabs a lock, which disables preemption and - * pins us to a particular CPU. This avoids an smp_processor_id() - * warning on Linux when debugging is enabled. - */ - cfs_set_ptldebug_header(&header, msgdata, CDEBUG_STACK()); - - if (!tcd) /* arch may not log in IRQ context */ - goto console; - - if (!tcd->tcd_cur_pages) - header.ph_flags |= PH_FLAG_FIRST_RECORD; - - if (tcd->tcd_shutting_down) { - cfs_trace_put_tcd(tcd); - tcd = NULL; - goto console; - } - - depth = 0; - known_size = strlen(file) + 1 + depth; - if (msgdata->msg_fn) - known_size += strlen(msgdata->msg_fn) + 1; - - if (libcfs_debug_binary) - known_size += sizeof(header); - - /* - * '2' used because vsnprintf return real size required for output - * _without_ terminating NULL. - * if needed is to small for this format. - */ - for (i = 0; i < 2; i++) { - tage = cfs_trace_get_tage(tcd, needed + known_size + 1); - if (!tage) { - if (needed + known_size > PAGE_SIZE) - mask |= D_ERROR; - - cfs_trace_put_tcd(tcd); - tcd = NULL; - goto console; - } - - string_buf = (char *)page_address(tage->page) + - tage->used + known_size; - - max_nob = PAGE_SIZE - tage->used - known_size; - if (max_nob <= 0) { - pr_emerg("negative max_nob: %d\n", max_nob); - mask |= D_ERROR; - cfs_trace_put_tcd(tcd); - tcd = NULL; - goto console; - } - - needed = 0; - if (format1) { - va_copy(ap, args); - needed = vsnprintf(string_buf, max_nob, format1, ap); - va_end(ap); - } - - if (format2) { - remain = max_nob - needed; - if (remain < 0) - remain = 0; - - va_start(ap, format2); - needed += vsnprintf(string_buf + needed, remain, - format2, ap); - va_end(ap); - } - - if (needed < max_nob) /* well. printing ok.. */ - break; - } - - if (*(string_buf + needed - 1) != '\n') - pr_info("format at %s:%d:%s doesn't end in newline\n", file, - msgdata->msg_line, msgdata->msg_fn); - - header.ph_len = known_size + needed; - debug_buf = (char *)page_address(tage->page) + tage->used; - - if (libcfs_debug_binary) { - memcpy(debug_buf, &header, sizeof(header)); - tage->used += sizeof(header); - debug_buf += sizeof(header); - } - - /* indent message according to the nesting level */ - while (depth-- > 0) { - *(debug_buf++) = '.'; - ++tage->used; - } - - strcpy(debug_buf, file); - tage->used += strlen(file) + 1; - debug_buf += strlen(file) + 1; - - if (msgdata->msg_fn) { - strcpy(debug_buf, msgdata->msg_fn); - tage->used += strlen(msgdata->msg_fn) + 1; - debug_buf += strlen(msgdata->msg_fn) + 1; - } - - __LASSERT(debug_buf == string_buf); - - tage->used += needed; - __LASSERT(tage->used <= PAGE_SIZE); - -console: - if (!(mask & libcfs_printk)) { - /* no console output requested */ - if (tcd) - cfs_trace_put_tcd(tcd); - return 1; - } - - if (cdls) { - if (libcfs_console_ratelimit && - cdls->cdls_next && /* not first time ever */ - !time_after(jiffies, cdls->cdls_next)) { - /* skipping a console message */ - cdls->cdls_count++; - if (tcd) - cfs_trace_put_tcd(tcd); - return 1; - } - - if (time_after(jiffies, - cdls->cdls_next + libcfs_console_max_delay + - 10 * HZ)) { - /* last timeout was a long time ago */ - cdls->cdls_delay /= libcfs_console_backoff * 4; - } else { - cdls->cdls_delay *= libcfs_console_backoff; - } - - if (cdls->cdls_delay < libcfs_console_min_delay) - cdls->cdls_delay = libcfs_console_min_delay; - else if (cdls->cdls_delay > libcfs_console_max_delay) - cdls->cdls_delay = libcfs_console_max_delay; - - /* ensure cdls_next is never zero after it's been seen */ - cdls->cdls_next = (jiffies + cdls->cdls_delay) | 1; - } - - if (tcd) { - cfs_print_to_console(&header, mask, string_buf, needed, file, - msgdata->msg_fn); - cfs_trace_put_tcd(tcd); - } else { - string_buf = cfs_trace_get_console_buffer(); - - needed = 0; - if (format1) { - va_copy(ap, args); - needed = vsnprintf(string_buf, - CFS_TRACE_CONSOLE_BUFFER_SIZE, - format1, ap); - va_end(ap); - } - if (format2) { - remain = CFS_TRACE_CONSOLE_BUFFER_SIZE - needed; - if (remain > 0) { - va_start(ap, format2); - needed += vsnprintf(string_buf + needed, remain, - format2, ap); - va_end(ap); - } - } - cfs_print_to_console(&header, mask, - string_buf, needed, file, msgdata->msg_fn); - - put_cpu(); - } - - if (cdls && cdls->cdls_count) { - string_buf = cfs_trace_get_console_buffer(); - - needed = snprintf(string_buf, CFS_TRACE_CONSOLE_BUFFER_SIZE, - "Skipped %d previous similar message%s\n", - cdls->cdls_count, - (cdls->cdls_count > 1) ? "s" : ""); - - cfs_print_to_console(&header, mask, - string_buf, needed, file, msgdata->msg_fn); - - put_cpu(); - cdls->cdls_count = 0; - } - - return 0; -} -EXPORT_SYMBOL(libcfs_debug_vmsg2); - -void -cfs_trace_assertion_failed(const char *str, - struct libcfs_debug_msg_data *msgdata) -{ - struct ptldebug_header hdr; - - libcfs_panic_in_progress = 1; - libcfs_catastrophe = 1; - mb(); - - cfs_set_ptldebug_header(&hdr, msgdata, CDEBUG_STACK()); - - cfs_print_to_console(&hdr, D_EMERG, str, strlen(str), - msgdata->msg_file, msgdata->msg_fn); - - panic("Lustre debug assertion failure\n"); - - /* not reached */ -} - -static void -panic_collect_pages(struct page_collection *pc) -{ - /* Do the collect_pages job on a single CPU: assumes that all other - * CPUs have been stopped during a panic. If this isn't true for some - * arch, this will have to be implemented separately in each arch. - */ - struct cfs_trace_cpu_data *tcd; - int i; - int j; - - INIT_LIST_HEAD(&pc->pc_pages); - - cfs_tcd_for_each(tcd, i, j) { - list_splice_init(&tcd->tcd_pages, &pc->pc_pages); - tcd->tcd_cur_pages = 0; - - if (pc->pc_want_daemon_pages) { - list_splice_init(&tcd->tcd_daemon_pages, &pc->pc_pages); - tcd->tcd_cur_daemon_pages = 0; - } - } -} - -static void collect_pages_on_all_cpus(struct page_collection *pc) -{ - struct cfs_trace_cpu_data *tcd; - int i, cpu; - - for_each_possible_cpu(cpu) { - cfs_tcd_for_each_type_lock(tcd, i, cpu) { - list_splice_init(&tcd->tcd_pages, &pc->pc_pages); - tcd->tcd_cur_pages = 0; - if (pc->pc_want_daemon_pages) { - list_splice_init(&tcd->tcd_daemon_pages, - &pc->pc_pages); - tcd->tcd_cur_daemon_pages = 0; - } - } - } -} - -static void collect_pages(struct page_collection *pc) -{ - INIT_LIST_HEAD(&pc->pc_pages); - - if (libcfs_panic_in_progress) - panic_collect_pages(pc); - else - collect_pages_on_all_cpus(pc); -} - -static void put_pages_back_on_all_cpus(struct page_collection *pc) -{ - struct cfs_trace_cpu_data *tcd; - struct list_head *cur_head; - struct cfs_trace_page *tage; - struct cfs_trace_page *tmp; - int i, cpu; - - for_each_possible_cpu(cpu) { - cfs_tcd_for_each_type_lock(tcd, i, cpu) { - cur_head = tcd->tcd_pages.next; - - list_for_each_entry_safe(tage, tmp, &pc->pc_pages, - linkage) { - __LASSERT_TAGE_INVARIANT(tage); - - if (tage->cpu != cpu || tage->type != i) - continue; - - cfs_tage_to_tail(tage, cur_head); - tcd->tcd_cur_pages++; - } - } - } -} - -static void put_pages_back(struct page_collection *pc) -{ - if (!libcfs_panic_in_progress) - put_pages_back_on_all_cpus(pc); -} - -/* Add pages to a per-cpu debug daemon ringbuffer. This buffer makes sure that - * we have a good amount of data at all times for dumping during an LBUG, even - * if we have been steadily writing (and otherwise discarding) pages via the - * debug daemon. - */ -static void put_pages_on_tcd_daemon_list(struct page_collection *pc, - struct cfs_trace_cpu_data *tcd) -{ - struct cfs_trace_page *tage; - struct cfs_trace_page *tmp; - - list_for_each_entry_safe(tage, tmp, &pc->pc_pages, linkage) { - __LASSERT_TAGE_INVARIANT(tage); - - if (tage->cpu != tcd->tcd_cpu || tage->type != tcd->tcd_type) - continue; - - cfs_tage_to_tail(tage, &tcd->tcd_daemon_pages); - tcd->tcd_cur_daemon_pages++; - - if (tcd->tcd_cur_daemon_pages > tcd->tcd_max_pages) { - struct cfs_trace_page *victim; - - __LASSERT(!list_empty(&tcd->tcd_daemon_pages)); - victim = cfs_tage_from_list(tcd->tcd_daemon_pages.next); - - __LASSERT_TAGE_INVARIANT(victim); - - list_del(&victim->linkage); - cfs_tage_free(victim); - tcd->tcd_cur_daemon_pages--; - } - } -} - -static void put_pages_on_daemon_list(struct page_collection *pc) -{ - struct cfs_trace_cpu_data *tcd; - int i, cpu; - - for_each_possible_cpu(cpu) { - cfs_tcd_for_each_type_lock(tcd, i, cpu) - put_pages_on_tcd_daemon_list(pc, tcd); - } -} - -void cfs_trace_debug_print(void) -{ - struct page_collection pc; - struct cfs_trace_page *tage; - struct cfs_trace_page *tmp; - - pc.pc_want_daemon_pages = 1; - collect_pages(&pc); - list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) { - char *p, *file, *fn; - struct page *page; - - __LASSERT_TAGE_INVARIANT(tage); - - page = tage->page; - p = page_address(page); - while (p < ((char *)page_address(page) + tage->used)) { - struct ptldebug_header *hdr; - int len; - - hdr = (void *)p; - p += sizeof(*hdr); - file = p; - p += strlen(file) + 1; - fn = p; - p += strlen(fn) + 1; - len = hdr->ph_len - (int)(p - (char *)hdr); - - cfs_print_to_console(hdr, D_EMERG, p, len, file, fn); - - p += len; - } - - list_del(&tage->linkage); - cfs_tage_free(tage); - } -} - -int cfs_tracefile_dump_all_pages(char *filename) -{ - struct page_collection pc; - struct file *filp; - struct cfs_trace_page *tage; - struct cfs_trace_page *tmp; - char *buf; - mm_segment_t __oldfs; - int rc; - - cfs_tracefile_write_lock(); - - filp = filp_open(filename, O_CREAT | O_EXCL | O_WRONLY | O_LARGEFILE, - 0600); - if (IS_ERR(filp)) { - rc = PTR_ERR(filp); - filp = NULL; - pr_err("LustreError: can't open %s for dump: rc %d\n", - filename, rc); - goto out; - } - - pc.pc_want_daemon_pages = 1; - collect_pages(&pc); - if (list_empty(&pc.pc_pages)) { - rc = 0; - goto close; - } - __oldfs = get_fs(); - set_fs(get_ds()); - - /* ok, for now, just write the pages. in the future we'll be building - * iobufs with the pages and calling generic_direct_IO - */ - list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) { - __LASSERT_TAGE_INVARIANT(tage); - - buf = kmap(tage->page); - rc = kernel_write(filp, buf, tage->used, &filp->f_pos); - kunmap(tage->page); - - if (rc != (int)tage->used) { - pr_warn("wanted to write %u but wrote %d\n", tage->used, - rc); - put_pages_back(&pc); - __LASSERT(list_empty(&pc.pc_pages)); - break; - } - list_del(&tage->linkage); - cfs_tage_free(tage); - } - set_fs(__oldfs); - rc = vfs_fsync(filp, 1); - if (rc) - pr_err("sync returns %d\n", rc); -close: - filp_close(filp, NULL); -out: - cfs_tracefile_write_unlock(); - return rc; -} - -void cfs_trace_flush_pages(void) -{ - struct page_collection pc; - struct cfs_trace_page *tage; - struct cfs_trace_page *tmp; - - pc.pc_want_daemon_pages = 1; - collect_pages(&pc); - list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) { - __LASSERT_TAGE_INVARIANT(tage); - - list_del(&tage->linkage); - cfs_tage_free(tage); - } -} - -int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob, - const char __user *usr_buffer, int usr_buffer_nob) -{ - int nob; - - if (usr_buffer_nob > knl_buffer_nob) - return -EOVERFLOW; - - if (copy_from_user((void *)knl_buffer, - usr_buffer, usr_buffer_nob)) - return -EFAULT; - - nob = strnlen(knl_buffer, usr_buffer_nob); - while (--nob >= 0) /* strip trailing whitespace */ - if (!isspace(knl_buffer[nob])) - break; - - if (nob < 0) /* empty string */ - return -EINVAL; - - if (nob == knl_buffer_nob) /* no space to terminate */ - return -EOVERFLOW; - - knl_buffer[nob + 1] = 0; /* terminate */ - return 0; -} -EXPORT_SYMBOL(cfs_trace_copyin_string); - -int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob, - const char *knl_buffer, char *append) -{ - /* - * NB if 'append' != NULL, it's a single character to append to the - * copied out string - usually "\n" or "" (i.e. a terminating zero byte) - */ - int nob = strlen(knl_buffer); - - if (nob > usr_buffer_nob) - nob = usr_buffer_nob; - - if (copy_to_user(usr_buffer, knl_buffer, nob)) - return -EFAULT; - - if (append && nob < usr_buffer_nob) { - if (copy_to_user(usr_buffer + nob, append, 1)) - return -EFAULT; - - nob++; - } - - return nob; -} -EXPORT_SYMBOL(cfs_trace_copyout_string); - -int cfs_trace_allocate_string_buffer(char **str, int nob) -{ - if (nob > 2 * PAGE_SIZE) /* string must be "sensible" */ - return -EINVAL; - - *str = kmalloc(nob, GFP_KERNEL | __GFP_ZERO); - if (!*str) - return -ENOMEM; - - return 0; -} - -int cfs_trace_dump_debug_buffer_usrstr(void __user *usr_str, int usr_str_nob) -{ - char *str; - int rc; - - rc = cfs_trace_allocate_string_buffer(&str, usr_str_nob + 1); - if (rc) - return rc; - - rc = cfs_trace_copyin_string(str, usr_str_nob + 1, - usr_str, usr_str_nob); - if (rc) - goto out; - - if (str[0] != '/') { - rc = -EINVAL; - goto out; - } - rc = cfs_tracefile_dump_all_pages(str); -out: - kfree(str); - return rc; -} - -int cfs_trace_daemon_command(char *str) -{ - int rc = 0; - - cfs_tracefile_write_lock(); - - if (!strcmp(str, "stop")) { - cfs_tracefile_write_unlock(); - cfs_trace_stop_thread(); - cfs_tracefile_write_lock(); - memset(cfs_tracefile, 0, sizeof(cfs_tracefile)); - - } else if (!strncmp(str, "size=", 5)) { - unsigned long tmp; - - rc = kstrtoul(str + 5, 10, &tmp); - if (!rc) { - if (tmp < 10 || tmp > 20480) - cfs_tracefile_size = CFS_TRACEFILE_SIZE; - else - cfs_tracefile_size = tmp << 20; - } - } else if (strlen(str) >= sizeof(cfs_tracefile)) { - rc = -ENAMETOOLONG; - } else if (str[0] != '/') { - rc = -EINVAL; - } else { - strcpy(cfs_tracefile, str); - - pr_info("debug daemon will attempt to start writing to %s (%lukB max)\n", - cfs_tracefile, - (long)(cfs_tracefile_size >> 10)); - - cfs_trace_start_thread(); - } - - cfs_tracefile_write_unlock(); - return rc; -} - -int cfs_trace_daemon_command_usrstr(void __user *usr_str, int usr_str_nob) -{ - char *str; - int rc; - - rc = cfs_trace_allocate_string_buffer(&str, usr_str_nob + 1); - if (rc) - return rc; - - rc = cfs_trace_copyin_string(str, usr_str_nob + 1, - usr_str, usr_str_nob); - if (!rc) - rc = cfs_trace_daemon_command(str); - - kfree(str); - return rc; -} - -int cfs_trace_set_debug_mb(int mb) -{ - int i; - int j; - int pages; - int limit = cfs_trace_max_debug_mb(); - struct cfs_trace_cpu_data *tcd; - - if (mb < num_possible_cpus()) { - pr_warn("%d MB is too small for debug buffer size, setting it to %d MB.\n", - mb, num_possible_cpus()); - mb = num_possible_cpus(); - } - - if (mb > limit) { - pr_warn("%d MB is too large for debug buffer size, setting it to %d MB.\n", - mb, limit); - mb = limit; - } - - mb /= num_possible_cpus(); - pages = mb << (20 - PAGE_SHIFT); - - cfs_tracefile_write_lock(); - - cfs_tcd_for_each(tcd, i, j) - tcd->tcd_max_pages = (pages * tcd->tcd_pages_factor) / 100; - - cfs_tracefile_write_unlock(); - - return 0; -} - -int cfs_trace_get_debug_mb(void) -{ - int i; - int j; - struct cfs_trace_cpu_data *tcd; - int total_pages = 0; - - cfs_tracefile_read_lock(); - - cfs_tcd_for_each(tcd, i, j) - total_pages += tcd->tcd_max_pages; - - cfs_tracefile_read_unlock(); - - return (total_pages >> (20 - PAGE_SHIFT)) + 1; -} - -static int tracefiled(void *arg) -{ - struct page_collection pc; - struct tracefiled_ctl *tctl = arg; - struct cfs_trace_page *tage; - struct cfs_trace_page *tmp; - struct file *filp; - char *buf; - int last_loop = 0; - int rc; - - /* we're started late enough that we pick up init's fs context */ - /* this is so broken in uml? what on earth is going on? */ - - complete(&tctl->tctl_start); - - while (1) { - wait_queue_entry_t __wait; - - pc.pc_want_daemon_pages = 0; - collect_pages(&pc); - if (list_empty(&pc.pc_pages)) - goto end_loop; - - filp = NULL; - cfs_tracefile_read_lock(); - if (cfs_tracefile[0]) { - filp = filp_open(cfs_tracefile, - O_CREAT | O_RDWR | O_LARGEFILE, - 0600); - if (IS_ERR(filp)) { - rc = PTR_ERR(filp); - filp = NULL; - pr_warn("couldn't open %s: %d\n", cfs_tracefile, - rc); - } - } - cfs_tracefile_read_unlock(); - if (!filp) { - put_pages_on_daemon_list(&pc); - __LASSERT(list_empty(&pc.pc_pages)); - goto end_loop; - } - - list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) { - static loff_t f_pos; - - __LASSERT_TAGE_INVARIANT(tage); - - if (f_pos >= (off_t)cfs_tracefile_size) - f_pos = 0; - else if (f_pos > i_size_read(file_inode(filp))) - f_pos = i_size_read(file_inode(filp)); - - buf = kmap(tage->page); - rc = kernel_write(filp, buf, tage->used, &f_pos); - kunmap(tage->page); - - if (rc != (int)tage->used) { - pr_warn("wanted to write %u but wrote %d\n", - tage->used, rc); - put_pages_back(&pc); - __LASSERT(list_empty(&pc.pc_pages)); - break; - } - } - - filp_close(filp, NULL); - put_pages_on_daemon_list(&pc); - if (!list_empty(&pc.pc_pages)) { - int i; - - pr_alert("trace pages aren't empty\n"); - pr_err("total cpus(%d): ", num_possible_cpus()); - for (i = 0; i < num_possible_cpus(); i++) - if (cpu_online(i)) - pr_cont("%d(on) ", i); - else - pr_cont("%d(off) ", i); - pr_cont("\n"); - - i = 0; - list_for_each_entry_safe(tage, tmp, &pc.pc_pages, - linkage) - pr_err("page %d belongs to cpu %d\n", - ++i, tage->cpu); - pr_err("There are %d pages unwritten\n", i); - } - __LASSERT(list_empty(&pc.pc_pages)); -end_loop: - if (atomic_read(&tctl->tctl_shutdown)) { - if (!last_loop) { - last_loop = 1; - continue; - } else { - break; - } - } - init_waitqueue_entry(&__wait, current); - add_wait_queue(&tctl->tctl_waitq, &__wait); - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ); - remove_wait_queue(&tctl->tctl_waitq, &__wait); - } - complete(&tctl->tctl_stop); - return 0; -} - -int cfs_trace_start_thread(void) -{ - struct tracefiled_ctl *tctl = &trace_tctl; - struct task_struct *task; - int rc = 0; - - mutex_lock(&cfs_trace_thread_mutex); - if (thread_running) - goto out; - - init_completion(&tctl->tctl_start); - init_completion(&tctl->tctl_stop); - init_waitqueue_head(&tctl->tctl_waitq); - atomic_set(&tctl->tctl_shutdown, 0); - - task = kthread_run(tracefiled, tctl, "ktracefiled"); - if (IS_ERR(task)) { - rc = PTR_ERR(task); - goto out; - } - - wait_for_completion(&tctl->tctl_start); - thread_running = 1; -out: - mutex_unlock(&cfs_trace_thread_mutex); - return rc; -} - -void cfs_trace_stop_thread(void) -{ - struct tracefiled_ctl *tctl = &trace_tctl; - - mutex_lock(&cfs_trace_thread_mutex); - if (thread_running) { - pr_info("shutting down debug daemon thread...\n"); - atomic_set(&tctl->tctl_shutdown, 1); - wait_for_completion(&tctl->tctl_stop); - thread_running = 0; - } - mutex_unlock(&cfs_trace_thread_mutex); -} - -int cfs_tracefile_init(int max_pages) -{ - struct cfs_trace_cpu_data *tcd; - int i; - int j; - int rc; - int factor; - - rc = cfs_tracefile_init_arch(); - if (rc) - return rc; - - cfs_tcd_for_each(tcd, i, j) { - /* tcd_pages_factor is initialized int tracefile_init_arch. */ - factor = tcd->tcd_pages_factor; - INIT_LIST_HEAD(&tcd->tcd_pages); - INIT_LIST_HEAD(&tcd->tcd_stock_pages); - INIT_LIST_HEAD(&tcd->tcd_daemon_pages); - tcd->tcd_cur_pages = 0; - tcd->tcd_cur_stock_pages = 0; - tcd->tcd_cur_daemon_pages = 0; - tcd->tcd_max_pages = (max_pages * factor) / 100; - LASSERT(tcd->tcd_max_pages > 0); - tcd->tcd_shutting_down = 0; - } - - return 0; -} - -static void trace_cleanup_on_all_cpus(void) -{ - struct cfs_trace_cpu_data *tcd; - struct cfs_trace_page *tage; - struct cfs_trace_page *tmp; - int i, cpu; - - for_each_possible_cpu(cpu) { - cfs_tcd_for_each_type_lock(tcd, i, cpu) { - tcd->tcd_shutting_down = 1; - - list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, - linkage) { - __LASSERT_TAGE_INVARIANT(tage); - - list_del(&tage->linkage); - cfs_tage_free(tage); - } - - tcd->tcd_cur_pages = 0; - } - } -} - -static void cfs_trace_cleanup(void) -{ - struct page_collection pc; - - INIT_LIST_HEAD(&pc.pc_pages); - - trace_cleanup_on_all_cpus(); - - cfs_tracefile_fini_arch(); -} - -void cfs_tracefile_exit(void) -{ - cfs_trace_stop_thread(); - cfs_trace_cleanup(); -} diff --git a/drivers/staging/lustre/lnet/libcfs/tracefile.h b/drivers/staging/lustre/lnet/libcfs/tracefile.h deleted file mode 100644 index 0608240d897ff2f97d19a2bf324942dfee47c913..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/libcfs/tracefile.h +++ /dev/null @@ -1,274 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#ifndef __LIBCFS_TRACEFILE_H__ -#define __LIBCFS_TRACEFILE_H__ - -#include -#include -#include -#include -#include -#include -#include - -enum cfs_trace_buf_type { - CFS_TCD_TYPE_PROC = 0, - CFS_TCD_TYPE_SOFTIRQ, - CFS_TCD_TYPE_IRQ, - CFS_TCD_TYPE_MAX -}; - -/* trace file lock routines */ - -#define TRACEFILE_NAME_SIZE 1024 -extern char cfs_tracefile[TRACEFILE_NAME_SIZE]; -extern long long cfs_tracefile_size; - -/** - * The path of debug log dump upcall script. - */ -extern char lnet_debug_log_upcall[1024]; - -void libcfs_run_debug_log_upcall(char *file); - -int cfs_tracefile_init_arch(void); -void cfs_tracefile_fini_arch(void); - -void cfs_tracefile_read_lock(void); -void cfs_tracefile_read_unlock(void); -void cfs_tracefile_write_lock(void); -void cfs_tracefile_write_unlock(void); - -int cfs_tracefile_dump_all_pages(char *filename); -void cfs_trace_debug_print(void); -void cfs_trace_flush_pages(void); -int cfs_trace_start_thread(void); -void cfs_trace_stop_thread(void); -int cfs_tracefile_init(int max_pages); -void cfs_tracefile_exit(void); - -int cfs_trace_copyin_string(char *knl_buffer, int knl_buffer_nob, - const char __user *usr_buffer, int usr_buffer_nob); -int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob, - const char *knl_str, char *append); -int cfs_trace_allocate_string_buffer(char **str, int nob); -int cfs_trace_dump_debug_buffer_usrstr(void __user *usr_str, int usr_str_nob); -int cfs_trace_daemon_command(char *str); -int cfs_trace_daemon_command_usrstr(void __user *usr_str, int usr_str_nob); -int cfs_trace_set_debug_mb(int mb); -int cfs_trace_get_debug_mb(void); - -void libcfs_debug_dumplog_internal(void *arg); -void libcfs_register_panic_notifier(void); -void libcfs_unregister_panic_notifier(void); -extern int libcfs_panic_in_progress; -int cfs_trace_max_debug_mb(void); - -#define TCD_MAX_PAGES (5 << (20 - PAGE_SHIFT)) -#define TCD_STOCK_PAGES (TCD_MAX_PAGES) -#define CFS_TRACEFILE_SIZE (500 << 20) - -#ifdef LUSTRE_TRACEFILE_PRIVATE - -/* - * Private declare for tracefile - */ -#define TCD_MAX_PAGES (5 << (20 - PAGE_SHIFT)) -#define TCD_STOCK_PAGES (TCD_MAX_PAGES) - -#define CFS_TRACEFILE_SIZE (500 << 20) - -/* - * Size of a buffer for sprinting console messages if we can't get a page - * from system - */ -#define CFS_TRACE_CONSOLE_BUFFER_SIZE 1024 - -union cfs_trace_data_union { - struct cfs_trace_cpu_data { - /* - * Even though this structure is meant to be per-CPU, locking - * is needed because in some places the data may be accessed - * from other CPUs. This lock is directly used in trace_get_tcd - * and trace_put_tcd, which are called in libcfs_debug_vmsg2 and - * tcd_for_each_type_lock - */ - spinlock_t tcd_lock; - unsigned long tcd_lock_flags; - - /* - * pages with trace records not yet processed by tracefiled. - */ - struct list_head tcd_pages; - /* number of pages on ->tcd_pages */ - unsigned long tcd_cur_pages; - - /* - * pages with trace records already processed by - * tracefiled. These pages are kept in memory, so that some - * portion of log can be written in the event of LBUG. This - * list is maintained in LRU order. - * - * Pages are moved to ->tcd_daemon_pages by tracefiled() - * (put_pages_on_daemon_list()). LRU pages from this list are - * discarded when list grows too large. - */ - struct list_head tcd_daemon_pages; - /* number of pages on ->tcd_daemon_pages */ - unsigned long tcd_cur_daemon_pages; - - /* - * Maximal number of pages allowed on ->tcd_pages and - * ->tcd_daemon_pages each. - * Always TCD_MAX_PAGES * tcd_pages_factor / 100 in current - * implementation. - */ - unsigned long tcd_max_pages; - - /* - * preallocated pages to write trace records into. Pages from - * ->tcd_stock_pages are moved to ->tcd_pages by - * portals_debug_msg(). - * - * This list is necessary, because on some platforms it's - * impossible to perform efficient atomic page allocation in a - * non-blockable context. - * - * Such platforms fill ->tcd_stock_pages "on occasion", when - * tracing code is entered in blockable context. - * - * trace_get_tage_try() tries to get a page from - * ->tcd_stock_pages first and resorts to atomic page - * allocation only if this queue is empty. ->tcd_stock_pages - * is replenished when tracing code is entered in blocking - * context (darwin-tracefile.c:trace_get_tcd()). We try to - * maintain TCD_STOCK_PAGES (40 by default) pages in this - * queue. Atomic allocation is only required if more than - * TCD_STOCK_PAGES pagesful are consumed by trace records all - * emitted in non-blocking contexts. Which is quite unlikely. - */ - struct list_head tcd_stock_pages; - /* number of pages on ->tcd_stock_pages */ - unsigned long tcd_cur_stock_pages; - - unsigned short tcd_shutting_down; - unsigned short tcd_cpu; - unsigned short tcd_type; - /* The factors to share debug memory. */ - unsigned short tcd_pages_factor; - } tcd; - char __pad[L1_CACHE_ALIGN(sizeof(struct cfs_trace_cpu_data))]; -}; - -#define TCD_MAX_TYPES 8 -extern union cfs_trace_data_union (*cfs_trace_data[TCD_MAX_TYPES])[NR_CPUS]; - -#define cfs_tcd_for_each(tcd, i, j) \ - for (i = 0; cfs_trace_data[i]; i++) \ - for (j = 0, ((tcd) = &(*cfs_trace_data[i])[j].tcd); \ - j < num_possible_cpus(); \ - j++, (tcd) = &(*cfs_trace_data[i])[j].tcd) - -#define cfs_tcd_for_each_type_lock(tcd, i, cpu) \ - for (i = 0; cfs_trace_data[i] && \ - (tcd = &(*cfs_trace_data[i])[cpu].tcd) && \ - cfs_trace_lock_tcd(tcd, 1); cfs_trace_unlock_tcd(tcd, 1), i++) - -void cfs_set_ptldebug_header(struct ptldebug_header *header, - struct libcfs_debug_msg_data *m, - unsigned long stack); -void cfs_print_to_console(struct ptldebug_header *hdr, int mask, - const char *buf, int len, const char *file, - const char *fn); - -int cfs_trace_lock_tcd(struct cfs_trace_cpu_data *tcd, int walking); -void cfs_trace_unlock_tcd(struct cfs_trace_cpu_data *tcd, int walking); - -extern char *cfs_trace_console_buffers[NR_CPUS][CFS_TCD_TYPE_MAX]; -enum cfs_trace_buf_type cfs_trace_buf_idx_get(void); - -static inline char * -cfs_trace_get_console_buffer(void) -{ - unsigned int i = get_cpu(); - unsigned int j = cfs_trace_buf_idx_get(); - - return cfs_trace_console_buffers[i][j]; -} - -static inline struct cfs_trace_cpu_data * -cfs_trace_get_tcd(void) -{ - struct cfs_trace_cpu_data *tcd = - &(*cfs_trace_data[cfs_trace_buf_idx_get()])[get_cpu()].tcd; - - cfs_trace_lock_tcd(tcd, 0); - - return tcd; -} - -static inline void cfs_trace_put_tcd(struct cfs_trace_cpu_data *tcd) -{ - cfs_trace_unlock_tcd(tcd, 0); - - put_cpu(); -} - -int cfs_trace_refill_stock(struct cfs_trace_cpu_data *tcd, gfp_t gfp, - struct list_head *stock); - -void cfs_trace_assertion_failed(const char *str, - struct libcfs_debug_msg_data *m); - -/* ASSERTION that is safe to use within the debug system */ -#define __LASSERT(cond) \ -do { \ - if (unlikely(!(cond))) { \ - LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_EMERG, NULL); \ - cfs_trace_assertion_failed("ASSERTION("#cond") failed", \ - &msgdata); \ - } \ -} while (0) - -#define __LASSERT_TAGE_INVARIANT(tage) \ -do { \ - __LASSERT(tage); \ - __LASSERT(tage->page); \ - __LASSERT(tage->used <= PAGE_SIZE); \ - __LASSERT(page_count(tage->page) > 0); \ -} while (0) - -#endif /* LUSTRE_TRACEFILE_PRIVATE */ - -#endif /* __LIBCFS_TRACEFILE_H__ */ diff --git a/drivers/staging/lustre/lnet/lnet/Makefile b/drivers/staging/lustre/lnet/lnet/Makefile deleted file mode 100644 index 0a9d70924fe0006735142cd41c596b648d7831df..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/lnet/Makefile +++ /dev/null @@ -1,10 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/include -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/lustre/include - -obj-$(CONFIG_LNET) += lnet.o - -lnet-y := api-ni.o config.o nidstrings.o net_fault.o \ - lib-me.o lib-msg.o lib-eq.o lib-md.o lib-ptl.o \ - lib-socket.o lib-move.o module.o lo.o \ - router.o router_proc.o acceptor.o peer.o diff --git a/drivers/staging/lustre/lnet/lnet/acceptor.c b/drivers/staging/lustre/lnet/lnet/acceptor.c deleted file mode 100644 index 5648f17eddc043f92054fee4ff1b3605a0a1b5e6..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/lnet/acceptor.c +++ /dev/null @@ -1,501 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_LNET -#include -#include -#include - -static int accept_port = 988; -static int accept_backlog = 127; -static int accept_timeout = 5; - -static struct { - int pta_shutdown; - struct socket *pta_sock; - struct completion pta_signal; -} lnet_acceptor_state = { - .pta_shutdown = 1 -}; - -int -lnet_acceptor_port(void) -{ - return accept_port; -} -EXPORT_SYMBOL(lnet_acceptor_port); - -static inline int -lnet_accept_magic(__u32 magic, __u32 constant) -{ - return (magic == constant || - magic == __swab32(constant)); -} - -static char *accept = "secure"; - -module_param(accept, charp, 0444); -MODULE_PARM_DESC(accept, "Accept connections (secure|all|none)"); -module_param(accept_port, int, 0444); -MODULE_PARM_DESC(accept_port, "Acceptor's port (same on all nodes)"); -module_param(accept_backlog, int, 0444); -MODULE_PARM_DESC(accept_backlog, "Acceptor's listen backlog"); -module_param(accept_timeout, int, 0644); -MODULE_PARM_DESC(accept_timeout, "Acceptor's timeout (seconds)"); - -static char *accept_type; - -static int -lnet_acceptor_get_tunables(void) -{ - /* - * Userland acceptor uses 'accept_type' instead of 'accept', due to - * conflict with 'accept(2)', but kernel acceptor still uses 'accept' - * for compatibility. Hence the trick. - */ - accept_type = accept; - return 0; -} - -int -lnet_acceptor_timeout(void) -{ - return accept_timeout; -} -EXPORT_SYMBOL(lnet_acceptor_timeout); - -void -lnet_connect_console_error(int rc, lnet_nid_t peer_nid, - __u32 peer_ip, int peer_port) -{ - switch (rc) { - /* "normal" errors */ - case -ECONNREFUSED: - CNETERR("Connection to %s at host %pI4h on port %d was refused: check that Lustre is running on that node.\n", - libcfs_nid2str(peer_nid), - &peer_ip, peer_port); - break; - case -EHOSTUNREACH: - case -ENETUNREACH: - CNETERR("Connection to %s at host %pI4h was unreachable: the network or that node may be down, or Lustre may be misconfigured.\n", - libcfs_nid2str(peer_nid), &peer_ip); - break; - case -ETIMEDOUT: - CNETERR("Connection to %s at host %pI4h on port %d took too long: that node may be hung or experiencing high load.\n", - libcfs_nid2str(peer_nid), - &peer_ip, peer_port); - break; - case -ECONNRESET: - LCONSOLE_ERROR_MSG(0x11b, "Connection to %s at host %pI4h on port %d was reset: is it running a compatible version of Lustre and is %s one of its NIDs?\n", - libcfs_nid2str(peer_nid), - &peer_ip, peer_port, - libcfs_nid2str(peer_nid)); - break; - case -EPROTO: - LCONSOLE_ERROR_MSG(0x11c, "Protocol error connecting to %s at host %pI4h on port %d: is it running a compatible version of Lustre?\n", - libcfs_nid2str(peer_nid), - &peer_ip, peer_port); - break; - case -EADDRINUSE: - LCONSOLE_ERROR_MSG(0x11d, "No privileged ports available to connect to %s at host %pI4h on port %d\n", - libcfs_nid2str(peer_nid), - &peer_ip, peer_port); - break; - default: - LCONSOLE_ERROR_MSG(0x11e, "Unexpected error %d connecting to %s at host %pI4h on port %d\n", - rc, libcfs_nid2str(peer_nid), - &peer_ip, peer_port); - break; - } -} -EXPORT_SYMBOL(lnet_connect_console_error); - -int -lnet_connect(struct socket **sockp, lnet_nid_t peer_nid, - __u32 local_ip, __u32 peer_ip, int peer_port) -{ - struct lnet_acceptor_connreq cr; - struct socket *sock; - int rc; - int port; - int fatal; - - BUILD_BUG_ON(sizeof(cr) > 16); /* too big to be on the stack */ - - for (port = LNET_ACCEPTOR_MAX_RESERVED_PORT; - port >= LNET_ACCEPTOR_MIN_RESERVED_PORT; - --port) { - /* Iterate through reserved ports. */ - - rc = lnet_sock_connect(&sock, &fatal, local_ip, port, peer_ip, - peer_port); - if (rc) { - if (fatal) - goto failed; - continue; - } - - BUILD_BUG_ON(LNET_PROTO_ACCEPTOR_VERSION != 1); - - cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC; - cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION; - cr.acr_nid = peer_nid; - - if (the_lnet.ln_testprotocompat) { - /* single-shot proto check */ - lnet_net_lock(LNET_LOCK_EX); - if (the_lnet.ln_testprotocompat & 4) { - cr.acr_version++; - the_lnet.ln_testprotocompat &= ~4; - } - if (the_lnet.ln_testprotocompat & 8) { - cr.acr_magic = LNET_PROTO_MAGIC; - the_lnet.ln_testprotocompat &= ~8; - } - lnet_net_unlock(LNET_LOCK_EX); - } - - rc = lnet_sock_write(sock, &cr, sizeof(cr), accept_timeout); - if (rc) - goto failed_sock; - - *sockp = sock; - return 0; - } - - rc = -EADDRINUSE; - goto failed; - - failed_sock: - sock_release(sock); - failed: - lnet_connect_console_error(rc, peer_nid, peer_ip, peer_port); - return rc; -} -EXPORT_SYMBOL(lnet_connect); - -static int -lnet_accept(struct socket *sock, __u32 magic) -{ - struct lnet_acceptor_connreq cr; - __u32 peer_ip; - int peer_port; - int rc; - int flip; - struct lnet_ni *ni; - char *str; - - LASSERT(sizeof(cr) <= 16); /* not too big for the stack */ - - rc = lnet_sock_getaddr(sock, 1, &peer_ip, &peer_port); - LASSERT(!rc); /* we succeeded before */ - - if (!lnet_accept_magic(magic, LNET_PROTO_ACCEPTOR_MAGIC)) { - if (lnet_accept_magic(magic, LNET_PROTO_MAGIC)) { - /* - * future version compatibility! - * When LNET unifies protocols over all LNDs, the first - * thing sent will be a version query. I send back - * LNET_PROTO_ACCEPTOR_MAGIC to tell her I'm "old" - */ - memset(&cr, 0, sizeof(cr)); - cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC; - cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION; - rc = lnet_sock_write(sock, &cr, sizeof(cr), - accept_timeout); - - if (rc) - CERROR("Error sending magic+version in response to LNET magic from %pI4h: %d\n", - &peer_ip, rc); - return -EPROTO; - } - - if (lnet_accept_magic(magic, LNET_PROTO_TCP_MAGIC)) - str = "'old' socknal/tcpnal"; - else - str = "unrecognised"; - - LCONSOLE_ERROR_MSG(0x11f, "Refusing connection from %pI4h magic %08x: %s acceptor protocol\n", - &peer_ip, magic, str); - return -EPROTO; - } - - flip = (magic != LNET_PROTO_ACCEPTOR_MAGIC); - - rc = lnet_sock_read(sock, &cr.acr_version, sizeof(cr.acr_version), - accept_timeout); - if (rc) { - CERROR("Error %d reading connection request version from %pI4h\n", - rc, &peer_ip); - return -EIO; - } - - if (flip) - __swab32s(&cr.acr_version); - - if (cr.acr_version != LNET_PROTO_ACCEPTOR_VERSION) { - /* - * future version compatibility! - * An acceptor-specific protocol rev will first send a version - * query. I send back my current version to tell her I'm - * "old". - */ - int peer_version = cr.acr_version; - - memset(&cr, 0, sizeof(cr)); - cr.acr_magic = LNET_PROTO_ACCEPTOR_MAGIC; - cr.acr_version = LNET_PROTO_ACCEPTOR_VERSION; - - rc = lnet_sock_write(sock, &cr, sizeof(cr), accept_timeout); - if (rc) - CERROR("Error sending magic+version in response to version %d from %pI4h: %d\n", - peer_version, &peer_ip, rc); - return -EPROTO; - } - - rc = lnet_sock_read(sock, &cr.acr_nid, - sizeof(cr) - - offsetof(struct lnet_acceptor_connreq, acr_nid), - accept_timeout); - if (rc) { - CERROR("Error %d reading connection request from %pI4h\n", - rc, &peer_ip); - return -EIO; - } - - if (flip) - __swab64s(&cr.acr_nid); - - ni = lnet_net2ni(LNET_NIDNET(cr.acr_nid)); - if (!ni || /* no matching net */ - ni->ni_nid != cr.acr_nid) { /* right NET, wrong NID! */ - if (ni) - lnet_ni_decref(ni); - LCONSOLE_ERROR_MSG(0x120, "Refusing connection from %pI4h for %s: No matching NI\n", - &peer_ip, libcfs_nid2str(cr.acr_nid)); - return -EPERM; - } - - if (!ni->ni_lnd->lnd_accept) { - /* This catches a request for the loopback LND */ - lnet_ni_decref(ni); - LCONSOLE_ERROR_MSG(0x121, "Refusing connection from %pI4h for %s: NI doesn not accept IP connections\n", - &peer_ip, libcfs_nid2str(cr.acr_nid)); - return -EPERM; - } - - CDEBUG(D_NET, "Accept %s from %pI4h\n", - libcfs_nid2str(cr.acr_nid), &peer_ip); - - rc = ni->ni_lnd->lnd_accept(ni, sock); - - lnet_ni_decref(ni); - return rc; -} - -static int -lnet_acceptor(void *arg) -{ - struct socket *newsock; - int rc; - __u32 magic; - __u32 peer_ip; - int peer_port; - int secure = (int)((long)arg); - - LASSERT(!lnet_acceptor_state.pta_sock); - - rc = lnet_sock_listen(&lnet_acceptor_state.pta_sock, 0, accept_port, - accept_backlog); - if (rc) { - if (rc == -EADDRINUSE) - LCONSOLE_ERROR_MSG(0x122, "Can't start acceptor on port %d: port already in use\n", - accept_port); - else - LCONSOLE_ERROR_MSG(0x123, "Can't start acceptor on port %d: unexpected error %d\n", - accept_port, rc); - - lnet_acceptor_state.pta_sock = NULL; - } else { - LCONSOLE(0, "Accept %s, port %d\n", accept_type, accept_port); - } - - /* set init status and unblock parent */ - lnet_acceptor_state.pta_shutdown = rc; - complete(&lnet_acceptor_state.pta_signal); - - if (rc) - return rc; - - while (!lnet_acceptor_state.pta_shutdown) { - rc = lnet_sock_accept(&newsock, lnet_acceptor_state.pta_sock); - if (rc) { - if (rc != -EAGAIN) { - CWARN("Accept error %d: pausing...\n", rc); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ); - } - continue; - } - - /* maybe the LNet acceptor thread has been waken */ - if (lnet_acceptor_state.pta_shutdown) { - sock_release(newsock); - break; - } - - rc = lnet_sock_getaddr(newsock, 1, &peer_ip, &peer_port); - if (rc) { - CERROR("Can't determine new connection's address\n"); - goto failed; - } - - if (secure && peer_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) { - CERROR("Refusing connection from %pI4h: insecure port %d\n", - &peer_ip, peer_port); - goto failed; - } - - rc = lnet_sock_read(newsock, &magic, sizeof(magic), - accept_timeout); - if (rc) { - CERROR("Error %d reading connection request from %pI4h\n", - rc, &peer_ip); - goto failed; - } - - rc = lnet_accept(newsock, magic); - if (rc) - goto failed; - - continue; - -failed: - sock_release(newsock); - } - - sock_release(lnet_acceptor_state.pta_sock); - lnet_acceptor_state.pta_sock = NULL; - - CDEBUG(D_NET, "Acceptor stopping\n"); - - /* unblock lnet_acceptor_stop() */ - complete(&lnet_acceptor_state.pta_signal); - return 0; -} - -static inline int -accept2secure(const char *acc, long *sec) -{ - if (!strcmp(acc, "secure")) { - *sec = 1; - return 1; - } else if (!strcmp(acc, "all")) { - *sec = 0; - return 1; - } else if (!strcmp(acc, "none")) { - return 0; - } - - LCONSOLE_ERROR_MSG(0x124, "Can't parse 'accept=\"%s\"'\n", - acc); - return -EINVAL; -} - -int -lnet_acceptor_start(void) -{ - struct task_struct *task; - int rc; - long rc2; - long secure; - - /* if acceptor is already running return immediately */ - if (!lnet_acceptor_state.pta_shutdown) - return 0; - - LASSERT(!lnet_acceptor_state.pta_sock); - - rc = lnet_acceptor_get_tunables(); - if (rc) - return rc; - - init_completion(&lnet_acceptor_state.pta_signal); - rc = accept2secure(accept_type, &secure); - if (rc <= 0) - return rc; - - if (!lnet_count_acceptor_nis()) /* not required */ - return 0; - - task = kthread_run(lnet_acceptor, (void *)(uintptr_t)secure, - "acceptor_%03ld", secure); - if (IS_ERR(task)) { - rc2 = PTR_ERR(task); - CERROR("Can't start acceptor thread: %ld\n", rc2); - - return -ESRCH; - } - - /* wait for acceptor to startup */ - wait_for_completion(&lnet_acceptor_state.pta_signal); - - if (!lnet_acceptor_state.pta_shutdown) { - /* started OK */ - LASSERT(lnet_acceptor_state.pta_sock); - return 0; - } - - LASSERT(!lnet_acceptor_state.pta_sock); - - return -ENETDOWN; -} - -void -lnet_acceptor_stop(void) -{ - struct sock *sk; - - if (lnet_acceptor_state.pta_shutdown) /* not running */ - return; - - lnet_acceptor_state.pta_shutdown = 1; - - sk = lnet_acceptor_state.pta_sock->sk; - - /* awake any sleepers using safe method */ - sk->sk_state_change(sk); - - /* block until acceptor signals exit */ - wait_for_completion(&lnet_acceptor_state.pta_signal); -} diff --git a/drivers/staging/lustre/lnet/lnet/api-ni.c b/drivers/staging/lustre/lnet/lnet/api-ni.c deleted file mode 100644 index f9ed6977056ce3ec7f5dbd0ef3f34ad3b8aacb8b..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/lnet/api-ni.c +++ /dev/null @@ -1,2307 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_LNET -#include -#include - -#include -#include - -#define D_LNI D_CONSOLE - -struct lnet the_lnet; /* THE state of the network */ -EXPORT_SYMBOL(the_lnet); - -static char *ip2nets = ""; -module_param(ip2nets, charp, 0444); -MODULE_PARM_DESC(ip2nets, "LNET network <- IP table"); - -static char *networks = ""; -module_param(networks, charp, 0444); -MODULE_PARM_DESC(networks, "local networks"); - -static char *routes = ""; -module_param(routes, charp, 0444); -MODULE_PARM_DESC(routes, "routes to non-local networks"); - -static int rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT; -module_param(rnet_htable_size, int, 0444); -MODULE_PARM_DESC(rnet_htable_size, "size of remote network hash table"); - -static int lnet_ping(struct lnet_process_id id, int timeout_ms, - struct lnet_process_id __user *ids, int n_ids); - -static char * -lnet_get_routes(void) -{ - return routes; -} - -static char * -lnet_get_networks(void) -{ - char *nets; - int rc; - - if (*networks && *ip2nets) { - LCONSOLE_ERROR_MSG(0x101, "Please specify EITHER 'networks' or 'ip2nets' but not both at once\n"); - return NULL; - } - - if (*ip2nets) { - rc = lnet_parse_ip2nets(&nets, ip2nets); - return !rc ? nets : NULL; - } - - if (*networks) - return networks; - - return "tcp"; -} - -static void -lnet_init_locks(void) -{ - spin_lock_init(&the_lnet.ln_eq_wait_lock); - init_waitqueue_head(&the_lnet.ln_eq_waitq); - init_waitqueue_head(&the_lnet.ln_rc_waitq); - mutex_init(&the_lnet.ln_lnd_mutex); - mutex_init(&the_lnet.ln_api_mutex); -} - -static int -lnet_create_remote_nets_table(void) -{ - int i; - struct list_head *hash; - - LASSERT(!the_lnet.ln_remote_nets_hash); - LASSERT(the_lnet.ln_remote_nets_hbits > 0); - hash = kvmalloc_array(LNET_REMOTE_NETS_HASH_SIZE, sizeof(*hash), - GFP_KERNEL); - if (!hash) { - CERROR("Failed to create remote nets hash table\n"); - return -ENOMEM; - } - - for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) - INIT_LIST_HEAD(&hash[i]); - the_lnet.ln_remote_nets_hash = hash; - return 0; -} - -static void -lnet_destroy_remote_nets_table(void) -{ - int i; - - if (!the_lnet.ln_remote_nets_hash) - return; - - for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) - LASSERT(list_empty(&the_lnet.ln_remote_nets_hash[i])); - - kvfree(the_lnet.ln_remote_nets_hash); - the_lnet.ln_remote_nets_hash = NULL; -} - -static void -lnet_destroy_locks(void) -{ - if (the_lnet.ln_res_lock) { - cfs_percpt_lock_free(the_lnet.ln_res_lock); - the_lnet.ln_res_lock = NULL; - } - - if (the_lnet.ln_net_lock) { - cfs_percpt_lock_free(the_lnet.ln_net_lock); - the_lnet.ln_net_lock = NULL; - } -} - -static int -lnet_create_locks(void) -{ - lnet_init_locks(); - - the_lnet.ln_res_lock = cfs_percpt_lock_alloc(lnet_cpt_table()); - if (!the_lnet.ln_res_lock) - goto failed; - - the_lnet.ln_net_lock = cfs_percpt_lock_alloc(lnet_cpt_table()); - if (!the_lnet.ln_net_lock) - goto failed; - - return 0; - - failed: - lnet_destroy_locks(); - return -ENOMEM; -} - -static void lnet_assert_wire_constants(void) -{ - /* - * Wire protocol assertions generated by 'wirecheck' - * running on Linux robert.bartonsoftware.com 2.6.8-1.521 - * #1 Mon Aug 16 09:01:18 EDT 2004 i686 athlon i386 GNU/Linux - * with gcc version 3.3.3 20040412 (Red Hat Linux 3.3.3-7) - */ - - /* Constants... */ - BUILD_BUG_ON(LNET_PROTO_TCP_MAGIC != 0xeebc0ded); - BUILD_BUG_ON(LNET_PROTO_TCP_VERSION_MAJOR != 1); - BUILD_BUG_ON(LNET_PROTO_TCP_VERSION_MINOR != 0); - BUILD_BUG_ON(LNET_MSG_ACK != 0); - BUILD_BUG_ON(LNET_MSG_PUT != 1); - BUILD_BUG_ON(LNET_MSG_GET != 2); - BUILD_BUG_ON(LNET_MSG_REPLY != 3); - BUILD_BUG_ON(LNET_MSG_HELLO != 4); - - /* Checks for struct ptl_handle_wire_t */ - BUILD_BUG_ON((int)sizeof(struct lnet_handle_wire) != 16); - BUILD_BUG_ON((int)offsetof(struct lnet_handle_wire, wh_interface_cookie) != 0); - BUILD_BUG_ON((int)sizeof(((struct lnet_handle_wire *)0)->wh_interface_cookie) != 8); - BUILD_BUG_ON((int)offsetof(struct lnet_handle_wire, wh_object_cookie) != 8); - BUILD_BUG_ON((int)sizeof(((struct lnet_handle_wire *)0)->wh_object_cookie) != 8); - - /* Checks for struct struct lnet_magicversion */ - BUILD_BUG_ON((int)sizeof(struct lnet_magicversion) != 8); - BUILD_BUG_ON((int)offsetof(struct lnet_magicversion, magic) != 0); - BUILD_BUG_ON((int)sizeof(((struct lnet_magicversion *)0)->magic) != 4); - BUILD_BUG_ON((int)offsetof(struct lnet_magicversion, version_major) != 4); - BUILD_BUG_ON((int)sizeof(((struct lnet_magicversion *)0)->version_major) != 2); - BUILD_BUG_ON((int)offsetof(struct lnet_magicversion, version_minor) != 6); - BUILD_BUG_ON((int)sizeof(((struct lnet_magicversion *)0)->version_minor) != 2); - - /* Checks for struct struct lnet_hdr */ - BUILD_BUG_ON((int)sizeof(struct lnet_hdr) != 72); - BUILD_BUG_ON((int)offsetof(struct lnet_hdr, dest_nid) != 0); - BUILD_BUG_ON((int)sizeof(((struct lnet_hdr *)0)->dest_nid) != 8); - BUILD_BUG_ON((int)offsetof(struct lnet_hdr, src_nid) != 8); - BUILD_BUG_ON((int)sizeof(((struct lnet_hdr *)0)->src_nid) != 8); - BUILD_BUG_ON((int)offsetof(struct lnet_hdr, dest_pid) != 16); - BUILD_BUG_ON((int)sizeof(((struct lnet_hdr *)0)->dest_pid) != 4); - BUILD_BUG_ON((int)offsetof(struct lnet_hdr, src_pid) != 20); - BUILD_BUG_ON((int)sizeof(((struct lnet_hdr *)0)->src_pid) != 4); - BUILD_BUG_ON((int)offsetof(struct lnet_hdr, type) != 24); - BUILD_BUG_ON((int)sizeof(((struct lnet_hdr *)0)->type) != 4); - BUILD_BUG_ON((int)offsetof(struct lnet_hdr, payload_length) != 28); - BUILD_BUG_ON((int)sizeof(((struct lnet_hdr *)0)->payload_length) != 4); - BUILD_BUG_ON((int)offsetof(struct lnet_hdr, msg) != 32); - BUILD_BUG_ON((int)sizeof(((struct lnet_hdr *)0)->msg) != 40); - - /* Ack */ - BUILD_BUG_ON((int)offsetof(struct lnet_hdr, msg.ack.dst_wmd) != 32); - BUILD_BUG_ON((int)sizeof(((struct lnet_hdr *)0)->msg.ack.dst_wmd) != 16); - BUILD_BUG_ON((int)offsetof(struct lnet_hdr, msg.ack.match_bits) != 48); - BUILD_BUG_ON((int)sizeof(((struct lnet_hdr *)0)->msg.ack.match_bits) != 8); - BUILD_BUG_ON((int)offsetof(struct lnet_hdr, msg.ack.mlength) != 56); - BUILD_BUG_ON((int)sizeof(((struct lnet_hdr *)0)->msg.ack.mlength) != 4); - - /* Put */ - BUILD_BUG_ON((int)offsetof(struct lnet_hdr, msg.put.ack_wmd) != 32); - BUILD_BUG_ON((int)sizeof(((struct lnet_hdr *)0)->msg.put.ack_wmd) != 16); - BUILD_BUG_ON((int)offsetof(struct lnet_hdr, msg.put.match_bits) != 48); - BUILD_BUG_ON((int)sizeof(((struct lnet_hdr *)0)->msg.put.match_bits) != 8); - BUILD_BUG_ON((int)offsetof(struct lnet_hdr, msg.put.hdr_data) != 56); - BUILD_BUG_ON((int)sizeof(((struct lnet_hdr *)0)->msg.put.hdr_data) != 8); - BUILD_BUG_ON((int)offsetof(struct lnet_hdr, msg.put.ptl_index) != 64); - BUILD_BUG_ON((int)sizeof(((struct lnet_hdr *)0)->msg.put.ptl_index) != 4); - BUILD_BUG_ON((int)offsetof(struct lnet_hdr, msg.put.offset) != 68); - BUILD_BUG_ON((int)sizeof(((struct lnet_hdr *)0)->msg.put.offset) != 4); - - /* Get */ - BUILD_BUG_ON((int)offsetof(struct lnet_hdr, msg.get.return_wmd) != 32); - BUILD_BUG_ON((int)sizeof(((struct lnet_hdr *)0)->msg.get.return_wmd) != 16); - BUILD_BUG_ON((int)offsetof(struct lnet_hdr, msg.get.match_bits) != 48); - BUILD_BUG_ON((int)sizeof(((struct lnet_hdr *)0)->msg.get.match_bits) != 8); - BUILD_BUG_ON((int)offsetof(struct lnet_hdr, msg.get.ptl_index) != 56); - BUILD_BUG_ON((int)sizeof(((struct lnet_hdr *)0)->msg.get.ptl_index) != 4); - BUILD_BUG_ON((int)offsetof(struct lnet_hdr, msg.get.src_offset) != 60); - BUILD_BUG_ON((int)sizeof(((struct lnet_hdr *)0)->msg.get.src_offset) != 4); - BUILD_BUG_ON((int)offsetof(struct lnet_hdr, msg.get.sink_length) != 64); - BUILD_BUG_ON((int)sizeof(((struct lnet_hdr *)0)->msg.get.sink_length) != 4); - - /* Reply */ - BUILD_BUG_ON((int)offsetof(struct lnet_hdr, msg.reply.dst_wmd) != 32); - BUILD_BUG_ON((int)sizeof(((struct lnet_hdr *)0)->msg.reply.dst_wmd) != 16); - - /* Hello */ - BUILD_BUG_ON((int)offsetof(struct lnet_hdr, msg.hello.incarnation) != 32); - BUILD_BUG_ON((int)sizeof(((struct lnet_hdr *)0)->msg.hello.incarnation) != 8); - BUILD_BUG_ON((int)offsetof(struct lnet_hdr, msg.hello.type) != 40); - BUILD_BUG_ON((int)sizeof(((struct lnet_hdr *)0)->msg.hello.type) != 4); -} - -static struct lnet_lnd * -lnet_find_lnd_by_type(__u32 type) -{ - struct lnet_lnd *lnd; - struct list_head *tmp; - - /* holding lnd mutex */ - list_for_each(tmp, &the_lnet.ln_lnds) { - lnd = list_entry(tmp, struct lnet_lnd, lnd_list); - - if (lnd->lnd_type == type) - return lnd; - } - - return NULL; -} - -void -lnet_register_lnd(struct lnet_lnd *lnd) -{ - mutex_lock(&the_lnet.ln_lnd_mutex); - - LASSERT(libcfs_isknown_lnd(lnd->lnd_type)); - LASSERT(!lnet_find_lnd_by_type(lnd->lnd_type)); - - list_add_tail(&lnd->lnd_list, &the_lnet.ln_lnds); - lnd->lnd_refcount = 0; - - CDEBUG(D_NET, "%s LND registered\n", libcfs_lnd2str(lnd->lnd_type)); - - mutex_unlock(&the_lnet.ln_lnd_mutex); -} -EXPORT_SYMBOL(lnet_register_lnd); - -void -lnet_unregister_lnd(struct lnet_lnd *lnd) -{ - mutex_lock(&the_lnet.ln_lnd_mutex); - - LASSERT(lnet_find_lnd_by_type(lnd->lnd_type) == lnd); - LASSERT(!lnd->lnd_refcount); - - list_del(&lnd->lnd_list); - CDEBUG(D_NET, "%s LND unregistered\n", libcfs_lnd2str(lnd->lnd_type)); - - mutex_unlock(&the_lnet.ln_lnd_mutex); -} -EXPORT_SYMBOL(lnet_unregister_lnd); - -void -lnet_counters_get(struct lnet_counters *counters) -{ - struct lnet_counters *ctr; - int i; - - memset(counters, 0, sizeof(*counters)); - - lnet_net_lock(LNET_LOCK_EX); - - cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) { - counters->msgs_max += ctr->msgs_max; - counters->msgs_alloc += ctr->msgs_alloc; - counters->errors += ctr->errors; - counters->send_count += ctr->send_count; - counters->recv_count += ctr->recv_count; - counters->route_count += ctr->route_count; - counters->drop_count += ctr->drop_count; - counters->send_length += ctr->send_length; - counters->recv_length += ctr->recv_length; - counters->route_length += ctr->route_length; - counters->drop_length += ctr->drop_length; - } - lnet_net_unlock(LNET_LOCK_EX); -} -EXPORT_SYMBOL(lnet_counters_get); - -void -lnet_counters_reset(void) -{ - struct lnet_counters *counters; - int i; - - lnet_net_lock(LNET_LOCK_EX); - - cfs_percpt_for_each(counters, i, the_lnet.ln_counters) - memset(counters, 0, sizeof(struct lnet_counters)); - - lnet_net_unlock(LNET_LOCK_EX); -} - -static char * -lnet_res_type2str(int type) -{ - switch (type) { - default: - LBUG(); - case LNET_COOKIE_TYPE_MD: - return "MD"; - case LNET_COOKIE_TYPE_ME: - return "ME"; - case LNET_COOKIE_TYPE_EQ: - return "EQ"; - } -} - -static void -lnet_res_container_cleanup(struct lnet_res_container *rec) -{ - int count = 0; - - if (!rec->rec_type) /* not set yet, it's uninitialized */ - return; - - while (!list_empty(&rec->rec_active)) { - struct list_head *e = rec->rec_active.next; - - list_del_init(e); - if (rec->rec_type == LNET_COOKIE_TYPE_EQ) { - kfree(list_entry(e, struct lnet_eq, eq_list)); - - } else if (rec->rec_type == LNET_COOKIE_TYPE_MD) { - kfree(list_entry(e, struct lnet_libmd, md_list)); - - } else { /* NB: Active MEs should be attached on portals */ - LBUG(); - } - count++; - } - - if (count > 0) { - /* - * Found alive MD/ME/EQ, user really should unlink/free - * all of them before finalize LNet, but if someone didn't, - * we have to recycle garbage for him - */ - CERROR("%d active elements on exit of %s container\n", - count, lnet_res_type2str(rec->rec_type)); - } - - kfree(rec->rec_lh_hash); - rec->rec_lh_hash = NULL; - - rec->rec_type = 0; /* mark it as finalized */ -} - -static int -lnet_res_container_setup(struct lnet_res_container *rec, int cpt, int type) -{ - int rc = 0; - int i; - - LASSERT(!rec->rec_type); - - rec->rec_type = type; - INIT_LIST_HEAD(&rec->rec_active); - rec->rec_lh_cookie = (cpt << LNET_COOKIE_TYPE_BITS) | type; - - /* Arbitrary choice of hash table size */ - rec->rec_lh_hash = kvmalloc_cpt(LNET_LH_HASH_SIZE * sizeof(rec->rec_lh_hash[0]), - GFP_KERNEL, cpt); - if (!rec->rec_lh_hash) { - rc = -ENOMEM; - goto out; - } - - for (i = 0; i < LNET_LH_HASH_SIZE; i++) - INIT_LIST_HEAD(&rec->rec_lh_hash[i]); - - return 0; - -out: - CERROR("Failed to setup %s resource container\n", - lnet_res_type2str(type)); - lnet_res_container_cleanup(rec); - return rc; -} - -static void -lnet_res_containers_destroy(struct lnet_res_container **recs) -{ - struct lnet_res_container *rec; - int i; - - cfs_percpt_for_each(rec, i, recs) - lnet_res_container_cleanup(rec); - - cfs_percpt_free(recs); -} - -static struct lnet_res_container ** -lnet_res_containers_create(int type) -{ - struct lnet_res_container **recs; - struct lnet_res_container *rec; - int rc; - int i; - - recs = cfs_percpt_alloc(lnet_cpt_table(), sizeof(*rec)); - if (!recs) { - CERROR("Failed to allocate %s resource containers\n", - lnet_res_type2str(type)); - return NULL; - } - - cfs_percpt_for_each(rec, i, recs) { - rc = lnet_res_container_setup(rec, i, type); - if (rc) { - lnet_res_containers_destroy(recs); - return NULL; - } - } - - return recs; -} - -struct lnet_libhandle * -lnet_res_lh_lookup(struct lnet_res_container *rec, __u64 cookie) -{ - /* ALWAYS called with lnet_res_lock held */ - struct list_head *head; - struct lnet_libhandle *lh; - unsigned int hash; - - if ((cookie & LNET_COOKIE_MASK) != rec->rec_type) - return NULL; - - hash = cookie >> (LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS); - head = &rec->rec_lh_hash[hash & LNET_LH_HASH_MASK]; - - list_for_each_entry(lh, head, lh_hash_chain) { - if (lh->lh_cookie == cookie) - return lh; - } - - return NULL; -} - -void -lnet_res_lh_initialize(struct lnet_res_container *rec, - struct lnet_libhandle *lh) -{ - /* ALWAYS called with lnet_res_lock held */ - unsigned int ibits = LNET_COOKIE_TYPE_BITS + LNET_CPT_BITS; - unsigned int hash; - - lh->lh_cookie = rec->rec_lh_cookie; - rec->rec_lh_cookie += 1 << ibits; - - hash = (lh->lh_cookie >> ibits) & LNET_LH_HASH_MASK; - - list_add(&lh->lh_hash_chain, &rec->rec_lh_hash[hash]); -} - -static int lnet_unprepare(void); - -static int -lnet_prepare(lnet_pid_t requested_pid) -{ - /* Prepare to bring up the network */ - struct lnet_res_container **recs; - int rc = 0; - - if (requested_pid == LNET_PID_ANY) { - /* Don't instantiate LNET just for me */ - return -ENETDOWN; - } - - LASSERT(!the_lnet.ln_refcount); - - the_lnet.ln_routing = 0; - - LASSERT(!(requested_pid & LNET_PID_USERFLAG)); - the_lnet.ln_pid = requested_pid; - - INIT_LIST_HEAD(&the_lnet.ln_test_peers); - INIT_LIST_HEAD(&the_lnet.ln_nis); - INIT_LIST_HEAD(&the_lnet.ln_nis_cpt); - INIT_LIST_HEAD(&the_lnet.ln_nis_zombie); - INIT_LIST_HEAD(&the_lnet.ln_routers); - INIT_LIST_HEAD(&the_lnet.ln_drop_rules); - INIT_LIST_HEAD(&the_lnet.ln_delay_rules); - - rc = lnet_create_remote_nets_table(); - if (rc) - goto failed; - /* - * NB the interface cookie in wire handles guards against delayed - * replies and ACKs appearing valid after reboot. - */ - the_lnet.ln_interface_cookie = ktime_get_ns(); - - the_lnet.ln_counters = cfs_percpt_alloc(lnet_cpt_table(), - sizeof(struct lnet_counters)); - if (!the_lnet.ln_counters) { - CERROR("Failed to allocate counters for LNet\n"); - rc = -ENOMEM; - goto failed; - } - - rc = lnet_peer_tables_create(); - if (rc) - goto failed; - - rc = lnet_msg_containers_create(); - if (rc) - goto failed; - - rc = lnet_res_container_setup(&the_lnet.ln_eq_container, 0, - LNET_COOKIE_TYPE_EQ); - if (rc) - goto failed; - - recs = lnet_res_containers_create(LNET_COOKIE_TYPE_ME); - if (!recs) { - rc = -ENOMEM; - goto failed; - } - - the_lnet.ln_me_containers = recs; - - recs = lnet_res_containers_create(LNET_COOKIE_TYPE_MD); - if (!recs) { - rc = -ENOMEM; - goto failed; - } - - the_lnet.ln_md_containers = recs; - - rc = lnet_portals_create(); - if (rc) { - CERROR("Failed to create portals for LNet: %d\n", rc); - goto failed; - } - - return 0; - - failed: - lnet_unprepare(); - return rc; -} - -static int -lnet_unprepare(void) -{ - /* - * NB no LNET_LOCK since this is the last reference. All LND instances - * have shut down already, so it is safe to unlink and free all - * descriptors, even those that appear committed to a network op (eg MD - * with non-zero pending count) - */ - lnet_fail_nid(LNET_NID_ANY, 0); - - LASSERT(!the_lnet.ln_refcount); - LASSERT(list_empty(&the_lnet.ln_test_peers)); - LASSERT(list_empty(&the_lnet.ln_nis)); - LASSERT(list_empty(&the_lnet.ln_nis_cpt)); - LASSERT(list_empty(&the_lnet.ln_nis_zombie)); - - lnet_portals_destroy(); - - if (the_lnet.ln_md_containers) { - lnet_res_containers_destroy(the_lnet.ln_md_containers); - the_lnet.ln_md_containers = NULL; - } - - if (the_lnet.ln_me_containers) { - lnet_res_containers_destroy(the_lnet.ln_me_containers); - the_lnet.ln_me_containers = NULL; - } - - lnet_res_container_cleanup(&the_lnet.ln_eq_container); - - lnet_msg_containers_destroy(); - lnet_peer_tables_destroy(); - lnet_rtrpools_free(0); - - if (the_lnet.ln_counters) { - cfs_percpt_free(the_lnet.ln_counters); - the_lnet.ln_counters = NULL; - } - lnet_destroy_remote_nets_table(); - - return 0; -} - -struct lnet_ni * -lnet_net2ni_locked(__u32 net, int cpt) -{ - struct list_head *tmp; - struct lnet_ni *ni; - - LASSERT(cpt != LNET_LOCK_EX); - - list_for_each(tmp, &the_lnet.ln_nis) { - ni = list_entry(tmp, struct lnet_ni, ni_list); - - if (LNET_NIDNET(ni->ni_nid) == net) { - lnet_ni_addref_locked(ni, cpt); - return ni; - } - } - - return NULL; -} - -struct lnet_ni * -lnet_net2ni(__u32 net) -{ - struct lnet_ni *ni; - - lnet_net_lock(0); - ni = lnet_net2ni_locked(net, 0); - lnet_net_unlock(0); - - return ni; -} -EXPORT_SYMBOL(lnet_net2ni); - -static unsigned int -lnet_nid_cpt_hash(lnet_nid_t nid, unsigned int number) -{ - __u64 key = nid; - unsigned int val; - - LASSERT(number >= 1 && number <= LNET_CPT_NUMBER); - - if (number == 1) - return 0; - - val = hash_long(key, LNET_CPT_BITS); - /* NB: LNET_CP_NUMBER doesn't have to be PO2 */ - if (val < number) - return val; - - return (unsigned int)(key + val + (val >> 1)) % number; -} - -int -lnet_cpt_of_nid_locked(lnet_nid_t nid) -{ - struct lnet_ni *ni; - - /* must called with hold of lnet_net_lock */ - if (LNET_CPT_NUMBER == 1) - return 0; /* the only one */ - - /* take lnet_net_lock(any) would be OK */ - if (!list_empty(&the_lnet.ln_nis_cpt)) { - list_for_each_entry(ni, &the_lnet.ln_nis_cpt, ni_cptlist) { - if (LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid)) - continue; - - LASSERT(ni->ni_cpts); - return ni->ni_cpts[lnet_nid_cpt_hash - (nid, ni->ni_ncpts)]; - } - } - - return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER); -} - -int -lnet_cpt_of_nid(lnet_nid_t nid) -{ - int cpt; - int cpt2; - - if (LNET_CPT_NUMBER == 1) - return 0; /* the only one */ - - if (list_empty(&the_lnet.ln_nis_cpt)) - return lnet_nid_cpt_hash(nid, LNET_CPT_NUMBER); - - cpt = lnet_net_lock_current(); - cpt2 = lnet_cpt_of_nid_locked(nid); - lnet_net_unlock(cpt); - - return cpt2; -} -EXPORT_SYMBOL(lnet_cpt_of_nid); - -int -lnet_islocalnet(__u32 net) -{ - struct lnet_ni *ni; - int cpt; - - cpt = lnet_net_lock_current(); - - ni = lnet_net2ni_locked(net, cpt); - if (ni) - lnet_ni_decref_locked(ni, cpt); - - lnet_net_unlock(cpt); - - return !!ni; -} - -struct lnet_ni * -lnet_nid2ni_locked(lnet_nid_t nid, int cpt) -{ - struct lnet_ni *ni; - struct list_head *tmp; - - LASSERT(cpt != LNET_LOCK_EX); - - list_for_each(tmp, &the_lnet.ln_nis) { - ni = list_entry(tmp, struct lnet_ni, ni_list); - - if (ni->ni_nid == nid) { - lnet_ni_addref_locked(ni, cpt); - return ni; - } - } - - return NULL; -} - -int -lnet_islocalnid(lnet_nid_t nid) -{ - struct lnet_ni *ni; - int cpt; - - cpt = lnet_net_lock_current(); - ni = lnet_nid2ni_locked(nid, cpt); - if (ni) - lnet_ni_decref_locked(ni, cpt); - lnet_net_unlock(cpt); - - return !!ni; -} - -int -lnet_count_acceptor_nis(void) -{ - /* Return the # of NIs that need the acceptor. */ - int count = 0; - struct list_head *tmp; - struct lnet_ni *ni; - int cpt; - - cpt = lnet_net_lock_current(); - list_for_each(tmp, &the_lnet.ln_nis) { - ni = list_entry(tmp, struct lnet_ni, ni_list); - - if (ni->ni_lnd->lnd_accept) - count++; - } - - lnet_net_unlock(cpt); - - return count; -} - -static struct lnet_ping_info * -lnet_ping_info_create(int num_ni) -{ - struct lnet_ping_info *ping_info; - unsigned int infosz; - - infosz = offsetof(struct lnet_ping_info, pi_ni[num_ni]); - ping_info = kvzalloc(infosz, GFP_KERNEL); - if (!ping_info) { - CERROR("Can't allocate ping info[%d]\n", num_ni); - return NULL; - } - - ping_info->pi_nnis = num_ni; - ping_info->pi_pid = the_lnet.ln_pid; - ping_info->pi_magic = LNET_PROTO_PING_MAGIC; - ping_info->pi_features = LNET_PING_FEAT_NI_STATUS; - - return ping_info; -} - -static inline int -lnet_get_ni_count(void) -{ - struct lnet_ni *ni; - int count = 0; - - lnet_net_lock(0); - - list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) - count++; - - lnet_net_unlock(0); - - return count; -} - -static inline void -lnet_ping_info_free(struct lnet_ping_info *pinfo) -{ - kvfree(pinfo); -} - -static void -lnet_ping_info_destroy(void) -{ - struct lnet_ni *ni; - - lnet_net_lock(LNET_LOCK_EX); - - list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) { - lnet_ni_lock(ni); - ni->ni_status = NULL; - lnet_ni_unlock(ni); - } - - lnet_ping_info_free(the_lnet.ln_ping_info); - the_lnet.ln_ping_info = NULL; - - lnet_net_unlock(LNET_LOCK_EX); -} - -static void -lnet_ping_event_handler(struct lnet_event *event) -{ - struct lnet_ping_info *pinfo = event->md.user_ptr; - - if (event->unlinked) - pinfo->pi_features = LNET_PING_FEAT_INVAL; -} - -static int -lnet_ping_info_setup(struct lnet_ping_info **ppinfo, - struct lnet_handle_md *md_handle, - int ni_count, bool set_eq) -{ - struct lnet_process_id id = {LNET_NID_ANY, LNET_PID_ANY}; - struct lnet_handle_me me_handle; - struct lnet_md md = { NULL }; - int rc, rc2; - - if (set_eq) { - rc = LNetEQAlloc(0, lnet_ping_event_handler, - &the_lnet.ln_ping_target_eq); - if (rc) { - CERROR("Can't allocate ping EQ: %d\n", rc); - return rc; - } - } - - *ppinfo = lnet_ping_info_create(ni_count); - if (!*ppinfo) { - rc = -ENOMEM; - goto failed_0; - } - - rc = LNetMEAttach(LNET_RESERVED_PORTAL, id, - LNET_PROTO_PING_MATCHBITS, 0, - LNET_UNLINK, LNET_INS_AFTER, - &me_handle); - if (rc) { - CERROR("Can't create ping ME: %d\n", rc); - goto failed_1; - } - - /* initialize md content */ - md.start = *ppinfo; - md.length = offsetof(struct lnet_ping_info, - pi_ni[(*ppinfo)->pi_nnis]); - md.threshold = LNET_MD_THRESH_INF; - md.max_size = 0; - md.options = LNET_MD_OP_GET | LNET_MD_TRUNCATE | - LNET_MD_MANAGE_REMOTE; - md.user_ptr = NULL; - md.eq_handle = the_lnet.ln_ping_target_eq; - md.user_ptr = *ppinfo; - - rc = LNetMDAttach(me_handle, md, LNET_RETAIN, md_handle); - if (rc) { - CERROR("Can't attach ping MD: %d\n", rc); - goto failed_2; - } - - return 0; - -failed_2: - rc2 = LNetMEUnlink(me_handle); - LASSERT(!rc2); -failed_1: - lnet_ping_info_free(*ppinfo); - *ppinfo = NULL; -failed_0: - if (set_eq) - LNetEQFree(the_lnet.ln_ping_target_eq); - return rc; -} - -static void -lnet_ping_md_unlink(struct lnet_ping_info *pinfo, - struct lnet_handle_md *md_handle) -{ - LNetMDUnlink(*md_handle); - LNetInvalidateMDHandle(md_handle); - - /* NB md could be busy; this just starts the unlink */ - while (pinfo->pi_features != LNET_PING_FEAT_INVAL) { - CDEBUG(D_NET, "Still waiting for ping MD to unlink\n"); - set_current_state(TASK_NOLOAD); - schedule_timeout(HZ); - } -} - -static void -lnet_ping_info_install_locked(struct lnet_ping_info *ping_info) -{ - struct lnet_ni_status *ns; - struct lnet_ni *ni; - int i = 0; - - list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) { - LASSERT(i < ping_info->pi_nnis); - - ns = &ping_info->pi_ni[i]; - - ns->ns_nid = ni->ni_nid; - - lnet_ni_lock(ni); - ns->ns_status = (ni->ni_status) ? - ni->ni_status->ns_status : LNET_NI_STATUS_UP; - ni->ni_status = ns; - lnet_ni_unlock(ni); - - i++; - } -} - -static void -lnet_ping_target_update(struct lnet_ping_info *pinfo, - struct lnet_handle_md md_handle) -{ - struct lnet_ping_info *old_pinfo = NULL; - struct lnet_handle_md old_md; - - /* switch the NIs to point to the new ping info created */ - lnet_net_lock(LNET_LOCK_EX); - - if (!the_lnet.ln_routing) - pinfo->pi_features |= LNET_PING_FEAT_RTE_DISABLED; - lnet_ping_info_install_locked(pinfo); - - if (the_lnet.ln_ping_info) { - old_pinfo = the_lnet.ln_ping_info; - old_md = the_lnet.ln_ping_target_md; - } - the_lnet.ln_ping_target_md = md_handle; - the_lnet.ln_ping_info = pinfo; - - lnet_net_unlock(LNET_LOCK_EX); - - if (old_pinfo) { - /* unlink the old ping info */ - lnet_ping_md_unlink(old_pinfo, &old_md); - lnet_ping_info_free(old_pinfo); - } -} - -static void -lnet_ping_target_fini(void) -{ - int rc; - - lnet_ping_md_unlink(the_lnet.ln_ping_info, - &the_lnet.ln_ping_target_md); - - rc = LNetEQFree(the_lnet.ln_ping_target_eq); - LASSERT(!rc); - - lnet_ping_info_destroy(); -} - -static int -lnet_ni_tq_credits(struct lnet_ni *ni) -{ - int credits; - - LASSERT(ni->ni_ncpts >= 1); - - if (ni->ni_ncpts == 1) - return ni->ni_maxtxcredits; - - credits = ni->ni_maxtxcredits / ni->ni_ncpts; - credits = max(credits, 8 * ni->ni_peertxcredits); - credits = min(credits, ni->ni_maxtxcredits); - - return credits; -} - -static void -lnet_ni_unlink_locked(struct lnet_ni *ni) -{ - if (!list_empty(&ni->ni_cptlist)) { - list_del_init(&ni->ni_cptlist); - lnet_ni_decref_locked(ni, 0); - } - - /* move it to zombie list and nobody can find it anymore */ - LASSERT(!list_empty(&ni->ni_list)); - list_move(&ni->ni_list, &the_lnet.ln_nis_zombie); - lnet_ni_decref_locked(ni, 0); /* drop ln_nis' ref */ -} - -static void -lnet_clear_zombies_nis_locked(void) -{ - int i; - int islo; - struct lnet_ni *ni; - struct lnet_ni *temp; - - /* - * Now wait for the NI's I just nuked to show up on ln_zombie_nis - * and shut them down in guaranteed thread context - */ - i = 2; - list_for_each_entry_safe(ni, temp, &the_lnet.ln_nis_zombie, ni_list) { - int *ref; - int j; - - list_del_init(&ni->ni_list); - cfs_percpt_for_each(ref, j, ni->ni_refs) { - if (!*ref) - continue; - /* still busy, add it back to zombie list */ - list_add(&ni->ni_list, &the_lnet.ln_nis_zombie); - break; - } - - if (!list_empty(&ni->ni_list)) { - lnet_net_unlock(LNET_LOCK_EX); - ++i; - if ((i & (-i)) == i) { - CDEBUG(D_WARNING, "Waiting for zombie LNI %s\n", - libcfs_nid2str(ni->ni_nid)); - } - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ); - lnet_net_lock(LNET_LOCK_EX); - continue; - } - - ni->ni_lnd->lnd_refcount--; - lnet_net_unlock(LNET_LOCK_EX); - - islo = ni->ni_lnd->lnd_type == LOLND; - - LASSERT(!in_interrupt()); - ni->ni_lnd->lnd_shutdown(ni); - - /* - * can't deref lnd anymore now; it might have unregistered - * itself... - */ - if (!islo) - CDEBUG(D_LNI, "Removed LNI %s\n", - libcfs_nid2str(ni->ni_nid)); - - lnet_ni_free(ni); - i = 2; - - lnet_net_lock(LNET_LOCK_EX); - } -} - -static void -lnet_shutdown_lndnis(void) -{ - struct lnet_ni *ni; - struct lnet_ni *temp; - int i; - - /* NB called holding the global mutex */ - - /* All quiet on the API front */ - LASSERT(!the_lnet.ln_shutdown); - LASSERT(!the_lnet.ln_refcount); - LASSERT(list_empty(&the_lnet.ln_nis_zombie)); - - lnet_net_lock(LNET_LOCK_EX); - the_lnet.ln_shutdown = 1; /* flag shutdown */ - - /* Unlink NIs from the global table */ - list_for_each_entry_safe(ni, temp, &the_lnet.ln_nis, ni_list) { - lnet_ni_unlink_locked(ni); - } - - /* Drop the cached loopback NI. */ - if (the_lnet.ln_loni) { - lnet_ni_decref_locked(the_lnet.ln_loni, 0); - the_lnet.ln_loni = NULL; - } - - lnet_net_unlock(LNET_LOCK_EX); - - /* - * Clear lazy portals and drop delayed messages which hold refs - * on their lnet_msg::msg_rxpeer - */ - for (i = 0; i < the_lnet.ln_nportals; i++) - LNetClearLazyPortal(i); - - /* - * Clear the peer table and wait for all peers to go (they hold refs on - * their NIs) - */ - lnet_peer_tables_cleanup(NULL); - - lnet_net_lock(LNET_LOCK_EX); - - lnet_clear_zombies_nis_locked(); - the_lnet.ln_shutdown = 0; - lnet_net_unlock(LNET_LOCK_EX); -} - -/* shutdown down the NI and release refcount */ -static void -lnet_shutdown_lndni(struct lnet_ni *ni) -{ - int i; - - lnet_net_lock(LNET_LOCK_EX); - lnet_ni_unlink_locked(ni); - lnet_net_unlock(LNET_LOCK_EX); - - /* clear messages for this NI on the lazy portal */ - for (i = 0; i < the_lnet.ln_nportals; i++) - lnet_clear_lazy_portal(ni, i, "Shutting down NI"); - - /* Do peer table cleanup for this ni */ - lnet_peer_tables_cleanup(ni); - - lnet_net_lock(LNET_LOCK_EX); - lnet_clear_zombies_nis_locked(); - lnet_net_unlock(LNET_LOCK_EX); -} - -static int -lnet_startup_lndni(struct lnet_ni *ni, struct lnet_ioctl_config_data *conf) -{ - struct lnet_ioctl_config_lnd_tunables *lnd_tunables = NULL; - int rc = -EINVAL; - int lnd_type; - struct lnet_lnd *lnd; - struct lnet_tx_queue *tq; - int i; - u32 seed; - - lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid)); - - LASSERT(libcfs_isknown_lnd(lnd_type)); - - if (lnd_type == CIBLND || lnd_type == OPENIBLND || - lnd_type == IIBLND || lnd_type == VIBLND) { - CERROR("LND %s obsoleted\n", libcfs_lnd2str(lnd_type)); - goto failed0; - } - - /* Make sure this new NI is unique. */ - lnet_net_lock(LNET_LOCK_EX); - rc = lnet_net_unique(LNET_NIDNET(ni->ni_nid), &the_lnet.ln_nis); - lnet_net_unlock(LNET_LOCK_EX); - if (!rc) { - if (lnd_type == LOLND) { - lnet_ni_free(ni); - return 0; - } - - CERROR("Net %s is not unique\n", - libcfs_net2str(LNET_NIDNET(ni->ni_nid))); - rc = -EEXIST; - goto failed0; - } - - mutex_lock(&the_lnet.ln_lnd_mutex); - lnd = lnet_find_lnd_by_type(lnd_type); - - if (!lnd) { - mutex_unlock(&the_lnet.ln_lnd_mutex); - rc = request_module("%s", libcfs_lnd2modname(lnd_type)); - mutex_lock(&the_lnet.ln_lnd_mutex); - - lnd = lnet_find_lnd_by_type(lnd_type); - if (!lnd) { - mutex_unlock(&the_lnet.ln_lnd_mutex); - CERROR("Can't load LND %s, module %s, rc=%d\n", - libcfs_lnd2str(lnd_type), - libcfs_lnd2modname(lnd_type), rc); - rc = -EINVAL; - goto failed0; - } - } - - lnet_net_lock(LNET_LOCK_EX); - lnd->lnd_refcount++; - lnet_net_unlock(LNET_LOCK_EX); - - ni->ni_lnd = lnd; - - if (conf && conf->cfg_hdr.ioc_len > sizeof(*conf)) - lnd_tunables = (struct lnet_ioctl_config_lnd_tunables *)conf->cfg_bulk; - - if (lnd_tunables) { - ni->ni_lnd_tunables = kzalloc(sizeof(*ni->ni_lnd_tunables), - GFP_NOFS); - if (!ni->ni_lnd_tunables) { - mutex_unlock(&the_lnet.ln_lnd_mutex); - rc = -ENOMEM; - goto failed0; - } - memcpy(ni->ni_lnd_tunables, lnd_tunables, - sizeof(*ni->ni_lnd_tunables)); - } - - /* - * If given some LND tunable parameters, parse those now to - * override the values in the NI structure. - */ - if (conf) { - if (conf->cfg_config_u.cfg_net.net_peer_rtr_credits >= 0) - ni->ni_peerrtrcredits = - conf->cfg_config_u.cfg_net.net_peer_rtr_credits; - if (conf->cfg_config_u.cfg_net.net_peer_timeout >= 0) - ni->ni_peertimeout = - conf->cfg_config_u.cfg_net.net_peer_timeout; - if (conf->cfg_config_u.cfg_net.net_peer_tx_credits != -1) - ni->ni_peertxcredits = - conf->cfg_config_u.cfg_net.net_peer_tx_credits; - if (conf->cfg_config_u.cfg_net.net_max_tx_credits >= 0) - ni->ni_maxtxcredits = - conf->cfg_config_u.cfg_net.net_max_tx_credits; - } - - rc = lnd->lnd_startup(ni); - - mutex_unlock(&the_lnet.ln_lnd_mutex); - - if (rc) { - LCONSOLE_ERROR_MSG(0x105, "Error %d starting up LNI %s\n", - rc, libcfs_lnd2str(lnd->lnd_type)); - lnet_net_lock(LNET_LOCK_EX); - lnd->lnd_refcount--; - lnet_net_unlock(LNET_LOCK_EX); - goto failed0; - } - - LASSERT(ni->ni_peertimeout <= 0 || lnd->lnd_query); - - lnet_net_lock(LNET_LOCK_EX); - /* refcount for ln_nis */ - lnet_ni_addref_locked(ni, 0); - list_add_tail(&ni->ni_list, &the_lnet.ln_nis); - if (ni->ni_cpts) { - lnet_ni_addref_locked(ni, 0); - list_add_tail(&ni->ni_cptlist, &the_lnet.ln_nis_cpt); - } - - lnet_net_unlock(LNET_LOCK_EX); - - if (lnd->lnd_type == LOLND) { - lnet_ni_addref(ni); - LASSERT(!the_lnet.ln_loni); - the_lnet.ln_loni = ni; - return 0; - } - - if (!ni->ni_peertxcredits || !ni->ni_maxtxcredits) { - LCONSOLE_ERROR_MSG(0x107, "LNI %s has no %scredits\n", - libcfs_lnd2str(lnd->lnd_type), - !ni->ni_peertxcredits ? - "" : "per-peer "); - /* - * shutdown the NI since if we get here then it must've already - * been started - */ - lnet_shutdown_lndni(ni); - return -EINVAL; - } - - cfs_percpt_for_each(tq, i, ni->ni_tx_queues) { - tq->tq_credits_min = - tq->tq_credits_max = - tq->tq_credits = lnet_ni_tq_credits(ni); - } - - /* Nodes with small feet have little entropy. The NID for this - * node gives the most entropy in the low bits. - */ - seed = LNET_NIDADDR(ni->ni_nid); - add_device_randomness(&seed, sizeof(seed)); - - CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n", - libcfs_nid2str(ni->ni_nid), ni->ni_peertxcredits, - lnet_ni_tq_credits(ni) * LNET_CPT_NUMBER, - ni->ni_peerrtrcredits, ni->ni_peertimeout); - - return 0; -failed0: - lnet_ni_free(ni); - return rc; -} - -static int -lnet_startup_lndnis(struct list_head *nilist) -{ - struct lnet_ni *ni; - int rc; - int ni_count = 0; - - while (!list_empty(nilist)) { - ni = list_entry(nilist->next, struct lnet_ni, ni_list); - list_del(&ni->ni_list); - rc = lnet_startup_lndni(ni, NULL); - - if (rc < 0) - goto failed; - - ni_count++; - } - - return ni_count; -failed: - lnet_shutdown_lndnis(); - - return rc; -} - -/** - * Initialize LNet library. - * - * Automatically called at module loading time. Caller has to call - * lnet_lib_exit() after a call to lnet_lib_init(), if and only if the - * latter returned 0. It must be called exactly once. - * - * \retval 0 on success - * \retval -ve on failures. - */ -int lnet_lib_init(void) -{ - int rc; - - lnet_assert_wire_constants(); - - memset(&the_lnet, 0, sizeof(the_lnet)); - - /* refer to global cfs_cpt_tab for now */ - the_lnet.ln_cpt_table = cfs_cpt_tab; - the_lnet.ln_cpt_number = cfs_cpt_number(cfs_cpt_tab); - - LASSERT(the_lnet.ln_cpt_number > 0); - if (the_lnet.ln_cpt_number > LNET_CPT_MAX) { - /* we are under risk of consuming all lh_cookie */ - CERROR("Can't have %d CPTs for LNet (max allowed is %d), please change setting of CPT-table and retry\n", - the_lnet.ln_cpt_number, LNET_CPT_MAX); - return -E2BIG; - } - - while ((1 << the_lnet.ln_cpt_bits) < the_lnet.ln_cpt_number) - the_lnet.ln_cpt_bits++; - - rc = lnet_create_locks(); - if (rc) { - CERROR("Can't create LNet global locks: %d\n", rc); - return rc; - } - - the_lnet.ln_refcount = 0; - LNetInvalidateEQHandle(&the_lnet.ln_rc_eqh); - INIT_LIST_HEAD(&the_lnet.ln_lnds); - INIT_LIST_HEAD(&the_lnet.ln_rcd_zombie); - INIT_LIST_HEAD(&the_lnet.ln_rcd_deathrow); - - /* - * The hash table size is the number of bits it takes to express the set - * ln_num_routes, minus 1 (better to under estimate than over so we - * don't waste memory). - */ - if (rnet_htable_size <= 0) - rnet_htable_size = LNET_REMOTE_NETS_HASH_DEFAULT; - else if (rnet_htable_size > LNET_REMOTE_NETS_HASH_MAX) - rnet_htable_size = LNET_REMOTE_NETS_HASH_MAX; - the_lnet.ln_remote_nets_hbits = max_t(int, 1, - order_base_2(rnet_htable_size) - 1); - - /* - * All LNDs apart from the LOLND are in separate modules. They - * register themselves when their module loads, and unregister - * themselves when their module is unloaded. - */ - lnet_register_lnd(&the_lolnd); - return 0; -} - -/** - * Finalize LNet library. - * - * \pre lnet_lib_init() called with success. - * \pre All LNet users called LNetNIFini() for matching LNetNIInit() calls. - */ -void lnet_lib_exit(void) -{ - LASSERT(!the_lnet.ln_refcount); - - while (!list_empty(&the_lnet.ln_lnds)) - lnet_unregister_lnd(list_entry(the_lnet.ln_lnds.next, - struct lnet_lnd, lnd_list)); - lnet_destroy_locks(); -} - -/** - * Set LNet PID and start LNet interfaces, routing, and forwarding. - * - * Users must call this function at least once before any other functions. - * For each successful call there must be a corresponding call to - * LNetNIFini(). For subsequent calls to LNetNIInit(), \a requested_pid is - * ignored. - * - * The PID used by LNet may be different from the one requested. - * See LNetGetId(). - * - * \param requested_pid PID requested by the caller. - * - * \return >= 0 on success, and < 0 error code on failures. - */ -int -LNetNIInit(lnet_pid_t requested_pid) -{ - int im_a_router = 0; - int rc; - int ni_count; - struct lnet_ping_info *pinfo; - struct lnet_handle_md md_handle; - struct list_head net_head; - - INIT_LIST_HEAD(&net_head); - - mutex_lock(&the_lnet.ln_api_mutex); - - CDEBUG(D_OTHER, "refs %d\n", the_lnet.ln_refcount); - - if (the_lnet.ln_refcount > 0) { - rc = the_lnet.ln_refcount++; - mutex_unlock(&the_lnet.ln_api_mutex); - return rc; - } - - rc = lnet_prepare(requested_pid); - if (rc) { - mutex_unlock(&the_lnet.ln_api_mutex); - return rc; - } - - /* Add in the loopback network */ - if (!lnet_ni_alloc(LNET_MKNET(LOLND, 0), NULL, &net_head)) { - rc = -ENOMEM; - goto err_empty_list; - } - - /* - * If LNet is being initialized via DLC it is possible - * that the user requests not to load module parameters (ones which - * are supported by DLC) on initialization. Therefore, make sure not - * to load networks, routes and forwarding from module parameters - * in this case. On cleanup in case of failure only clean up - * routes if it has been loaded - */ - if (!the_lnet.ln_nis_from_mod_params) { - rc = lnet_parse_networks(&net_head, lnet_get_networks()); - if (rc < 0) - goto err_empty_list; - } - - ni_count = lnet_startup_lndnis(&net_head); - if (ni_count < 0) { - rc = ni_count; - goto err_empty_list; - } - - if (!the_lnet.ln_nis_from_mod_params) { - rc = lnet_parse_routes(lnet_get_routes(), &im_a_router); - if (rc) - goto err_shutdown_lndnis; - - rc = lnet_check_routes(); - if (rc) - goto err_destroy_routes; - - rc = lnet_rtrpools_alloc(im_a_router); - if (rc) - goto err_destroy_routes; - } - - rc = lnet_acceptor_start(); - if (rc) - goto err_destroy_routes; - - the_lnet.ln_refcount = 1; - /* Now I may use my own API functions... */ - - rc = lnet_ping_info_setup(&pinfo, &md_handle, ni_count, true); - if (rc) - goto err_acceptor_stop; - - lnet_ping_target_update(pinfo, md_handle); - - rc = lnet_router_checker_start(); - if (rc) - goto err_stop_ping; - - lnet_fault_init(); - lnet_router_debugfs_init(); - - mutex_unlock(&the_lnet.ln_api_mutex); - - return 0; - -err_stop_ping: - lnet_ping_target_fini(); -err_acceptor_stop: - the_lnet.ln_refcount = 0; - lnet_acceptor_stop(); -err_destroy_routes: - if (!the_lnet.ln_nis_from_mod_params) - lnet_destroy_routes(); -err_shutdown_lndnis: - lnet_shutdown_lndnis(); -err_empty_list: - lnet_unprepare(); - LASSERT(rc < 0); - mutex_unlock(&the_lnet.ln_api_mutex); - while (!list_empty(&net_head)) { - struct lnet_ni *ni; - - ni = list_entry(net_head.next, struct lnet_ni, ni_list); - list_del_init(&ni->ni_list); - lnet_ni_free(ni); - } - return rc; -} -EXPORT_SYMBOL(LNetNIInit); - -/** - * Stop LNet interfaces, routing, and forwarding. - * - * Users must call this function once for each successful call to LNetNIInit(). - * Once the LNetNIFini() operation has been started, the results of pending - * API operations are undefined. - * - * \return always 0 for current implementation. - */ -int -LNetNIFini(void) -{ - mutex_lock(&the_lnet.ln_api_mutex); - - LASSERT(the_lnet.ln_refcount > 0); - - if (the_lnet.ln_refcount != 1) { - the_lnet.ln_refcount--; - } else { - LASSERT(!the_lnet.ln_niinit_self); - - lnet_fault_fini(); - lnet_router_debugfs_fini(); - lnet_router_checker_stop(); - lnet_ping_target_fini(); - - /* Teardown fns that use my own API functions BEFORE here */ - the_lnet.ln_refcount = 0; - - lnet_acceptor_stop(); - lnet_destroy_routes(); - lnet_shutdown_lndnis(); - lnet_unprepare(); - } - - mutex_unlock(&the_lnet.ln_api_mutex); - return 0; -} -EXPORT_SYMBOL(LNetNIFini); - -/** - * Grabs the ni data from the ni structure and fills the out - * parameters - * - * \param[in] ni network interface structure - * \param[out] config NI configuration - */ -static void -lnet_fill_ni_info(struct lnet_ni *ni, struct lnet_ioctl_config_data *config) -{ - struct lnet_ioctl_config_lnd_tunables *lnd_cfg = NULL; - struct lnet_ioctl_net_config *net_config; - size_t min_size, tunable_size = 0; - int i; - - if (!ni || !config) - return; - - net_config = (struct lnet_ioctl_net_config *)config->cfg_bulk; - if (!net_config) - return; - - BUILD_BUG_ON(ARRAY_SIZE(ni->ni_interfaces) != - ARRAY_SIZE(net_config->ni_interfaces)); - - for (i = 0; i < ARRAY_SIZE(ni->ni_interfaces); i++) { - if (!ni->ni_interfaces[i]) - break; - - strncpy(net_config->ni_interfaces[i], - ni->ni_interfaces[i], - sizeof(net_config->ni_interfaces[i])); - } - - config->cfg_nid = ni->ni_nid; - config->cfg_config_u.cfg_net.net_peer_timeout = ni->ni_peertimeout; - config->cfg_config_u.cfg_net.net_max_tx_credits = ni->ni_maxtxcredits; - config->cfg_config_u.cfg_net.net_peer_tx_credits = ni->ni_peertxcredits; - config->cfg_config_u.cfg_net.net_peer_rtr_credits = ni->ni_peerrtrcredits; - - net_config->ni_status = ni->ni_status->ns_status; - - if (ni->ni_cpts) { - int num_cpts = min(ni->ni_ncpts, LNET_MAX_SHOW_NUM_CPT); - - for (i = 0; i < num_cpts; i++) - net_config->ni_cpts[i] = ni->ni_cpts[i]; - - config->cfg_ncpts = num_cpts; - } - - /* - * See if user land tools sent in a newer and larger version - * of struct lnet_tunables than what the kernel uses. - */ - min_size = sizeof(*config) + sizeof(*net_config); - - if (config->cfg_hdr.ioc_len > min_size) - tunable_size = config->cfg_hdr.ioc_len - min_size; - - /* Don't copy to much data to user space */ - min_size = min(tunable_size, sizeof(*ni->ni_lnd_tunables)); - lnd_cfg = (struct lnet_ioctl_config_lnd_tunables *)net_config->cfg_bulk; - - if (ni->ni_lnd_tunables && lnd_cfg && min_size) { - memcpy(lnd_cfg, ni->ni_lnd_tunables, min_size); - config->cfg_config_u.cfg_net.net_interface_count = 1; - - /* Tell user land that kernel side has less data */ - if (tunable_size > sizeof(*ni->ni_lnd_tunables)) { - min_size = tunable_size - sizeof(ni->ni_lnd_tunables); - config->cfg_hdr.ioc_len -= min_size; - } - } -} - -static int -lnet_get_net_config(struct lnet_ioctl_config_data *config) -{ - struct lnet_ni *ni; - struct list_head *tmp; - int idx = config->cfg_count; - int cpt, i = 0; - int rc = -ENOENT; - - cpt = lnet_net_lock_current(); - - list_for_each(tmp, &the_lnet.ln_nis) { - if (i++ != idx) - continue; - - ni = list_entry(tmp, struct lnet_ni, ni_list); - lnet_ni_lock(ni); - lnet_fill_ni_info(ni, config); - lnet_ni_unlock(ni); - rc = 0; - break; - } - - lnet_net_unlock(cpt); - return rc; -} - -int -lnet_dyn_add_ni(lnet_pid_t requested_pid, struct lnet_ioctl_config_data *conf) -{ - char *nets = conf->cfg_config_u.cfg_net.net_intf; - struct lnet_ping_info *pinfo; - struct lnet_handle_md md_handle; - struct lnet_ni *ni; - struct list_head net_head; - struct lnet_remotenet *rnet; - int rc; - - INIT_LIST_HEAD(&net_head); - - /* Create a ni structure for the network string */ - rc = lnet_parse_networks(&net_head, nets); - if (rc <= 0) - return !rc ? -EINVAL : rc; - - mutex_lock(&the_lnet.ln_api_mutex); - - if (rc > 1) { - rc = -EINVAL; /* only add one interface per call */ - goto failed0; - } - - ni = list_entry(net_head.next, struct lnet_ni, ni_list); - - lnet_net_lock(LNET_LOCK_EX); - rnet = lnet_find_net_locked(LNET_NIDNET(ni->ni_nid)); - lnet_net_unlock(LNET_LOCK_EX); - /* - * make sure that the net added doesn't invalidate the current - * configuration LNet is keeping - */ - if (rnet) { - CERROR("Adding net %s will invalidate routing configuration\n", - nets); - rc = -EUSERS; - goto failed0; - } - - rc = lnet_ping_info_setup(&pinfo, &md_handle, 1 + lnet_get_ni_count(), - false); - if (rc) - goto failed0; - - list_del_init(&ni->ni_list); - - rc = lnet_startup_lndni(ni, conf); - if (rc) - goto failed1; - - if (ni->ni_lnd->lnd_accept) { - rc = lnet_acceptor_start(); - if (rc < 0) { - /* shutdown the ni that we just started */ - CERROR("Failed to start up acceptor thread\n"); - lnet_shutdown_lndni(ni); - goto failed1; - } - } - - lnet_ping_target_update(pinfo, md_handle); - mutex_unlock(&the_lnet.ln_api_mutex); - - return 0; - -failed1: - lnet_ping_md_unlink(pinfo, &md_handle); - lnet_ping_info_free(pinfo); -failed0: - mutex_unlock(&the_lnet.ln_api_mutex); - while (!list_empty(&net_head)) { - ni = list_entry(net_head.next, struct lnet_ni, ni_list); - list_del_init(&ni->ni_list); - lnet_ni_free(ni); - } - return rc; -} - -int -lnet_dyn_del_ni(__u32 net) -{ - struct lnet_ni *ni; - struct lnet_ping_info *pinfo; - struct lnet_handle_md md_handle; - int rc; - - /* don't allow userspace to shutdown the LOLND */ - if (LNET_NETTYP(net) == LOLND) - return -EINVAL; - - mutex_lock(&the_lnet.ln_api_mutex); - /* create and link a new ping info, before removing the old one */ - rc = lnet_ping_info_setup(&pinfo, &md_handle, - lnet_get_ni_count() - 1, false); - if (rc) - goto out; - - ni = lnet_net2ni(net); - if (!ni) { - rc = -EINVAL; - goto failed; - } - - /* decrement the reference counter taken by lnet_net2ni() */ - lnet_ni_decref_locked(ni, 0); - - lnet_shutdown_lndni(ni); - - if (!lnet_count_acceptor_nis()) - lnet_acceptor_stop(); - - lnet_ping_target_update(pinfo, md_handle); - goto out; -failed: - lnet_ping_md_unlink(pinfo, &md_handle); - lnet_ping_info_free(pinfo); -out: - mutex_unlock(&the_lnet.ln_api_mutex); - - return rc; -} - -/** - * LNet ioctl handler. - * - */ -int -LNetCtl(unsigned int cmd, void *arg) -{ - struct libcfs_ioctl_data *data = arg; - struct lnet_ioctl_config_data *config; - struct lnet_process_id id = {0}; - struct lnet_ni *ni; - int rc; - unsigned long secs_passed; - - BUILD_BUG_ON(LIBCFS_IOC_DATA_MAX < - sizeof(struct lnet_ioctl_net_config) + - sizeof(struct lnet_ioctl_config_data)); - - switch (cmd) { - case IOC_LIBCFS_GET_NI: - rc = LNetGetId(data->ioc_count, &id); - data->ioc_nid = id.nid; - return rc; - - case IOC_LIBCFS_FAIL_NID: - return lnet_fail_nid(data->ioc_nid, data->ioc_count); - - case IOC_LIBCFS_ADD_ROUTE: - config = arg; - - if (config->cfg_hdr.ioc_len < sizeof(*config)) - return -EINVAL; - - mutex_lock(&the_lnet.ln_api_mutex); - rc = lnet_add_route(config->cfg_net, - config->cfg_config_u.cfg_route.rtr_hop, - config->cfg_nid, - config->cfg_config_u.cfg_route.rtr_priority); - if (!rc) { - rc = lnet_check_routes(); - if (rc) - lnet_del_route(config->cfg_net, - config->cfg_nid); - } - mutex_unlock(&the_lnet.ln_api_mutex); - return rc; - - case IOC_LIBCFS_DEL_ROUTE: - config = arg; - - if (config->cfg_hdr.ioc_len < sizeof(*config)) - return -EINVAL; - - mutex_lock(&the_lnet.ln_api_mutex); - rc = lnet_del_route(config->cfg_net, config->cfg_nid); - mutex_unlock(&the_lnet.ln_api_mutex); - return rc; - - case IOC_LIBCFS_GET_ROUTE: - config = arg; - - if (config->cfg_hdr.ioc_len < sizeof(*config)) - return -EINVAL; - - return lnet_get_route(config->cfg_count, - &config->cfg_net, - &config->cfg_config_u.cfg_route.rtr_hop, - &config->cfg_nid, - &config->cfg_config_u.cfg_route.rtr_flags, - &config->cfg_config_u.cfg_route.rtr_priority); - - case IOC_LIBCFS_GET_NET: { - size_t total = sizeof(*config) + - sizeof(struct lnet_ioctl_net_config); - config = arg; - - if (config->cfg_hdr.ioc_len < total) - return -EINVAL; - - return lnet_get_net_config(config); - } - - case IOC_LIBCFS_GET_LNET_STATS: { - struct lnet_ioctl_lnet_stats *lnet_stats = arg; - - if (lnet_stats->st_hdr.ioc_len < sizeof(*lnet_stats)) - return -EINVAL; - - lnet_counters_get(&lnet_stats->st_cntrs); - return 0; - } - - case IOC_LIBCFS_CONFIG_RTR: - config = arg; - - if (config->cfg_hdr.ioc_len < sizeof(*config)) - return -EINVAL; - - mutex_lock(&the_lnet.ln_api_mutex); - if (config->cfg_config_u.cfg_buffers.buf_enable) { - rc = lnet_rtrpools_enable(); - mutex_unlock(&the_lnet.ln_api_mutex); - return rc; - } - lnet_rtrpools_disable(); - mutex_unlock(&the_lnet.ln_api_mutex); - return 0; - - case IOC_LIBCFS_ADD_BUF: - config = arg; - - if (config->cfg_hdr.ioc_len < sizeof(*config)) - return -EINVAL; - - mutex_lock(&the_lnet.ln_api_mutex); - rc = lnet_rtrpools_adjust(config->cfg_config_u.cfg_buffers.buf_tiny, - config->cfg_config_u.cfg_buffers.buf_small, - config->cfg_config_u.cfg_buffers.buf_large); - mutex_unlock(&the_lnet.ln_api_mutex); - return rc; - - case IOC_LIBCFS_GET_BUF: { - struct lnet_ioctl_pool_cfg *pool_cfg; - size_t total = sizeof(*config) + sizeof(*pool_cfg); - - config = arg; - - if (config->cfg_hdr.ioc_len < total) - return -EINVAL; - - pool_cfg = (struct lnet_ioctl_pool_cfg *)config->cfg_bulk; - return lnet_get_rtr_pool_cfg(config->cfg_count, pool_cfg); - } - - case IOC_LIBCFS_GET_PEER_INFO: { - struct lnet_ioctl_peer *peer_info = arg; - - if (peer_info->pr_hdr.ioc_len < sizeof(*peer_info)) - return -EINVAL; - - return lnet_get_peer_info(peer_info->pr_count, - &peer_info->pr_nid, - peer_info->pr_lnd_u.pr_peer_credits.cr_aliveness, - &peer_info->pr_lnd_u.pr_peer_credits.cr_ncpt, - &peer_info->pr_lnd_u.pr_peer_credits.cr_refcount, - &peer_info->pr_lnd_u.pr_peer_credits.cr_ni_peer_tx_credits, - &peer_info->pr_lnd_u.pr_peer_credits.cr_peer_tx_credits, - &peer_info->pr_lnd_u.pr_peer_credits.cr_peer_rtr_credits, - &peer_info->pr_lnd_u.pr_peer_credits.cr_peer_min_rtr_credits, - &peer_info->pr_lnd_u.pr_peer_credits.cr_peer_tx_qnob); - } - - case IOC_LIBCFS_NOTIFY_ROUTER: - secs_passed = (ktime_get_real_seconds() - data->ioc_u64[0]); - secs_passed *= msecs_to_jiffies(MSEC_PER_SEC); - - return lnet_notify(NULL, data->ioc_nid, data->ioc_flags, - jiffies - secs_passed); - - case IOC_LIBCFS_LNET_DIST: - rc = LNetDist(data->ioc_nid, &data->ioc_nid, &data->ioc_u32[1]); - if (rc < 0 && rc != -EHOSTUNREACH) - return rc; - - data->ioc_u32[0] = rc; - return 0; - - case IOC_LIBCFS_TESTPROTOCOMPAT: - lnet_net_lock(LNET_LOCK_EX); - the_lnet.ln_testprotocompat = data->ioc_flags; - lnet_net_unlock(LNET_LOCK_EX); - return 0; - - case IOC_LIBCFS_LNET_FAULT: - return lnet_fault_ctl(data->ioc_flags, data); - - case IOC_LIBCFS_PING: - id.nid = data->ioc_nid; - id.pid = data->ioc_u32[0]; - rc = lnet_ping(id, data->ioc_u32[1], /* timeout */ - data->ioc_pbuf1, - data->ioc_plen1 / sizeof(struct lnet_process_id)); - if (rc < 0) - return rc; - data->ioc_count = rc; - return 0; - - default: - ni = lnet_net2ni(data->ioc_net); - if (!ni) - return -EINVAL; - - if (!ni->ni_lnd->lnd_ctl) - rc = -EINVAL; - else - rc = ni->ni_lnd->lnd_ctl(ni, cmd, arg); - - lnet_ni_decref(ni); - return rc; - } - /* not reached */ -} -EXPORT_SYMBOL(LNetCtl); - -void LNetDebugPeer(struct lnet_process_id id) -{ - lnet_debug_peer(id.nid); -} -EXPORT_SYMBOL(LNetDebugPeer); - -/** - * Retrieve the lnet_process_id ID of LNet interface at \a index. Note that - * all interfaces share a same PID, as requested by LNetNIInit(). - * - * \param index Index of the interface to look up. - * \param id On successful return, this location will hold the - * lnet_process_id ID of the interface. - * - * \retval 0 If an interface exists at \a index. - * \retval -ENOENT If no interface has been found. - */ -int -LNetGetId(unsigned int index, struct lnet_process_id *id) -{ - struct lnet_ni *ni; - struct list_head *tmp; - int cpt; - int rc = -ENOENT; - - LASSERT(the_lnet.ln_refcount > 0); - - cpt = lnet_net_lock_current(); - - list_for_each(tmp, &the_lnet.ln_nis) { - if (index--) - continue; - - ni = list_entry(tmp, struct lnet_ni, ni_list); - - id->nid = ni->ni_nid; - id->pid = the_lnet.ln_pid; - rc = 0; - break; - } - - lnet_net_unlock(cpt); - return rc; -} -EXPORT_SYMBOL(LNetGetId); - -static int lnet_ping(struct lnet_process_id id, int timeout_ms, - struct lnet_process_id __user *ids, int n_ids) -{ - struct lnet_handle_eq eqh; - struct lnet_handle_md mdh; - struct lnet_event event; - struct lnet_md md = { NULL }; - int which; - int unlinked = 0; - int replied = 0; - const int a_long_time = 60000; /* mS */ - int infosz; - struct lnet_ping_info *info; - struct lnet_process_id tmpid; - int i; - int nob; - int rc; - int rc2; - - infosz = offsetof(struct lnet_ping_info, pi_ni[n_ids]); - - if (n_ids <= 0 || - id.nid == LNET_NID_ANY || - timeout_ms > 500000 || /* arbitrary limit! */ - n_ids > 20) /* arbitrary limit! */ - return -EINVAL; - - if (id.pid == LNET_PID_ANY) - id.pid = LNET_PID_LUSTRE; - - info = kzalloc(infosz, GFP_KERNEL); - if (!info) - return -ENOMEM; - - /* NB 2 events max (including any unlink event) */ - rc = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE, &eqh); - if (rc) { - CERROR("Can't allocate EQ: %d\n", rc); - goto out_0; - } - - /* initialize md content */ - md.start = info; - md.length = infosz; - md.threshold = 2; /*GET/REPLY*/ - md.max_size = 0; - md.options = LNET_MD_TRUNCATE; - md.user_ptr = NULL; - md.eq_handle = eqh; - - rc = LNetMDBind(md, LNET_UNLINK, &mdh); - if (rc) { - CERROR("Can't bind MD: %d\n", rc); - goto out_1; - } - - rc = LNetGet(LNET_NID_ANY, mdh, id, - LNET_RESERVED_PORTAL, - LNET_PROTO_PING_MATCHBITS, 0); - - if (rc) { - /* Don't CERROR; this could be deliberate! */ - - rc2 = LNetMDUnlink(mdh); - LASSERT(!rc2); - - /* NB must wait for the UNLINK event below... */ - unlinked = 1; - timeout_ms = a_long_time; - } - - do { - /* MUST block for unlink to complete */ - - rc2 = LNetEQPoll(&eqh, 1, timeout_ms, !unlinked, - &event, &which); - - CDEBUG(D_NET, "poll %d(%d %d)%s\n", rc2, - (rc2 <= 0) ? -1 : event.type, - (rc2 <= 0) ? -1 : event.status, - (rc2 > 0 && event.unlinked) ? " unlinked" : ""); - - LASSERT(rc2 != -EOVERFLOW); /* can't miss anything */ - - if (rc2 <= 0 || event.status) { - /* timeout or error */ - if (!replied && !rc) - rc = (rc2 < 0) ? rc2 : - !rc2 ? -ETIMEDOUT : - event.status; - - if (!unlinked) { - /* Ensure completion in finite time... */ - LNetMDUnlink(mdh); - /* No assertion (racing with network) */ - unlinked = 1; - timeout_ms = a_long_time; - } else if (!rc2) { - /* timed out waiting for unlink */ - CWARN("ping %s: late network completion\n", - libcfs_id2str(id)); - } - } else if (event.type == LNET_EVENT_REPLY) { - replied = 1; - rc = event.mlength; - } - - } while (rc2 <= 0 || !event.unlinked); - - if (!replied) { - if (rc >= 0) - CWARN("%s: Unexpected rc >= 0 but no reply!\n", - libcfs_id2str(id)); - rc = -EIO; - goto out_1; - } - - nob = rc; - LASSERT(nob >= 0 && nob <= infosz); - - rc = -EPROTO; /* if I can't parse... */ - - if (nob < 8) { - /* can't check magic/version */ - CERROR("%s: ping info too short %d\n", - libcfs_id2str(id), nob); - goto out_1; - } - - if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) { - lnet_swap_pinginfo(info); - } else if (info->pi_magic != LNET_PROTO_PING_MAGIC) { - CERROR("%s: Unexpected magic %08x\n", - libcfs_id2str(id), info->pi_magic); - goto out_1; - } - - if (!(info->pi_features & LNET_PING_FEAT_NI_STATUS)) { - CERROR("%s: ping w/o NI status: 0x%x\n", - libcfs_id2str(id), info->pi_features); - goto out_1; - } - - if (nob < offsetof(struct lnet_ping_info, pi_ni[0])) { - CERROR("%s: Short reply %d(%d min)\n", libcfs_id2str(id), - nob, (int)offsetof(struct lnet_ping_info, pi_ni[0])); - goto out_1; - } - - if (info->pi_nnis < n_ids) - n_ids = info->pi_nnis; - - if (nob < offsetof(struct lnet_ping_info, pi_ni[n_ids])) { - CERROR("%s: Short reply %d(%d expected)\n", libcfs_id2str(id), - nob, (int)offsetof(struct lnet_ping_info, pi_ni[n_ids])); - goto out_1; - } - - rc = -EFAULT; /* If I SEGV... */ - - memset(&tmpid, 0, sizeof(tmpid)); - for (i = 0; i < n_ids; i++) { - tmpid.pid = info->pi_pid; - tmpid.nid = info->pi_ni[i].ns_nid; - if (copy_to_user(&ids[i], &tmpid, sizeof(tmpid))) - goto out_1; - } - rc = info->pi_nnis; - - out_1: - rc2 = LNetEQFree(eqh); - if (rc2) - CERROR("rc2 %d\n", rc2); - LASSERT(!rc2); - - out_0: - kfree(info); - return rc; -} diff --git a/drivers/staging/lustre/lnet/lnet/config.c b/drivers/staging/lustre/lnet/lnet/config.c deleted file mode 100644 index 55ecc1998b7e203922e071338ed93bdbc3046a5d..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/lnet/config.c +++ /dev/null @@ -1,1235 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_LNET -#include -#include -#include -#include - -struct lnet_text_buf { /* tmp struct for parsing routes */ - struct list_head ltb_list; /* stash on lists */ - int ltb_size; /* allocated size */ - char ltb_text[0]; /* text buffer */ -}; - -static int lnet_tbnob; /* track text buf allocation */ -#define LNET_MAX_TEXTBUF_NOB (64 << 10) /* bound allocation */ -#define LNET_SINGLE_TEXTBUF_NOB (4 << 10) - -static void -lnet_syntax(char *name, char *str, int offset, int width) -{ - static char dots[LNET_SINGLE_TEXTBUF_NOB]; - static char dashes[LNET_SINGLE_TEXTBUF_NOB]; - - memset(dots, '.', sizeof(dots)); - dots[sizeof(dots) - 1] = 0; - memset(dashes, '-', sizeof(dashes)); - dashes[sizeof(dashes) - 1] = 0; - - LCONSOLE_ERROR_MSG(0x10f, "Error parsing '%s=\"%s\"'\n", name, str); - LCONSOLE_ERROR_MSG(0x110, "here...........%.*s..%.*s|%.*s|\n", - (int)strlen(name), dots, offset, dots, - (width < 1) ? 0 : width - 1, dashes); -} - -static int -lnet_issep(char c) -{ - switch (c) { - case '\n': - case '\r': - case ';': - return 1; - default: - return 0; - } -} - -int -lnet_net_unique(__u32 net, struct list_head *nilist) -{ - struct list_head *tmp; - struct lnet_ni *ni; - - list_for_each(tmp, nilist) { - ni = list_entry(tmp, struct lnet_ni, ni_list); - - if (LNET_NIDNET(ni->ni_nid) == net) - return 0; - } - - return 1; -} - -void -lnet_ni_free(struct lnet_ni *ni) -{ - int i; - - if (ni->ni_refs) - cfs_percpt_free(ni->ni_refs); - - if (ni->ni_tx_queues) - cfs_percpt_free(ni->ni_tx_queues); - - if (ni->ni_cpts) - cfs_expr_list_values_free(ni->ni_cpts, ni->ni_ncpts); - - kfree(ni->ni_lnd_tunables); - - for (i = 0; i < LNET_MAX_INTERFACES && ni->ni_interfaces[i]; i++) - kfree(ni->ni_interfaces[i]); - - /* release reference to net namespace */ - if (ni->ni_net_ns) - put_net(ni->ni_net_ns); - - kfree(ni); -} - -struct lnet_ni * -lnet_ni_alloc(__u32 net, struct cfs_expr_list *el, struct list_head *nilist) -{ - struct lnet_tx_queue *tq; - struct lnet_ni *ni; - int rc; - int i; - - if (!lnet_net_unique(net, nilist)) { - LCONSOLE_ERROR_MSG(0x111, "Duplicate network specified: %s\n", - libcfs_net2str(net)); - return NULL; - } - - ni = kzalloc(sizeof(*ni), GFP_NOFS); - if (!ni) { - CERROR("Out of memory creating network %s\n", - libcfs_net2str(net)); - return NULL; - } - - spin_lock_init(&ni->ni_lock); - INIT_LIST_HEAD(&ni->ni_cptlist); - ni->ni_refs = cfs_percpt_alloc(lnet_cpt_table(), - sizeof(*ni->ni_refs[0])); - if (!ni->ni_refs) - goto failed; - - ni->ni_tx_queues = cfs_percpt_alloc(lnet_cpt_table(), - sizeof(*ni->ni_tx_queues[0])); - if (!ni->ni_tx_queues) - goto failed; - - cfs_percpt_for_each(tq, i, ni->ni_tx_queues) - INIT_LIST_HEAD(&tq->tq_delayed); - - if (!el) { - ni->ni_cpts = NULL; - ni->ni_ncpts = LNET_CPT_NUMBER; - } else { - rc = cfs_expr_list_values(el, LNET_CPT_NUMBER, &ni->ni_cpts); - if (rc <= 0) { - CERROR("Failed to set CPTs for NI %s: %d\n", - libcfs_net2str(net), rc); - goto failed; - } - - LASSERT(rc <= LNET_CPT_NUMBER); - if (rc == LNET_CPT_NUMBER) { - cfs_expr_list_values_free(ni->ni_cpts, LNET_CPT_NUMBER); - ni->ni_cpts = NULL; - } - - ni->ni_ncpts = rc; - } - - /* LND will fill in the address part of the NID */ - ni->ni_nid = LNET_MKNID(net, 0); - - /* Store net namespace in which current ni is being created */ - if (current->nsproxy->net_ns) - ni->ni_net_ns = get_net(current->nsproxy->net_ns); - else - ni->ni_net_ns = NULL; - - ni->ni_last_alive = ktime_get_real_seconds(); - list_add_tail(&ni->ni_list, nilist); - return ni; - failed: - lnet_ni_free(ni); - return NULL; -} - -int -lnet_parse_networks(struct list_head *nilist, char *networks) -{ - struct cfs_expr_list *el = NULL; - char *tokens; - char *str; - char *tmp; - struct lnet_ni *ni; - __u32 net; - int nnets = 0; - struct list_head *temp_node; - - if (!networks) { - CERROR("networks string is undefined\n"); - return -EINVAL; - } - - if (strlen(networks) > LNET_SINGLE_TEXTBUF_NOB) { - /* _WAY_ conservative */ - LCONSOLE_ERROR_MSG(0x112, - "Can't parse networks: string too long\n"); - return -EINVAL; - } - - tokens = kstrdup(networks, GFP_KERNEL); - if (!tokens) { - CERROR("Can't allocate net tokens\n"); - return -ENOMEM; - } - - tmp = tokens; - str = tokens; - - while (str && *str) { - char *comma = strchr(str, ','); - char *bracket = strchr(str, '('); - char *square = strchr(str, '['); - char *iface; - int niface; - int rc; - - /* - * NB we don't check interface conflicts here; it's the LNDs - * responsibility (if it cares at all) - */ - if (square && (!comma || square < comma)) { - /* - * i.e: o2ib0(ib0)[1,2], number between square - * brackets are CPTs this NI needs to be bond - */ - if (bracket && bracket > square) { - tmp = square; - goto failed_syntax; - } - - tmp = strchr(square, ']'); - if (!tmp) { - tmp = square; - goto failed_syntax; - } - - rc = cfs_expr_list_parse(square, tmp - square + 1, - 0, LNET_CPT_NUMBER - 1, &el); - if (rc) { - tmp = square; - goto failed_syntax; - } - - while (square <= tmp) - *square++ = ' '; - } - - if (!bracket || (comma && comma < bracket)) { - /* no interface list specified */ - - if (comma) - *comma++ = 0; - net = libcfs_str2net(strim(str)); - - if (net == LNET_NIDNET(LNET_NID_ANY)) { - LCONSOLE_ERROR_MSG(0x113, - "Unrecognised network type\n"); - tmp = str; - goto failed_syntax; - } - - if (LNET_NETTYP(net) != LOLND && /* LO is implicit */ - !lnet_ni_alloc(net, el, nilist)) - goto failed; - - if (el) { - cfs_expr_list_free(el); - el = NULL; - } - - str = comma; - continue; - } - - *bracket = 0; - net = libcfs_str2net(strim(str)); - if (net == LNET_NIDNET(LNET_NID_ANY)) { - tmp = str; - goto failed_syntax; - } - - ni = lnet_ni_alloc(net, el, nilist); - if (!ni) - goto failed; - - if (el) { - cfs_expr_list_free(el); - el = NULL; - } - - niface = 0; - iface = bracket + 1; - - bracket = strchr(iface, ')'); - if (!bracket) { - tmp = iface; - goto failed_syntax; - } - - *bracket = 0; - do { - comma = strchr(iface, ','); - if (comma) - *comma++ = 0; - - iface = strim(iface); - if (!*iface) { - tmp = iface; - goto failed_syntax; - } - - if (niface == LNET_MAX_INTERFACES) { - LCONSOLE_ERROR_MSG(0x115, - "Too many interfaces for net %s\n", - libcfs_net2str(net)); - goto failed; - } - - /* - * Allocate a separate piece of memory and copy - * into it the string, so we don't have - * a depencency on the tokens string. This way we - * can free the tokens at the end of the function. - * The newly allocated ni_interfaces[] can be - * freed when freeing the NI - */ - ni->ni_interfaces[niface] = kstrdup(iface, GFP_KERNEL); - if (!ni->ni_interfaces[niface]) { - CERROR("Can't allocate net interface name\n"); - goto failed; - } - niface++; - iface = comma; - } while (iface); - - str = bracket + 1; - comma = strchr(bracket + 1, ','); - if (comma) { - *comma = 0; - str = strim(str); - if (*str) { - tmp = str; - goto failed_syntax; - } - str = comma + 1; - continue; - } - - str = strim(str); - if (*str) { - tmp = str; - goto failed_syntax; - } - } - - list_for_each(temp_node, nilist) - nnets++; - - kfree(tokens); - return nnets; - - failed_syntax: - lnet_syntax("networks", networks, (int)(tmp - tokens), strlen(tmp)); - failed: - while (!list_empty(nilist)) { - ni = list_entry(nilist->next, struct lnet_ni, ni_list); - - list_del(&ni->ni_list); - lnet_ni_free(ni); - } - - if (el) - cfs_expr_list_free(el); - - kfree(tokens); - - return -EINVAL; -} - -static struct lnet_text_buf * -lnet_new_text_buf(int str_len) -{ - struct lnet_text_buf *ltb; - int nob; - - /* NB allocate space for the terminating 0 */ - nob = offsetof(struct lnet_text_buf, ltb_text[str_len + 1]); - if (nob > LNET_SINGLE_TEXTBUF_NOB) { - /* _way_ conservative for "route net gateway..." */ - CERROR("text buffer too big\n"); - return NULL; - } - - if (lnet_tbnob + nob > LNET_MAX_TEXTBUF_NOB) { - CERROR("Too many text buffers\n"); - return NULL; - } - - ltb = kzalloc(nob, GFP_KERNEL); - if (!ltb) - return NULL; - - ltb->ltb_size = nob; - ltb->ltb_text[0] = 0; - lnet_tbnob += nob; - return ltb; -} - -static void -lnet_free_text_buf(struct lnet_text_buf *ltb) -{ - lnet_tbnob -= ltb->ltb_size; - kfree(ltb); -} - -static void -lnet_free_text_bufs(struct list_head *tbs) -{ - struct lnet_text_buf *ltb; - - while (!list_empty(tbs)) { - ltb = list_entry(tbs->next, struct lnet_text_buf, ltb_list); - - list_del(<b->ltb_list); - lnet_free_text_buf(ltb); - } -} - -static int -lnet_str2tbs_sep(struct list_head *tbs, char *str) -{ - struct list_head pending; - char *sep; - int nob; - int i; - struct lnet_text_buf *ltb; - - INIT_LIST_HEAD(&pending); - - /* Split 'str' into separate commands */ - for (;;) { - /* skip leading whitespace */ - while (isspace(*str)) - str++; - - /* scan for separator or comment */ - for (sep = str; *sep; sep++) - if (lnet_issep(*sep) || *sep == '#') - break; - - nob = (int)(sep - str); - if (nob > 0) { - ltb = lnet_new_text_buf(nob); - if (!ltb) { - lnet_free_text_bufs(&pending); - return -ENOMEM; - } - - for (i = 0; i < nob; i++) - if (isspace(str[i])) - ltb->ltb_text[i] = ' '; - else - ltb->ltb_text[i] = str[i]; - - ltb->ltb_text[nob] = 0; - - list_add_tail(<b->ltb_list, &pending); - } - - if (*sep == '#') { - /* scan for separator */ - do { - sep++; - } while (*sep && !lnet_issep(*sep)); - } - - if (!*sep) - break; - - str = sep + 1; - } - - list_splice(&pending, tbs->prev); - return 0; -} - -static int -lnet_expand1tb(struct list_head *list, - char *str, char *sep1, char *sep2, - char *item, int itemlen) -{ - int len1 = (int)(sep1 - str); - int len2 = strlen(sep2 + 1); - struct lnet_text_buf *ltb; - - LASSERT(*sep1 == '['); - LASSERT(*sep2 == ']'); - - ltb = lnet_new_text_buf(len1 + itemlen + len2); - if (!ltb) - return -ENOMEM; - - memcpy(ltb->ltb_text, str, len1); - memcpy(<b->ltb_text[len1], item, itemlen); - memcpy(<b->ltb_text[len1 + itemlen], sep2 + 1, len2); - ltb->ltb_text[len1 + itemlen + len2] = 0; - - list_add_tail(<b->ltb_list, list); - return 0; -} - -static int -lnet_str2tbs_expand(struct list_head *tbs, char *str) -{ - char num[16]; - struct list_head pending; - char *sep; - char *sep2; - char *parsed; - char *enditem; - int lo; - int hi; - int stride; - int i; - int nob; - int scanned; - - INIT_LIST_HEAD(&pending); - - sep = strchr(str, '['); - if (!sep) /* nothing to expand */ - return 0; - - sep2 = strchr(sep, ']'); - if (!sep2) - goto failed; - - for (parsed = sep; parsed < sep2; parsed = enditem) { - enditem = ++parsed; - while (enditem < sep2 && *enditem != ',') - enditem++; - - if (enditem == parsed) /* no empty items */ - goto failed; - - if (sscanf(parsed, "%d-%d/%d%n", &lo, &hi, - &stride, &scanned) < 3) { - if (sscanf(parsed, "%d-%d%n", &lo, &hi, &scanned) < 2) { - /* simple string enumeration */ - if (lnet_expand1tb(&pending, str, sep, sep2, - parsed, - (int)(enditem - parsed))) { - goto failed; - } - continue; - } - - stride = 1; - } - - /* range expansion */ - - if (enditem != parsed + scanned) /* no trailing junk */ - goto failed; - - if (hi < 0 || lo < 0 || stride < 0 || hi < lo || - (hi - lo) % stride) - goto failed; - - for (i = lo; i <= hi; i += stride) { - snprintf(num, sizeof(num), "%d", i); - nob = strlen(num); - if (nob + 1 == sizeof(num)) - goto failed; - - if (lnet_expand1tb(&pending, str, sep, sep2, - num, nob)) - goto failed; - } - } - - list_splice(&pending, tbs->prev); - return 1; - - failed: - lnet_free_text_bufs(&pending); - return -EINVAL; -} - -static int -lnet_parse_hops(char *str, unsigned int *hops) -{ - int len = strlen(str); - int nob = len; - - return (sscanf(str, "%u%n", hops, &nob) >= 1 && - nob == len && - *hops > 0 && *hops < 256); -} - -#define LNET_PRIORITY_SEPARATOR (':') - -static int -lnet_parse_priority(char *str, unsigned int *priority, char **token) -{ - int nob; - char *sep; - int len; - - sep = strchr(str, LNET_PRIORITY_SEPARATOR); - if (!sep) { - *priority = 0; - return 0; - } - len = strlen(sep + 1); - - if ((sscanf((sep + 1), "%u%n", priority, &nob) < 1) || (len != nob)) { - /* - * Update the caller's token pointer so it treats the found - * priority as the token to report in the error message. - */ - *token += sep - str + 1; - return -EINVAL; - } - - CDEBUG(D_NET, "gateway %s, priority %d, nob %d\n", str, *priority, nob); - - /* - * Change priority separator to \0 to be able to parse NID - */ - *sep = '\0'; - return 0; -} - -static int -lnet_parse_route(char *str, int *im_a_router) -{ - /* static scratch buffer OK (single threaded) */ - static char cmd[LNET_SINGLE_TEXTBUF_NOB]; - - struct list_head nets; - struct list_head gateways; - struct list_head *tmp1; - struct list_head *tmp2; - __u32 net; - lnet_nid_t nid; - struct lnet_text_buf *ltb; - int rc; - char *sep; - char *token = str; - int ntokens = 0; - int myrc = -1; - __u32 hops; - int got_hops = 0; - unsigned int priority = 0; - - INIT_LIST_HEAD(&gateways); - INIT_LIST_HEAD(&nets); - - /* save a copy of the string for error messages */ - strncpy(cmd, str, sizeof(cmd)); - cmd[sizeof(cmd) - 1] = '\0'; - - sep = str; - for (;;) { - /* scan for token start */ - while (isspace(*sep)) - sep++; - if (!*sep) { - if (ntokens < (got_hops ? 3 : 2)) - goto token_error; - break; - } - - ntokens++; - token = sep++; - - /* scan for token end */ - while (*sep && !isspace(*sep)) - sep++; - if (*sep) - *sep++ = 0; - - if (ntokens == 1) { - tmp2 = &nets; /* expanding nets */ - } else if (ntokens == 2 && - lnet_parse_hops(token, &hops)) { - got_hops = 1; /* got a hop count */ - continue; - } else { - tmp2 = &gateways; /* expanding gateways */ - } - - ltb = lnet_new_text_buf(strlen(token)); - if (!ltb) - goto out; - - strcpy(ltb->ltb_text, token); - tmp1 = <b->ltb_list; - list_add_tail(tmp1, tmp2); - - while (tmp1 != tmp2) { - ltb = list_entry(tmp1, struct lnet_text_buf, ltb_list); - - rc = lnet_str2tbs_expand(tmp1->next, ltb->ltb_text); - if (rc < 0) - goto token_error; - - tmp1 = tmp1->next; - - if (rc > 0) { /* expanded! */ - list_del(<b->ltb_list); - lnet_free_text_buf(ltb); - continue; - } - - if (ntokens == 1) { - net = libcfs_str2net(ltb->ltb_text); - if (net == LNET_NIDNET(LNET_NID_ANY) || - LNET_NETTYP(net) == LOLND) - goto token_error; - } else { - rc = lnet_parse_priority(ltb->ltb_text, - &priority, &token); - if (rc < 0) - goto token_error; - - nid = libcfs_str2nid(ltb->ltb_text); - if (nid == LNET_NID_ANY || - LNET_NETTYP(LNET_NIDNET(nid)) == LOLND) - goto token_error; - } - } - } - - /** - * if there are no hops set then we want to flag this value as - * unset since hops is an optional parameter - */ - if (!got_hops) - hops = LNET_UNDEFINED_HOPS; - - LASSERT(!list_empty(&nets)); - LASSERT(!list_empty(&gateways)); - - list_for_each(tmp1, &nets) { - ltb = list_entry(tmp1, struct lnet_text_buf, ltb_list); - net = libcfs_str2net(ltb->ltb_text); - LASSERT(net != LNET_NIDNET(LNET_NID_ANY)); - - list_for_each(tmp2, &gateways) { - ltb = list_entry(tmp2, struct lnet_text_buf, ltb_list); - nid = libcfs_str2nid(ltb->ltb_text); - LASSERT(nid != LNET_NID_ANY); - - if (lnet_islocalnid(nid)) { - *im_a_router = 1; - continue; - } - - rc = lnet_add_route(net, hops, nid, priority); - if (rc && rc != -EEXIST && rc != -EHOSTUNREACH) { - CERROR("Can't create route to %s via %s\n", - libcfs_net2str(net), - libcfs_nid2str(nid)); - goto out; - } - } - } - - myrc = 0; - goto out; - - token_error: - lnet_syntax("routes", cmd, (int)(token - str), strlen(token)); - out: - lnet_free_text_bufs(&nets); - lnet_free_text_bufs(&gateways); - return myrc; -} - -static int -lnet_parse_route_tbs(struct list_head *tbs, int *im_a_router) -{ - struct lnet_text_buf *ltb; - - while (!list_empty(tbs)) { - ltb = list_entry(tbs->next, struct lnet_text_buf, ltb_list); - - if (lnet_parse_route(ltb->ltb_text, im_a_router) < 0) { - lnet_free_text_bufs(tbs); - return -EINVAL; - } - - list_del(<b->ltb_list); - lnet_free_text_buf(ltb); - } - - return 0; -} - -int -lnet_parse_routes(char *routes, int *im_a_router) -{ - struct list_head tbs; - int rc = 0; - - *im_a_router = 0; - - INIT_LIST_HEAD(&tbs); - - if (lnet_str2tbs_sep(&tbs, routes) < 0) { - CERROR("Error parsing routes\n"); - rc = -EINVAL; - } else { - rc = lnet_parse_route_tbs(&tbs, im_a_router); - } - - LASSERT(!lnet_tbnob); - return rc; -} - -static int -lnet_match_network_token(char *token, int len, __u32 *ipaddrs, int nip) -{ - LIST_HEAD(list); - int rc; - int i; - - rc = cfs_ip_addr_parse(token, len, &list); - if (rc) - return rc; - - for (rc = i = 0; !rc && i < nip; i++) - rc = cfs_ip_addr_match(ipaddrs[i], &list); - - cfs_expr_list_free_list(&list); - - return rc; -} - -static int -lnet_match_network_tokens(char *net_entry, __u32 *ipaddrs, int nip) -{ - static char tokens[LNET_SINGLE_TEXTBUF_NOB]; - - int matched = 0; - int ntokens = 0; - int len; - char *net = NULL; - char *sep; - char *token; - int rc; - - LASSERT(strlen(net_entry) < sizeof(tokens)); - - /* work on a copy of the string */ - strcpy(tokens, net_entry); - sep = tokens; - for (;;) { - /* scan for token start */ - while (isspace(*sep)) - sep++; - if (!*sep) - break; - - token = sep++; - - /* scan for token end */ - while (*sep && !isspace(*sep)) - sep++; - if (*sep) - *sep++ = 0; - - if (!ntokens++) { - net = token; - continue; - } - - len = strlen(token); - - rc = lnet_match_network_token(token, len, ipaddrs, nip); - if (rc < 0) { - lnet_syntax("ip2nets", net_entry, - (int)(token - tokens), len); - return rc; - } - - if (rc) - matched |= 1; - } - - if (!matched) - return 0; - - strcpy(net_entry, net); /* replace with matched net */ - return 1; -} - -static __u32 -lnet_netspec2net(char *netspec) -{ - char *bracket = strchr(netspec, '('); - __u32 net; - - if (bracket) - *bracket = 0; - - net = libcfs_str2net(netspec); - - if (bracket) - *bracket = '('; - - return net; -} - -static int -lnet_splitnets(char *source, struct list_head *nets) -{ - int offset = 0; - int offset2; - int len; - struct lnet_text_buf *tb; - struct lnet_text_buf *tb2; - struct list_head *t; - char *sep; - char *bracket; - __u32 net; - - LASSERT(!list_empty(nets)); - LASSERT(nets->next == nets->prev); /* single entry */ - - tb = list_entry(nets->next, struct lnet_text_buf, ltb_list); - - for (;;) { - sep = strchr(tb->ltb_text, ','); - bracket = strchr(tb->ltb_text, '('); - - if (sep && bracket && bracket < sep) { - /* netspec lists interfaces... */ - - offset2 = offset + (int)(bracket - tb->ltb_text); - len = strlen(bracket); - - bracket = strchr(bracket + 1, ')'); - - if (!bracket || - !(bracket[1] == ',' || !bracket[1])) { - lnet_syntax("ip2nets", source, offset2, len); - return -EINVAL; - } - - sep = !bracket[1] ? NULL : bracket + 1; - } - - if (sep) - *sep++ = 0; - - net = lnet_netspec2net(tb->ltb_text); - if (net == LNET_NIDNET(LNET_NID_ANY)) { - lnet_syntax("ip2nets", source, offset, - strlen(tb->ltb_text)); - return -EINVAL; - } - - list_for_each(t, nets) { - tb2 = list_entry(t, struct lnet_text_buf, ltb_list); - - if (tb2 == tb) - continue; - - if (net == lnet_netspec2net(tb2->ltb_text)) { - /* duplicate network */ - lnet_syntax("ip2nets", source, offset, - strlen(tb->ltb_text)); - return -EINVAL; - } - } - - if (!sep) - return 0; - - offset += (int)(sep - tb->ltb_text); - len = strlen(sep); - tb2 = lnet_new_text_buf(len); - if (!tb2) - return -ENOMEM; - - strncpy(tb2->ltb_text, sep, len); - tb2->ltb_text[len] = '\0'; - list_add_tail(&tb2->ltb_list, nets); - - tb = tb2; - } -} - -static int -lnet_match_networks(char **networksp, char *ip2nets, __u32 *ipaddrs, int nip) -{ - static char networks[LNET_SINGLE_TEXTBUF_NOB]; - static char source[LNET_SINGLE_TEXTBUF_NOB]; - - struct list_head raw_entries; - struct list_head matched_nets; - struct list_head current_nets; - struct list_head *t; - struct list_head *t2; - struct lnet_text_buf *tb; - struct lnet_text_buf *temp; - struct lnet_text_buf *tb2; - __u32 net1; - __u32 net2; - int len; - int count; - int dup; - int rc; - - INIT_LIST_HEAD(&raw_entries); - if (lnet_str2tbs_sep(&raw_entries, ip2nets) < 0) { - CERROR("Error parsing ip2nets\n"); - LASSERT(!lnet_tbnob); - return -EINVAL; - } - - INIT_LIST_HEAD(&matched_nets); - INIT_LIST_HEAD(¤t_nets); - networks[0] = 0; - count = 0; - len = 0; - rc = 0; - - list_for_each_entry_safe(tb, temp, &raw_entries, ltb_list) { - strncpy(source, tb->ltb_text, sizeof(source)); - source[sizeof(source) - 1] = '\0'; - - /* replace ltb_text with the network(s) add on match */ - rc = lnet_match_network_tokens(tb->ltb_text, ipaddrs, nip); - if (rc < 0) - break; - - list_del(&tb->ltb_list); - - if (!rc) { /* no match */ - lnet_free_text_buf(tb); - continue; - } - - /* split into separate networks */ - INIT_LIST_HEAD(¤t_nets); - list_add(&tb->ltb_list, ¤t_nets); - rc = lnet_splitnets(source, ¤t_nets); - if (rc < 0) - break; - - dup = 0; - list_for_each(t, ¤t_nets) { - tb = list_entry(t, struct lnet_text_buf, ltb_list); - net1 = lnet_netspec2net(tb->ltb_text); - LASSERT(net1 != LNET_NIDNET(LNET_NID_ANY)); - - list_for_each(t2, &matched_nets) { - tb2 = list_entry(t2, struct lnet_text_buf, - ltb_list); - net2 = lnet_netspec2net(tb2->ltb_text); - LASSERT(net2 != LNET_NIDNET(LNET_NID_ANY)); - - if (net1 == net2) { - dup = 1; - break; - } - } - - if (dup) - break; - } - - if (dup) { - lnet_free_text_bufs(¤t_nets); - continue; - } - - list_for_each_safe(t, t2, ¤t_nets) { - tb = list_entry(t, struct lnet_text_buf, ltb_list); - - list_del(&tb->ltb_list); - list_add_tail(&tb->ltb_list, &matched_nets); - - len += snprintf(networks + len, sizeof(networks) - len, - "%s%s", !len ? "" : ",", - tb->ltb_text); - - if (len >= sizeof(networks)) { - CERROR("Too many matched networks\n"); - rc = -E2BIG; - goto out; - } - } - - count++; - } - - out: - lnet_free_text_bufs(&raw_entries); - lnet_free_text_bufs(&matched_nets); - lnet_free_text_bufs(¤t_nets); - LASSERT(!lnet_tbnob); - - if (rc < 0) - return rc; - - *networksp = networks; - return count; -} - -static int -lnet_ipaddr_enumerate(__u32 **ipaddrsp) -{ - int up; - __u32 netmask; - __u32 *ipaddrs; - __u32 *ipaddrs2; - int nip; - char **ifnames; - int nif = lnet_ipif_enumerate(&ifnames); - int i; - int rc; - - if (nif <= 0) - return nif; - - ipaddrs = kcalloc(nif, sizeof(*ipaddrs), GFP_KERNEL); - if (!ipaddrs) { - CERROR("Can't allocate ipaddrs[%d]\n", nif); - lnet_ipif_free_enumeration(ifnames, nif); - return -ENOMEM; - } - - for (i = nip = 0; i < nif; i++) { - if (!strcmp(ifnames[i], "lo")) - continue; - - rc = lnet_ipif_query(ifnames[i], &up, &ipaddrs[nip], &netmask); - if (rc) { - CWARN("Can't query interface %s: %d\n", - ifnames[i], rc); - continue; - } - - if (!up) { - CWARN("Ignoring interface %s: it's down\n", - ifnames[i]); - continue; - } - - nip++; - } - - lnet_ipif_free_enumeration(ifnames, nif); - - if (nip == nif) { - *ipaddrsp = ipaddrs; - } else { - if (nip > 0) { - ipaddrs2 = kcalloc(nip, sizeof(*ipaddrs2), - GFP_KERNEL); - if (!ipaddrs2) { - CERROR("Can't allocate ipaddrs[%d]\n", nip); - nip = -ENOMEM; - } else { - memcpy(ipaddrs2, ipaddrs, - nip * sizeof(*ipaddrs)); - *ipaddrsp = ipaddrs2; - rc = nip; - } - } - kfree(ipaddrs); - } - return nip; -} - -int -lnet_parse_ip2nets(char **networksp, char *ip2nets) -{ - __u32 *ipaddrs = NULL; - int nip = lnet_ipaddr_enumerate(&ipaddrs); - int rc; - - if (nip < 0) { - LCONSOLE_ERROR_MSG(0x117, - "Error %d enumerating local IP interfaces for ip2nets to match\n", - nip); - return nip; - } - - if (!nip) { - LCONSOLE_ERROR_MSG(0x118, - "No local IP interfaces for ip2nets to match\n"); - return -ENOENT; - } - - rc = lnet_match_networks(networksp, ip2nets, ipaddrs, nip); - kfree(ipaddrs); - - if (rc < 0) { - LCONSOLE_ERROR_MSG(0x119, "Error %d parsing ip2nets\n", rc); - return rc; - } - - if (!rc) { - LCONSOLE_ERROR_MSG(0x11a, - "ip2nets does not match any local IP interfaces\n"); - return -ENOENT; - } - - return 0; -} diff --git a/drivers/staging/lustre/lnet/lnet/lib-eq.c b/drivers/staging/lustre/lnet/lnet/lib-eq.c deleted file mode 100644 index c78e70373ab4382f82a0668fe38c1cd4aba346de..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/lnet/lib-eq.c +++ /dev/null @@ -1,426 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/lnet/lib-eq.c - * - * Library level Event queue management routines - */ - -#define DEBUG_SUBSYSTEM S_LNET - -#include - -/** - * Create an event queue that has room for \a count number of events. - * - * The event queue is circular and older events will be overwritten by new - * ones if they are not removed in time by the user using the functions - * LNetEQGet(), LNetEQWait(), or LNetEQPoll(). It is up to the user to - * determine the appropriate size of the event queue to prevent this loss - * of events. Note that when EQ handler is specified in \a callback, no - * event loss can happen, since the handler is run for each event deposited - * into the EQ. - * - * \param count The number of events to be stored in the event queue. It - * will be rounded up to the next power of two. - * \param callback A handler function that runs when an event is deposited - * into the EQ. The constant value LNET_EQ_HANDLER_NONE can be used to - * indicate that no event handler is desired. - * \param handle On successful return, this location will hold a handle for - * the newly created EQ. - * - * \retval 0 On success. - * \retval -EINVAL If an parameter is not valid. - * \retval -ENOMEM If memory for the EQ can't be allocated. - * - * \see lnet_eq_handler_t for the discussion on EQ handler semantics. - */ -int -LNetEQAlloc(unsigned int count, lnet_eq_handler_t callback, - struct lnet_handle_eq *handle) -{ - struct lnet_eq *eq; - - LASSERT(the_lnet.ln_refcount > 0); - - /* - * We need count to be a power of 2 so that when eq_{enq,deq}_seq - * overflow, they don't skip entries, so the queue has the same - * apparent capacity at all times - */ - if (count) - count = roundup_pow_of_two(count); - - if (callback != LNET_EQ_HANDLER_NONE && count) - CWARN("EQ callback is guaranteed to get every event, do you still want to set eqcount %d for polling event which will have locking overhead? Please contact with developer to confirm\n", count); - - /* - * count can be 0 if only need callback, we can eliminate - * overhead of enqueue event - */ - if (!count && callback == LNET_EQ_HANDLER_NONE) - return -EINVAL; - - eq = kzalloc(sizeof(*eq), GFP_NOFS); - if (!eq) - return -ENOMEM; - - if (count) { - eq->eq_events = kvmalloc_array(count, sizeof(struct lnet_event), - GFP_KERNEL | __GFP_ZERO); - if (!eq->eq_events) - goto failed; - /* - * NB allocator has set all event sequence numbers to 0, - * so all them should be earlier than eq_deq_seq - */ - } - - eq->eq_deq_seq = 1; - eq->eq_enq_seq = 1; - eq->eq_size = count; - eq->eq_callback = callback; - - eq->eq_refs = cfs_percpt_alloc(lnet_cpt_table(), - sizeof(*eq->eq_refs[0])); - if (!eq->eq_refs) - goto failed; - - /* MUST hold both exclusive lnet_res_lock */ - lnet_res_lock(LNET_LOCK_EX); - /* - * NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do - * both EQ lookup and poll event with only lnet_eq_wait_lock - */ - lnet_eq_wait_lock(); - - lnet_res_lh_initialize(&the_lnet.ln_eq_container, &eq->eq_lh); - list_add(&eq->eq_list, &the_lnet.ln_eq_container.rec_active); - - lnet_eq_wait_unlock(); - lnet_res_unlock(LNET_LOCK_EX); - - lnet_eq2handle(handle, eq); - return 0; - -failed: - kvfree(eq->eq_events); - - if (eq->eq_refs) - cfs_percpt_free(eq->eq_refs); - - kfree(eq); - return -ENOMEM; -} -EXPORT_SYMBOL(LNetEQAlloc); - -/** - * Release the resources associated with an event queue if it's idle; - * otherwise do nothing and it's up to the user to try again. - * - * \param eqh A handle for the event queue to be released. - * - * \retval 0 If the EQ is not in use and freed. - * \retval -ENOENT If \a eqh does not point to a valid EQ. - * \retval -EBUSY If the EQ is still in use by some MDs. - */ -int -LNetEQFree(struct lnet_handle_eq eqh) -{ - struct lnet_eq *eq; - struct lnet_event *events = NULL; - int **refs = NULL; - int *ref; - int rc = 0; - int size = 0; - int i; - - LASSERT(the_lnet.ln_refcount > 0); - - lnet_res_lock(LNET_LOCK_EX); - /* - * NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do - * both EQ lookup and poll event with only lnet_eq_wait_lock - */ - lnet_eq_wait_lock(); - - eq = lnet_handle2eq(&eqh); - if (!eq) { - rc = -ENOENT; - goto out; - } - - cfs_percpt_for_each(ref, i, eq->eq_refs) { - LASSERT(*ref >= 0); - if (!*ref) - continue; - - CDEBUG(D_NET, "Event equeue (%d: %d) busy on destroy.\n", - i, *ref); - rc = -EBUSY; - goto out; - } - - /* stash for free after lock dropped */ - events = eq->eq_events; - size = eq->eq_size; - refs = eq->eq_refs; - - lnet_res_lh_invalidate(&eq->eq_lh); - list_del(&eq->eq_list); - kfree(eq); - out: - lnet_eq_wait_unlock(); - lnet_res_unlock(LNET_LOCK_EX); - - kvfree(events); - if (refs) - cfs_percpt_free(refs); - - return rc; -} -EXPORT_SYMBOL(LNetEQFree); - -void -lnet_eq_enqueue_event(struct lnet_eq *eq, struct lnet_event *ev) -{ - /* MUST called with resource lock hold but w/o lnet_eq_wait_lock */ - int index; - - if (!eq->eq_size) { - LASSERT(eq->eq_callback != LNET_EQ_HANDLER_NONE); - eq->eq_callback(ev); - return; - } - - lnet_eq_wait_lock(); - ev->sequence = eq->eq_enq_seq++; - - LASSERT(is_power_of_2(eq->eq_size)); - index = ev->sequence & (eq->eq_size - 1); - - eq->eq_events[index] = *ev; - - if (eq->eq_callback != LNET_EQ_HANDLER_NONE) - eq->eq_callback(ev); - - /* Wake anyone waiting in LNetEQPoll() */ - if (waitqueue_active(&the_lnet.ln_eq_waitq)) - wake_up_all(&the_lnet.ln_eq_waitq); - lnet_eq_wait_unlock(); -} - -static int -lnet_eq_dequeue_event(struct lnet_eq *eq, struct lnet_event *ev) -{ - int new_index = eq->eq_deq_seq & (eq->eq_size - 1); - struct lnet_event *new_event = &eq->eq_events[new_index]; - int rc; - - /* must called with lnet_eq_wait_lock hold */ - if (LNET_SEQ_GT(eq->eq_deq_seq, new_event->sequence)) - return 0; - - /* We've got a new event... */ - *ev = *new_event; - - CDEBUG(D_INFO, "event: %p, sequence: %lu, eq->size: %u\n", - new_event, eq->eq_deq_seq, eq->eq_size); - - /* ...but did it overwrite an event we've not seen yet? */ - if (eq->eq_deq_seq == new_event->sequence) { - rc = 1; - } else { - /* - * don't complain with CERROR: some EQs are sized small - * anyway; if it's important, the caller should complain - */ - CDEBUG(D_NET, "Event Queue Overflow: eq seq %lu ev seq %lu\n", - eq->eq_deq_seq, new_event->sequence); - rc = -EOVERFLOW; - } - - eq->eq_deq_seq = new_event->sequence + 1; - return rc; -} - -/** - * A nonblocking function that can be used to get the next event in an EQ. - * If an event handler is associated with the EQ, the handler will run before - * this function returns successfully. The event is removed from the queue. - * - * \param eventq A handle for the event queue. - * \param event On successful return (1 or -EOVERFLOW), this location will - * hold the next event in the EQ. - * - * \retval 0 No pending event in the EQ. - * \retval 1 Indicates success. - * \retval -ENOENT If \a eventq does not point to a valid EQ. - * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that - * at least one event between this event and the last event obtained from the - * EQ has been dropped due to limited space in the EQ. - */ - -/** - * Block the calling process until there is an event in the EQ. - * If an event handler is associated with the EQ, the handler will run before - * this function returns successfully. This function returns the next event - * in the EQ and removes it from the EQ. - * - * \param eventq A handle for the event queue. - * \param event On successful return (1 or -EOVERFLOW), this location will - * hold the next event in the EQ. - * - * \retval 1 Indicates success. - * \retval -ENOENT If \a eventq does not point to a valid EQ. - * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that - * at least one event between this event and the last event obtained from the - * EQ has been dropped due to limited space in the EQ. - */ - -static int -lnet_eq_wait_locked(int *timeout_ms, long state) -__must_hold(&the_lnet.ln_eq_wait_lock) -{ - int tms = *timeout_ms; - int wait; - wait_queue_entry_t wl; - unsigned long now; - - if (!tms) - return -ENXIO; /* don't want to wait and no new event */ - - init_waitqueue_entry(&wl, current); - set_current_state(state); - add_wait_queue(&the_lnet.ln_eq_waitq, &wl); - - lnet_eq_wait_unlock(); - - if (tms < 0) { - schedule(); - } else { - now = jiffies; - schedule_timeout(msecs_to_jiffies(tms)); - tms -= jiffies_to_msecs(jiffies - now); - if (tms < 0) /* no more wait but may have new event */ - tms = 0; - } - - wait = tms; /* might need to call here again */ - *timeout_ms = tms; - - lnet_eq_wait_lock(); - remove_wait_queue(&the_lnet.ln_eq_waitq, &wl); - - return wait; -} - -/** - * Block the calling process until there's an event from a set of EQs or - * timeout happens. - * - * If an event handler is associated with the EQ, the handler will run before - * this function returns successfully, in which case the corresponding event - * is consumed. - * - * LNetEQPoll() provides a timeout to allow applications to poll, block for a - * fixed period, or block indefinitely. - * - * \param eventqs,neq An array of EQ handles, and size of the array. - * \param timeout_ms Time in milliseconds to wait for an event to occur on - * one of the EQs. The constant LNET_TIME_FOREVER can be used to indicate an - * infinite timeout. - * \param interruptible, if true, use TASK_INTERRUPTIBLE, else TASK_NOLOAD - * \param event,which On successful return (1 or -EOVERFLOW), \a event will - * hold the next event in the EQs, and \a which will contain the index of the - * EQ from which the event was taken. - * - * \retval 0 No pending event in the EQs after timeout. - * \retval 1 Indicates success. - * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that - * at least one event between this event and the last event obtained from the - * EQ indicated by \a which has been dropped due to limited space in the EQ. - * \retval -ENOENT If there's an invalid handle in \a eventqs. - */ -int -LNetEQPoll(struct lnet_handle_eq *eventqs, int neq, int timeout_ms, - int interruptible, - struct lnet_event *event, int *which) -{ - int wait = 1; - int rc; - int i; - - LASSERT(the_lnet.ln_refcount > 0); - - if (neq < 1) - return -ENOENT; - - lnet_eq_wait_lock(); - - for (;;) { - for (i = 0; i < neq; i++) { - struct lnet_eq *eq = lnet_handle2eq(&eventqs[i]); - - if (!eq) { - lnet_eq_wait_unlock(); - return -ENOENT; - } - - rc = lnet_eq_dequeue_event(eq, event); - if (rc) { - lnet_eq_wait_unlock(); - *which = i; - return rc; - } - } - - if (!wait) - break; - - /* - * return value of lnet_eq_wait_locked: - * -1 : did nothing and it's sure no new event - * 1 : sleep inside and wait until new event - * 0 : don't want to wait anymore, but might have new event - * so need to call dequeue again - */ - wait = lnet_eq_wait_locked(&timeout_ms, - interruptible ? TASK_INTERRUPTIBLE - : TASK_NOLOAD); - if (wait < 0) /* no new event */ - break; - } - - lnet_eq_wait_unlock(); - return 0; -} diff --git a/drivers/staging/lustre/lnet/lnet/lib-md.c b/drivers/staging/lustre/lnet/lnet/lib-md.c deleted file mode 100644 index 8a22514aaf713ab7c00dfc71114c250de71d8cfb..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/lnet/lib-md.c +++ /dev/null @@ -1,463 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/lnet/lib-md.c - * - * Memory Descriptor management routines - */ - -#define DEBUG_SUBSYSTEM S_LNET - -#include - -/* must be called with lnet_res_lock held */ -void -lnet_md_unlink(struct lnet_libmd *md) -{ - if (!(md->md_flags & LNET_MD_FLAG_ZOMBIE)) { - /* first unlink attempt... */ - struct lnet_me *me = md->md_me; - - md->md_flags |= LNET_MD_FLAG_ZOMBIE; - - /* - * Disassociate from ME (if any), - * and unlink it if it was created - * with LNET_UNLINK - */ - if (me) { - /* detach MD from portal */ - lnet_ptl_detach_md(me, md); - if (me->me_unlink == LNET_UNLINK) - lnet_me_unlink(me); - } - - /* ensure all future handle lookups fail */ - lnet_res_lh_invalidate(&md->md_lh); - } - - if (md->md_refcount) { - CDEBUG(D_NET, "Queueing unlink of md %p\n", md); - return; - } - - CDEBUG(D_NET, "Unlinking md %p\n", md); - - if (md->md_eq) { - int cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie); - - LASSERT(*md->md_eq->eq_refs[cpt] > 0); - (*md->md_eq->eq_refs[cpt])--; - } - - LASSERT(!list_empty(&md->md_list)); - list_del_init(&md->md_list); - kfree(md); -} - -static int -lnet_md_build(struct lnet_libmd *lmd, struct lnet_md *umd, int unlink) -{ - int i; - unsigned int niov; - int total_length = 0; - - lmd->md_me = NULL; - lmd->md_start = umd->start; - lmd->md_offset = 0; - lmd->md_max_size = umd->max_size; - lmd->md_options = umd->options; - lmd->md_user_ptr = umd->user_ptr; - lmd->md_eq = NULL; - lmd->md_threshold = umd->threshold; - lmd->md_refcount = 0; - lmd->md_flags = (unlink == LNET_UNLINK) ? LNET_MD_FLAG_AUTO_UNLINK : 0; - - if (umd->options & LNET_MD_IOVEC) { - if (umd->options & LNET_MD_KIOV) /* Can't specify both */ - return -EINVAL; - - niov = umd->length; - lmd->md_niov = umd->length; - memcpy(lmd->md_iov.iov, umd->start, - niov * sizeof(lmd->md_iov.iov[0])); - - for (i = 0; i < (int)niov; i++) { - /* We take the base address on trust */ - /* invalid length */ - if (lmd->md_iov.iov[i].iov_len <= 0) - return -EINVAL; - - total_length += lmd->md_iov.iov[i].iov_len; - } - - lmd->md_length = total_length; - - if ((umd->options & LNET_MD_MAX_SIZE) && /* use max size */ - (umd->max_size < 0 || - umd->max_size > total_length)) /* illegal max_size */ - return -EINVAL; - - } else if (umd->options & LNET_MD_KIOV) { - niov = umd->length; - lmd->md_niov = umd->length; - memcpy(lmd->md_iov.kiov, umd->start, - niov * sizeof(lmd->md_iov.kiov[0])); - - for (i = 0; i < (int)niov; i++) { - /* We take the page pointer on trust */ - if (lmd->md_iov.kiov[i].bv_offset + - lmd->md_iov.kiov[i].bv_len > PAGE_SIZE) - return -EINVAL; /* invalid length */ - - total_length += lmd->md_iov.kiov[i].bv_len; - } - - lmd->md_length = total_length; - - if ((umd->options & LNET_MD_MAX_SIZE) && /* max size used */ - (umd->max_size < 0 || - umd->max_size > total_length)) /* illegal max_size */ - return -EINVAL; - } else { /* contiguous */ - lmd->md_length = umd->length; - niov = 1; - lmd->md_niov = 1; - lmd->md_iov.iov[0].iov_base = umd->start; - lmd->md_iov.iov[0].iov_len = umd->length; - - if ((umd->options & LNET_MD_MAX_SIZE) && /* max size used */ - (umd->max_size < 0 || - umd->max_size > (int)umd->length)) /* illegal max_size */ - return -EINVAL; - } - - return 0; -} - -/* must be called with resource lock held */ -static int -lnet_md_link(struct lnet_libmd *md, struct lnet_handle_eq eq_handle, int cpt) -{ - struct lnet_res_container *container = the_lnet.ln_md_containers[cpt]; - - /* - * NB we are passed an allocated, but inactive md. - * if we return success, caller may lnet_md_unlink() it. - * otherwise caller may only kfree() it. - */ - /* - * This implementation doesn't know how to create START events or - * disable END events. Best to LASSERT our caller is compliant so - * we find out quickly... - */ - /* - * TODO - reevaluate what should be here in light of - * the removal of the start and end events - * maybe there we shouldn't even allow LNET_EQ_NONE!) - * LASSERT(!eq); - */ - if (!LNetEQHandleIsInvalid(eq_handle)) { - md->md_eq = lnet_handle2eq(&eq_handle); - - if (!md->md_eq) - return -ENOENT; - - (*md->md_eq->eq_refs[cpt])++; - } - - lnet_res_lh_initialize(container, &md->md_lh); - - LASSERT(list_empty(&md->md_list)); - list_add(&md->md_list, &container->rec_active); - - return 0; -} - -/* must be called with lnet_res_lock held */ -void -lnet_md_deconstruct(struct lnet_libmd *lmd, struct lnet_md *umd) -{ - /* NB this doesn't copy out all the iov entries so when a - * discontiguous MD is copied out, the target gets to know the - * original iov pointer (in start) and the number of entries it had - * and that's all. - */ - umd->start = lmd->md_start; - umd->length = !(lmd->md_options & - (LNET_MD_IOVEC | LNET_MD_KIOV)) ? - lmd->md_length : lmd->md_niov; - umd->threshold = lmd->md_threshold; - umd->max_size = lmd->md_max_size; - umd->options = lmd->md_options; - umd->user_ptr = lmd->md_user_ptr; - lnet_eq2handle(&umd->eq_handle, lmd->md_eq); -} - -static int -lnet_md_validate(struct lnet_md *umd) -{ - if (!umd->start && umd->length) { - CERROR("MD start pointer can not be NULL with length %u\n", - umd->length); - return -EINVAL; - } - - if ((umd->options & (LNET_MD_KIOV | LNET_MD_IOVEC)) && - umd->length > LNET_MAX_IOV) { - CERROR("Invalid option: too many fragments %u, %d max\n", - umd->length, LNET_MAX_IOV); - return -EINVAL; - } - - return 0; -} - -/** - * Create a memory descriptor and attach it to a ME - * - * \param meh A handle for a ME to associate the new MD with. - * \param umd Provides initial values for the user-visible parts of a MD. - * Other than its use for initialization, there is no linkage between this - * structure and the MD maintained by the LNet. - * \param unlink A flag to indicate whether the MD is automatically unlinked - * when it becomes inactive, either because the operation threshold drops to - * zero or because the available memory becomes less than \a umd.max_size. - * (Note that the check for unlinking a MD only occurs after the completion - * of a successful operation on the MD.) The value LNET_UNLINK enables auto - * unlinking; the value LNET_RETAIN disables it. - * \param handle On successful returns, a handle to the newly created MD is - * saved here. This handle can be used later in LNetMDUnlink(). - * - * \retval 0 On success. - * \retval -EINVAL If \a umd is not valid. - * \retval -ENOMEM If new MD cannot be allocated. - * \retval -ENOENT Either \a meh or \a umd.eq_handle does not point to a - * valid object. Note that it's OK to supply a NULL \a umd.eq_handle by - * calling LNetInvalidateHandle() on it. - * \retval -EBUSY If the ME pointed to by \a meh is already associated with - * a MD. - */ -int -LNetMDAttach(struct lnet_handle_me meh, struct lnet_md umd, - enum lnet_unlink unlink, struct lnet_handle_md *handle) -{ - LIST_HEAD(matches); - LIST_HEAD(drops); - struct lnet_me *me; - struct lnet_libmd *md; - int cpt; - int rc; - - LASSERT(the_lnet.ln_refcount > 0); - - if (lnet_md_validate(&umd)) - return -EINVAL; - - if (!(umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT))) { - CERROR("Invalid option: no MD_OP set\n"); - return -EINVAL; - } - - md = lnet_md_alloc(&umd); - if (!md) - return -ENOMEM; - - rc = lnet_md_build(md, &umd, unlink); - if (rc) - goto out_free; - - cpt = lnet_cpt_of_cookie(meh.cookie); - - lnet_res_lock(cpt); - - me = lnet_handle2me(&meh); - if (!me) - rc = -ENOENT; - else if (me->me_md) - rc = -EBUSY; - else - rc = lnet_md_link(md, umd.eq_handle, cpt); - - if (rc) - goto out_unlock; - - /* - * attach this MD to portal of ME and check if it matches any - * blocked msgs on this portal - */ - lnet_ptl_attach_md(me, md, &matches, &drops); - - lnet_md2handle(handle, md); - - lnet_res_unlock(cpt); - - lnet_drop_delayed_msg_list(&drops, "Bad match"); - lnet_recv_delayed_msg_list(&matches); - - return 0; - -out_unlock: - lnet_res_unlock(cpt); -out_free: - kfree(md); - return rc; -} -EXPORT_SYMBOL(LNetMDAttach); - -/** - * Create a "free floating" memory descriptor - a MD that is not associated - * with a ME. Such MDs are usually used in LNetPut() and LNetGet() operations. - * - * \param umd,unlink See the discussion for LNetMDAttach(). - * \param handle On successful returns, a handle to the newly created MD is - * saved here. This handle can be used later in LNetMDUnlink(), LNetPut(), - * and LNetGet() operations. - * - * \retval 0 On success. - * \retval -EINVAL If \a umd is not valid. - * \retval -ENOMEM If new MD cannot be allocated. - * \retval -ENOENT \a umd.eq_handle does not point to a valid EQ. Note that - * it's OK to supply a NULL \a umd.eq_handle by calling - * LNetInvalidateHandle() on it. - */ -int -LNetMDBind(struct lnet_md umd, enum lnet_unlink unlink, - struct lnet_handle_md *handle) -{ - struct lnet_libmd *md; - int cpt; - int rc; - - LASSERT(the_lnet.ln_refcount > 0); - - if (lnet_md_validate(&umd)) - return -EINVAL; - - if ((umd.options & (LNET_MD_OP_GET | LNET_MD_OP_PUT))) { - CERROR("Invalid option: GET|PUT illegal on active MDs\n"); - return -EINVAL; - } - - md = lnet_md_alloc(&umd); - if (!md) - return -ENOMEM; - - rc = lnet_md_build(md, &umd, unlink); - if (rc) - goto out_free; - - cpt = lnet_res_lock_current(); - - rc = lnet_md_link(md, umd.eq_handle, cpt); - if (rc) - goto out_unlock; - - lnet_md2handle(handle, md); - - lnet_res_unlock(cpt); - return 0; - -out_unlock: - lnet_res_unlock(cpt); -out_free: - kfree(md); - - return rc; -} -EXPORT_SYMBOL(LNetMDBind); - -/** - * Unlink the memory descriptor from any ME it may be linked to and release - * the internal resources associated with it. As a result, active messages - * associated with the MD may get aborted. - * - * This function does not free the memory region associated with the MD; - * i.e., the memory the user allocated for this MD. If the ME associated with - * this MD is not NULL and was created with auto unlink enabled, the ME is - * unlinked as well (see LNetMEAttach()). - * - * Explicitly unlinking a MD via this function call has the same behavior as - * a MD that has been automatically unlinked, except that no LNET_EVENT_UNLINK - * is generated in the latter case. - * - * An unlinked event can be reported in two ways: - * - If there's no pending operations on the MD, it's unlinked immediately - * and an LNET_EVENT_UNLINK event is logged before this function returns. - * - Otherwise, the MD is only marked for deletion when this function - * returns, and the unlinked event will be piggybacked on the event of - * the completion of the last operation by setting the unlinked field of - * the event. No dedicated LNET_EVENT_UNLINK event is generated. - * - * Note that in both cases the unlinked field of the event is always set; no - * more event will happen on the MD after such an event is logged. - * - * \param mdh A handle for the MD to be unlinked. - * - * \retval 0 On success. - * \retval -ENOENT If \a mdh does not point to a valid MD object. - */ -int -LNetMDUnlink(struct lnet_handle_md mdh) -{ - struct lnet_event ev; - struct lnet_libmd *md; - int cpt; - - LASSERT(the_lnet.ln_refcount > 0); - - cpt = lnet_cpt_of_cookie(mdh.cookie); - lnet_res_lock(cpt); - - md = lnet_handle2md(&mdh); - if (!md) { - lnet_res_unlock(cpt); - return -ENOENT; - } - - md->md_flags |= LNET_MD_FLAG_ABORTED; - /* - * If the MD is busy, lnet_md_unlink just marks it for deletion, and - * when the LND is done, the completion event flags that the MD was - * unlinked. Otherwise, we enqueue an event now... - */ - if (md->md_eq && !md->md_refcount) { - lnet_build_unlink_event(md, &ev); - lnet_eq_enqueue_event(md->md_eq, &ev); - } - - lnet_md_unlink(md); - - lnet_res_unlock(cpt); - return 0; -} -EXPORT_SYMBOL(LNetMDUnlink); diff --git a/drivers/staging/lustre/lnet/lnet/lib-me.c b/drivers/staging/lustre/lnet/lnet/lib-me.c deleted file mode 100644 index 672e37bdd045c443e3dace6b9081f60621d81d54..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/lnet/lib-me.c +++ /dev/null @@ -1,274 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/lnet/lib-me.c - * - * Match Entry management routines - */ - -#define DEBUG_SUBSYSTEM S_LNET - -#include - -/** - * Create and attach a match entry to the match list of \a portal. The new - * ME is empty, i.e. not associated with a memory descriptor. LNetMDAttach() - * can be used to attach a MD to an empty ME. - * - * \param portal The portal table index where the ME should be attached. - * \param match_id Specifies the match criteria for the process ID of - * the requester. The constants LNET_PID_ANY and LNET_NID_ANY can be - * used to wildcard either of the identifiers in the lnet_process_id - * structure. - * \param match_bits,ignore_bits Specify the match criteria to apply - * to the match bits in the incoming request. The ignore bits are used - * to mask out insignificant bits in the incoming match bits. The resulting - * bits are then compared to the ME's match bits to determine if the - * incoming request meets the match criteria. - * \param unlink Indicates whether the ME should be unlinked when the memory - * descriptor associated with it is unlinked (Note that the check for - * unlinking a ME only occurs when the memory descriptor is unlinked.). - * Valid values are LNET_RETAIN and LNET_UNLINK. - * \param pos Indicates whether the new ME should be prepended or - * appended to the match list. Allowed constants: LNET_INS_BEFORE, - * LNET_INS_AFTER. - * \param handle On successful returns, a handle to the newly created ME - * object is saved here. This handle can be used later in LNetMEInsert(), - * LNetMEUnlink(), or LNetMDAttach() functions. - * - * \retval 0 On success. - * \retval -EINVAL If \a portal is invalid. - * \retval -ENOMEM If new ME object cannot be allocated. - */ -int -LNetMEAttach(unsigned int portal, - struct lnet_process_id match_id, - __u64 match_bits, __u64 ignore_bits, - enum lnet_unlink unlink, enum lnet_ins_pos pos, - struct lnet_handle_me *handle) -{ - struct lnet_match_table *mtable; - struct lnet_me *me; - struct list_head *head; - - LASSERT(the_lnet.ln_refcount > 0); - - if ((int)portal >= the_lnet.ln_nportals) - return -EINVAL; - - mtable = lnet_mt_of_attach(portal, match_id, - match_bits, ignore_bits, pos); - if (!mtable) /* can't match portal type */ - return -EPERM; - - me = kzalloc(sizeof(*me), GFP_NOFS); - if (!me) - return -ENOMEM; - - lnet_res_lock(mtable->mt_cpt); - - me->me_portal = portal; - me->me_match_id = match_id; - me->me_match_bits = match_bits; - me->me_ignore_bits = ignore_bits; - me->me_unlink = unlink; - me->me_md = NULL; - - lnet_res_lh_initialize(the_lnet.ln_me_containers[mtable->mt_cpt], - &me->me_lh); - if (ignore_bits) - head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE]; - else - head = lnet_mt_match_head(mtable, match_id, match_bits); - - me->me_pos = head - &mtable->mt_mhash[0]; - if (pos == LNET_INS_AFTER || pos == LNET_INS_LOCAL) - list_add_tail(&me->me_list, head); - else - list_add(&me->me_list, head); - - lnet_me2handle(handle, me); - - lnet_res_unlock(mtable->mt_cpt); - return 0; -} -EXPORT_SYMBOL(LNetMEAttach); - -/** - * Create and a match entry and insert it before or after the ME pointed to by - * \a current_meh. The new ME is empty, i.e. not associated with a memory - * descriptor. LNetMDAttach() can be used to attach a MD to an empty ME. - * - * This function is identical to LNetMEAttach() except for the position - * where the new ME is inserted. - * - * \param current_meh A handle for a ME. The new ME will be inserted - * immediately before or immediately after this ME. - * \param match_id,match_bits,ignore_bits,unlink,pos,handle See the discussion - * for LNetMEAttach(). - * - * \retval 0 On success. - * \retval -ENOMEM If new ME object cannot be allocated. - * \retval -ENOENT If \a current_meh does not point to a valid match entry. - */ -int -LNetMEInsert(struct lnet_handle_me current_meh, - struct lnet_process_id match_id, - __u64 match_bits, __u64 ignore_bits, - enum lnet_unlink unlink, enum lnet_ins_pos pos, - struct lnet_handle_me *handle) -{ - struct lnet_me *current_me; - struct lnet_me *new_me; - struct lnet_portal *ptl; - int cpt; - - LASSERT(the_lnet.ln_refcount > 0); - - if (pos == LNET_INS_LOCAL) - return -EPERM; - - new_me = kzalloc(sizeof(*new_me), GFP_NOFS); - if (!new_me) - return -ENOMEM; - - cpt = lnet_cpt_of_cookie(current_meh.cookie); - - lnet_res_lock(cpt); - - current_me = lnet_handle2me(¤t_meh); - if (!current_me) { - kfree(new_me); - - lnet_res_unlock(cpt); - return -ENOENT; - } - - LASSERT(current_me->me_portal < the_lnet.ln_nportals); - - ptl = the_lnet.ln_portals[current_me->me_portal]; - if (lnet_ptl_is_unique(ptl)) { - /* nosense to insertion on unique portal */ - kfree(new_me); - lnet_res_unlock(cpt); - return -EPERM; - } - - new_me->me_pos = current_me->me_pos; - new_me->me_portal = current_me->me_portal; - new_me->me_match_id = match_id; - new_me->me_match_bits = match_bits; - new_me->me_ignore_bits = ignore_bits; - new_me->me_unlink = unlink; - new_me->me_md = NULL; - - lnet_res_lh_initialize(the_lnet.ln_me_containers[cpt], &new_me->me_lh); - - if (pos == LNET_INS_AFTER) - list_add(&new_me->me_list, ¤t_me->me_list); - else - list_add_tail(&new_me->me_list, ¤t_me->me_list); - - lnet_me2handle(handle, new_me); - - lnet_res_unlock(cpt); - - return 0; -} -EXPORT_SYMBOL(LNetMEInsert); - -/** - * Unlink a match entry from its match list. - * - * This operation also releases any resources associated with the ME. If a - * memory descriptor is attached to the ME, then it will be unlinked as well - * and an unlink event will be generated. It is an error to use the ME handle - * after calling LNetMEUnlink(). - * - * \param meh A handle for the ME to be unlinked. - * - * \retval 0 On success. - * \retval -ENOENT If \a meh does not point to a valid ME. - * \see LNetMDUnlink() for the discussion on delivering unlink event. - */ -int -LNetMEUnlink(struct lnet_handle_me meh) -{ - struct lnet_me *me; - struct lnet_libmd *md; - struct lnet_event ev; - int cpt; - - LASSERT(the_lnet.ln_refcount > 0); - - cpt = lnet_cpt_of_cookie(meh.cookie); - lnet_res_lock(cpt); - - me = lnet_handle2me(&meh); - if (!me) { - lnet_res_unlock(cpt); - return -ENOENT; - } - - md = me->me_md; - if (md) { - md->md_flags |= LNET_MD_FLAG_ABORTED; - if (md->md_eq && !md->md_refcount) { - lnet_build_unlink_event(md, &ev); - lnet_eq_enqueue_event(md->md_eq, &ev); - } - } - - lnet_me_unlink(me); - - lnet_res_unlock(cpt); - return 0; -} -EXPORT_SYMBOL(LNetMEUnlink); - -/* call with lnet_res_lock please */ -void -lnet_me_unlink(struct lnet_me *me) -{ - list_del(&me->me_list); - - if (me->me_md) { - struct lnet_libmd *md = me->me_md; - - /* detach MD from portal of this ME */ - lnet_ptl_detach_md(me, md); - lnet_md_unlink(md); - } - - lnet_res_lh_invalidate(&me->me_lh); - kfree(me); -} diff --git a/drivers/staging/lustre/lnet/lnet/lib-move.c b/drivers/staging/lustre/lnet/lnet/lib-move.c deleted file mode 100644 index f8eaf8ff8d8d11cebc0694641275c62f2a5b3013..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/lnet/lib-move.c +++ /dev/null @@ -1,2386 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/lnet/lib-move.c - * - * Data movement routines - */ - -#define DEBUG_SUBSYSTEM S_LNET - -#include -#include -#include - -static int local_nid_dist_zero = 1; -module_param(local_nid_dist_zero, int, 0444); -MODULE_PARM_DESC(local_nid_dist_zero, "Reserved"); - -int -lnet_fail_nid(lnet_nid_t nid, unsigned int threshold) -{ - struct lnet_test_peer *tp; - struct lnet_test_peer *temp; - struct list_head *el; - struct list_head *next; - struct list_head cull; - - /* NB: use lnet_net_lock(0) to serialize operations on test peers */ - if (threshold) { - /* Adding a new entry */ - tp = kzalloc(sizeof(*tp), GFP_NOFS); - if (!tp) - return -ENOMEM; - - tp->tp_nid = nid; - tp->tp_threshold = threshold; - - lnet_net_lock(0); - list_add_tail(&tp->tp_list, &the_lnet.ln_test_peers); - lnet_net_unlock(0); - return 0; - } - - /* removing entries */ - INIT_LIST_HEAD(&cull); - - lnet_net_lock(0); - - list_for_each_safe(el, next, &the_lnet.ln_test_peers) { - tp = list_entry(el, struct lnet_test_peer, tp_list); - - if (!tp->tp_threshold || /* needs culling anyway */ - nid == LNET_NID_ANY || /* removing all entries */ - tp->tp_nid == nid) { /* matched this one */ - list_del(&tp->tp_list); - list_add(&tp->tp_list, &cull); - } - } - - lnet_net_unlock(0); - - list_for_each_entry_safe(tp, temp, &cull, tp_list) { - list_del(&tp->tp_list); - kfree(tp); - } - return 0; -} - -static int -fail_peer(lnet_nid_t nid, int outgoing) -{ - struct lnet_test_peer *tp; - struct lnet_test_peer *temp; - struct list_head *el; - struct list_head *next; - struct list_head cull; - int fail = 0; - - INIT_LIST_HEAD(&cull); - - /* NB: use lnet_net_lock(0) to serialize operations on test peers */ - lnet_net_lock(0); - - list_for_each_safe(el, next, &the_lnet.ln_test_peers) { - tp = list_entry(el, struct lnet_test_peer, tp_list); - - if (!tp->tp_threshold) { - /* zombie entry */ - if (outgoing) { - /* - * only cull zombies on outgoing tests, - * since we may be at interrupt priority on - * incoming messages. - */ - list_del(&tp->tp_list); - list_add(&tp->tp_list, &cull); - } - continue; - } - - if (tp->tp_nid == LNET_NID_ANY || /* fail every peer */ - nid == tp->tp_nid) { /* fail this peer */ - fail = 1; - - if (tp->tp_threshold != LNET_MD_THRESH_INF) { - tp->tp_threshold--; - if (outgoing && - !tp->tp_threshold) { - /* see above */ - list_del(&tp->tp_list); - list_add(&tp->tp_list, &cull); - } - } - break; - } - } - - lnet_net_unlock(0); - - list_for_each_entry_safe(tp, temp, &cull, tp_list) { - list_del(&tp->tp_list); - - kfree(tp); - } - - return fail; -} - -unsigned int -lnet_iov_nob(unsigned int niov, struct kvec *iov) -{ - unsigned int nob = 0; - - LASSERT(!niov || iov); - while (niov-- > 0) - nob += (iov++)->iov_len; - - return nob; -} -EXPORT_SYMBOL(lnet_iov_nob); - -void -lnet_copy_iov2iter(struct iov_iter *to, - unsigned int nsiov, const struct kvec *siov, - unsigned int soffset, unsigned int nob) -{ - /* NB diov, siov are READ-ONLY */ - const char *s; - size_t left; - - if (!nob) - return; - - /* skip complete frags before 'soffset' */ - LASSERT(nsiov > 0); - while (soffset >= siov->iov_len) { - soffset -= siov->iov_len; - siov++; - nsiov--; - LASSERT(nsiov > 0); - } - - s = (char *)siov->iov_base + soffset; - left = siov->iov_len - soffset; - do { - size_t n, copy = left; - - LASSERT(nsiov > 0); - - if (copy > nob) - copy = nob; - n = copy_to_iter(s, copy, to); - if (n != copy) - return; - nob -= n; - - siov++; - s = (char *)siov->iov_base; - left = siov->iov_len; - nsiov--; - } while (nob > 0); -} -EXPORT_SYMBOL(lnet_copy_iov2iter); - -void -lnet_copy_kiov2iter(struct iov_iter *to, - unsigned int nsiov, const struct bio_vec *siov, - unsigned int soffset, unsigned int nob) -{ - if (!nob) - return; - - LASSERT(!in_interrupt()); - - LASSERT(nsiov > 0); - while (soffset >= siov->bv_len) { - soffset -= siov->bv_len; - siov++; - nsiov--; - LASSERT(nsiov > 0); - } - - do { - size_t copy = siov->bv_len - soffset, n; - - LASSERT(nsiov > 0); - - if (copy > nob) - copy = nob; - n = copy_page_to_iter(siov->bv_page, - siov->bv_offset + soffset, - copy, to); - if (n != copy) - return; - nob -= n; - siov++; - nsiov--; - soffset = 0; - } while (nob > 0); -} -EXPORT_SYMBOL(lnet_copy_kiov2iter); - -int -lnet_extract_iov(int dst_niov, struct kvec *dst, - int src_niov, const struct kvec *src, - unsigned int offset, unsigned int len) -{ - /* - * Initialise 'dst' to the subset of 'src' starting at 'offset', - * for exactly 'len' bytes, and return the number of entries. - * NB not destructive to 'src' - */ - unsigned int frag_len; - unsigned int niov; - - if (!len) /* no data => */ - return 0; /* no frags */ - - LASSERT(src_niov > 0); - while (offset >= src->iov_len) { /* skip initial frags */ - offset -= src->iov_len; - src_niov--; - src++; - LASSERT(src_niov > 0); - } - - niov = 1; - for (;;) { - LASSERT(src_niov > 0); - LASSERT((int)niov <= dst_niov); - - frag_len = src->iov_len - offset; - dst->iov_base = ((char *)src->iov_base) + offset; - - if (len <= frag_len) { - dst->iov_len = len; - return niov; - } - - dst->iov_len = frag_len; - - len -= frag_len; - dst++; - src++; - niov++; - src_niov--; - offset = 0; - } -} -EXPORT_SYMBOL(lnet_extract_iov); - -unsigned int -lnet_kiov_nob(unsigned int niov, struct bio_vec *kiov) -{ - unsigned int nob = 0; - - LASSERT(!niov || kiov); - while (niov-- > 0) - nob += (kiov++)->bv_len; - - return nob; -} -EXPORT_SYMBOL(lnet_kiov_nob); - -int -lnet_extract_kiov(int dst_niov, struct bio_vec *dst, - int src_niov, const struct bio_vec *src, - unsigned int offset, unsigned int len) -{ - /* - * Initialise 'dst' to the subset of 'src' starting at 'offset', - * for exactly 'len' bytes, and return the number of entries. - * NB not destructive to 'src' - */ - unsigned int frag_len; - unsigned int niov; - - if (!len) /* no data => */ - return 0; /* no frags */ - - LASSERT(src_niov > 0); - while (offset >= src->bv_len) { /* skip initial frags */ - offset -= src->bv_len; - src_niov--; - src++; - LASSERT(src_niov > 0); - } - - niov = 1; - for (;;) { - LASSERT(src_niov > 0); - LASSERT((int)niov <= dst_niov); - - frag_len = src->bv_len - offset; - dst->bv_page = src->bv_page; - dst->bv_offset = src->bv_offset + offset; - - if (len <= frag_len) { - dst->bv_len = len; - LASSERT(dst->bv_offset + dst->bv_len - <= PAGE_SIZE); - return niov; - } - - dst->bv_len = frag_len; - LASSERT(dst->bv_offset + dst->bv_len <= PAGE_SIZE); - - len -= frag_len; - dst++; - src++; - niov++; - src_niov--; - offset = 0; - } -} -EXPORT_SYMBOL(lnet_extract_kiov); - -void -lnet_ni_recv(struct lnet_ni *ni, void *private, struct lnet_msg *msg, - int delayed, unsigned int offset, unsigned int mlen, - unsigned int rlen) -{ - unsigned int niov = 0; - struct kvec *iov = NULL; - struct bio_vec *kiov = NULL; - struct iov_iter to; - int rc; - - LASSERT(!in_interrupt()); - LASSERT(!mlen || msg); - - if (msg) { - LASSERT(msg->msg_receiving); - LASSERT(!msg->msg_sending); - LASSERT(rlen == msg->msg_len); - LASSERT(mlen <= msg->msg_len); - LASSERT(msg->msg_offset == offset); - LASSERT(msg->msg_wanted == mlen); - - msg->msg_receiving = 0; - - if (mlen) { - niov = msg->msg_niov; - iov = msg->msg_iov; - kiov = msg->msg_kiov; - - LASSERT(niov > 0); - LASSERT(!iov != !kiov); - } - } - - if (iov) { - iov_iter_kvec(&to, ITER_KVEC | READ, iov, niov, mlen + offset); - iov_iter_advance(&to, offset); - } else { - iov_iter_bvec(&to, ITER_BVEC | READ, kiov, niov, mlen + offset); - iov_iter_advance(&to, offset); - } - rc = ni->ni_lnd->lnd_recv(ni, private, msg, delayed, &to, rlen); - if (rc < 0) - lnet_finalize(ni, msg, rc); -} - -static void -lnet_setpayloadbuffer(struct lnet_msg *msg) -{ - struct lnet_libmd *md = msg->msg_md; - - LASSERT(msg->msg_len > 0); - LASSERT(!msg->msg_routing); - LASSERT(md); - LASSERT(!msg->msg_niov); - LASSERT(!msg->msg_iov); - LASSERT(!msg->msg_kiov); - - msg->msg_niov = md->md_niov; - if (md->md_options & LNET_MD_KIOV) - msg->msg_kiov = md->md_iov.kiov; - else - msg->msg_iov = md->md_iov.iov; -} - -void -lnet_prep_send(struct lnet_msg *msg, int type, struct lnet_process_id target, - unsigned int offset, unsigned int len) -{ - msg->msg_type = type; - msg->msg_target = target; - msg->msg_len = len; - msg->msg_offset = offset; - - if (len) - lnet_setpayloadbuffer(msg); - - memset(&msg->msg_hdr, 0, sizeof(msg->msg_hdr)); - msg->msg_hdr.type = cpu_to_le32(type); - msg->msg_hdr.dest_nid = cpu_to_le64(target.nid); - msg->msg_hdr.dest_pid = cpu_to_le32(target.pid); - /* src_nid will be set later */ - msg->msg_hdr.src_pid = cpu_to_le32(the_lnet.ln_pid); - msg->msg_hdr.payload_length = cpu_to_le32(len); -} - -static void -lnet_ni_send(struct lnet_ni *ni, struct lnet_msg *msg) -{ - void *priv = msg->msg_private; - int rc; - - LASSERT(!in_interrupt()); - LASSERT(LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) == LOLND || - (msg->msg_txcredit && msg->msg_peertxcredit)); - - rc = ni->ni_lnd->lnd_send(ni, priv, msg); - if (rc < 0) - lnet_finalize(ni, msg, rc); -} - -static int -lnet_ni_eager_recv(struct lnet_ni *ni, struct lnet_msg *msg) -{ - int rc; - - LASSERT(!msg->msg_sending); - LASSERT(msg->msg_receiving); - LASSERT(!msg->msg_rx_ready_delay); - LASSERT(ni->ni_lnd->lnd_eager_recv); - - msg->msg_rx_ready_delay = 1; - rc = ni->ni_lnd->lnd_eager_recv(ni, msg->msg_private, msg, - &msg->msg_private); - if (rc) { - CERROR("recv from %s / send to %s aborted: eager_recv failed %d\n", - libcfs_nid2str(msg->msg_rxpeer->lp_nid), - libcfs_id2str(msg->msg_target), rc); - LASSERT(rc < 0); /* required by my callers */ - } - - return rc; -} - -/* NB: caller shall hold a ref on 'lp' as I'd drop lnet_net_lock */ -static void -lnet_ni_query_locked(struct lnet_ni *ni, struct lnet_peer *lp) -{ - unsigned long last_alive = 0; - - LASSERT(lnet_peer_aliveness_enabled(lp)); - LASSERT(ni->ni_lnd->lnd_query); - - lnet_net_unlock(lp->lp_cpt); - ni->ni_lnd->lnd_query(ni, lp->lp_nid, &last_alive); - lnet_net_lock(lp->lp_cpt); - - lp->lp_last_query = jiffies; - - if (last_alive) /* NI has updated timestamp */ - lp->lp_last_alive = last_alive; -} - -/* NB: always called with lnet_net_lock held */ -static inline int -lnet_peer_is_alive(struct lnet_peer *lp, unsigned long now) -{ - int alive; - unsigned long deadline; - - LASSERT(lnet_peer_aliveness_enabled(lp)); - - /* Trust lnet_notify() if it has more recent aliveness news, but - * ignore the initial assumed death (see lnet_peers_start_down()). - */ - if (!lp->lp_alive && lp->lp_alive_count > 0 && - time_after_eq(lp->lp_timestamp, lp->lp_last_alive)) - return 0; - - deadline = lp->lp_last_alive + lp->lp_ni->ni_peertimeout * HZ; - alive = time_after(deadline, now); - - /* Update obsolete lp_alive except for routers assumed to be dead - * initially, because router checker would update aliveness in this - * case, and moreover lp_last_alive at peer creation is assumed. - */ - if (alive && !lp->lp_alive && - !(lnet_isrouter(lp) && !lp->lp_alive_count)) - lnet_notify_locked(lp, 0, 1, lp->lp_last_alive); - - return alive; -} - -/* - * NB: returns 1 when alive, 0 when dead, negative when error; - * may drop the lnet_net_lock - */ -static int -lnet_peer_alive_locked(struct lnet_peer *lp) -{ - unsigned long now = jiffies; - - if (!lnet_peer_aliveness_enabled(lp)) - return -ENODEV; - - if (lnet_peer_is_alive(lp, now)) - return 1; - - /* - * Peer appears dead, but we should avoid frequent NI queries (at - * most once per lnet_queryinterval seconds). - */ - if (lp->lp_last_query) { - static const int lnet_queryinterval = 1; - - unsigned long next_query = - lp->lp_last_query + lnet_queryinterval * HZ; - - if (time_before(now, next_query)) { - if (lp->lp_alive) - CWARN("Unexpected aliveness of peer %s: %d < %d (%d/%d)\n", - libcfs_nid2str(lp->lp_nid), - (int)now, (int)next_query, - lnet_queryinterval, - lp->lp_ni->ni_peertimeout); - return 0; - } - } - - /* query NI for latest aliveness news */ - lnet_ni_query_locked(lp->lp_ni, lp); - - if (lnet_peer_is_alive(lp, now)) - return 1; - - lnet_notify_locked(lp, 0, 0, lp->lp_last_alive); - return 0; -} - -/** - * \param msg The message to be sent. - * \param do_send True if lnet_ni_send() should be called in this function. - * lnet_send() is going to lnet_net_unlock immediately after this, so - * it sets do_send FALSE and I don't do the unlock/send/lock bit. - * - * \retval LNET_CREDIT_OK If \a msg sent or OK to send. - * \retval LNET_CREDIT_WAIT If \a msg blocked for credit. - * \retval -EHOSTUNREACH If the next hop of the message appears dead. - * \retval -ECANCELED If the MD of the message has been unlinked. - */ -static int -lnet_post_send_locked(struct lnet_msg *msg, int do_send) -{ - struct lnet_peer *lp = msg->msg_txpeer; - struct lnet_ni *ni = lp->lp_ni; - int cpt = msg->msg_tx_cpt; - struct lnet_tx_queue *tq = ni->ni_tx_queues[cpt]; - - /* non-lnet_send() callers have checked before */ - LASSERT(!do_send || msg->msg_tx_delayed); - LASSERT(!msg->msg_receiving); - LASSERT(msg->msg_tx_committed); - - /* NB 'lp' is always the next hop */ - if (!(msg->msg_target.pid & LNET_PID_USERFLAG) && - !lnet_peer_alive_locked(lp)) { - the_lnet.ln_counters[cpt]->drop_count++; - the_lnet.ln_counters[cpt]->drop_length += msg->msg_len; - lnet_net_unlock(cpt); - - CNETERR("Dropping message for %s: peer not alive\n", - libcfs_id2str(msg->msg_target)); - if (do_send) - lnet_finalize(ni, msg, -EHOSTUNREACH); - - lnet_net_lock(cpt); - return -EHOSTUNREACH; - } - - if (msg->msg_md && - (msg->msg_md->md_flags & LNET_MD_FLAG_ABORTED)) { - lnet_net_unlock(cpt); - - CNETERR("Aborting message for %s: LNetM[DE]Unlink() already called on the MD/ME.\n", - libcfs_id2str(msg->msg_target)); - if (do_send) - lnet_finalize(ni, msg, -ECANCELED); - - lnet_net_lock(cpt); - return -ECANCELED; - } - - if (!msg->msg_peertxcredit) { - LASSERT((lp->lp_txcredits < 0) == - !list_empty(&lp->lp_txq)); - - msg->msg_peertxcredit = 1; - lp->lp_txqnob += msg->msg_len + sizeof(struct lnet_hdr); - lp->lp_txcredits--; - - if (lp->lp_txcredits < lp->lp_mintxcredits) - lp->lp_mintxcredits = lp->lp_txcredits; - - if (lp->lp_txcredits < 0) { - msg->msg_tx_delayed = 1; - list_add_tail(&msg->msg_list, &lp->lp_txq); - return LNET_CREDIT_WAIT; - } - } - - if (!msg->msg_txcredit) { - LASSERT((tq->tq_credits < 0) == - !list_empty(&tq->tq_delayed)); - - msg->msg_txcredit = 1; - tq->tq_credits--; - - if (tq->tq_credits < tq->tq_credits_min) - tq->tq_credits_min = tq->tq_credits; - - if (tq->tq_credits < 0) { - msg->msg_tx_delayed = 1; - list_add_tail(&msg->msg_list, &tq->tq_delayed); - return LNET_CREDIT_WAIT; - } - } - - if (do_send) { - lnet_net_unlock(cpt); - lnet_ni_send(ni, msg); - lnet_net_lock(cpt); - } - return LNET_CREDIT_OK; -} - -static struct lnet_rtrbufpool * -lnet_msg2bufpool(struct lnet_msg *msg) -{ - struct lnet_rtrbufpool *rbp; - int cpt; - - LASSERT(msg->msg_rx_committed); - - cpt = msg->msg_rx_cpt; - rbp = &the_lnet.ln_rtrpools[cpt][0]; - - LASSERT(msg->msg_len <= LNET_MTU); - while (msg->msg_len > (unsigned int)rbp->rbp_npages * PAGE_SIZE) { - rbp++; - LASSERT(rbp < &the_lnet.ln_rtrpools[cpt][LNET_NRBPOOLS]); - } - - return rbp; -} - -static int -lnet_post_routed_recv_locked(struct lnet_msg *msg, int do_recv) -{ - /* - * lnet_parse is going to lnet_net_unlock immediately after this, so it - * sets do_recv FALSE and I don't do the unlock/send/lock bit. - * I return LNET_CREDIT_WAIT if msg blocked and LNET_CREDIT_OK if - * received or OK to receive - */ - struct lnet_peer *lp = msg->msg_rxpeer; - struct lnet_rtrbufpool *rbp; - struct lnet_rtrbuf *rb; - - LASSERT(!msg->msg_iov); - LASSERT(!msg->msg_kiov); - LASSERT(!msg->msg_niov); - LASSERT(msg->msg_routing); - LASSERT(msg->msg_receiving); - LASSERT(!msg->msg_sending); - - /* non-lnet_parse callers only receive delayed messages */ - LASSERT(!do_recv || msg->msg_rx_delayed); - - if (!msg->msg_peerrtrcredit) { - LASSERT((lp->lp_rtrcredits < 0) == - !list_empty(&lp->lp_rtrq)); - - msg->msg_peerrtrcredit = 1; - lp->lp_rtrcredits--; - if (lp->lp_rtrcredits < lp->lp_minrtrcredits) - lp->lp_minrtrcredits = lp->lp_rtrcredits; - - if (lp->lp_rtrcredits < 0) { - /* must have checked eager_recv before here */ - LASSERT(msg->msg_rx_ready_delay); - msg->msg_rx_delayed = 1; - list_add_tail(&msg->msg_list, &lp->lp_rtrq); - return LNET_CREDIT_WAIT; - } - } - - rbp = lnet_msg2bufpool(msg); - - if (!msg->msg_rtrcredit) { - msg->msg_rtrcredit = 1; - rbp->rbp_credits--; - if (rbp->rbp_credits < rbp->rbp_mincredits) - rbp->rbp_mincredits = rbp->rbp_credits; - - if (rbp->rbp_credits < 0) { - /* must have checked eager_recv before here */ - LASSERT(msg->msg_rx_ready_delay); - msg->msg_rx_delayed = 1; - list_add_tail(&msg->msg_list, &rbp->rbp_msgs); - return LNET_CREDIT_WAIT; - } - } - - LASSERT(!list_empty(&rbp->rbp_bufs)); - rb = list_entry(rbp->rbp_bufs.next, struct lnet_rtrbuf, rb_list); - list_del(&rb->rb_list); - - msg->msg_niov = rbp->rbp_npages; - msg->msg_kiov = &rb->rb_kiov[0]; - - if (do_recv) { - int cpt = msg->msg_rx_cpt; - - lnet_net_unlock(cpt); - lnet_ni_recv(lp->lp_ni, msg->msg_private, msg, 1, - 0, msg->msg_len, msg->msg_len); - lnet_net_lock(cpt); - } - return LNET_CREDIT_OK; -} - -void -lnet_return_tx_credits_locked(struct lnet_msg *msg) -{ - struct lnet_peer *txpeer = msg->msg_txpeer; - struct lnet_msg *msg2; - - if (msg->msg_txcredit) { - struct lnet_ni *ni = txpeer->lp_ni; - struct lnet_tx_queue *tq = ni->ni_tx_queues[msg->msg_tx_cpt]; - - /* give back NI txcredits */ - msg->msg_txcredit = 0; - - LASSERT((tq->tq_credits < 0) == - !list_empty(&tq->tq_delayed)); - - tq->tq_credits++; - if (tq->tq_credits <= 0) { - msg2 = list_entry(tq->tq_delayed.next, - struct lnet_msg, msg_list); - list_del(&msg2->msg_list); - - LASSERT(msg2->msg_txpeer->lp_ni == ni); - LASSERT(msg2->msg_tx_delayed); - - (void)lnet_post_send_locked(msg2, 1); - } - } - - if (msg->msg_peertxcredit) { - /* give back peer txcredits */ - msg->msg_peertxcredit = 0; - - LASSERT((txpeer->lp_txcredits < 0) == - !list_empty(&txpeer->lp_txq)); - - txpeer->lp_txqnob -= msg->msg_len + sizeof(struct lnet_hdr); - LASSERT(txpeer->lp_txqnob >= 0); - - txpeer->lp_txcredits++; - if (txpeer->lp_txcredits <= 0) { - msg2 = list_entry(txpeer->lp_txq.next, - struct lnet_msg, msg_list); - list_del(&msg2->msg_list); - - LASSERT(msg2->msg_txpeer == txpeer); - LASSERT(msg2->msg_tx_delayed); - - (void)lnet_post_send_locked(msg2, 1); - } - } - - if (txpeer) { - msg->msg_txpeer = NULL; - lnet_peer_decref_locked(txpeer); - } -} - -void -lnet_schedule_blocked_locked(struct lnet_rtrbufpool *rbp) -{ - struct lnet_msg *msg; - - if (list_empty(&rbp->rbp_msgs)) - return; - msg = list_entry(rbp->rbp_msgs.next, - struct lnet_msg, msg_list); - list_del(&msg->msg_list); - - (void)lnet_post_routed_recv_locked(msg, 1); -} - -void -lnet_drop_routed_msgs_locked(struct list_head *list, int cpt) -{ - struct list_head drop; - struct lnet_msg *msg; - struct lnet_msg *tmp; - - INIT_LIST_HEAD(&drop); - - list_splice_init(list, &drop); - - lnet_net_unlock(cpt); - - list_for_each_entry_safe(msg, tmp, &drop, msg_list) { - lnet_ni_recv(msg->msg_rxpeer->lp_ni, msg->msg_private, NULL, - 0, 0, 0, msg->msg_hdr.payload_length); - list_del_init(&msg->msg_list); - lnet_finalize(NULL, msg, -ECANCELED); - } - - lnet_net_lock(cpt); -} - -void -lnet_return_rx_credits_locked(struct lnet_msg *msg) -{ - struct lnet_peer *rxpeer = msg->msg_rxpeer; - struct lnet_msg *msg2; - - if (msg->msg_rtrcredit) { - /* give back global router credits */ - struct lnet_rtrbuf *rb; - struct lnet_rtrbufpool *rbp; - - /* - * NB If a msg ever blocks for a buffer in rbp_msgs, it stays - * there until it gets one allocated, or aborts the wait - * itself - */ - LASSERT(msg->msg_kiov); - - rb = container_of(msg->msg_kiov, struct lnet_rtrbuf, rb_kiov[0]); - rbp = rb->rb_pool; - - msg->msg_kiov = NULL; - msg->msg_rtrcredit = 0; - - LASSERT(rbp == lnet_msg2bufpool(msg)); - - LASSERT((rbp->rbp_credits > 0) == - !list_empty(&rbp->rbp_bufs)); - - /* - * If routing is now turned off, we just drop this buffer and - * don't bother trying to return credits. - */ - if (!the_lnet.ln_routing) { - lnet_destroy_rtrbuf(rb, rbp->rbp_npages); - goto routing_off; - } - - /* - * It is possible that a user has lowered the desired number of - * buffers in this pool. Make sure we never put back - * more buffers than the stated number. - */ - if (unlikely(rbp->rbp_credits >= rbp->rbp_req_nbuffers)) { - /* Discard this buffer so we don't have too many. */ - lnet_destroy_rtrbuf(rb, rbp->rbp_npages); - rbp->rbp_nbuffers--; - } else { - list_add(&rb->rb_list, &rbp->rbp_bufs); - rbp->rbp_credits++; - if (rbp->rbp_credits <= 0) - lnet_schedule_blocked_locked(rbp); - } - } - -routing_off: - if (msg->msg_peerrtrcredit) { - /* give back peer router credits */ - msg->msg_peerrtrcredit = 0; - - LASSERT((rxpeer->lp_rtrcredits < 0) == - !list_empty(&rxpeer->lp_rtrq)); - - rxpeer->lp_rtrcredits++; - /* - * drop all messages which are queued to be routed on that - * peer. - */ - if (!the_lnet.ln_routing) { - lnet_drop_routed_msgs_locked(&rxpeer->lp_rtrq, - msg->msg_rx_cpt); - } else if (rxpeer->lp_rtrcredits <= 0) { - msg2 = list_entry(rxpeer->lp_rtrq.next, - struct lnet_msg, msg_list); - list_del(&msg2->msg_list); - - (void)lnet_post_routed_recv_locked(msg2, 1); - } - } - if (rxpeer) { - msg->msg_rxpeer = NULL; - lnet_peer_decref_locked(rxpeer); - } -} - -static int -lnet_compare_routes(struct lnet_route *r1, struct lnet_route *r2) -{ - struct lnet_peer *p1 = r1->lr_gateway; - struct lnet_peer *p2 = r2->lr_gateway; - int r1_hops = (r1->lr_hops == LNET_UNDEFINED_HOPS) ? 1 : r1->lr_hops; - int r2_hops = (r2->lr_hops == LNET_UNDEFINED_HOPS) ? 1 : r2->lr_hops; - - if (r1->lr_priority < r2->lr_priority) - return 1; - - if (r1->lr_priority > r2->lr_priority) - return -ERANGE; - - if (r1_hops < r2_hops) - return 1; - - if (r1_hops > r2_hops) - return -ERANGE; - - if (p1->lp_txqnob < p2->lp_txqnob) - return 1; - - if (p1->lp_txqnob > p2->lp_txqnob) - return -ERANGE; - - if (p1->lp_txcredits > p2->lp_txcredits) - return 1; - - if (p1->lp_txcredits < p2->lp_txcredits) - return -ERANGE; - - if (r1->lr_seq - r2->lr_seq <= 0) - return 1; - - return -ERANGE; -} - -static struct lnet_peer * -lnet_find_route_locked(struct lnet_ni *ni, lnet_nid_t target, - lnet_nid_t rtr_nid) -{ - struct lnet_remotenet *rnet; - struct lnet_route *route; - struct lnet_route *best_route; - struct lnet_route *last_route; - struct lnet_peer *lp_best; - struct lnet_peer *lp; - int rc; - - /* - * If @rtr_nid is not LNET_NID_ANY, return the gateway with - * rtr_nid nid, otherwise find the best gateway I can use - */ - rnet = lnet_find_net_locked(LNET_NIDNET(target)); - if (!rnet) - return NULL; - - lp_best = NULL; - best_route = NULL; - last_route = NULL; - list_for_each_entry(route, &rnet->lrn_routes, lr_list) { - lp = route->lr_gateway; - - if (!lnet_is_route_alive(route)) - continue; - - if (ni && lp->lp_ni != ni) - continue; - - if (lp->lp_nid == rtr_nid) /* it's pre-determined router */ - return lp; - - if (!lp_best) { - best_route = route; - last_route = route; - lp_best = lp; - continue; - } - - /* no protection on below fields, but it's harmless */ - if (last_route->lr_seq - route->lr_seq < 0) - last_route = route; - - rc = lnet_compare_routes(route, best_route); - if (rc < 0) - continue; - - best_route = route; - lp_best = lp; - } - - /* - * set sequence number on the best router to the latest sequence + 1 - * so we can round-robin all routers, it's race and inaccurate but - * harmless and functional - */ - if (best_route) - best_route->lr_seq = last_route->lr_seq + 1; - return lp_best; -} - -int -lnet_send(lnet_nid_t src_nid, struct lnet_msg *msg, lnet_nid_t rtr_nid) -{ - lnet_nid_t dst_nid = msg->msg_target.nid; - struct lnet_ni *src_ni; - struct lnet_ni *local_ni; - struct lnet_peer *lp; - int cpt; - int cpt2; - int rc; - - /* - * NB: rtr_nid is set to LNET_NID_ANY for all current use-cases, - * but we might want to use pre-determined router for ACK/REPLY - * in the future - */ - /* NB: ni == interface pre-determined (ACK/REPLY) */ - LASSERT(!msg->msg_txpeer); - LASSERT(!msg->msg_sending); - LASSERT(!msg->msg_target_is_router); - LASSERT(!msg->msg_receiving); - - msg->msg_sending = 1; - - LASSERT(!msg->msg_tx_committed); - cpt = lnet_cpt_of_nid(rtr_nid == LNET_NID_ANY ? dst_nid : rtr_nid); - again: - lnet_net_lock(cpt); - - if (the_lnet.ln_shutdown) { - lnet_net_unlock(cpt); - return -ESHUTDOWN; - } - - if (src_nid == LNET_NID_ANY) { - src_ni = NULL; - } else { - src_ni = lnet_nid2ni_locked(src_nid, cpt); - if (!src_ni) { - lnet_net_unlock(cpt); - LCONSOLE_WARN("Can't send to %s: src %s is not a local nid\n", - libcfs_nid2str(dst_nid), - libcfs_nid2str(src_nid)); - return -EINVAL; - } - LASSERT(!msg->msg_routing); - } - - /* Is this for someone on a local network? */ - local_ni = lnet_net2ni_locked(LNET_NIDNET(dst_nid), cpt); - - if (local_ni) { - if (!src_ni) { - src_ni = local_ni; - src_nid = src_ni->ni_nid; - } else if (src_ni == local_ni) { - lnet_ni_decref_locked(local_ni, cpt); - } else { - lnet_ni_decref_locked(local_ni, cpt); - lnet_ni_decref_locked(src_ni, cpt); - lnet_net_unlock(cpt); - LCONSOLE_WARN("No route to %s via from %s\n", - libcfs_nid2str(dst_nid), - libcfs_nid2str(src_nid)); - return -EINVAL; - } - - LASSERT(src_nid != LNET_NID_ANY); - lnet_msg_commit(msg, cpt); - - if (!msg->msg_routing) - msg->msg_hdr.src_nid = cpu_to_le64(src_nid); - - if (src_ni == the_lnet.ln_loni) { - /* No send credit hassles with LOLND */ - lnet_net_unlock(cpt); - lnet_ni_send(src_ni, msg); - - lnet_net_lock(cpt); - lnet_ni_decref_locked(src_ni, cpt); - lnet_net_unlock(cpt); - return 0; - } - - rc = lnet_nid2peer_locked(&lp, dst_nid, cpt); - /* lp has ref on src_ni; lose mine */ - lnet_ni_decref_locked(src_ni, cpt); - if (rc) { - lnet_net_unlock(cpt); - LCONSOLE_WARN("Error %d finding peer %s\n", rc, - libcfs_nid2str(dst_nid)); - /* ENOMEM or shutting down */ - return rc; - } - LASSERT(lp->lp_ni == src_ni); - } else { - /* sending to a remote network */ - lp = lnet_find_route_locked(src_ni, dst_nid, rtr_nid); - if (!lp) { - if (src_ni) - lnet_ni_decref_locked(src_ni, cpt); - lnet_net_unlock(cpt); - - LCONSOLE_WARN("No route to %s via %s (all routers down)\n", - libcfs_id2str(msg->msg_target), - libcfs_nid2str(src_nid)); - return -EHOSTUNREACH; - } - - /* - * rtr_nid is LNET_NID_ANY or NID of pre-determined router, - * it's possible that rtr_nid isn't LNET_NID_ANY and lp isn't - * pre-determined router, this can happen if router table - * was changed when we release the lock - */ - if (rtr_nid != lp->lp_nid) { - cpt2 = lnet_cpt_of_nid_locked(lp->lp_nid); - if (cpt2 != cpt) { - if (src_ni) - lnet_ni_decref_locked(src_ni, cpt); - lnet_net_unlock(cpt); - - rtr_nid = lp->lp_nid; - cpt = cpt2; - goto again; - } - } - - CDEBUG(D_NET, "Best route to %s via %s for %s %d\n", - libcfs_nid2str(dst_nid), libcfs_nid2str(lp->lp_nid), - lnet_msgtyp2str(msg->msg_type), msg->msg_len); - - if (!src_ni) { - src_ni = lp->lp_ni; - src_nid = src_ni->ni_nid; - } else { - LASSERT(src_ni == lp->lp_ni); - lnet_ni_decref_locked(src_ni, cpt); - } - - lnet_peer_addref_locked(lp); - - LASSERT(src_nid != LNET_NID_ANY); - lnet_msg_commit(msg, cpt); - - if (!msg->msg_routing) { - /* I'm the source and now I know which NI to send on */ - msg->msg_hdr.src_nid = cpu_to_le64(src_nid); - } - - msg->msg_target_is_router = 1; - msg->msg_target.nid = lp->lp_nid; - msg->msg_target.pid = LNET_PID_LUSTRE; - } - - /* 'lp' is our best choice of peer */ - - LASSERT(!msg->msg_peertxcredit); - LASSERT(!msg->msg_txcredit); - LASSERT(!msg->msg_txpeer); - - msg->msg_txpeer = lp; /* msg takes my ref on lp */ - - rc = lnet_post_send_locked(msg, 0); - lnet_net_unlock(cpt); - - if (rc < 0) - return rc; - - if (rc == LNET_CREDIT_OK) - lnet_ni_send(src_ni, msg); - - return 0; /* rc == LNET_CREDIT_OK or LNET_CREDIT_WAIT */ -} - -void -lnet_drop_message(struct lnet_ni *ni, int cpt, void *private, unsigned int nob) -{ - lnet_net_lock(cpt); - the_lnet.ln_counters[cpt]->drop_count++; - the_lnet.ln_counters[cpt]->drop_length += nob; - lnet_net_unlock(cpt); - - lnet_ni_recv(ni, private, NULL, 0, 0, 0, nob); -} - -static void -lnet_recv_put(struct lnet_ni *ni, struct lnet_msg *msg) -{ - struct lnet_hdr *hdr = &msg->msg_hdr; - - if (msg->msg_wanted) - lnet_setpayloadbuffer(msg); - - lnet_build_msg_event(msg, LNET_EVENT_PUT); - - /* - * Must I ACK? If so I'll grab the ack_wmd out of the header and put - * it back into the ACK during lnet_finalize() - */ - msg->msg_ack = !lnet_is_wire_handle_none(&hdr->msg.put.ack_wmd) && - !(msg->msg_md->md_options & LNET_MD_ACK_DISABLE); - - lnet_ni_recv(ni, msg->msg_private, msg, msg->msg_rx_delayed, - msg->msg_offset, msg->msg_wanted, hdr->payload_length); -} - -static int -lnet_parse_put(struct lnet_ni *ni, struct lnet_msg *msg) -{ - struct lnet_hdr *hdr = &msg->msg_hdr; - struct lnet_match_info info; - bool ready_delay; - int rc; - - /* Convert put fields to host byte order */ - le64_to_cpus(&hdr->msg.put.match_bits); - le32_to_cpus(&hdr->msg.put.ptl_index); - le32_to_cpus(&hdr->msg.put.offset); - - info.mi_id.nid = hdr->src_nid; - info.mi_id.pid = hdr->src_pid; - info.mi_opc = LNET_MD_OP_PUT; - info.mi_portal = hdr->msg.put.ptl_index; - info.mi_rlength = hdr->payload_length; - info.mi_roffset = hdr->msg.put.offset; - info.mi_mbits = hdr->msg.put.match_bits; - - msg->msg_rx_ready_delay = !ni->ni_lnd->lnd_eager_recv; - ready_delay = msg->msg_rx_ready_delay; - - again: - rc = lnet_ptl_match_md(&info, msg); - switch (rc) { - default: - LBUG(); - - case LNET_MATCHMD_OK: - lnet_recv_put(ni, msg); - return 0; - - case LNET_MATCHMD_NONE: - /** - * no eager_recv or has already called it, should - * have been attached on delayed list - */ - if (ready_delay) - return 0; - - rc = lnet_ni_eager_recv(ni, msg); - if (!rc) { - ready_delay = true; - goto again; - } - /* fall through */ - - case LNET_MATCHMD_DROP: - CNETERR("Dropping PUT from %s portal %d match %llu offset %d length %d: %d\n", - libcfs_id2str(info.mi_id), info.mi_portal, - info.mi_mbits, info.mi_roffset, info.mi_rlength, rc); - - return -ENOENT; /* -ve: OK but no match */ - } -} - -static int -lnet_parse_get(struct lnet_ni *ni, struct lnet_msg *msg, int rdma_get) -{ - struct lnet_match_info info; - struct lnet_hdr *hdr = &msg->msg_hdr; - struct lnet_handle_wire reply_wmd; - int rc; - - /* Convert get fields to host byte order */ - le64_to_cpus(&hdr->msg.get.match_bits); - le32_to_cpus(&hdr->msg.get.ptl_index); - le32_to_cpus(&hdr->msg.get.sink_length); - le32_to_cpus(&hdr->msg.get.src_offset); - - info.mi_id.nid = hdr->src_nid; - info.mi_id.pid = hdr->src_pid; - info.mi_opc = LNET_MD_OP_GET; - info.mi_portal = hdr->msg.get.ptl_index; - info.mi_rlength = hdr->msg.get.sink_length; - info.mi_roffset = hdr->msg.get.src_offset; - info.mi_mbits = hdr->msg.get.match_bits; - - rc = lnet_ptl_match_md(&info, msg); - if (rc == LNET_MATCHMD_DROP) { - CNETERR("Dropping GET from %s portal %d match %llu offset %d length %d\n", - libcfs_id2str(info.mi_id), info.mi_portal, - info.mi_mbits, info.mi_roffset, info.mi_rlength); - return -ENOENT; /* -ve: OK but no match */ - } - - LASSERT(rc == LNET_MATCHMD_OK); - - lnet_build_msg_event(msg, LNET_EVENT_GET); - - reply_wmd = hdr->msg.get.return_wmd; - - lnet_prep_send(msg, LNET_MSG_REPLY, info.mi_id, - msg->msg_offset, msg->msg_wanted); - - msg->msg_hdr.msg.reply.dst_wmd = reply_wmd; - - if (rdma_get) { - /* The LND completes the REPLY from her recv procedure */ - lnet_ni_recv(ni, msg->msg_private, msg, 0, - msg->msg_offset, msg->msg_len, msg->msg_len); - return 0; - } - - lnet_ni_recv(ni, msg->msg_private, NULL, 0, 0, 0, 0); - msg->msg_receiving = 0; - - rc = lnet_send(ni->ni_nid, msg, LNET_NID_ANY); - if (rc < 0) { - /* didn't get as far as lnet_ni_send() */ - CERROR("%s: Unable to send REPLY for GET from %s: %d\n", - libcfs_nid2str(ni->ni_nid), - libcfs_id2str(info.mi_id), rc); - - lnet_finalize(ni, msg, rc); - } - - return 0; -} - -static int -lnet_parse_reply(struct lnet_ni *ni, struct lnet_msg *msg) -{ - void *private = msg->msg_private; - struct lnet_hdr *hdr = &msg->msg_hdr; - struct lnet_process_id src = {0}; - struct lnet_libmd *md; - int rlength; - int mlength; - int cpt; - - cpt = lnet_cpt_of_cookie(hdr->msg.reply.dst_wmd.wh_object_cookie); - lnet_res_lock(cpt); - - src.nid = hdr->src_nid; - src.pid = hdr->src_pid; - - /* NB handles only looked up by creator (no flips) */ - md = lnet_wire_handle2md(&hdr->msg.reply.dst_wmd); - if (!md || !md->md_threshold || md->md_me) { - CNETERR("%s: Dropping REPLY from %s for %s MD %#llx.%#llx\n", - libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), - !md ? "invalid" : "inactive", - hdr->msg.reply.dst_wmd.wh_interface_cookie, - hdr->msg.reply.dst_wmd.wh_object_cookie); - if (md && md->md_me) - CERROR("REPLY MD also attached to portal %d\n", - md->md_me->me_portal); - - lnet_res_unlock(cpt); - return -ENOENT; /* -ve: OK but no match */ - } - - LASSERT(!md->md_offset); - - rlength = hdr->payload_length; - mlength = min_t(uint, rlength, md->md_length); - - if (mlength < rlength && - !(md->md_options & LNET_MD_TRUNCATE)) { - CNETERR("%s: Dropping REPLY from %s length %d for MD %#llx would overflow (%d)\n", - libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), - rlength, hdr->msg.reply.dst_wmd.wh_object_cookie, - mlength); - lnet_res_unlock(cpt); - return -ENOENT; /* -ve: OK but no match */ - } - - CDEBUG(D_NET, "%s: Reply from %s of length %d/%d into md %#llx\n", - libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), - mlength, rlength, hdr->msg.reply.dst_wmd.wh_object_cookie); - - lnet_msg_attach_md(msg, md, 0, mlength); - - if (mlength) - lnet_setpayloadbuffer(msg); - - lnet_res_unlock(cpt); - - lnet_build_msg_event(msg, LNET_EVENT_REPLY); - - lnet_ni_recv(ni, private, msg, 0, 0, mlength, rlength); - return 0; -} - -static int -lnet_parse_ack(struct lnet_ni *ni, struct lnet_msg *msg) -{ - struct lnet_hdr *hdr = &msg->msg_hdr; - struct lnet_process_id src = {0}; - struct lnet_libmd *md; - int cpt; - - src.nid = hdr->src_nid; - src.pid = hdr->src_pid; - - /* Convert ack fields to host byte order */ - le64_to_cpus(&hdr->msg.ack.match_bits); - le32_to_cpus(&hdr->msg.ack.mlength); - - cpt = lnet_cpt_of_cookie(hdr->msg.ack.dst_wmd.wh_object_cookie); - lnet_res_lock(cpt); - - /* NB handles only looked up by creator (no flips) */ - md = lnet_wire_handle2md(&hdr->msg.ack.dst_wmd); - if (!md || !md->md_threshold || md->md_me) { - /* Don't moan; this is expected */ - CDEBUG(D_NET, - "%s: Dropping ACK from %s to %s MD %#llx.%#llx\n", - libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), - !md ? "invalid" : "inactive", - hdr->msg.ack.dst_wmd.wh_interface_cookie, - hdr->msg.ack.dst_wmd.wh_object_cookie); - if (md && md->md_me) - CERROR("Source MD also attached to portal %d\n", - md->md_me->me_portal); - - lnet_res_unlock(cpt); - return -ENOENT; /* -ve! */ - } - - CDEBUG(D_NET, "%s: ACK from %s into md %#llx\n", - libcfs_nid2str(ni->ni_nid), libcfs_id2str(src), - hdr->msg.ack.dst_wmd.wh_object_cookie); - - lnet_msg_attach_md(msg, md, 0, 0); - - lnet_res_unlock(cpt); - - lnet_build_msg_event(msg, LNET_EVENT_ACK); - - lnet_ni_recv(ni, msg->msg_private, msg, 0, 0, 0, msg->msg_len); - return 0; -} - -/** - * \retval LNET_CREDIT_OK If \a msg is forwarded - * \retval LNET_CREDIT_WAIT If \a msg is blocked because w/o buffer - * \retval -ve error code - */ -int -lnet_parse_forward_locked(struct lnet_ni *ni, struct lnet_msg *msg) -{ - int rc = 0; - - if (!the_lnet.ln_routing) - return -ECANCELED; - - if (msg->msg_rxpeer->lp_rtrcredits <= 0 || - lnet_msg2bufpool(msg)->rbp_credits <= 0) { - if (!ni->ni_lnd->lnd_eager_recv) { - msg->msg_rx_ready_delay = 1; - } else { - lnet_net_unlock(msg->msg_rx_cpt); - rc = lnet_ni_eager_recv(ni, msg); - lnet_net_lock(msg->msg_rx_cpt); - } - } - - if (!rc) - rc = lnet_post_routed_recv_locked(msg, 0); - return rc; -} - -int -lnet_parse_local(struct lnet_ni *ni, struct lnet_msg *msg) -{ - int rc; - - switch (msg->msg_type) { - case LNET_MSG_ACK: - rc = lnet_parse_ack(ni, msg); - break; - case LNET_MSG_PUT: - rc = lnet_parse_put(ni, msg); - break; - case LNET_MSG_GET: - rc = lnet_parse_get(ni, msg, msg->msg_rdma_get); - break; - case LNET_MSG_REPLY: - rc = lnet_parse_reply(ni, msg); - break; - default: /* prevent an unused label if !kernel */ - LASSERT(0); - return -EPROTO; - } - - LASSERT(!rc || rc == -ENOENT); - return rc; -} - -char * -lnet_msgtyp2str(int type) -{ - switch (type) { - case LNET_MSG_ACK: - return "ACK"; - case LNET_MSG_PUT: - return "PUT"; - case LNET_MSG_GET: - return "GET"; - case LNET_MSG_REPLY: - return "REPLY"; - case LNET_MSG_HELLO: - return "HELLO"; - default: - return ""; - } -} - -void -lnet_print_hdr(struct lnet_hdr *hdr) -{ - struct lnet_process_id src = {0}; - struct lnet_process_id dst = {0}; - char *type_str = lnet_msgtyp2str(hdr->type); - - src.nid = hdr->src_nid; - src.pid = hdr->src_pid; - - dst.nid = hdr->dest_nid; - dst.pid = hdr->dest_pid; - - CWARN("P3 Header at %p of type %s\n", hdr, type_str); - CWARN(" From %s\n", libcfs_id2str(src)); - CWARN(" To %s\n", libcfs_id2str(dst)); - - switch (hdr->type) { - default: - break; - - case LNET_MSG_PUT: - CWARN(" Ptl index %d, ack md %#llx.%#llx, match bits %llu\n", - hdr->msg.put.ptl_index, - hdr->msg.put.ack_wmd.wh_interface_cookie, - hdr->msg.put.ack_wmd.wh_object_cookie, - hdr->msg.put.match_bits); - CWARN(" Length %d, offset %d, hdr data %#llx\n", - hdr->payload_length, hdr->msg.put.offset, - hdr->msg.put.hdr_data); - break; - - case LNET_MSG_GET: - CWARN(" Ptl index %d, return md %#llx.%#llx, match bits %llu\n", - hdr->msg.get.ptl_index, - hdr->msg.get.return_wmd.wh_interface_cookie, - hdr->msg.get.return_wmd.wh_object_cookie, - hdr->msg.get.match_bits); - CWARN(" Length %d, src offset %d\n", - hdr->msg.get.sink_length, - hdr->msg.get.src_offset); - break; - - case LNET_MSG_ACK: - CWARN(" dst md %#llx.%#llx, manipulated length %d\n", - hdr->msg.ack.dst_wmd.wh_interface_cookie, - hdr->msg.ack.dst_wmd.wh_object_cookie, - hdr->msg.ack.mlength); - break; - - case LNET_MSG_REPLY: - CWARN(" dst md %#llx.%#llx, length %d\n", - hdr->msg.reply.dst_wmd.wh_interface_cookie, - hdr->msg.reply.dst_wmd.wh_object_cookie, - hdr->payload_length); - } -} - -int -lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid, - void *private, int rdma_req) -{ - int rc = 0; - int cpt; - int for_me; - struct lnet_msg *msg; - lnet_pid_t dest_pid; - lnet_nid_t dest_nid; - lnet_nid_t src_nid; - __u32 payload_length; - __u32 type; - - LASSERT(!in_interrupt()); - - type = le32_to_cpu(hdr->type); - src_nid = le64_to_cpu(hdr->src_nid); - dest_nid = le64_to_cpu(hdr->dest_nid); - dest_pid = le32_to_cpu(hdr->dest_pid); - payload_length = le32_to_cpu(hdr->payload_length); - - for_me = (ni->ni_nid == dest_nid); - cpt = lnet_cpt_of_nid(from_nid); - - switch (type) { - case LNET_MSG_ACK: - case LNET_MSG_GET: - if (payload_length > 0) { - CERROR("%s, src %s: bad %s payload %d (0 expected)\n", - libcfs_nid2str(from_nid), - libcfs_nid2str(src_nid), - lnet_msgtyp2str(type), payload_length); - return -EPROTO; - } - break; - - case LNET_MSG_PUT: - case LNET_MSG_REPLY: - if (payload_length > - (__u32)(for_me ? LNET_MAX_PAYLOAD : LNET_MTU)) { - CERROR("%s, src %s: bad %s payload %d (%d max expected)\n", - libcfs_nid2str(from_nid), - libcfs_nid2str(src_nid), - lnet_msgtyp2str(type), - payload_length, - for_me ? LNET_MAX_PAYLOAD : LNET_MTU); - return -EPROTO; - } - break; - - default: - CERROR("%s, src %s: Bad message type 0x%x\n", - libcfs_nid2str(from_nid), - libcfs_nid2str(src_nid), type); - return -EPROTO; - } - - if (the_lnet.ln_routing && - ni->ni_last_alive != ktime_get_real_seconds()) { - /* NB: so far here is the only place to set NI status to "up */ - lnet_ni_lock(ni); - ni->ni_last_alive = ktime_get_real_seconds(); - if (ni->ni_status && - ni->ni_status->ns_status == LNET_NI_STATUS_DOWN) - ni->ni_status->ns_status = LNET_NI_STATUS_UP; - lnet_ni_unlock(ni); - } - - /* - * Regard a bad destination NID as a protocol error. Senders should - * know what they're doing; if they don't they're misconfigured, buggy - * or malicious so we chop them off at the knees :) - */ - if (!for_me) { - if (LNET_NIDNET(dest_nid) == LNET_NIDNET(ni->ni_nid)) { - /* should have gone direct */ - CERROR("%s, src %s: Bad dest nid %s (should have been sent direct)\n", - libcfs_nid2str(from_nid), - libcfs_nid2str(src_nid), - libcfs_nid2str(dest_nid)); - return -EPROTO; - } - - if (lnet_islocalnid(dest_nid)) { - /* - * dest is another local NI; sender should have used - * this node's NID on its own network - */ - CERROR("%s, src %s: Bad dest nid %s (it's my nid but on a different network)\n", - libcfs_nid2str(from_nid), - libcfs_nid2str(src_nid), - libcfs_nid2str(dest_nid)); - return -EPROTO; - } - - if (rdma_req && type == LNET_MSG_GET) { - CERROR("%s, src %s: Bad optimized GET for %s (final destination must be me)\n", - libcfs_nid2str(from_nid), - libcfs_nid2str(src_nid), - libcfs_nid2str(dest_nid)); - return -EPROTO; - } - - if (!the_lnet.ln_routing) { - CERROR("%s, src %s: Dropping message for %s (routing not enabled)\n", - libcfs_nid2str(from_nid), - libcfs_nid2str(src_nid), - libcfs_nid2str(dest_nid)); - goto drop; - } - } - - /* - * Message looks OK; we're not going to return an error, so we MUST - * call back lnd_recv() come what may... - */ - if (!list_empty(&the_lnet.ln_test_peers) && /* normally we don't */ - fail_peer(src_nid, 0)) { /* shall we now? */ - CERROR("%s, src %s: Dropping %s to simulate failure\n", - libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), - lnet_msgtyp2str(type)); - goto drop; - } - - if (!list_empty(&the_lnet.ln_drop_rules) && - lnet_drop_rule_match(hdr)) { - CDEBUG(D_NET, "%s, src %s, dst %s: Dropping %s to simulate silent message loss\n", - libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), - libcfs_nid2str(dest_nid), lnet_msgtyp2str(type)); - goto drop; - } - - msg = kzalloc(sizeof(*msg), GFP_NOFS); - if (!msg) { - CERROR("%s, src %s: Dropping %s (out of memory)\n", - libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), - lnet_msgtyp2str(type)); - goto drop; - } - - /* msg zeroed by kzalloc() - * i.e. flags all clear, pointers NULL etc - */ - msg->msg_type = type; - msg->msg_private = private; - msg->msg_receiving = 1; - msg->msg_rdma_get = rdma_req; - msg->msg_wanted = payload_length; - msg->msg_len = payload_length; - msg->msg_offset = 0; - msg->msg_hdr = *hdr; - /* for building message event */ - msg->msg_from = from_nid; - if (!for_me) { - msg->msg_target.pid = dest_pid; - msg->msg_target.nid = dest_nid; - msg->msg_routing = 1; - - } else { - /* convert common msg->hdr fields to host byteorder */ - msg->msg_hdr.type = type; - msg->msg_hdr.src_nid = src_nid; - le32_to_cpus(&msg->msg_hdr.src_pid); - msg->msg_hdr.dest_nid = dest_nid; - msg->msg_hdr.dest_pid = dest_pid; - msg->msg_hdr.payload_length = payload_length; - } - - lnet_net_lock(cpt); - rc = lnet_nid2peer_locked(&msg->msg_rxpeer, from_nid, cpt); - if (rc) { - lnet_net_unlock(cpt); - CERROR("%s, src %s: Dropping %s (error %d looking up sender)\n", - libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), - lnet_msgtyp2str(type), rc); - kfree(msg); - if (rc == -ESHUTDOWN) - /* We are shutting down. Don't do anything more */ - return 0; - goto drop; - } - - if (lnet_isrouter(msg->msg_rxpeer)) { - lnet_peer_set_alive(msg->msg_rxpeer); - if (avoid_asym_router_failure && - LNET_NIDNET(src_nid) != LNET_NIDNET(from_nid)) { - /* received a remote message from router, update - * remote NI status on this router. - * NB: multi-hop routed message will be ignored. - */ - lnet_router_ni_update_locked(msg->msg_rxpeer, - LNET_NIDNET(src_nid)); - } - } - - lnet_msg_commit(msg, cpt); - - /* message delay simulation */ - if (unlikely(!list_empty(&the_lnet.ln_delay_rules) && - lnet_delay_rule_match_locked(hdr, msg))) { - lnet_net_unlock(cpt); - return 0; - } - - if (!for_me) { - rc = lnet_parse_forward_locked(ni, msg); - lnet_net_unlock(cpt); - - if (rc < 0) - goto free_drop; - - if (rc == LNET_CREDIT_OK) { - lnet_ni_recv(ni, msg->msg_private, msg, 0, - 0, payload_length, payload_length); - } - return 0; - } - - lnet_net_unlock(cpt); - - rc = lnet_parse_local(ni, msg); - if (rc) - goto free_drop; - return 0; - - free_drop: - LASSERT(!msg->msg_md); - lnet_finalize(ni, msg, rc); - - drop: - lnet_drop_message(ni, cpt, private, payload_length); - return 0; -} -EXPORT_SYMBOL(lnet_parse); - -void -lnet_drop_delayed_msg_list(struct list_head *head, char *reason) -{ - while (!list_empty(head)) { - struct lnet_process_id id = {0}; - struct lnet_msg *msg; - - msg = list_entry(head->next, struct lnet_msg, msg_list); - list_del(&msg->msg_list); - - id.nid = msg->msg_hdr.src_nid; - id.pid = msg->msg_hdr.src_pid; - - LASSERT(!msg->msg_md); - LASSERT(msg->msg_rx_delayed); - LASSERT(msg->msg_rxpeer); - LASSERT(msg->msg_hdr.type == LNET_MSG_PUT); - - CWARN("Dropping delayed PUT from %s portal %d match %llu offset %d length %d: %s\n", - libcfs_id2str(id), - msg->msg_hdr.msg.put.ptl_index, - msg->msg_hdr.msg.put.match_bits, - msg->msg_hdr.msg.put.offset, - msg->msg_hdr.payload_length, reason); - - /* - * NB I can't drop msg's ref on msg_rxpeer until after I've - * called lnet_drop_message(), so I just hang onto msg as well - * until that's done - */ - lnet_drop_message(msg->msg_rxpeer->lp_ni, - msg->msg_rxpeer->lp_cpt, - msg->msg_private, msg->msg_len); - /* - * NB: message will not generate event because w/o attached MD, - * but we still should give error code so lnet_msg_decommit() - * can skip counters operations and other checks. - */ - lnet_finalize(msg->msg_rxpeer->lp_ni, msg, -ENOENT); - } -} - -void -lnet_recv_delayed_msg_list(struct list_head *head) -{ - while (!list_empty(head)) { - struct lnet_msg *msg; - struct lnet_process_id id; - - msg = list_entry(head->next, struct lnet_msg, msg_list); - list_del(&msg->msg_list); - - /* - * md won't disappear under me, since each msg - * holds a ref on it - */ - id.nid = msg->msg_hdr.src_nid; - id.pid = msg->msg_hdr.src_pid; - - LASSERT(msg->msg_rx_delayed); - LASSERT(msg->msg_md); - LASSERT(msg->msg_rxpeer); - LASSERT(msg->msg_hdr.type == LNET_MSG_PUT); - - CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d match %llu offset %d length %d.\n", - libcfs_id2str(id), msg->msg_hdr.msg.put.ptl_index, - msg->msg_hdr.msg.put.match_bits, - msg->msg_hdr.msg.put.offset, - msg->msg_hdr.payload_length); - - lnet_recv_put(msg->msg_rxpeer->lp_ni, msg); - } -} - -/** - * Initiate an asynchronous PUT operation. - * - * There are several events associated with a PUT: completion of the send on - * the initiator node (LNET_EVENT_SEND), and when the send completes - * successfully, the receipt of an acknowledgment (LNET_EVENT_ACK) indicating - * that the operation was accepted by the target. The event LNET_EVENT_PUT is - * used at the target node to indicate the completion of incoming data - * delivery. - * - * The local events will be logged in the EQ associated with the MD pointed to - * by \a mdh handle. Using a MD without an associated EQ results in these - * events being discarded. In this case, the caller must have another - * mechanism (e.g., a higher level protocol) for determining when it is safe - * to modify the memory region associated with the MD. - * - * Note that LNet does not guarantee the order of LNET_EVENT_SEND and - * LNET_EVENT_ACK, though intuitively ACK should happen after SEND. - * - * \param self Indicates the NID of a local interface through which to send - * the PUT request. Use LNET_NID_ANY to let LNet choose one by itself. - * \param mdh A handle for the MD that describes the memory to be sent. The MD - * must be "free floating" (See LNetMDBind()). - * \param ack Controls whether an acknowledgment is requested. - * Acknowledgments are only sent when they are requested by the initiating - * process and the target MD enables them. - * \param target A process identifier for the target process. - * \param portal The index in the \a target's portal table. - * \param match_bits The match bits to use for MD selection at the target - * process. - * \param offset The offset into the target MD (only used when the target - * MD has the LNET_MD_MANAGE_REMOTE option set). - * \param hdr_data 64 bits of user data that can be included in the message - * header. This data is written to an event queue entry at the target if an - * EQ is present on the matching MD. - * - * \retval 0 Success, and only in this case events will be generated - * and logged to EQ (if it exists). - * \retval -EIO Simulated failure. - * \retval -ENOMEM Memory allocation failure. - * \retval -ENOENT Invalid MD object. - * - * \see lnet_event::hdr_data and lnet_event_kind. - */ -int -LNetPut(lnet_nid_t self, struct lnet_handle_md mdh, enum lnet_ack_req ack, - struct lnet_process_id target, unsigned int portal, - __u64 match_bits, unsigned int offset, - __u64 hdr_data) -{ - struct lnet_msg *msg; - struct lnet_libmd *md; - int cpt; - int rc; - - LASSERT(the_lnet.ln_refcount > 0); - - if (!list_empty(&the_lnet.ln_test_peers) && /* normally we don't */ - fail_peer(target.nid, 1)) { /* shall we now? */ - CERROR("Dropping PUT to %s: simulated failure\n", - libcfs_id2str(target)); - return -EIO; - } - - msg = kzalloc(sizeof(*msg), GFP_NOFS); - if (!msg) { - CERROR("Dropping PUT to %s: ENOMEM on struct lnet_msg\n", - libcfs_id2str(target)); - return -ENOMEM; - } - msg->msg_vmflush = !!(current->flags & PF_MEMALLOC); - - cpt = lnet_cpt_of_cookie(mdh.cookie); - lnet_res_lock(cpt); - - md = lnet_handle2md(&mdh); - if (!md || !md->md_threshold || md->md_me) { - CERROR("Dropping PUT (%llu:%d:%s): MD (%d) invalid\n", - match_bits, portal, libcfs_id2str(target), - !md ? -1 : md->md_threshold); - if (md && md->md_me) - CERROR("Source MD also attached to portal %d\n", - md->md_me->me_portal); - lnet_res_unlock(cpt); - - kfree(msg); - return -ENOENT; - } - - CDEBUG(D_NET, "%s -> %s\n", __func__, libcfs_id2str(target)); - - lnet_msg_attach_md(msg, md, 0, 0); - - lnet_prep_send(msg, LNET_MSG_PUT, target, 0, md->md_length); - - msg->msg_hdr.msg.put.match_bits = cpu_to_le64(match_bits); - msg->msg_hdr.msg.put.ptl_index = cpu_to_le32(portal); - msg->msg_hdr.msg.put.offset = cpu_to_le32(offset); - msg->msg_hdr.msg.put.hdr_data = hdr_data; - - /* NB handles only looked up by creator (no flips) */ - if (ack == LNET_ACK_REQ) { - msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie = - the_lnet.ln_interface_cookie; - msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie = - md->md_lh.lh_cookie; - } else { - msg->msg_hdr.msg.put.ack_wmd.wh_interface_cookie = - LNET_WIRE_HANDLE_COOKIE_NONE; - msg->msg_hdr.msg.put.ack_wmd.wh_object_cookie = - LNET_WIRE_HANDLE_COOKIE_NONE; - } - - lnet_res_unlock(cpt); - - lnet_build_msg_event(msg, LNET_EVENT_SEND); - - rc = lnet_send(self, msg, LNET_NID_ANY); - if (rc) { - CNETERR("Error sending PUT to %s: %d\n", - libcfs_id2str(target), rc); - lnet_finalize(NULL, msg, rc); - } - - /* completion will be signalled by an event */ - return 0; -} -EXPORT_SYMBOL(LNetPut); - -struct lnet_msg * -lnet_create_reply_msg(struct lnet_ni *ni, struct lnet_msg *getmsg) -{ - /* - * The LND can DMA direct to the GET md (i.e. no REPLY msg). This - * returns a msg for the LND to pass to lnet_finalize() when the sink - * data has been received. - * - * CAVEAT EMPTOR: 'getmsg' is the original GET, which is freed when - * lnet_finalize() is called on it, so the LND must call this first - */ - struct lnet_msg *msg = kzalloc(sizeof(*msg), GFP_NOFS); - struct lnet_libmd *getmd = getmsg->msg_md; - struct lnet_process_id peer_id = getmsg->msg_target; - int cpt; - - LASSERT(!getmsg->msg_target_is_router); - LASSERT(!getmsg->msg_routing); - - if (!msg) { - CERROR("%s: Dropping REPLY from %s: can't allocate msg\n", - libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id)); - goto drop; - } - - cpt = lnet_cpt_of_cookie(getmd->md_lh.lh_cookie); - lnet_res_lock(cpt); - - LASSERT(getmd->md_refcount > 0); - - if (!getmd->md_threshold) { - CERROR("%s: Dropping REPLY from %s for inactive MD %p\n", - libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), - getmd); - lnet_res_unlock(cpt); - goto drop; - } - - LASSERT(!getmd->md_offset); - - CDEBUG(D_NET, "%s: Reply from %s md %p\n", - libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), getmd); - - /* setup information for lnet_build_msg_event */ - msg->msg_from = peer_id.nid; - msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */ - msg->msg_hdr.src_nid = peer_id.nid; - msg->msg_hdr.payload_length = getmd->md_length; - msg->msg_receiving = 1; /* required by lnet_msg_attach_md */ - - lnet_msg_attach_md(msg, getmd, getmd->md_offset, getmd->md_length); - lnet_res_unlock(cpt); - - cpt = lnet_cpt_of_nid(peer_id.nid); - - lnet_net_lock(cpt); - lnet_msg_commit(msg, cpt); - lnet_net_unlock(cpt); - - lnet_build_msg_event(msg, LNET_EVENT_REPLY); - - return msg; - - drop: - cpt = lnet_cpt_of_nid(peer_id.nid); - - lnet_net_lock(cpt); - the_lnet.ln_counters[cpt]->drop_count++; - the_lnet.ln_counters[cpt]->drop_length += getmd->md_length; - lnet_net_unlock(cpt); - - kfree(msg); - - return NULL; -} -EXPORT_SYMBOL(lnet_create_reply_msg); - -void -lnet_set_reply_msg_len(struct lnet_ni *ni, struct lnet_msg *reply, - unsigned int len) -{ - /* - * Set the REPLY length, now the RDMA that elides the REPLY message has - * completed and I know it. - */ - LASSERT(reply); - LASSERT(reply->msg_type == LNET_MSG_GET); - LASSERT(reply->msg_ev.type == LNET_EVENT_REPLY); - - /* - * NB I trusted my peer to RDMA. If she tells me she's written beyond - * the end of my buffer, I might as well be dead. - */ - LASSERT(len <= reply->msg_ev.mlength); - - reply->msg_ev.mlength = len; -} -EXPORT_SYMBOL(lnet_set_reply_msg_len); - -/** - * Initiate an asynchronous GET operation. - * - * On the initiator node, an LNET_EVENT_SEND is logged when the GET request - * is sent, and an LNET_EVENT_REPLY is logged when the data returned from - * the target node in the REPLY has been written to local MD. - * - * On the target node, an LNET_EVENT_GET is logged when the GET request - * arrives and is accepted into a MD. - * - * \param self,target,portal,match_bits,offset See the discussion in LNetPut(). - * \param mdh A handle for the MD that describes the memory into which the - * requested data will be received. The MD must be "free floating" - * (See LNetMDBind()). - * - * \retval 0 Success, and only in this case events will be generated - * and logged to EQ (if it exists) of the MD. - * \retval -EIO Simulated failure. - * \retval -ENOMEM Memory allocation failure. - * \retval -ENOENT Invalid MD object. - */ -int -LNetGet(lnet_nid_t self, struct lnet_handle_md mdh, - struct lnet_process_id target, unsigned int portal, - __u64 match_bits, unsigned int offset) -{ - struct lnet_msg *msg; - struct lnet_libmd *md; - int cpt; - int rc; - - LASSERT(the_lnet.ln_refcount > 0); - - if (!list_empty(&the_lnet.ln_test_peers) && /* normally we don't */ - fail_peer(target.nid, 1)) { /* shall we now? */ - CERROR("Dropping GET to %s: simulated failure\n", - libcfs_id2str(target)); - return -EIO; - } - - msg = kzalloc(sizeof(*msg), GFP_NOFS); - if (!msg) { - CERROR("Dropping GET to %s: ENOMEM on struct lnet_msg\n", - libcfs_id2str(target)); - return -ENOMEM; - } - - cpt = lnet_cpt_of_cookie(mdh.cookie); - lnet_res_lock(cpt); - - md = lnet_handle2md(&mdh); - if (!md || !md->md_threshold || md->md_me) { - CERROR("Dropping GET (%llu:%d:%s): MD (%d) invalid\n", - match_bits, portal, libcfs_id2str(target), - !md ? -1 : md->md_threshold); - if (md && md->md_me) - CERROR("REPLY MD also attached to portal %d\n", - md->md_me->me_portal); - - lnet_res_unlock(cpt); - - kfree(msg); - return -ENOENT; - } - - CDEBUG(D_NET, "%s -> %s\n", __func__, libcfs_id2str(target)); - - lnet_msg_attach_md(msg, md, 0, 0); - - lnet_prep_send(msg, LNET_MSG_GET, target, 0, 0); - - msg->msg_hdr.msg.get.match_bits = cpu_to_le64(match_bits); - msg->msg_hdr.msg.get.ptl_index = cpu_to_le32(portal); - msg->msg_hdr.msg.get.src_offset = cpu_to_le32(offset); - msg->msg_hdr.msg.get.sink_length = cpu_to_le32(md->md_length); - - /* NB handles only looked up by creator (no flips) */ - msg->msg_hdr.msg.get.return_wmd.wh_interface_cookie = - the_lnet.ln_interface_cookie; - msg->msg_hdr.msg.get.return_wmd.wh_object_cookie = - md->md_lh.lh_cookie; - - lnet_res_unlock(cpt); - - lnet_build_msg_event(msg, LNET_EVENT_SEND); - - rc = lnet_send(self, msg, LNET_NID_ANY); - if (rc < 0) { - CNETERR("Error sending GET to %s: %d\n", - libcfs_id2str(target), rc); - lnet_finalize(NULL, msg, rc); - } - - /* completion will be signalled by an event */ - return 0; -} -EXPORT_SYMBOL(LNetGet); - -/** - * Calculate distance to node at \a dstnid. - * - * \param dstnid Target NID. - * \param srcnidp If not NULL, NID of the local interface to reach \a dstnid - * is saved here. - * \param orderp If not NULL, order of the route to reach \a dstnid is saved - * here. - * - * \retval 0 If \a dstnid belongs to a local interface, and reserved option - * local_nid_dist_zero is set, which is the default. - * \retval positives Distance to target NID, i.e. number of hops plus one. - * \retval -EHOSTUNREACH If \a dstnid is not reachable. - */ -int -LNetDist(lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) -{ - struct list_head *e; - struct lnet_ni *ni; - struct lnet_remotenet *rnet; - __u32 dstnet = LNET_NIDNET(dstnid); - int hops; - int cpt; - __u32 order = 2; - struct list_head *rn_list; - - /* - * if !local_nid_dist_zero, I don't return a distance of 0 ever - * (when lustre sees a distance of 0, it substitutes 0@lo), so I - * keep order 0 free for 0@lo and order 1 free for a local NID - * match - */ - LASSERT(the_lnet.ln_refcount > 0); - - cpt = lnet_net_lock_current(); - - list_for_each(e, &the_lnet.ln_nis) { - ni = list_entry(e, struct lnet_ni, ni_list); - - if (ni->ni_nid == dstnid) { - if (srcnidp) - *srcnidp = dstnid; - if (orderp) { - if (LNET_NETTYP(LNET_NIDNET(dstnid)) == LOLND) - *orderp = 0; - else - *orderp = 1; - } - lnet_net_unlock(cpt); - - return local_nid_dist_zero ? 0 : 1; - } - - if (LNET_NIDNET(ni->ni_nid) == dstnet) { - /* - * Check if ni was originally created in - * current net namespace. - * If not, assign order above 0xffff0000, - * to make this ni not a priority. - */ - if (!net_eq(ni->ni_net_ns, current->nsproxy->net_ns)) - order += 0xffff0000; - - if (srcnidp) - *srcnidp = ni->ni_nid; - if (orderp) - *orderp = order; - lnet_net_unlock(cpt); - return 1; - } - - order++; - } - - rn_list = lnet_net2rnethash(dstnet); - list_for_each(e, rn_list) { - rnet = list_entry(e, struct lnet_remotenet, lrn_list); - - if (rnet->lrn_net == dstnet) { - struct lnet_route *route; - struct lnet_route *shortest = NULL; - __u32 shortest_hops = LNET_UNDEFINED_HOPS; - __u32 route_hops; - - LASSERT(!list_empty(&rnet->lrn_routes)); - - list_for_each_entry(route, &rnet->lrn_routes, - lr_list) { - route_hops = route->lr_hops; - if (route_hops == LNET_UNDEFINED_HOPS) - route_hops = 1; - if (!shortest || - route_hops < shortest_hops) { - shortest = route; - shortest_hops = route_hops; - } - } - - LASSERT(shortest); - hops = shortest_hops; - if (srcnidp) - *srcnidp = shortest->lr_gateway->lp_ni->ni_nid; - if (orderp) - *orderp = order; - lnet_net_unlock(cpt); - return hops + 1; - } - order++; - } - - lnet_net_unlock(cpt); - return -EHOSTUNREACH; -} -EXPORT_SYMBOL(LNetDist); diff --git a/drivers/staging/lustre/lnet/lnet/lib-msg.c b/drivers/staging/lustre/lnet/lnet/lib-msg.c deleted file mode 100644 index 0091273c04b921ce1349fc89d51aec70fd617ead..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/lnet/lib-msg.c +++ /dev/null @@ -1,625 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/lnet/lib-msg.c - * - * Message decoding, parsing and finalizing routines - */ - -#define DEBUG_SUBSYSTEM S_LNET - -#include - -void -lnet_build_unlink_event(struct lnet_libmd *md, struct lnet_event *ev) -{ - memset(ev, 0, sizeof(*ev)); - - ev->status = 0; - ev->unlinked = 1; - ev->type = LNET_EVENT_UNLINK; - lnet_md_deconstruct(md, &ev->md); - lnet_md2handle(&ev->md_handle, md); -} - -/* - * Don't need any lock, must be called after lnet_commit_md - */ -void -lnet_build_msg_event(struct lnet_msg *msg, enum lnet_event_kind ev_type) -{ - struct lnet_hdr *hdr = &msg->msg_hdr; - struct lnet_event *ev = &msg->msg_ev; - - LASSERT(!msg->msg_routing); - - ev->type = ev_type; - - if (ev_type == LNET_EVENT_SEND) { - /* event for active message */ - ev->target.nid = le64_to_cpu(hdr->dest_nid); - ev->target.pid = le32_to_cpu(hdr->dest_pid); - ev->initiator.nid = LNET_NID_ANY; - ev->initiator.pid = the_lnet.ln_pid; - ev->sender = LNET_NID_ANY; - } else { - /* event for passive message */ - ev->target.pid = hdr->dest_pid; - ev->target.nid = hdr->dest_nid; - ev->initiator.pid = hdr->src_pid; - ev->initiator.nid = hdr->src_nid; - ev->rlength = hdr->payload_length; - ev->sender = msg->msg_from; - ev->mlength = msg->msg_wanted; - ev->offset = msg->msg_offset; - } - - switch (ev_type) { - default: - LBUG(); - - case LNET_EVENT_PUT: /* passive PUT */ - ev->pt_index = hdr->msg.put.ptl_index; - ev->match_bits = hdr->msg.put.match_bits; - ev->hdr_data = hdr->msg.put.hdr_data; - return; - - case LNET_EVENT_GET: /* passive GET */ - ev->pt_index = hdr->msg.get.ptl_index; - ev->match_bits = hdr->msg.get.match_bits; - ev->hdr_data = 0; - return; - - case LNET_EVENT_ACK: /* ACK */ - ev->match_bits = hdr->msg.ack.match_bits; - ev->mlength = hdr->msg.ack.mlength; - return; - - case LNET_EVENT_REPLY: /* REPLY */ - return; - - case LNET_EVENT_SEND: /* active message */ - if (msg->msg_type == LNET_MSG_PUT) { - ev->pt_index = le32_to_cpu(hdr->msg.put.ptl_index); - ev->match_bits = le64_to_cpu(hdr->msg.put.match_bits); - ev->offset = le32_to_cpu(hdr->msg.put.offset); - ev->mlength = - ev->rlength = le32_to_cpu(hdr->payload_length); - ev->hdr_data = le64_to_cpu(hdr->msg.put.hdr_data); - - } else { - LASSERT(msg->msg_type == LNET_MSG_GET); - ev->pt_index = le32_to_cpu(hdr->msg.get.ptl_index); - ev->match_bits = le64_to_cpu(hdr->msg.get.match_bits); - ev->mlength = - ev->rlength = le32_to_cpu(hdr->msg.get.sink_length); - ev->offset = le32_to_cpu(hdr->msg.get.src_offset); - ev->hdr_data = 0; - } - return; - } -} - -void -lnet_msg_commit(struct lnet_msg *msg, int cpt) -{ - struct lnet_msg_container *container = the_lnet.ln_msg_containers[cpt]; - struct lnet_counters *counters = the_lnet.ln_counters[cpt]; - - /* routed message can be committed for both receiving and sending */ - LASSERT(!msg->msg_tx_committed); - - if (msg->msg_sending) { - LASSERT(!msg->msg_receiving); - - msg->msg_tx_cpt = cpt; - msg->msg_tx_committed = 1; - if (msg->msg_rx_committed) { /* routed message REPLY */ - LASSERT(msg->msg_onactivelist); - return; - } - } else { - LASSERT(!msg->msg_sending); - msg->msg_rx_cpt = cpt; - msg->msg_rx_committed = 1; - } - - LASSERT(!msg->msg_onactivelist); - msg->msg_onactivelist = 1; - list_add(&msg->msg_activelist, &container->msc_active); - - counters->msgs_alloc++; - if (counters->msgs_alloc > counters->msgs_max) - counters->msgs_max = counters->msgs_alloc; -} - -static void -lnet_msg_decommit_tx(struct lnet_msg *msg, int status) -{ - struct lnet_counters *counters; - struct lnet_event *ev = &msg->msg_ev; - - LASSERT(msg->msg_tx_committed); - if (status) - goto out; - - counters = the_lnet.ln_counters[msg->msg_tx_cpt]; - switch (ev->type) { - default: /* routed message */ - LASSERT(msg->msg_routing); - LASSERT(msg->msg_rx_committed); - LASSERT(!ev->type); - - counters->route_length += msg->msg_len; - counters->route_count++; - goto out; - - case LNET_EVENT_PUT: - /* should have been decommitted */ - LASSERT(!msg->msg_rx_committed); - /* overwritten while sending ACK */ - LASSERT(msg->msg_type == LNET_MSG_ACK); - msg->msg_type = LNET_MSG_PUT; /* fix type */ - break; - - case LNET_EVENT_SEND: - LASSERT(!msg->msg_rx_committed); - if (msg->msg_type == LNET_MSG_PUT) - counters->send_length += msg->msg_len; - break; - - case LNET_EVENT_GET: - LASSERT(msg->msg_rx_committed); - /* - * overwritten while sending reply, we should never be - * here for optimized GET - */ - LASSERT(msg->msg_type == LNET_MSG_REPLY); - msg->msg_type = LNET_MSG_GET; /* fix type */ - break; - } - - counters->send_count++; - out: - lnet_return_tx_credits_locked(msg); - msg->msg_tx_committed = 0; -} - -static void -lnet_msg_decommit_rx(struct lnet_msg *msg, int status) -{ - struct lnet_counters *counters; - struct lnet_event *ev = &msg->msg_ev; - - LASSERT(!msg->msg_tx_committed); /* decommitted or never committed */ - LASSERT(msg->msg_rx_committed); - - if (status) - goto out; - - counters = the_lnet.ln_counters[msg->msg_rx_cpt]; - switch (ev->type) { - default: - LASSERT(!ev->type); - LASSERT(msg->msg_routing); - goto out; - - case LNET_EVENT_ACK: - LASSERT(msg->msg_type == LNET_MSG_ACK); - break; - - case LNET_EVENT_GET: - /* - * type is "REPLY" if it's an optimized GET on passive side, - * because optimized GET will never be committed for sending, - * so message type wouldn't be changed back to "GET" by - * lnet_msg_decommit_tx(), see details in lnet_parse_get() - */ - LASSERT(msg->msg_type == LNET_MSG_REPLY || - msg->msg_type == LNET_MSG_GET); - counters->send_length += msg->msg_wanted; - break; - - case LNET_EVENT_PUT: - LASSERT(msg->msg_type == LNET_MSG_PUT); - break; - - case LNET_EVENT_REPLY: - /* - * type is "GET" if it's an optimized GET on active side, - * see details in lnet_create_reply_msg() - */ - LASSERT(msg->msg_type == LNET_MSG_GET || - msg->msg_type == LNET_MSG_REPLY); - break; - } - - counters->recv_count++; - if (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_REPLY) - counters->recv_length += msg->msg_wanted; - - out: - lnet_return_rx_credits_locked(msg); - msg->msg_rx_committed = 0; -} - -void -lnet_msg_decommit(struct lnet_msg *msg, int cpt, int status) -{ - int cpt2 = cpt; - - LASSERT(msg->msg_tx_committed || msg->msg_rx_committed); - LASSERT(msg->msg_onactivelist); - - if (msg->msg_tx_committed) { /* always decommit for sending first */ - LASSERT(cpt == msg->msg_tx_cpt); - lnet_msg_decommit_tx(msg, status); - } - - if (msg->msg_rx_committed) { - /* forwarding msg committed for both receiving and sending */ - if (cpt != msg->msg_rx_cpt) { - lnet_net_unlock(cpt); - cpt2 = msg->msg_rx_cpt; - lnet_net_lock(cpt2); - } - lnet_msg_decommit_rx(msg, status); - } - - list_del(&msg->msg_activelist); - msg->msg_onactivelist = 0; - - the_lnet.ln_counters[cpt2]->msgs_alloc--; - - if (cpt2 != cpt) { - lnet_net_unlock(cpt2); - lnet_net_lock(cpt); - } -} - -void -lnet_msg_attach_md(struct lnet_msg *msg, struct lnet_libmd *md, - unsigned int offset, unsigned int mlen) -{ - /* NB: @offset and @len are only useful for receiving */ - /* - * Here, we attach the MD on lnet_msg and mark it busy and - * decrementing its threshold. Come what may, the lnet_msg "owns" - * the MD until a call to lnet_msg_detach_md or lnet_finalize() - * signals completion. - */ - LASSERT(!msg->msg_routing); - - msg->msg_md = md; - if (msg->msg_receiving) { /* committed for receiving */ - msg->msg_offset = offset; - msg->msg_wanted = mlen; - } - - md->md_refcount++; - if (md->md_threshold != LNET_MD_THRESH_INF) { - LASSERT(md->md_threshold > 0); - md->md_threshold--; - } - - /* build umd in event */ - lnet_md2handle(&msg->msg_ev.md_handle, md); - lnet_md_deconstruct(md, &msg->msg_ev.md); -} - -void -lnet_msg_detach_md(struct lnet_msg *msg, int status) -{ - struct lnet_libmd *md = msg->msg_md; - int unlink; - - /* Now it's safe to drop my caller's ref */ - md->md_refcount--; - LASSERT(md->md_refcount >= 0); - - unlink = lnet_md_unlinkable(md); - if (md->md_eq) { - msg->msg_ev.status = status; - msg->msg_ev.unlinked = unlink; - lnet_eq_enqueue_event(md->md_eq, &msg->msg_ev); - } - - if (unlink) - lnet_md_unlink(md); - - msg->msg_md = NULL; -} - -static int -lnet_complete_msg_locked(struct lnet_msg *msg, int cpt) -{ - struct lnet_handle_wire ack_wmd; - int rc; - int status = msg->msg_ev.status; - - LASSERT(msg->msg_onactivelist); - - if (!status && msg->msg_ack) { - /* Only send an ACK if the PUT completed successfully */ - - lnet_msg_decommit(msg, cpt, 0); - - msg->msg_ack = 0; - lnet_net_unlock(cpt); - - LASSERT(msg->msg_ev.type == LNET_EVENT_PUT); - LASSERT(!msg->msg_routing); - - ack_wmd = msg->msg_hdr.msg.put.ack_wmd; - - lnet_prep_send(msg, LNET_MSG_ACK, msg->msg_ev.initiator, 0, 0); - - msg->msg_hdr.msg.ack.dst_wmd = ack_wmd; - msg->msg_hdr.msg.ack.match_bits = msg->msg_ev.match_bits; - msg->msg_hdr.msg.ack.mlength = cpu_to_le32(msg->msg_ev.mlength); - - /* - * NB: we probably want to use NID of msg::msg_from as 3rd - * parameter (router NID) if it's routed message - */ - rc = lnet_send(msg->msg_ev.target.nid, msg, LNET_NID_ANY); - - lnet_net_lock(cpt); - /* - * NB: message is committed for sending, we should return - * on success because LND will finalize this message later. - * - * Also, there is possibility that message is committed for - * sending and also failed before delivering to LND, - * i.e: ENOMEM, in that case we can't fall through either - * because CPT for sending can be different with CPT for - * receiving, so we should return back to lnet_finalize() - * to make sure we are locking the correct partition. - */ - return rc; - - } else if (!status && /* OK so far */ - (msg->msg_routing && !msg->msg_sending)) { - /* not forwarded */ - LASSERT(!msg->msg_receiving); /* called back recv already */ - lnet_net_unlock(cpt); - - rc = lnet_send(LNET_NID_ANY, msg, LNET_NID_ANY); - - lnet_net_lock(cpt); - /* - * NB: message is committed for sending, we should return - * on success because LND will finalize this message later. - * - * Also, there is possibility that message is committed for - * sending and also failed before delivering to LND, - * i.e: ENOMEM, in that case we can't fall through either: - * - The rule is message must decommit for sending first if - * the it's committed for both sending and receiving - * - CPT for sending can be different with CPT for receiving, - * so we should return back to lnet_finalize() to make - * sure we are locking the correct partition. - */ - return rc; - } - - lnet_msg_decommit(msg, cpt, status); - kfree(msg); - return 0; -} - -void -lnet_finalize(struct lnet_ni *ni, struct lnet_msg *msg, int status) -{ - struct lnet_msg_container *container; - int my_slot; - int cpt; - int rc; - int i; - - LASSERT(!in_interrupt()); - - if (!msg) - return; - - msg->msg_ev.status = status; - - if (msg->msg_md) { - cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie); - - lnet_res_lock(cpt); - lnet_msg_detach_md(msg, status); - lnet_res_unlock(cpt); - } - - again: - rc = 0; - if (!msg->msg_tx_committed && !msg->msg_rx_committed) { - /* not committed to network yet */ - LASSERT(!msg->msg_onactivelist); - kfree(msg); - return; - } - - /* - * NB: routed message can be committed for both receiving and sending, - * we should finalize in LIFO order and keep counters correct. - * (finalize sending first then finalize receiving) - */ - cpt = msg->msg_tx_committed ? msg->msg_tx_cpt : msg->msg_rx_cpt; - lnet_net_lock(cpt); - - container = the_lnet.ln_msg_containers[cpt]; - list_add_tail(&msg->msg_list, &container->msc_finalizing); - - /* - * Recursion breaker. Don't complete the message here if I am (or - * enough other threads are) already completing messages - */ - my_slot = -1; - for (i = 0; i < container->msc_nfinalizers; i++) { - if (container->msc_finalizers[i] == current) - break; - - if (my_slot < 0 && !container->msc_finalizers[i]) - my_slot = i; - } - - if (i < container->msc_nfinalizers || my_slot < 0) { - lnet_net_unlock(cpt); - return; - } - - container->msc_finalizers[my_slot] = current; - - while (!list_empty(&container->msc_finalizing)) { - msg = list_entry(container->msc_finalizing.next, - struct lnet_msg, msg_list); - - list_del(&msg->msg_list); - - /* - * NB drops and regains the lnet lock if it actually does - * anything, so my finalizing friends can chomp along too - */ - rc = lnet_complete_msg_locked(msg, cpt); - if (rc) - break; - } - - if (unlikely(!list_empty(&the_lnet.ln_delay_rules))) { - lnet_net_unlock(cpt); - lnet_delay_rule_check(); - lnet_net_lock(cpt); - } - - container->msc_finalizers[my_slot] = NULL; - lnet_net_unlock(cpt); - - if (rc) - goto again; -} -EXPORT_SYMBOL(lnet_finalize); - -void -lnet_msg_container_cleanup(struct lnet_msg_container *container) -{ - int count = 0; - - if (!container->msc_init) - return; - - while (!list_empty(&container->msc_active)) { - struct lnet_msg *msg; - - msg = list_entry(container->msc_active.next, - struct lnet_msg, msg_activelist); - LASSERT(msg->msg_onactivelist); - msg->msg_onactivelist = 0; - list_del(&msg->msg_activelist); - kfree(msg); - count++; - } - - if (count > 0) - CERROR("%d active msg on exit\n", count); - - kvfree(container->msc_finalizers); - container->msc_finalizers = NULL; - container->msc_init = 0; -} - -int -lnet_msg_container_setup(struct lnet_msg_container *container, int cpt) -{ - container->msc_init = 1; - - INIT_LIST_HEAD(&container->msc_active); - INIT_LIST_HEAD(&container->msc_finalizing); - - /* number of CPUs */ - container->msc_nfinalizers = cfs_cpt_weight(lnet_cpt_table(), cpt); - - container->msc_finalizers = kvzalloc_cpt(container->msc_nfinalizers * - sizeof(*container->msc_finalizers), - GFP_KERNEL, cpt); - - if (!container->msc_finalizers) { - CERROR("Failed to allocate message finalizers\n"); - lnet_msg_container_cleanup(container); - return -ENOMEM; - } - - return 0; -} - -void -lnet_msg_containers_destroy(void) -{ - struct lnet_msg_container *container; - int i; - - if (!the_lnet.ln_msg_containers) - return; - - cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers) - lnet_msg_container_cleanup(container); - - cfs_percpt_free(the_lnet.ln_msg_containers); - the_lnet.ln_msg_containers = NULL; -} - -int -lnet_msg_containers_create(void) -{ - struct lnet_msg_container *container; - int rc; - int i; - - the_lnet.ln_msg_containers = cfs_percpt_alloc(lnet_cpt_table(), - sizeof(*container)); - - if (!the_lnet.ln_msg_containers) { - CERROR("Failed to allocate cpu-partition data for network\n"); - return -ENOMEM; - } - - cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers) { - rc = lnet_msg_container_setup(container, i); - if (rc) { - lnet_msg_containers_destroy(); - return rc; - } - } - - return 0; -} diff --git a/drivers/staging/lustre/lnet/lnet/lib-ptl.c b/drivers/staging/lustre/lnet/lnet/lib-ptl.c deleted file mode 100644 index fc47379c5938370626db4a77215f533e899e04c8..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/lnet/lib-ptl.c +++ /dev/null @@ -1,987 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * GPL HEADER END - */ -/* - * Copyright (c) 2012, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/lnet/lib-ptl.c - * - * portal & match routines - * - * Author: liang@whamcloud.com - */ - -#define DEBUG_SUBSYSTEM S_LNET - -#include - -/* NB: add /proc interfaces in upcoming patches */ -int portal_rotor = LNET_PTL_ROTOR_HASH_RT; -module_param(portal_rotor, int, 0644); -MODULE_PARM_DESC(portal_rotor, "redirect PUTs to different cpu-partitions"); - -static int -lnet_ptl_match_type(unsigned int index, struct lnet_process_id match_id, - __u64 mbits, __u64 ignore_bits) -{ - struct lnet_portal *ptl = the_lnet.ln_portals[index]; - int unique; - - unique = !ignore_bits && - match_id.nid != LNET_NID_ANY && - match_id.pid != LNET_PID_ANY; - - LASSERT(!lnet_ptl_is_unique(ptl) || !lnet_ptl_is_wildcard(ptl)); - - /* prefer to check w/o any lock */ - if (likely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl))) - goto match; - - /* unset, new portal */ - lnet_ptl_lock(ptl); - /* check again with lock */ - if (unlikely(lnet_ptl_is_unique(ptl) || lnet_ptl_is_wildcard(ptl))) { - lnet_ptl_unlock(ptl); - goto match; - } - - /* still not set */ - if (unique) - lnet_ptl_setopt(ptl, LNET_PTL_MATCH_UNIQUE); - else - lnet_ptl_setopt(ptl, LNET_PTL_MATCH_WILDCARD); - - lnet_ptl_unlock(ptl); - - return 1; - - match: - if ((lnet_ptl_is_unique(ptl) && !unique) || - (lnet_ptl_is_wildcard(ptl) && unique)) - return 0; - return 1; -} - -static void -lnet_ptl_enable_mt(struct lnet_portal *ptl, int cpt) -{ - struct lnet_match_table *mtable = ptl->ptl_mtables[cpt]; - int i; - - /* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */ - LASSERT(lnet_ptl_is_wildcard(ptl)); - - mtable->mt_enabled = 1; - - ptl->ptl_mt_maps[ptl->ptl_mt_nmaps] = cpt; - for (i = ptl->ptl_mt_nmaps - 1; i >= 0; i--) { - LASSERT(ptl->ptl_mt_maps[i] != cpt); - if (ptl->ptl_mt_maps[i] < cpt) - break; - - /* swap to order */ - ptl->ptl_mt_maps[i + 1] = ptl->ptl_mt_maps[i]; - ptl->ptl_mt_maps[i] = cpt; - } - - ptl->ptl_mt_nmaps++; -} - -static void -lnet_ptl_disable_mt(struct lnet_portal *ptl, int cpt) -{ - struct lnet_match_table *mtable = ptl->ptl_mtables[cpt]; - int i; - - /* with hold of both lnet_res_lock(cpt) and lnet_ptl_lock */ - LASSERT(lnet_ptl_is_wildcard(ptl)); - - if (LNET_CPT_NUMBER == 1) - return; /* never disable the only match-table */ - - mtable->mt_enabled = 0; - - LASSERT(ptl->ptl_mt_nmaps > 0 && - ptl->ptl_mt_nmaps <= LNET_CPT_NUMBER); - - /* remove it from mt_maps */ - ptl->ptl_mt_nmaps--; - for (i = 0; i < ptl->ptl_mt_nmaps; i++) { - if (ptl->ptl_mt_maps[i] >= cpt) /* overwrite it */ - ptl->ptl_mt_maps[i] = ptl->ptl_mt_maps[i + 1]; - } -} - -static int -lnet_try_match_md(struct lnet_libmd *md, - struct lnet_match_info *info, struct lnet_msg *msg) -{ - /* - * ALWAYS called holding the lnet_res_lock, and can't lnet_res_unlock; - * lnet_match_blocked_msg() relies on this to avoid races - */ - unsigned int offset; - unsigned int mlength; - struct lnet_me *me = md->md_me; - - /* MD exhausted */ - if (lnet_md_exhausted(md)) - return LNET_MATCHMD_NONE | LNET_MATCHMD_EXHAUSTED; - - /* mismatched MD op */ - if (!(md->md_options & info->mi_opc)) - return LNET_MATCHMD_NONE; - - /* mismatched ME nid/pid? */ - if (me->me_match_id.nid != LNET_NID_ANY && - me->me_match_id.nid != info->mi_id.nid) - return LNET_MATCHMD_NONE; - - if (me->me_match_id.pid != LNET_PID_ANY && - me->me_match_id.pid != info->mi_id.pid) - return LNET_MATCHMD_NONE; - - /* mismatched ME matchbits? */ - if ((me->me_match_bits ^ info->mi_mbits) & ~me->me_ignore_bits) - return LNET_MATCHMD_NONE; - - /* Hurrah! This _is_ a match; check it out... */ - - if (!(md->md_options & LNET_MD_MANAGE_REMOTE)) - offset = md->md_offset; - else - offset = info->mi_roffset; - - if (md->md_options & LNET_MD_MAX_SIZE) { - mlength = md->md_max_size; - LASSERT(md->md_offset + mlength <= md->md_length); - } else { - mlength = md->md_length - offset; - } - - if (info->mi_rlength <= mlength) { /* fits in allowed space */ - mlength = info->mi_rlength; - } else if (!(md->md_options & LNET_MD_TRUNCATE)) { - /* this packet _really_ is too big */ - CERROR("Matching packet from %s, match %llu length %d too big: %d left, %d allowed\n", - libcfs_id2str(info->mi_id), info->mi_mbits, - info->mi_rlength, md->md_length - offset, mlength); - - return LNET_MATCHMD_DROP; - } - - /* Commit to this ME/MD */ - CDEBUG(D_NET, "Incoming %s index %x from %s of length %d/%d into md %#llx [%d] + %d\n", - (info->mi_opc == LNET_MD_OP_PUT) ? "put" : "get", - info->mi_portal, libcfs_id2str(info->mi_id), mlength, - info->mi_rlength, md->md_lh.lh_cookie, md->md_niov, offset); - - lnet_msg_attach_md(msg, md, offset, mlength); - md->md_offset = offset + mlength; - - if (!lnet_md_exhausted(md)) - return LNET_MATCHMD_OK; - - /* - * Auto-unlink NOW, so the ME gets unlinked if required. - * We bumped md->md_refcount above so the MD just gets flagged - * for unlink when it is finalized. - */ - if (md->md_flags & LNET_MD_FLAG_AUTO_UNLINK) - lnet_md_unlink(md); - - return LNET_MATCHMD_OK | LNET_MATCHMD_EXHAUSTED; -} - -static struct lnet_match_table * -lnet_match2mt(struct lnet_portal *ptl, struct lnet_process_id id, __u64 mbits) -{ - if (LNET_CPT_NUMBER == 1) - return ptl->ptl_mtables[0]; /* the only one */ - - /* if it's a unique portal, return match-table hashed by NID */ - return lnet_ptl_is_unique(ptl) ? - ptl->ptl_mtables[lnet_cpt_of_nid(id.nid)] : NULL; -} - -struct lnet_match_table * -lnet_mt_of_attach(unsigned int index, struct lnet_process_id id, - __u64 mbits, __u64 ignore_bits, enum lnet_ins_pos pos) -{ - struct lnet_portal *ptl; - struct lnet_match_table *mtable; - - /* NB: called w/o lock */ - LASSERT(index < the_lnet.ln_nportals); - - if (!lnet_ptl_match_type(index, id, mbits, ignore_bits)) - return NULL; - - ptl = the_lnet.ln_portals[index]; - - mtable = lnet_match2mt(ptl, id, mbits); - if (mtable) /* unique portal or only one match-table */ - return mtable; - - /* it's a wildcard portal */ - switch (pos) { - default: - return NULL; - case LNET_INS_BEFORE: - case LNET_INS_AFTER: - /* - * posted by no affinity thread, always hash to specific - * match-table to avoid buffer stealing which is heavy - */ - return ptl->ptl_mtables[ptl->ptl_index % LNET_CPT_NUMBER]; - case LNET_INS_LOCAL: - /* posted by cpu-affinity thread */ - return ptl->ptl_mtables[lnet_cpt_current()]; - } -} - -static struct lnet_match_table * -lnet_mt_of_match(struct lnet_match_info *info, struct lnet_msg *msg) -{ - struct lnet_match_table *mtable; - struct lnet_portal *ptl; - unsigned int nmaps; - unsigned int rotor; - unsigned int cpt; - bool routed; - - /* NB: called w/o lock */ - LASSERT(info->mi_portal < the_lnet.ln_nportals); - ptl = the_lnet.ln_portals[info->mi_portal]; - - LASSERT(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)); - - mtable = lnet_match2mt(ptl, info->mi_id, info->mi_mbits); - if (mtable) - return mtable; - - /* it's a wildcard portal */ - routed = LNET_NIDNET(msg->msg_hdr.src_nid) != - LNET_NIDNET(msg->msg_hdr.dest_nid); - - if (portal_rotor == LNET_PTL_ROTOR_OFF || - (portal_rotor != LNET_PTL_ROTOR_ON && !routed)) { - cpt = lnet_cpt_current(); - if (ptl->ptl_mtables[cpt]->mt_enabled) - return ptl->ptl_mtables[cpt]; - } - - rotor = ptl->ptl_rotor++; /* get round-robin factor */ - if (portal_rotor == LNET_PTL_ROTOR_HASH_RT && routed) - cpt = lnet_cpt_of_nid(msg->msg_hdr.src_nid); - else - cpt = rotor % LNET_CPT_NUMBER; - - if (!ptl->ptl_mtables[cpt]->mt_enabled) { - /* is there any active entry for this portal? */ - nmaps = ptl->ptl_mt_nmaps; - /* map to an active mtable to avoid heavy "stealing" */ - if (nmaps) { - /* - * NB: there is possibility that ptl_mt_maps is being - * changed because we are not under protection of - * lnet_ptl_lock, but it shouldn't hurt anything - */ - cpt = ptl->ptl_mt_maps[rotor % nmaps]; - } - } - - return ptl->ptl_mtables[cpt]; -} - -static int -lnet_mt_test_exhausted(struct lnet_match_table *mtable, int pos) -{ - __u64 *bmap; - int i; - - if (!lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal])) - return 0; - - if (pos < 0) { /* check all bits */ - for (i = 0; i < LNET_MT_EXHAUSTED_BMAP; i++) { - if (mtable->mt_exhausted[i] != (__u64)(-1)) - return 0; - } - return 1; - } - - LASSERT(pos <= LNET_MT_HASH_IGNORE); - /* mtable::mt_mhash[pos] is marked as exhausted or not */ - bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64]; - pos &= (1 << LNET_MT_BITS_U64) - 1; - - return (*bmap & BIT(pos)); -} - -static void -lnet_mt_set_exhausted(struct lnet_match_table *mtable, int pos, int exhausted) -{ - __u64 *bmap; - - LASSERT(lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal])); - LASSERT(pos <= LNET_MT_HASH_IGNORE); - - /* set mtable::mt_mhash[pos] as exhausted/non-exhausted */ - bmap = &mtable->mt_exhausted[pos >> LNET_MT_BITS_U64]; - pos &= (1 << LNET_MT_BITS_U64) - 1; - - if (!exhausted) - *bmap &= ~(1ULL << pos); - else - *bmap |= 1ULL << pos; -} - -struct list_head * -lnet_mt_match_head(struct lnet_match_table *mtable, - struct lnet_process_id id, __u64 mbits) -{ - struct lnet_portal *ptl = the_lnet.ln_portals[mtable->mt_portal]; - unsigned long hash = mbits; - - if (!lnet_ptl_is_wildcard(ptl)) { - hash += id.nid + id.pid; - - LASSERT(lnet_ptl_is_unique(ptl)); - hash = hash_long(hash, LNET_MT_HASH_BITS); - } - return &mtable->mt_mhash[hash & LNET_MT_HASH_MASK]; -} - -int -lnet_mt_match_md(struct lnet_match_table *mtable, - struct lnet_match_info *info, struct lnet_msg *msg) -{ - struct list_head *head; - struct lnet_me *me; - struct lnet_me *tmp; - int exhausted = 0; - int rc; - - /* any ME with ignore bits? */ - if (!list_empty(&mtable->mt_mhash[LNET_MT_HASH_IGNORE])) - head = &mtable->mt_mhash[LNET_MT_HASH_IGNORE]; - else - head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits); - again: - /* NB: only wildcard portal needs to return LNET_MATCHMD_EXHAUSTED */ - if (lnet_ptl_is_wildcard(the_lnet.ln_portals[mtable->mt_portal])) - exhausted = LNET_MATCHMD_EXHAUSTED; - - list_for_each_entry_safe(me, tmp, head, me_list) { - /* ME attached but MD not attached yet */ - if (!me->me_md) - continue; - - LASSERT(me == me->me_md->md_me); - - rc = lnet_try_match_md(me->me_md, info, msg); - if (!(rc & LNET_MATCHMD_EXHAUSTED)) - exhausted = 0; /* mlist is not empty */ - - if (rc & LNET_MATCHMD_FINISH) { - /* - * don't return EXHAUSTED bit because we don't know - * whether the mlist is empty or not - */ - return rc & ~LNET_MATCHMD_EXHAUSTED; - } - } - - if (exhausted == LNET_MATCHMD_EXHAUSTED) { /* @head is exhausted */ - lnet_mt_set_exhausted(mtable, head - mtable->mt_mhash, 1); - if (!lnet_mt_test_exhausted(mtable, -1)) - exhausted = 0; - } - - if (!exhausted && head == &mtable->mt_mhash[LNET_MT_HASH_IGNORE]) { - head = lnet_mt_match_head(mtable, info->mi_id, info->mi_mbits); - goto again; /* re-check MEs w/o ignore-bits */ - } - - if (info->mi_opc == LNET_MD_OP_GET || - !lnet_ptl_is_lazy(the_lnet.ln_portals[info->mi_portal])) - return exhausted | LNET_MATCHMD_DROP; - - return exhausted | LNET_MATCHMD_NONE; -} - -static int -lnet_ptl_match_early(struct lnet_portal *ptl, struct lnet_msg *msg) -{ - int rc; - - /* - * message arrived before any buffer posting on this portal, - * simply delay or drop this message - */ - if (likely(lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl))) - return 0; - - lnet_ptl_lock(ptl); - /* check it again with hold of lock */ - if (lnet_ptl_is_wildcard(ptl) || lnet_ptl_is_unique(ptl)) { - lnet_ptl_unlock(ptl); - return 0; - } - - if (lnet_ptl_is_lazy(ptl)) { - if (msg->msg_rx_ready_delay) { - msg->msg_rx_delayed = 1; - list_add_tail(&msg->msg_list, - &ptl->ptl_msg_delayed); - } - rc = LNET_MATCHMD_NONE; - } else { - rc = LNET_MATCHMD_DROP; - } - - lnet_ptl_unlock(ptl); - return rc; -} - -static int -lnet_ptl_match_delay(struct lnet_portal *ptl, - struct lnet_match_info *info, struct lnet_msg *msg) -{ - int first = ptl->ptl_mt_maps[0]; /* read w/o lock */ - int rc = 0; - int i; - - /** - * Steal buffer from other CPTs, and delay msg if nothing to - * steal. This function is more expensive than a regular - * match, but we don't expect it can happen a lot. The return - * code contains one of LNET_MATCHMD_OK, LNET_MATCHMD_DROP, or - * LNET_MATCHMD_NONE. - */ - LASSERT(lnet_ptl_is_wildcard(ptl)); - - for (i = 0; i < LNET_CPT_NUMBER; i++) { - struct lnet_match_table *mtable; - int cpt; - - cpt = (first + i) % LNET_CPT_NUMBER; - mtable = ptl->ptl_mtables[cpt]; - if (i && i != LNET_CPT_NUMBER - 1 && !mtable->mt_enabled) - continue; - - lnet_res_lock(cpt); - lnet_ptl_lock(ptl); - - if (!i) { - /* The first try, add to stealing list. */ - list_add_tail(&msg->msg_list, - &ptl->ptl_msg_stealing); - } - - if (!list_empty(&msg->msg_list)) { - /* On stealing list. */ - rc = lnet_mt_match_md(mtable, info, msg); - - if ((rc & LNET_MATCHMD_EXHAUSTED) && - mtable->mt_enabled) - lnet_ptl_disable_mt(ptl, cpt); - - if (rc & LNET_MATCHMD_FINISH) { - /* Match found, remove from stealing list. */ - list_del_init(&msg->msg_list); - } else if (i == LNET_CPT_NUMBER - 1 || /* (1) */ - !ptl->ptl_mt_nmaps || /* (2) */ - (ptl->ptl_mt_nmaps == 1 && /* (3) */ - ptl->ptl_mt_maps[0] == cpt)) { - /** - * No match found, and this is either - * (1) the last cpt to check, or - * (2) there is no active cpt, or - * (3) this is the only active cpt. - * There is nothing to steal: delay or - * drop the message. - */ - list_del_init(&msg->msg_list); - - if (lnet_ptl_is_lazy(ptl)) { - msg->msg_rx_delayed = 1; - list_add_tail(&msg->msg_list, - &ptl->ptl_msg_delayed); - rc = LNET_MATCHMD_NONE; - } else { - rc = LNET_MATCHMD_DROP; - } - } else { - /* Do another iteration. */ - rc = 0; - } - } else { - /** - * No longer on stealing list: another thread - * matched the message in lnet_ptl_attach_md(). - * We are now expected to handle the message. - */ - rc = !msg->msg_md ? - LNET_MATCHMD_DROP : LNET_MATCHMD_OK; - } - - lnet_ptl_unlock(ptl); - lnet_res_unlock(cpt); - - /** - * Note that test (1) above ensures that we always - * exit the loop through this break statement. - * - * LNET_MATCHMD_NONE means msg was added to the - * delayed queue, and we may no longer reference it - * after lnet_ptl_unlock() and lnet_res_unlock(). - */ - if (rc & (LNET_MATCHMD_FINISH | LNET_MATCHMD_NONE)) - break; - } - - return rc; -} - -int -lnet_ptl_match_md(struct lnet_match_info *info, struct lnet_msg *msg) -{ - struct lnet_match_table *mtable; - struct lnet_portal *ptl; - int rc; - - CDEBUG(D_NET, "Request from %s of length %d into portal %d MB=%#llx\n", - libcfs_id2str(info->mi_id), info->mi_rlength, info->mi_portal, - info->mi_mbits); - - if (info->mi_portal >= the_lnet.ln_nportals) { - CERROR("Invalid portal %d not in [0-%d]\n", - info->mi_portal, the_lnet.ln_nportals); - return LNET_MATCHMD_DROP; - } - - ptl = the_lnet.ln_portals[info->mi_portal]; - rc = lnet_ptl_match_early(ptl, msg); - if (rc) /* matched or delayed early message */ - return rc; - - mtable = lnet_mt_of_match(info, msg); - lnet_res_lock(mtable->mt_cpt); - - if (the_lnet.ln_shutdown) { - rc = LNET_MATCHMD_DROP; - goto out1; - } - - rc = lnet_mt_match_md(mtable, info, msg); - if ((rc & LNET_MATCHMD_EXHAUSTED) && mtable->mt_enabled) { - lnet_ptl_lock(ptl); - lnet_ptl_disable_mt(ptl, mtable->mt_cpt); - lnet_ptl_unlock(ptl); - } - - if (rc & LNET_MATCHMD_FINISH) /* matched or dropping */ - goto out1; - - if (!msg->msg_rx_ready_delay) - goto out1; - - LASSERT(lnet_ptl_is_lazy(ptl)); - LASSERT(!msg->msg_rx_delayed); - - /* NB: we don't expect "delay" can happen a lot */ - if (lnet_ptl_is_unique(ptl) || LNET_CPT_NUMBER == 1) { - lnet_ptl_lock(ptl); - - msg->msg_rx_delayed = 1; - list_add_tail(&msg->msg_list, &ptl->ptl_msg_delayed); - - lnet_ptl_unlock(ptl); - lnet_res_unlock(mtable->mt_cpt); - rc = LNET_MATCHMD_NONE; - } else { - lnet_res_unlock(mtable->mt_cpt); - rc = lnet_ptl_match_delay(ptl, info, msg); - } - - /* LNET_MATCHMD_NONE means msg was added to the delay queue */ - if (rc & LNET_MATCHMD_NONE) { - CDEBUG(D_NET, - "Delaying %s from %s ptl %d MB %#llx off %d len %d\n", - info->mi_opc == LNET_MD_OP_PUT ? "PUT" : "GET", - libcfs_id2str(info->mi_id), info->mi_portal, - info->mi_mbits, info->mi_roffset, info->mi_rlength); - } - goto out0; - out1: - lnet_res_unlock(mtable->mt_cpt); - out0: - /* EXHAUSTED bit is only meaningful for internal functions */ - return rc & ~LNET_MATCHMD_EXHAUSTED; -} - -void -lnet_ptl_detach_md(struct lnet_me *me, struct lnet_libmd *md) -{ - LASSERT(me->me_md == md && md->md_me == me); - - me->me_md = NULL; - md->md_me = NULL; -} - -/* called with lnet_res_lock held */ -void -lnet_ptl_attach_md(struct lnet_me *me, struct lnet_libmd *md, - struct list_head *matches, struct list_head *drops) -{ - struct lnet_portal *ptl = the_lnet.ln_portals[me->me_portal]; - struct lnet_match_table *mtable; - struct list_head *head; - struct lnet_msg *tmp; - struct lnet_msg *msg; - int exhausted = 0; - int cpt; - - LASSERT(!md->md_refcount); /* a brand new MD */ - - me->me_md = md; - md->md_me = me; - - cpt = lnet_cpt_of_cookie(md->md_lh.lh_cookie); - mtable = ptl->ptl_mtables[cpt]; - - if (list_empty(&ptl->ptl_msg_stealing) && - list_empty(&ptl->ptl_msg_delayed) && - !lnet_mt_test_exhausted(mtable, me->me_pos)) - return; - - lnet_ptl_lock(ptl); - head = &ptl->ptl_msg_stealing; - again: - list_for_each_entry_safe(msg, tmp, head, msg_list) { - struct lnet_match_info info; - struct lnet_hdr *hdr; - int rc; - - LASSERT(msg->msg_rx_delayed || head == &ptl->ptl_msg_stealing); - - hdr = &msg->msg_hdr; - info.mi_id.nid = hdr->src_nid; - info.mi_id.pid = hdr->src_pid; - info.mi_opc = LNET_MD_OP_PUT; - info.mi_portal = hdr->msg.put.ptl_index; - info.mi_rlength = hdr->payload_length; - info.mi_roffset = hdr->msg.put.offset; - info.mi_mbits = hdr->msg.put.match_bits; - - rc = lnet_try_match_md(md, &info, msg); - - exhausted = (rc & LNET_MATCHMD_EXHAUSTED); - if (rc & LNET_MATCHMD_NONE) { - if (exhausted) - break; - continue; - } - - /* Hurrah! This _is_ a match */ - LASSERT(rc & LNET_MATCHMD_FINISH); - list_del_init(&msg->msg_list); - - if (head == &ptl->ptl_msg_stealing) { - if (exhausted) - break; - /* stealing thread will handle the message */ - continue; - } - - if (rc & LNET_MATCHMD_OK) { - list_add_tail(&msg->msg_list, matches); - - CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d match %llu offset %d length %d.\n", - libcfs_id2str(info.mi_id), - info.mi_portal, info.mi_mbits, - info.mi_roffset, info.mi_rlength); - } else { - list_add_tail(&msg->msg_list, drops); - } - - if (exhausted) - break; - } - - if (!exhausted && head == &ptl->ptl_msg_stealing) { - head = &ptl->ptl_msg_delayed; - goto again; - } - - if (lnet_ptl_is_wildcard(ptl) && !exhausted) { - lnet_mt_set_exhausted(mtable, me->me_pos, 0); - if (!mtable->mt_enabled) - lnet_ptl_enable_mt(ptl, cpt); - } - - lnet_ptl_unlock(ptl); -} - -static void -lnet_ptl_cleanup(struct lnet_portal *ptl) -{ - struct lnet_match_table *mtable; - int i; - - if (!ptl->ptl_mtables) /* uninitialized portal */ - return; - - LASSERT(list_empty(&ptl->ptl_msg_delayed)); - LASSERT(list_empty(&ptl->ptl_msg_stealing)); - cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) { - struct list_head *mhash; - struct lnet_me *me; - int j; - - if (!mtable->mt_mhash) /* uninitialized match-table */ - continue; - - mhash = mtable->mt_mhash; - /* cleanup ME */ - for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++) { - while (!list_empty(&mhash[j])) { - me = list_entry(mhash[j].next, - struct lnet_me, me_list); - CERROR("Active ME %p on exit\n", me); - list_del(&me->me_list); - kfree(me); - } - } - /* the extra entry is for MEs with ignore bits */ - kvfree(mhash); - } - - cfs_percpt_free(ptl->ptl_mtables); - ptl->ptl_mtables = NULL; -} - -static int -lnet_ptl_setup(struct lnet_portal *ptl, int index) -{ - struct lnet_match_table *mtable; - struct list_head *mhash; - int i; - int j; - - ptl->ptl_mtables = cfs_percpt_alloc(lnet_cpt_table(), - sizeof(struct lnet_match_table)); - if (!ptl->ptl_mtables) { - CERROR("Failed to create match table for portal %d\n", index); - return -ENOMEM; - } - - ptl->ptl_index = index; - INIT_LIST_HEAD(&ptl->ptl_msg_delayed); - INIT_LIST_HEAD(&ptl->ptl_msg_stealing); - spin_lock_init(&ptl->ptl_lock); - cfs_percpt_for_each(mtable, i, ptl->ptl_mtables) { - /* the extra entry is for MEs with ignore bits */ - mhash = kvzalloc_cpt(sizeof(*mhash) * (LNET_MT_HASH_SIZE + 1), - GFP_KERNEL, i); - if (!mhash) { - CERROR("Failed to create match hash for portal %d\n", - index); - goto failed; - } - - memset(&mtable->mt_exhausted[0], -1, - sizeof(mtable->mt_exhausted[0]) * - LNET_MT_EXHAUSTED_BMAP); - mtable->mt_mhash = mhash; - for (j = 0; j < LNET_MT_HASH_SIZE + 1; j++) - INIT_LIST_HEAD(&mhash[j]); - - mtable->mt_portal = index; - mtable->mt_cpt = i; - } - - return 0; - failed: - lnet_ptl_cleanup(ptl); - return -ENOMEM; -} - -void -lnet_portals_destroy(void) -{ - int i; - - if (!the_lnet.ln_portals) - return; - - for (i = 0; i < the_lnet.ln_nportals; i++) - lnet_ptl_cleanup(the_lnet.ln_portals[i]); - - cfs_array_free(the_lnet.ln_portals); - the_lnet.ln_portals = NULL; - the_lnet.ln_nportals = 0; -} - -int -lnet_portals_create(void) -{ - int size; - int i; - - size = offsetof(struct lnet_portal, ptl_mt_maps[LNET_CPT_NUMBER]); - - the_lnet.ln_portals = cfs_array_alloc(MAX_PORTALS, size); - if (!the_lnet.ln_portals) { - CERROR("Failed to allocate portals table\n"); - return -ENOMEM; - } - the_lnet.ln_nportals = MAX_PORTALS; - - for (i = 0; i < the_lnet.ln_nportals; i++) { - if (lnet_ptl_setup(the_lnet.ln_portals[i], i)) { - lnet_portals_destroy(); - return -ENOMEM; - } - } - - return 0; -} - -/** - * Turn on the lazy portal attribute. Use with caution! - * - * This portal attribute only affects incoming PUT requests to the portal, - * and is off by default. By default, if there's no matching MD for an - * incoming PUT request, it is simply dropped. With the lazy attribute on, - * such requests are queued indefinitely until either a matching MD is - * posted to the portal or the lazy attribute is turned off. - * - * It would prevent dropped requests, however it should be regarded as the - * last line of defense - i.e. users must keep a close watch on active - * buffers on a lazy portal and once it becomes too low post more buffers as - * soon as possible. This is because delayed requests usually have detrimental - * effects on underlying network connections. A few delayed requests often - * suffice to bring an underlying connection to a complete halt, due to flow - * control mechanisms. - * - * There's also a DOS attack risk. If users don't post match-all MDs on a - * lazy portal, a malicious peer can easily stop a service by sending some - * PUT requests with match bits that won't match any MD. A routed server is - * especially vulnerable since the connections to its neighbor routers are - * shared among all clients. - * - * \param portal Index of the portal to enable the lazy attribute on. - * - * \retval 0 On success. - * \retval -EINVAL If \a portal is not a valid index. - */ -int -LNetSetLazyPortal(int portal) -{ - struct lnet_portal *ptl; - - if (portal < 0 || portal >= the_lnet.ln_nportals) - return -EINVAL; - - CDEBUG(D_NET, "Setting portal %d lazy\n", portal); - ptl = the_lnet.ln_portals[portal]; - - lnet_res_lock(LNET_LOCK_EX); - lnet_ptl_lock(ptl); - - lnet_ptl_setopt(ptl, LNET_PTL_LAZY); - - lnet_ptl_unlock(ptl); - lnet_res_unlock(LNET_LOCK_EX); - - return 0; -} -EXPORT_SYMBOL(LNetSetLazyPortal); - -int -lnet_clear_lazy_portal(struct lnet_ni *ni, int portal, char *reason) -{ - struct lnet_portal *ptl; - LIST_HEAD(zombies); - - if (portal < 0 || portal >= the_lnet.ln_nportals) - return -EINVAL; - - ptl = the_lnet.ln_portals[portal]; - - lnet_res_lock(LNET_LOCK_EX); - lnet_ptl_lock(ptl); - - if (!lnet_ptl_is_lazy(ptl)) { - lnet_ptl_unlock(ptl); - lnet_res_unlock(LNET_LOCK_EX); - return 0; - } - - if (ni) { - struct lnet_msg *msg, *tmp; - - /* grab all messages which are on the NI passed in */ - list_for_each_entry_safe(msg, tmp, &ptl->ptl_msg_delayed, - msg_list) { - if (msg->msg_rxpeer->lp_ni == ni) - list_move(&msg->msg_list, &zombies); - } - } else { - if (the_lnet.ln_shutdown) - CWARN("Active lazy portal %d on exit\n", portal); - else - CDEBUG(D_NET, "clearing portal %d lazy\n", portal); - - /* grab all the blocked messages atomically */ - list_splice_init(&ptl->ptl_msg_delayed, &zombies); - - lnet_ptl_unsetopt(ptl, LNET_PTL_LAZY); - } - - lnet_ptl_unlock(ptl); - lnet_res_unlock(LNET_LOCK_EX); - - lnet_drop_delayed_msg_list(&zombies, reason); - - return 0; -} - -/** - * Turn off the lazy portal attribute. Delayed requests on the portal, - * if any, will be all dropped when this function returns. - * - * \param portal Index of the portal to disable the lazy attribute on. - * - * \retval 0 On success. - * \retval -EINVAL If \a portal is not a valid index. - */ -int -LNetClearLazyPortal(int portal) -{ - return lnet_clear_lazy_portal(NULL, portal, - "Clearing lazy portal attr"); -} -EXPORT_SYMBOL(LNetClearLazyPortal); diff --git a/drivers/staging/lustre/lnet/lnet/lib-socket.c b/drivers/staging/lustre/lnet/lnet/lib-socket.c deleted file mode 100644 index 9b61260155f2a36cb9d4b72615eb0cd85e8f52f9..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/lnet/lib-socket.c +++ /dev/null @@ -1,585 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Seagate, Inc. - */ -#define DEBUG_SUBSYSTEM S_LNET - -#include -#include -#include -#include -#include -/* For sys_open & sys_close */ -#include -#include - -#include - -static int -kernel_sock_unlocked_ioctl(struct file *filp, int cmd, unsigned long arg) -{ - mm_segment_t oldfs = get_fs(); - int err; - - set_fs(KERNEL_DS); - err = filp->f_op->unlocked_ioctl(filp, cmd, arg); - set_fs(oldfs); - - return err; -} - -static int -lnet_sock_ioctl(int cmd, unsigned long arg) -{ - struct file *sock_filp; - struct socket *sock; - int rc; - - rc = sock_create(PF_INET, SOCK_STREAM, 0, &sock); - if (rc) { - CERROR("Can't create socket: %d\n", rc); - return rc; - } - - sock_filp = sock_alloc_file(sock, 0, NULL); - if (IS_ERR(sock_filp)) - return PTR_ERR(sock_filp); - - rc = kernel_sock_unlocked_ioctl(sock_filp, cmd, arg); - - fput(sock_filp); - return rc; -} - -int -lnet_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask) -{ - struct ifreq ifr; - int nob; - int rc; - __be32 val; - - nob = strnlen(name, IFNAMSIZ); - if (nob == IFNAMSIZ) { - CERROR("Interface name %s too long\n", name); - return -EINVAL; - } - - BUILD_BUG_ON(sizeof(ifr.ifr_name) < IFNAMSIZ); - - if (strlen(name) > sizeof(ifr.ifr_name) - 1) - return -E2BIG; - strncpy(ifr.ifr_name, name, sizeof(ifr.ifr_name)); - - rc = lnet_sock_ioctl(SIOCGIFFLAGS, (unsigned long)&ifr); - if (rc) { - CERROR("Can't get flags for interface %s\n", name); - return rc; - } - - if (!(ifr.ifr_flags & IFF_UP)) { - CDEBUG(D_NET, "Interface %s down\n", name); - *up = 0; - *ip = *mask = 0; - return 0; - } - *up = 1; - - if (strlen(name) > sizeof(ifr.ifr_name) - 1) - return -E2BIG; - strncpy(ifr.ifr_name, name, sizeof(ifr.ifr_name)); - - ifr.ifr_addr.sa_family = AF_INET; - rc = lnet_sock_ioctl(SIOCGIFADDR, (unsigned long)&ifr); - if (rc) { - CERROR("Can't get IP address for interface %s\n", name); - return rc; - } - - val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr; - *ip = ntohl(val); - - if (strlen(name) > sizeof(ifr.ifr_name) - 1) - return -E2BIG; - strncpy(ifr.ifr_name, name, sizeof(ifr.ifr_name)); - - ifr.ifr_addr.sa_family = AF_INET; - rc = lnet_sock_ioctl(SIOCGIFNETMASK, (unsigned long)&ifr); - if (rc) { - CERROR("Can't get netmask for interface %s\n", name); - return rc; - } - - val = ((struct sockaddr_in *)&ifr.ifr_netmask)->sin_addr.s_addr; - *mask = ntohl(val); - - return 0; -} -EXPORT_SYMBOL(lnet_ipif_query); - -int -lnet_ipif_enumerate(char ***namesp) -{ - /* Allocate and fill in 'names', returning # interfaces/error */ - char **names; - int toobig; - int nalloc; - int nfound; - struct ifreq *ifr; - struct ifconf ifc; - int rc; - int nob; - int i; - - nalloc = 16; /* first guess at max interfaces */ - toobig = 0; - for (;;) { - if (nalloc * sizeof(*ifr) > PAGE_SIZE) { - toobig = 1; - nalloc = PAGE_SIZE / sizeof(*ifr); - CWARN("Too many interfaces: only enumerating first %d\n", - nalloc); - } - - ifr = kzalloc(nalloc * sizeof(*ifr), GFP_KERNEL); - if (!ifr) { - CERROR("ENOMEM enumerating up to %d interfaces\n", - nalloc); - rc = -ENOMEM; - goto out0; - } - - ifc.ifc_buf = (char *)ifr; - ifc.ifc_len = nalloc * sizeof(*ifr); - - rc = lnet_sock_ioctl(SIOCGIFCONF, (unsigned long)&ifc); - if (rc < 0) { - CERROR("Error %d enumerating interfaces\n", rc); - goto out1; - } - - LASSERT(!rc); - - nfound = ifc.ifc_len / sizeof(*ifr); - LASSERT(nfound <= nalloc); - - if (nfound < nalloc || toobig) - break; - - kfree(ifr); - nalloc *= 2; - } - - if (!nfound) - goto out1; - - names = kzalloc(nfound * sizeof(*names), GFP_KERNEL); - if (!names) { - rc = -ENOMEM; - goto out1; - } - - for (i = 0; i < nfound; i++) { - nob = strnlen(ifr[i].ifr_name, IFNAMSIZ); - if (nob == IFNAMSIZ) { - /* no space for terminating NULL */ - CERROR("interface name %.*s too long (%d max)\n", - nob, ifr[i].ifr_name, IFNAMSIZ); - rc = -ENAMETOOLONG; - goto out2; - } - - names[i] = kmalloc(IFNAMSIZ, GFP_KERNEL); - if (!names[i]) { - rc = -ENOMEM; - goto out2; - } - - memcpy(names[i], ifr[i].ifr_name, nob); - names[i][nob] = 0; - } - - *namesp = names; - rc = nfound; - -out2: - if (rc < 0) - lnet_ipif_free_enumeration(names, nfound); -out1: - kfree(ifr); -out0: - return rc; -} -EXPORT_SYMBOL(lnet_ipif_enumerate); - -void -lnet_ipif_free_enumeration(char **names, int n) -{ - int i; - - LASSERT(n > 0); - - for (i = 0; i < n && names[i]; i++) - kfree(names[i]); - - kfree(names); -} -EXPORT_SYMBOL(lnet_ipif_free_enumeration); - -int -lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout) -{ - int rc; - long jiffies_left = timeout * msecs_to_jiffies(MSEC_PER_SEC); - unsigned long then; - struct timeval tv; - struct kvec iov = { .iov_base = buffer, .iov_len = nob }; - struct msghdr msg = {NULL,}; - - LASSERT(nob > 0); - /* - * Caller may pass a zero timeout if she thinks the socket buffer is - * empty enough to take the whole message immediately - */ - iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &iov, 1, nob); - for (;;) { - msg.msg_flags = !timeout ? MSG_DONTWAIT : 0; - if (timeout) { - /* Set send timeout to remaining time */ - jiffies_to_timeval(jiffies_left, &tv); - rc = kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, - (char *)&tv, sizeof(tv)); - if (rc) { - CERROR("Can't set socket send timeout %ld.%06d: %d\n", - (long)tv.tv_sec, (int)tv.tv_usec, rc); - return rc; - } - } - - then = jiffies; - rc = kernel_sendmsg(sock, &msg, &iov, 1, nob); - jiffies_left -= jiffies - then; - - if (rc < 0) - return rc; - - if (!rc) { - CERROR("Unexpected zero rc\n"); - return -ECONNABORTED; - } - - if (!msg_data_left(&msg)) - break; - - if (jiffies_left <= 0) - return -EAGAIN; - } - return 0; -} -EXPORT_SYMBOL(lnet_sock_write); - -int -lnet_sock_read(struct socket *sock, void *buffer, int nob, int timeout) -{ - int rc; - long jiffies_left = timeout * msecs_to_jiffies(MSEC_PER_SEC); - unsigned long then; - struct timeval tv; - struct kvec iov = { - .iov_base = buffer, - .iov_len = nob - }; - struct msghdr msg = { - .msg_flags = 0 - }; - - LASSERT(nob > 0); - LASSERT(jiffies_left > 0); - - iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, nob); - - for (;;) { - /* Set receive timeout to remaining time */ - jiffies_to_timeval(jiffies_left, &tv); - rc = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, - (char *)&tv, sizeof(tv)); - if (rc) { - CERROR("Can't set socket recv timeout %ld.%06d: %d\n", - (long)tv.tv_sec, (int)tv.tv_usec, rc); - return rc; - } - - then = jiffies; - rc = sock_recvmsg(sock, &msg, 0); - jiffies_left -= jiffies - then; - - if (rc < 0) - return rc; - - if (!rc) - return -ECONNRESET; - - if (!msg_data_left(&msg)) - return 0; - - if (jiffies_left <= 0) - return -ETIMEDOUT; - } -} -EXPORT_SYMBOL(lnet_sock_read); - -static int -lnet_sock_create(struct socket **sockp, int *fatal, __u32 local_ip, - int local_port) -{ - struct sockaddr_in locaddr; - struct socket *sock; - int rc; - int option; - - /* All errors are fatal except bind failure if the port is in use */ - *fatal = 1; - - rc = sock_create(PF_INET, SOCK_STREAM, 0, &sock); - *sockp = sock; - if (rc) { - CERROR("Can't create socket: %d\n", rc); - return rc; - } - - option = 1; - rc = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, - (char *)&option, sizeof(option)); - if (rc) { - CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc); - goto failed; - } - - if (local_ip || local_port) { - memset(&locaddr, 0, sizeof(locaddr)); - locaddr.sin_family = AF_INET; - locaddr.sin_port = htons(local_port); - if (!local_ip) - locaddr.sin_addr.s_addr = htonl(INADDR_ANY); - else - locaddr.sin_addr.s_addr = htonl(local_ip); - - rc = kernel_bind(sock, (struct sockaddr *)&locaddr, - sizeof(locaddr)); - if (rc == -EADDRINUSE) { - CDEBUG(D_NET, "Port %d already in use\n", local_port); - *fatal = 0; - goto failed; - } - if (rc) { - CERROR("Error trying to bind to port %d: %d\n", - local_port, rc); - goto failed; - } - } - return 0; - -failed: - sock_release(sock); - return rc; -} - -int -lnet_sock_setbuf(struct socket *sock, int txbufsize, int rxbufsize) -{ - int option; - int rc; - - if (txbufsize) { - option = txbufsize; - rc = kernel_setsockopt(sock, SOL_SOCKET, SO_SNDBUF, - (char *)&option, sizeof(option)); - if (rc) { - CERROR("Can't set send buffer %d: %d\n", - option, rc); - return rc; - } - } - - if (rxbufsize) { - option = rxbufsize; - rc = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUF, - (char *)&option, sizeof(option)); - if (rc) { - CERROR("Can't set receive buffer %d: %d\n", - option, rc); - return rc; - } - } - return 0; -} -EXPORT_SYMBOL(lnet_sock_setbuf); - -int -lnet_sock_getaddr(struct socket *sock, bool remote, __u32 *ip, int *port) -{ - struct sockaddr_in sin; - int rc; - - if (remote) - rc = kernel_getpeername(sock, (struct sockaddr *)&sin); - else - rc = kernel_getsockname(sock, (struct sockaddr *)&sin); - if (rc < 0) { - CERROR("Error %d getting sock %s IP/port\n", - rc, remote ? "peer" : "local"); - return rc; - } - - if (ip) - *ip = ntohl(sin.sin_addr.s_addr); - - if (port) - *port = ntohs(sin.sin_port); - - return 0; -} -EXPORT_SYMBOL(lnet_sock_getaddr); - -int -lnet_sock_getbuf(struct socket *sock, int *txbufsize, int *rxbufsize) -{ - if (txbufsize) - *txbufsize = sock->sk->sk_sndbuf; - - if (rxbufsize) - *rxbufsize = sock->sk->sk_rcvbuf; - - return 0; -} -EXPORT_SYMBOL(lnet_sock_getbuf); - -int -lnet_sock_listen(struct socket **sockp, __u32 local_ip, int local_port, - int backlog) -{ - int fatal; - int rc; - - rc = lnet_sock_create(sockp, &fatal, local_ip, local_port); - if (rc) { - if (!fatal) - CERROR("Can't create socket: port %d already in use\n", - local_port); - return rc; - } - - rc = kernel_listen(*sockp, backlog); - if (!rc) - return 0; - - CERROR("Can't set listen backlog %d: %d\n", backlog, rc); - sock_release(*sockp); - return rc; -} - -int -lnet_sock_accept(struct socket **newsockp, struct socket *sock) -{ - wait_queue_entry_t wait; - struct socket *newsock; - int rc; - - /* - * XXX this should add a ref to sock->ops->owner, if - * TCP could be a module - */ - rc = sock_create_lite(PF_PACKET, sock->type, IPPROTO_TCP, &newsock); - if (rc) { - CERROR("Can't allocate socket\n"); - return rc; - } - - newsock->ops = sock->ops; - - rc = sock->ops->accept(sock, newsock, O_NONBLOCK, false); - if (rc == -EAGAIN) { - /* Nothing ready, so wait for activity */ - init_waitqueue_entry(&wait, current); - add_wait_queue(sk_sleep(sock->sk), &wait); - set_current_state(TASK_INTERRUPTIBLE); - schedule(); - remove_wait_queue(sk_sleep(sock->sk), &wait); - rc = sock->ops->accept(sock, newsock, O_NONBLOCK, false); - } - - if (rc) - goto failed; - - *newsockp = newsock; - return 0; - -failed: - sock_release(newsock); - return rc; -} - -int -lnet_sock_connect(struct socket **sockp, int *fatal, __u32 local_ip, - int local_port, __u32 peer_ip, int peer_port) -{ - struct sockaddr_in srvaddr; - int rc; - - rc = lnet_sock_create(sockp, fatal, local_ip, local_port); - if (rc) - return rc; - - memset(&srvaddr, 0, sizeof(srvaddr)); - srvaddr.sin_family = AF_INET; - srvaddr.sin_port = htons(peer_port); - srvaddr.sin_addr.s_addr = htonl(peer_ip); - - rc = kernel_connect(*sockp, (struct sockaddr *)&srvaddr, - sizeof(srvaddr), 0); - if (!rc) - return 0; - - /* - * EADDRNOTAVAIL probably means we're already connected to the same - * peer/port on the same local port on a differently typed - * connection. Let our caller retry with a different local - * port... - */ - *fatal = !(rc == -EADDRNOTAVAIL); - - CDEBUG_LIMIT(*fatal ? D_NETERROR : D_NET, - "Error %d connecting %pI4h/%d -> %pI4h/%d\n", rc, - &local_ip, local_port, &peer_ip, peer_port); - - sock_release(*sockp); - return rc; -} diff --git a/drivers/staging/lustre/lnet/lnet/lo.c b/drivers/staging/lustre/lnet/lnet/lo.c deleted file mode 100644 index 7456b989e451be100314d5a9ec89912147a6f0e8..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/lnet/lo.c +++ /dev/null @@ -1,105 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_LNET - -#include - -static int -lolnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg) -{ - LASSERT(!lntmsg->msg_routing); - LASSERT(!lntmsg->msg_target_is_router); - - return lnet_parse(ni, &lntmsg->msg_hdr, ni->ni_nid, lntmsg, 0); -} - -static int -lolnd_recv(struct lnet_ni *ni, void *private, struct lnet_msg *lntmsg, - int delayed, struct iov_iter *to, unsigned int rlen) -{ - struct lnet_msg *sendmsg = private; - - if (lntmsg) { /* not discarding */ - if (sendmsg->msg_iov) - lnet_copy_iov2iter(to, - sendmsg->msg_niov, - sendmsg->msg_iov, - sendmsg->msg_offset, - iov_iter_count(to)); - else - lnet_copy_kiov2iter(to, - sendmsg->msg_niov, - sendmsg->msg_kiov, - sendmsg->msg_offset, - iov_iter_count(to)); - - lnet_finalize(ni, lntmsg, 0); - } - - lnet_finalize(ni, sendmsg, 0); - return 0; -} - -static int lolnd_instanced; - -static void -lolnd_shutdown(struct lnet_ni *ni) -{ - CDEBUG(D_NET, "shutdown\n"); - LASSERT(lolnd_instanced); - - lolnd_instanced = 0; -} - -static int -lolnd_startup(struct lnet_ni *ni) -{ - LASSERT(ni->ni_lnd == &the_lolnd); - LASSERT(!lolnd_instanced); - lolnd_instanced = 1; - - return 0; -} - -struct lnet_lnd the_lolnd = { - /* .lnd_list = */ {&the_lolnd.lnd_list, &the_lolnd.lnd_list}, - /* .lnd_refcount = */ 0, - /* .lnd_type = */ LOLND, - /* .lnd_startup = */ lolnd_startup, - /* .lnd_shutdown = */ lolnd_shutdown, - /* .lnt_ctl = */ NULL, - /* .lnd_send = */ lolnd_send, - /* .lnd_recv = */ lolnd_recv, - /* .lnd_eager_recv = */ NULL, - /* .lnd_notify = */ NULL, - /* .lnd_accept = */ NULL -}; diff --git a/drivers/staging/lustre/lnet/lnet/module.c b/drivers/staging/lustre/lnet/lnet/module.c deleted file mode 100644 index 9d06664f0c1736306e4790bf032d70807e7499da..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/lnet/module.c +++ /dev/null @@ -1,239 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_LNET - -#include -#include - -static int config_on_load; -module_param(config_on_load, int, 0444); -MODULE_PARM_DESC(config_on_load, "configure network at module load"); - -static struct mutex lnet_config_mutex; - -static int -lnet_configure(void *arg) -{ - /* 'arg' only there so I can be passed to cfs_create_thread() */ - int rc = 0; - - mutex_lock(&lnet_config_mutex); - - if (!the_lnet.ln_niinit_self) { - rc = try_module_get(THIS_MODULE); - - if (rc != 1) - goto out; - - rc = LNetNIInit(LNET_PID_LUSTRE); - if (rc >= 0) { - the_lnet.ln_niinit_self = 1; - rc = 0; - } else { - module_put(THIS_MODULE); - } - } - -out: - mutex_unlock(&lnet_config_mutex); - return rc; -} - -static int -lnet_unconfigure(void) -{ - int refcount; - - mutex_lock(&lnet_config_mutex); - - if (the_lnet.ln_niinit_self) { - the_lnet.ln_niinit_self = 0; - LNetNIFini(); - module_put(THIS_MODULE); - } - - mutex_lock(&the_lnet.ln_api_mutex); - refcount = the_lnet.ln_refcount; - mutex_unlock(&the_lnet.ln_api_mutex); - - mutex_unlock(&lnet_config_mutex); - return !refcount ? 0 : -EBUSY; -} - -static int -lnet_dyn_configure(struct libcfs_ioctl_hdr *hdr) -{ - struct lnet_ioctl_config_data *conf = - (struct lnet_ioctl_config_data *)hdr; - int rc; - - if (conf->cfg_hdr.ioc_len < sizeof(*conf)) - return -EINVAL; - - mutex_lock(&lnet_config_mutex); - if (!the_lnet.ln_niinit_self) { - rc = -EINVAL; - goto out_unlock; - } - rc = lnet_dyn_add_ni(LNET_PID_LUSTRE, conf); -out_unlock: - mutex_unlock(&lnet_config_mutex); - - return rc; -} - -static int -lnet_dyn_unconfigure(struct libcfs_ioctl_hdr *hdr) -{ - struct lnet_ioctl_config_data *conf = - (struct lnet_ioctl_config_data *)hdr; - int rc; - - if (conf->cfg_hdr.ioc_len < sizeof(*conf)) - return -EINVAL; - - mutex_lock(&lnet_config_mutex); - if (!the_lnet.ln_niinit_self) { - rc = -EINVAL; - goto out_unlock; - } - rc = lnet_dyn_del_ni(conf->cfg_net); -out_unlock: - mutex_unlock(&lnet_config_mutex); - - return rc; -} - -static int -lnet_ioctl(struct notifier_block *nb, - unsigned long cmd, void *vdata) -{ - int rc; - struct libcfs_ioctl_hdr *hdr = vdata; - - switch (cmd) { - case IOC_LIBCFS_CONFIGURE: { - struct libcfs_ioctl_data *data = - (struct libcfs_ioctl_data *)hdr; - - if (data->ioc_hdr.ioc_len < sizeof(*data)) { - rc = -EINVAL; - } else { - the_lnet.ln_nis_from_mod_params = data->ioc_flags; - rc = lnet_configure(NULL); - } - break; - } - - case IOC_LIBCFS_UNCONFIGURE: - rc = lnet_unconfigure(); - break; - - case IOC_LIBCFS_ADD_NET: - rc = lnet_dyn_configure(hdr); - break; - - case IOC_LIBCFS_DEL_NET: - rc = lnet_dyn_unconfigure(hdr); - break; - - default: - /* - * Passing LNET_PID_ANY only gives me a ref if the net is up - * already; I'll need it to ensure the net can't go down while - * I'm called into it - */ - rc = LNetNIInit(LNET_PID_ANY); - if (rc >= 0) { - rc = LNetCtl(cmd, hdr); - LNetNIFini(); - } - break; - } - return notifier_from_ioctl_errno(rc); -} - -static struct notifier_block lnet_ioctl_handler = { - .notifier_call = lnet_ioctl, -}; - -static int __init lnet_init(void) -{ - int rc; - - mutex_init(&lnet_config_mutex); - - rc = libcfs_setup(); - if (rc) - return rc; - - rc = lnet_lib_init(); - if (rc) { - CERROR("lnet_lib_init: error %d\n", rc); - return rc; - } - - rc = blocking_notifier_chain_register(&libcfs_ioctl_list, - &lnet_ioctl_handler); - LASSERT(!rc); - - if (config_on_load) { - /* - * Have to schedule a separate thread to avoid deadlocking - * in modload - */ - (void)kthread_run(lnet_configure, NULL, "lnet_initd"); - } - - return 0; -} - -static void __exit lnet_exit(void) -{ - int rc; - - rc = blocking_notifier_chain_unregister(&libcfs_ioctl_list, - &lnet_ioctl_handler); - LASSERT(!rc); - - lnet_lib_exit(); -} - -MODULE_AUTHOR("OpenSFS, Inc. "); -MODULE_DESCRIPTION("Lustre Networking layer"); -MODULE_VERSION(LNET_VERSION); -MODULE_LICENSE("GPL"); - -module_init(lnet_init); -module_exit(lnet_exit); diff --git a/drivers/staging/lustre/lnet/lnet/net_fault.c b/drivers/staging/lustre/lnet/lnet/net_fault.c deleted file mode 100644 index 0066394b0bb0c36c810e864fdb6d70e80231aca6..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/lnet/net_fault.c +++ /dev/null @@ -1,1023 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2014, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Seagate, Inc. - * - * lnet/lnet/net_fault.c - * - * Lustre network fault simulation - * - * Author: liang.zhen@intel.com - */ - -#define DEBUG_SUBSYSTEM S_LNET - -#include -#include - -#define LNET_MSG_MASK (LNET_PUT_BIT | LNET_ACK_BIT | \ - LNET_GET_BIT | LNET_REPLY_BIT) - -struct lnet_drop_rule { - /** link chain on the_lnet.ln_drop_rules */ - struct list_head dr_link; - /** attributes of this rule */ - struct lnet_fault_attr dr_attr; - /** lock to protect \a dr_drop_at and \a dr_stat */ - spinlock_t dr_lock; - /** - * the message sequence to drop, which means message is dropped when - * dr_stat.drs_count == dr_drop_at - */ - unsigned long dr_drop_at; - /** - * seconds to drop the next message, it's exclusive with dr_drop_at - */ - unsigned long dr_drop_time; - /** baseline to caculate dr_drop_time */ - unsigned long dr_time_base; - /** statistic of dropped messages */ - struct lnet_fault_stat dr_stat; -}; - -static bool -lnet_fault_nid_match(lnet_nid_t nid, lnet_nid_t msg_nid) -{ - if (nid == msg_nid || nid == LNET_NID_ANY) - return true; - - if (LNET_NIDNET(nid) != LNET_NIDNET(msg_nid)) - return false; - - /* 255.255.255.255@net is wildcard for all addresses in a network */ - return LNET_NIDADDR(nid) == LNET_NIDADDR(LNET_NID_ANY); -} - -static bool -lnet_fault_attr_match(struct lnet_fault_attr *attr, lnet_nid_t src, - lnet_nid_t dst, unsigned int type, unsigned int portal) -{ - if (!lnet_fault_nid_match(attr->fa_src, src) || - !lnet_fault_nid_match(attr->fa_dst, dst)) - return false; - - if (!(attr->fa_msg_mask & (1 << type))) - return false; - - /** - * NB: ACK and REPLY have no portal, but they should have been - * rejected by message mask - */ - if (attr->fa_ptl_mask && /* has portal filter */ - !(attr->fa_ptl_mask & (1ULL << portal))) - return false; - - return true; -} - -static int -lnet_fault_attr_validate(struct lnet_fault_attr *attr) -{ - if (!attr->fa_msg_mask) - attr->fa_msg_mask = LNET_MSG_MASK; /* all message types */ - - if (!attr->fa_ptl_mask) /* no portal filter */ - return 0; - - /* NB: only PUT and GET can be filtered if portal filter has been set */ - attr->fa_msg_mask &= LNET_GET_BIT | LNET_PUT_BIT; - if (!attr->fa_msg_mask) { - CDEBUG(D_NET, "can't find valid message type bits %x\n", - attr->fa_msg_mask); - return -EINVAL; - } - return 0; -} - -static void -lnet_fault_stat_inc(struct lnet_fault_stat *stat, unsigned int type) -{ - /* NB: fs_counter is NOT updated by this function */ - switch (type) { - case LNET_MSG_PUT: - stat->fs_put++; - return; - case LNET_MSG_ACK: - stat->fs_ack++; - return; - case LNET_MSG_GET: - stat->fs_get++; - return; - case LNET_MSG_REPLY: - stat->fs_reply++; - return; - } -} - -/** - * LNet message drop simulation - */ - -/** - * Add a new drop rule to LNet - * There is no check for duplicated drop rule, all rules will be checked for - * incoming message. - */ -static int -lnet_drop_rule_add(struct lnet_fault_attr *attr) -{ - struct lnet_drop_rule *rule; - - if (attr->u.drop.da_rate & attr->u.drop.da_interval) { - CDEBUG(D_NET, "please provide either drop rate or drop interval, but not both at the same time %d/%d\n", - attr->u.drop.da_rate, attr->u.drop.da_interval); - return -EINVAL; - } - - if (lnet_fault_attr_validate(attr)) - return -EINVAL; - - rule = kzalloc(sizeof(*rule), GFP_NOFS); - if (!rule) - return -ENOMEM; - - spin_lock_init(&rule->dr_lock); - - rule->dr_attr = *attr; - if (attr->u.drop.da_interval) { - rule->dr_time_base = jiffies + attr->u.drop.da_interval * HZ; - rule->dr_drop_time = jiffies + - prandom_u32_max(attr->u.drop.da_interval) * HZ; - } else { - rule->dr_drop_at = prandom_u32_max(attr->u.drop.da_rate); - } - - lnet_net_lock(LNET_LOCK_EX); - list_add(&rule->dr_link, &the_lnet.ln_drop_rules); - lnet_net_unlock(LNET_LOCK_EX); - - CDEBUG(D_NET, "Added drop rule: src %s, dst %s, rate %d, interval %d\n", - libcfs_nid2str(attr->fa_src), libcfs_nid2str(attr->fa_src), - attr->u.drop.da_rate, attr->u.drop.da_interval); - return 0; -} - -/** - * Remove matched drop rules from lnet, all rules that can match \a src and - * \a dst will be removed. - * If \a src is zero, then all rules have \a dst as destination will be remove - * If \a dst is zero, then all rules have \a src as source will be removed - * If both of them are zero, all rules will be removed - */ -static int -lnet_drop_rule_del(lnet_nid_t src, lnet_nid_t dst) -{ - struct lnet_drop_rule *rule; - struct lnet_drop_rule *tmp; - struct list_head zombies; - int n = 0; - - INIT_LIST_HEAD(&zombies); - - lnet_net_lock(LNET_LOCK_EX); - list_for_each_entry_safe(rule, tmp, &the_lnet.ln_drop_rules, dr_link) { - if (rule->dr_attr.fa_src != src && src) - continue; - - if (rule->dr_attr.fa_dst != dst && dst) - continue; - - list_move(&rule->dr_link, &zombies); - } - lnet_net_unlock(LNET_LOCK_EX); - - list_for_each_entry_safe(rule, tmp, &zombies, dr_link) { - CDEBUG(D_NET, "Remove drop rule: src %s->dst: %s (1/%d, %d)\n", - libcfs_nid2str(rule->dr_attr.fa_src), - libcfs_nid2str(rule->dr_attr.fa_dst), - rule->dr_attr.u.drop.da_rate, - rule->dr_attr.u.drop.da_interval); - - list_del(&rule->dr_link); - kfree(rule); - n++; - } - - return n; -} - -/** - * List drop rule at position of \a pos - */ -static int -lnet_drop_rule_list(int pos, struct lnet_fault_attr *attr, - struct lnet_fault_stat *stat) -{ - struct lnet_drop_rule *rule; - int cpt; - int i = 0; - int rc = -ENOENT; - - cpt = lnet_net_lock_current(); - list_for_each_entry(rule, &the_lnet.ln_drop_rules, dr_link) { - if (i++ < pos) - continue; - - spin_lock(&rule->dr_lock); - *attr = rule->dr_attr; - *stat = rule->dr_stat; - spin_unlock(&rule->dr_lock); - rc = 0; - break; - } - - lnet_net_unlock(cpt); - return rc; -} - -/** - * reset counters for all drop rules - */ -static void -lnet_drop_rule_reset(void) -{ - struct lnet_drop_rule *rule; - int cpt; - - cpt = lnet_net_lock_current(); - - list_for_each_entry(rule, &the_lnet.ln_drop_rules, dr_link) { - struct lnet_fault_attr *attr = &rule->dr_attr; - - spin_lock(&rule->dr_lock); - - memset(&rule->dr_stat, 0, sizeof(rule->dr_stat)); - if (attr->u.drop.da_rate) { - rule->dr_drop_at = prandom_u32_max(attr->u.drop.da_rate); - } else { - rule->dr_drop_time = jiffies + - prandom_u32_max(attr->u.drop.da_interval) * HZ; - rule->dr_time_base = jiffies + attr->u.drop.da_interval * HZ; - } - spin_unlock(&rule->dr_lock); - } - - lnet_net_unlock(cpt); -} - -/** - * check source/destination NID, portal, message type and drop rate, - * decide whether should drop this message or not - */ -static bool -drop_rule_match(struct lnet_drop_rule *rule, lnet_nid_t src, - lnet_nid_t dst, unsigned int type, unsigned int portal) -{ - struct lnet_fault_attr *attr = &rule->dr_attr; - bool drop; - - if (!lnet_fault_attr_match(attr, src, dst, type, portal)) - return false; - - /* match this rule, check drop rate now */ - spin_lock(&rule->dr_lock); - if (rule->dr_drop_time) { /* time based drop */ - unsigned long now = jiffies; - - rule->dr_stat.fs_count++; - drop = time_after_eq(now, rule->dr_drop_time); - if (drop) { - if (time_after(now, rule->dr_time_base)) - rule->dr_time_base = now; - - rule->dr_drop_time = rule->dr_time_base + - prandom_u32_max(attr->u.drop.da_interval) * HZ; - rule->dr_time_base += attr->u.drop.da_interval * HZ; - - CDEBUG(D_NET, "Drop Rule %s->%s: next drop : %lu\n", - libcfs_nid2str(attr->fa_src), - libcfs_nid2str(attr->fa_dst), - rule->dr_drop_time); - } - - } else { /* rate based drop */ - drop = rule->dr_stat.fs_count++ == rule->dr_drop_at; - - if (!do_div(rule->dr_stat.fs_count, attr->u.drop.da_rate)) { - rule->dr_drop_at = rule->dr_stat.fs_count + - prandom_u32_max(attr->u.drop.da_rate); - CDEBUG(D_NET, "Drop Rule %s->%s: next drop: %lu\n", - libcfs_nid2str(attr->fa_src), - libcfs_nid2str(attr->fa_dst), rule->dr_drop_at); - } - } - - if (drop) { /* drop this message, update counters */ - lnet_fault_stat_inc(&rule->dr_stat, type); - rule->dr_stat.u.drop.ds_dropped++; - } - - spin_unlock(&rule->dr_lock); - return drop; -} - -/** - * Check if message from \a src to \a dst can match any existed drop rule - */ -bool -lnet_drop_rule_match(struct lnet_hdr *hdr) -{ - struct lnet_drop_rule *rule; - lnet_nid_t src = le64_to_cpu(hdr->src_nid); - lnet_nid_t dst = le64_to_cpu(hdr->dest_nid); - unsigned int typ = le32_to_cpu(hdr->type); - unsigned int ptl = -1; - bool drop = false; - int cpt; - - /** - * NB: if Portal is specified, then only PUT and GET will be - * filtered by drop rule - */ - if (typ == LNET_MSG_PUT) - ptl = le32_to_cpu(hdr->msg.put.ptl_index); - else if (typ == LNET_MSG_GET) - ptl = le32_to_cpu(hdr->msg.get.ptl_index); - - cpt = lnet_net_lock_current(); - list_for_each_entry(rule, &the_lnet.ln_drop_rules, dr_link) { - drop = drop_rule_match(rule, src, dst, typ, ptl); - if (drop) - break; - } - - lnet_net_unlock(cpt); - return drop; -} - -/** - * LNet Delay Simulation - */ -/** timestamp (second) to send delayed message */ -#define msg_delay_send msg_ev.hdr_data - -struct lnet_delay_rule { - /** link chain on the_lnet.ln_delay_rules */ - struct list_head dl_link; - /** link chain on delay_dd.dd_sched_rules */ - struct list_head dl_sched_link; - /** attributes of this rule */ - struct lnet_fault_attr dl_attr; - /** lock to protect \a below members */ - spinlock_t dl_lock; - /** refcount of delay rule */ - atomic_t dl_refcount; - /** - * the message sequence to delay, which means message is delayed when - * dl_stat.fs_count == dl_delay_at - */ - unsigned long dl_delay_at; - /** - * seconds to delay the next message, it's exclusive with dl_delay_at - */ - unsigned long dl_delay_time; - /** baseline to caculate dl_delay_time */ - unsigned long dl_time_base; - /** jiffies to send the next delayed message */ - unsigned long dl_msg_send; - /** delayed message list */ - struct list_head dl_msg_list; - /** statistic of delayed messages */ - struct lnet_fault_stat dl_stat; - /** timer to wakeup delay_daemon */ - struct timer_list dl_timer; -}; - -struct delay_daemon_data { - /** serialise rule add/remove */ - struct mutex dd_mutex; - /** protect rules on \a dd_sched_rules */ - spinlock_t dd_lock; - /** scheduled delay rules (by timer) */ - struct list_head dd_sched_rules; - /** daemon thread sleeps at here */ - wait_queue_head_t dd_waitq; - /** controller (lctl command) wait at here */ - wait_queue_head_t dd_ctl_waitq; - /** daemon is running */ - unsigned int dd_running; - /** daemon stopped */ - unsigned int dd_stopped; -}; - -static struct delay_daemon_data delay_dd; - -static unsigned long -round_timeout(unsigned long timeout) -{ - return (unsigned int)rounddown(timeout, HZ) + HZ; -} - -static void -delay_rule_decref(struct lnet_delay_rule *rule) -{ - if (atomic_dec_and_test(&rule->dl_refcount)) { - LASSERT(list_empty(&rule->dl_sched_link)); - LASSERT(list_empty(&rule->dl_msg_list)); - LASSERT(list_empty(&rule->dl_link)); - - kfree(rule); - } -} - -/** - * check source/destination NID, portal, message type and delay rate, - * decide whether should delay this message or not - */ -static bool -delay_rule_match(struct lnet_delay_rule *rule, lnet_nid_t src, - lnet_nid_t dst, unsigned int type, unsigned int portal, - struct lnet_msg *msg) -{ - struct lnet_fault_attr *attr = &rule->dl_attr; - bool delay; - - if (!lnet_fault_attr_match(attr, src, dst, type, portal)) - return false; - - /* match this rule, check delay rate now */ - spin_lock(&rule->dl_lock); - if (rule->dl_delay_time) { /* time based delay */ - unsigned long now = jiffies; - - rule->dl_stat.fs_count++; - delay = time_after_eq(now, rule->dl_delay_time); - if (delay) { - if (time_after(now, rule->dl_time_base)) - rule->dl_time_base = now; - - rule->dl_delay_time = rule->dl_time_base + - prandom_u32_max(attr->u.delay.la_interval) * HZ; - rule->dl_time_base += attr->u.delay.la_interval * HZ; - - CDEBUG(D_NET, "Delay Rule %s->%s: next delay : %lu\n", - libcfs_nid2str(attr->fa_src), - libcfs_nid2str(attr->fa_dst), - rule->dl_delay_time); - } - - } else { /* rate based delay */ - delay = rule->dl_stat.fs_count++ == rule->dl_delay_at; - /* generate the next random rate sequence */ - if (!do_div(rule->dl_stat.fs_count, attr->u.delay.la_rate)) { - rule->dl_delay_at = rule->dl_stat.fs_count + - prandom_u32_max(attr->u.delay.la_rate); - CDEBUG(D_NET, "Delay Rule %s->%s: next delay: %lu\n", - libcfs_nid2str(attr->fa_src), - libcfs_nid2str(attr->fa_dst), rule->dl_delay_at); - } - } - - if (!delay) { - spin_unlock(&rule->dl_lock); - return false; - } - - /* delay this message, update counters */ - lnet_fault_stat_inc(&rule->dl_stat, type); - rule->dl_stat.u.delay.ls_delayed++; - - list_add_tail(&msg->msg_list, &rule->dl_msg_list); - msg->msg_delay_send = round_timeout( - jiffies + attr->u.delay.la_latency * HZ); - if (rule->dl_msg_send == -1) { - rule->dl_msg_send = msg->msg_delay_send; - mod_timer(&rule->dl_timer, rule->dl_msg_send); - } - - spin_unlock(&rule->dl_lock); - return true; -} - -/** - * check if \a msg can match any Delay Rule, receiving of this message - * will be delayed if there is a match. - */ -bool -lnet_delay_rule_match_locked(struct lnet_hdr *hdr, struct lnet_msg *msg) -{ - struct lnet_delay_rule *rule; - lnet_nid_t src = le64_to_cpu(hdr->src_nid); - lnet_nid_t dst = le64_to_cpu(hdr->dest_nid); - unsigned int typ = le32_to_cpu(hdr->type); - unsigned int ptl = -1; - - /* NB: called with hold of lnet_net_lock */ - - /** - * NB: if Portal is specified, then only PUT and GET will be - * filtered by delay rule - */ - if (typ == LNET_MSG_PUT) - ptl = le32_to_cpu(hdr->msg.put.ptl_index); - else if (typ == LNET_MSG_GET) - ptl = le32_to_cpu(hdr->msg.get.ptl_index); - - list_for_each_entry(rule, &the_lnet.ln_delay_rules, dl_link) { - if (delay_rule_match(rule, src, dst, typ, ptl, msg)) - return true; - } - - return false; -} - -/** check out delayed messages for send */ -static void -delayed_msg_check(struct lnet_delay_rule *rule, bool all, - struct list_head *msg_list) -{ - struct lnet_msg *msg; - struct lnet_msg *tmp; - unsigned long now = jiffies; - - if (!all && rule->dl_msg_send > now) - return; - - spin_lock(&rule->dl_lock); - list_for_each_entry_safe(msg, tmp, &rule->dl_msg_list, msg_list) { - if (!all && msg->msg_delay_send > now) - break; - - msg->msg_delay_send = 0; - list_move_tail(&msg->msg_list, msg_list); - } - - if (list_empty(&rule->dl_msg_list)) { - del_timer(&rule->dl_timer); - rule->dl_msg_send = -1; - - } else if (!list_empty(msg_list)) { - /* - * dequeued some timedout messages, update timer for the - * next delayed message on rule - */ - msg = list_entry(rule->dl_msg_list.next, - struct lnet_msg, msg_list); - rule->dl_msg_send = msg->msg_delay_send; - mod_timer(&rule->dl_timer, rule->dl_msg_send); - } - spin_unlock(&rule->dl_lock); -} - -static void -delayed_msg_process(struct list_head *msg_list, bool drop) -{ - struct lnet_msg *msg; - - while (!list_empty(msg_list)) { - struct lnet_ni *ni; - int cpt; - int rc; - - msg = list_entry(msg_list->next, struct lnet_msg, msg_list); - LASSERT(msg->msg_rxpeer); - - ni = msg->msg_rxpeer->lp_ni; - cpt = msg->msg_rx_cpt; - - list_del_init(&msg->msg_list); - if (drop) { - rc = -ECANCELED; - - } else if (!msg->msg_routing) { - rc = lnet_parse_local(ni, msg); - if (!rc) - continue; - - } else { - lnet_net_lock(cpt); - rc = lnet_parse_forward_locked(ni, msg); - lnet_net_unlock(cpt); - - switch (rc) { - case LNET_CREDIT_OK: - lnet_ni_recv(ni, msg->msg_private, msg, 0, - 0, msg->msg_len, msg->msg_len); - /* fall through */ - case LNET_CREDIT_WAIT: - continue; - default: /* failures */ - break; - } - } - - lnet_drop_message(ni, cpt, msg->msg_private, msg->msg_len); - lnet_finalize(ni, msg, rc); - } -} - -/** - * Process delayed messages for scheduled rules - * This function can either be called by delay_rule_daemon, or by lnet_finalise - */ -void -lnet_delay_rule_check(void) -{ - struct lnet_delay_rule *rule; - struct list_head msgs; - - INIT_LIST_HEAD(&msgs); - while (1) { - if (list_empty(&delay_dd.dd_sched_rules)) - break; - - spin_lock_bh(&delay_dd.dd_lock); - if (list_empty(&delay_dd.dd_sched_rules)) { - spin_unlock_bh(&delay_dd.dd_lock); - break; - } - - rule = list_entry(delay_dd.dd_sched_rules.next, - struct lnet_delay_rule, dl_sched_link); - list_del_init(&rule->dl_sched_link); - spin_unlock_bh(&delay_dd.dd_lock); - - delayed_msg_check(rule, false, &msgs); - delay_rule_decref(rule); /* -1 for delay_dd.dd_sched_rules */ - } - - if (!list_empty(&msgs)) - delayed_msg_process(&msgs, false); -} - -/** daemon thread to handle delayed messages */ -static int -lnet_delay_rule_daemon(void *arg) -{ - delay_dd.dd_running = 1; - wake_up(&delay_dd.dd_ctl_waitq); - - while (delay_dd.dd_running) { - wait_event_interruptible(delay_dd.dd_waitq, - !delay_dd.dd_running || - !list_empty(&delay_dd.dd_sched_rules)); - lnet_delay_rule_check(); - } - - /* in case more rules have been enqueued after my last check */ - lnet_delay_rule_check(); - delay_dd.dd_stopped = 1; - wake_up(&delay_dd.dd_ctl_waitq); - - return 0; -} - -static void -delay_timer_cb(struct timer_list *t) -{ - struct lnet_delay_rule *rule = from_timer(rule, t, dl_timer); - - spin_lock_bh(&delay_dd.dd_lock); - if (list_empty(&rule->dl_sched_link) && delay_dd.dd_running) { - atomic_inc(&rule->dl_refcount); - list_add_tail(&rule->dl_sched_link, &delay_dd.dd_sched_rules); - wake_up(&delay_dd.dd_waitq); - } - spin_unlock_bh(&delay_dd.dd_lock); -} - -/** - * Add a new delay rule to LNet - * There is no check for duplicated delay rule, all rules will be checked for - * incoming message. - */ -int -lnet_delay_rule_add(struct lnet_fault_attr *attr) -{ - struct lnet_delay_rule *rule; - int rc = 0; - - if (attr->u.delay.la_rate & attr->u.delay.la_interval) { - CDEBUG(D_NET, "please provide either delay rate or delay interval, but not both at the same time %d/%d\n", - attr->u.delay.la_rate, attr->u.delay.la_interval); - return -EINVAL; - } - - if (!attr->u.delay.la_latency) { - CDEBUG(D_NET, "delay latency cannot be zero\n"); - return -EINVAL; - } - - if (lnet_fault_attr_validate(attr)) - return -EINVAL; - - rule = kzalloc(sizeof(*rule), GFP_NOFS); - if (!rule) - return -ENOMEM; - - mutex_lock(&delay_dd.dd_mutex); - if (!delay_dd.dd_running) { - struct task_struct *task; - - /** - * NB: although LND threads will process delayed message - * in lnet_finalize, but there is no guarantee that LND - * threads will be waken up if no other message needs to - * be handled. - * Only one daemon thread, performance is not the concern - * of this simualation module. - */ - task = kthread_run(lnet_delay_rule_daemon, NULL, "lnet_dd"); - if (IS_ERR(task)) { - rc = PTR_ERR(task); - goto failed; - } - wait_event(delay_dd.dd_ctl_waitq, delay_dd.dd_running); - } - - timer_setup(&rule->dl_timer, delay_timer_cb, 0); - - spin_lock_init(&rule->dl_lock); - INIT_LIST_HEAD(&rule->dl_msg_list); - INIT_LIST_HEAD(&rule->dl_sched_link); - - rule->dl_attr = *attr; - if (attr->u.delay.la_interval) { - rule->dl_time_base = jiffies + attr->u.delay.la_interval * HZ; - rule->dl_delay_time = jiffies + - prandom_u32_max(attr->u.delay.la_interval) * HZ; - } else { - rule->dl_delay_at = prandom_u32_max(attr->u.delay.la_rate); - } - - rule->dl_msg_send = -1; - - lnet_net_lock(LNET_LOCK_EX); - atomic_set(&rule->dl_refcount, 1); - list_add(&rule->dl_link, &the_lnet.ln_delay_rules); - lnet_net_unlock(LNET_LOCK_EX); - - CDEBUG(D_NET, "Added delay rule: src %s, dst %s, rate %d\n", - libcfs_nid2str(attr->fa_src), libcfs_nid2str(attr->fa_src), - attr->u.delay.la_rate); - - mutex_unlock(&delay_dd.dd_mutex); - return 0; -failed: - mutex_unlock(&delay_dd.dd_mutex); - kfree(rule); - return rc; -} - -/** - * Remove matched Delay Rules from lnet, if \a shutdown is true or both \a src - * and \a dst are zero, all rules will be removed, otherwise only matched rules - * will be removed. - * If \a src is zero, then all rules have \a dst as destination will be remove - * If \a dst is zero, then all rules have \a src as source will be removed - * - * When a delay rule is removed, all delayed messages of this rule will be - * processed immediately. - */ -int -lnet_delay_rule_del(lnet_nid_t src, lnet_nid_t dst, bool shutdown) -{ - struct lnet_delay_rule *rule; - struct lnet_delay_rule *tmp; - struct list_head rule_list; - struct list_head msg_list; - int n = 0; - bool cleanup; - - INIT_LIST_HEAD(&rule_list); - INIT_LIST_HEAD(&msg_list); - - if (shutdown) { - src = 0; - dst = 0; - } - - mutex_lock(&delay_dd.dd_mutex); - lnet_net_lock(LNET_LOCK_EX); - - list_for_each_entry_safe(rule, tmp, &the_lnet.ln_delay_rules, dl_link) { - if (rule->dl_attr.fa_src != src && src) - continue; - - if (rule->dl_attr.fa_dst != dst && dst) - continue; - - CDEBUG(D_NET, "Remove delay rule: src %s->dst: %s (1/%d, %d)\n", - libcfs_nid2str(rule->dl_attr.fa_src), - libcfs_nid2str(rule->dl_attr.fa_dst), - rule->dl_attr.u.delay.la_rate, - rule->dl_attr.u.delay.la_interval); - /* refcount is taken over by rule_list */ - list_move(&rule->dl_link, &rule_list); - } - - /* check if we need to shutdown delay_daemon */ - cleanup = list_empty(&the_lnet.ln_delay_rules) && - !list_empty(&rule_list); - lnet_net_unlock(LNET_LOCK_EX); - - list_for_each_entry_safe(rule, tmp, &rule_list, dl_link) { - list_del_init(&rule->dl_link); - - del_timer_sync(&rule->dl_timer); - delayed_msg_check(rule, true, &msg_list); - delay_rule_decref(rule); /* -1 for the_lnet.ln_delay_rules */ - n++; - } - - if (cleanup) { /* no more delay rule, shutdown delay_daemon */ - LASSERT(delay_dd.dd_running); - delay_dd.dd_running = 0; - wake_up(&delay_dd.dd_waitq); - - while (!delay_dd.dd_stopped) - wait_event(delay_dd.dd_ctl_waitq, delay_dd.dd_stopped); - } - mutex_unlock(&delay_dd.dd_mutex); - - if (!list_empty(&msg_list)) - delayed_msg_process(&msg_list, shutdown); - - return n; -} - -/** - * List Delay Rule at position of \a pos - */ -int -lnet_delay_rule_list(int pos, struct lnet_fault_attr *attr, - struct lnet_fault_stat *stat) -{ - struct lnet_delay_rule *rule; - int cpt; - int i = 0; - int rc = -ENOENT; - - cpt = lnet_net_lock_current(); - list_for_each_entry(rule, &the_lnet.ln_delay_rules, dl_link) { - if (i++ < pos) - continue; - - spin_lock(&rule->dl_lock); - *attr = rule->dl_attr; - *stat = rule->dl_stat; - spin_unlock(&rule->dl_lock); - rc = 0; - break; - } - - lnet_net_unlock(cpt); - return rc; -} - -/** - * reset counters for all Delay Rules - */ -void -lnet_delay_rule_reset(void) -{ - struct lnet_delay_rule *rule; - int cpt; - - cpt = lnet_net_lock_current(); - - list_for_each_entry(rule, &the_lnet.ln_delay_rules, dl_link) { - struct lnet_fault_attr *attr = &rule->dl_attr; - - spin_lock(&rule->dl_lock); - - memset(&rule->dl_stat, 0, sizeof(rule->dl_stat)); - if (attr->u.delay.la_rate) { - rule->dl_delay_at = prandom_u32_max(attr->u.delay.la_rate); - } else { - rule->dl_delay_time = - jiffies + prandom_u32_max( - attr->u.delay.la_interval) * HZ; - rule->dl_time_base = jiffies + attr->u.delay.la_interval * HZ; - } - spin_unlock(&rule->dl_lock); - } - - lnet_net_unlock(cpt); -} - -int -lnet_fault_ctl(int opc, struct libcfs_ioctl_data *data) -{ - struct lnet_fault_attr *attr; - struct lnet_fault_stat *stat; - - attr = (struct lnet_fault_attr *)data->ioc_inlbuf1; - - switch (opc) { - default: - return -EINVAL; - - case LNET_CTL_DROP_ADD: - if (!attr) - return -EINVAL; - - return lnet_drop_rule_add(attr); - - case LNET_CTL_DROP_DEL: - if (!attr) - return -EINVAL; - - data->ioc_count = lnet_drop_rule_del(attr->fa_src, - attr->fa_dst); - return 0; - - case LNET_CTL_DROP_RESET: - lnet_drop_rule_reset(); - return 0; - - case LNET_CTL_DROP_LIST: - stat = (struct lnet_fault_stat *)data->ioc_inlbuf2; - if (!attr || !stat) - return -EINVAL; - - return lnet_drop_rule_list(data->ioc_count, attr, stat); - - case LNET_CTL_DELAY_ADD: - if (!attr) - return -EINVAL; - - return lnet_delay_rule_add(attr); - - case LNET_CTL_DELAY_DEL: - if (!attr) - return -EINVAL; - - data->ioc_count = lnet_delay_rule_del(attr->fa_src, - attr->fa_dst, false); - return 0; - - case LNET_CTL_DELAY_RESET: - lnet_delay_rule_reset(); - return 0; - - case LNET_CTL_DELAY_LIST: - stat = (struct lnet_fault_stat *)data->ioc_inlbuf2; - if (!attr || !stat) - return -EINVAL; - - return lnet_delay_rule_list(data->ioc_count, attr, stat); - } -} - -int -lnet_fault_init(void) -{ - BUILD_BUG_ON(LNET_PUT_BIT != 1 << LNET_MSG_PUT); - BUILD_BUG_ON(LNET_ACK_BIT != 1 << LNET_MSG_ACK); - BUILD_BUG_ON(LNET_GET_BIT != 1 << LNET_MSG_GET); - BUILD_BUG_ON(LNET_REPLY_BIT != 1 << LNET_MSG_REPLY); - - mutex_init(&delay_dd.dd_mutex); - spin_lock_init(&delay_dd.dd_lock); - init_waitqueue_head(&delay_dd.dd_waitq); - init_waitqueue_head(&delay_dd.dd_ctl_waitq); - INIT_LIST_HEAD(&delay_dd.dd_sched_rules); - - return 0; -} - -void -lnet_fault_fini(void) -{ - lnet_drop_rule_del(0, 0); - lnet_delay_rule_del(0, 0, true); - - LASSERT(list_empty(&the_lnet.ln_drop_rules)); - LASSERT(list_empty(&the_lnet.ln_delay_rules)); - LASSERT(list_empty(&delay_dd.dd_sched_rules)); -} diff --git a/drivers/staging/lustre/lnet/lnet/nidstrings.c b/drivers/staging/lustre/lnet/lnet/nidstrings.c deleted file mode 100644 index 0f6c3fa16c65d4a7baa3cf3149a913033ce3d9a8..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/lnet/nidstrings.c +++ /dev/null @@ -1,1261 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/lnet/nidstrings.c - * - * Author: Phil Schwan - */ - -#define DEBUG_SUBSYSTEM S_LNET - -#include -#include -#include -#include -#include - -/* max value for numeric network address */ -#define MAX_NUMERIC_VALUE 0xffffffff - -#define IPSTRING_LENGTH 16 - -/* CAVEAT VENDITOR! Keep the canonical string representation of nets/nids - * consistent in all conversion functions. Some code fragments are copied - * around for the sake of clarity... - */ - -/* CAVEAT EMPTOR! Racey temporary buffer allocation! - * Choose the number of nidstrings to support the MAXIMUM expected number of - * concurrent users. If there are more, the returned string will be volatile. - * NB this number must allow for a process to be descheduled for a timeslice - * between getting its string and using it. - */ - -static char libcfs_nidstrings[LNET_NIDSTR_COUNT][LNET_NIDSTR_SIZE]; -static int libcfs_nidstring_idx; - -static DEFINE_SPINLOCK(libcfs_nidstring_lock); - -static struct netstrfns *libcfs_namenum2netstrfns(const char *name); - -char * -libcfs_next_nidstring(void) -{ - char *str; - unsigned long flags; - - spin_lock_irqsave(&libcfs_nidstring_lock, flags); - - str = libcfs_nidstrings[libcfs_nidstring_idx++]; - if (libcfs_nidstring_idx == ARRAY_SIZE(libcfs_nidstrings)) - libcfs_nidstring_idx = 0; - - spin_unlock_irqrestore(&libcfs_nidstring_lock, flags); - return str; -} -EXPORT_SYMBOL(libcfs_next_nidstring); - -/** - * Nid range list syntax. - * \verbatim - * - * :== [ ' ' ] - * :== '@' - * :== '*' | - * | - * - * :== ... - * - * :== | - * - * :== '[' [ ',' ] ']' - * :== | - * '-' | - * '-' '/' - * :== | - * :== "lo" | "tcp" | "o2ib" | "cib" | "openib" | "iib" | - * "vib" | "ra" | "elan" | "mx" | "ptl" - * \endverbatim - */ - -/** - * Structure to represent \ token of the syntax. - * - * One of this is created for each \ parsed. - */ -struct nidrange { - /** - * Link to list of this structures which is built on nid range - * list parsing. - */ - struct list_head nr_link; - /** - * List head for addrrange::ar_link. - */ - struct list_head nr_addrranges; - /** - * Flag indicating that *@ is found. - */ - int nr_all; - /** - * Pointer to corresponding element of libcfs_netstrfns. - */ - struct netstrfns *nr_netstrfns; - /** - * Number of network. E.g. 5 if \ is "elan5". - */ - int nr_netnum; -}; - -/** - * Structure to represent \ token of the syntax. - */ -struct addrrange { - /** - * Link to nidrange::nr_addrranges. - */ - struct list_head ar_link; - /** - * List head for cfs_expr_list::el_list. - */ - struct list_head ar_numaddr_ranges; -}; - -/** - * Parses \ token on the syntax. - * - * Allocates struct addrrange and links to \a nidrange via - * (nidrange::nr_addrranges) - * - * \retval 0 if \a src parses to '*' | \ | \ - * \retval -errno otherwise - */ -static int -parse_addrange(const struct cfs_lstr *src, struct nidrange *nidrange) -{ - struct addrrange *addrrange; - - if (src->ls_len == 1 && src->ls_str[0] == '*') { - nidrange->nr_all = 1; - return 0; - } - - addrrange = kzalloc(sizeof(struct addrrange), GFP_NOFS); - if (!addrrange) - return -ENOMEM; - list_add_tail(&addrrange->ar_link, &nidrange->nr_addrranges); - INIT_LIST_HEAD(&addrrange->ar_numaddr_ranges); - - return nidrange->nr_netstrfns->nf_parse_addrlist(src->ls_str, - src->ls_len, - &addrrange->ar_numaddr_ranges); -} - -/** - * Finds or creates struct nidrange. - * - * Checks if \a src is a valid network name, looks for corresponding - * nidrange on the ist of nidranges (\a nidlist), creates new struct - * nidrange if it is not found. - * - * \retval pointer to struct nidrange matching network specified via \a src - * \retval NULL if \a src does not match any network - */ -static struct nidrange * -add_nidrange(const struct cfs_lstr *src, - struct list_head *nidlist) -{ - struct netstrfns *nf; - struct nidrange *nr; - int endlen; - unsigned int netnum; - - if (src->ls_len >= LNET_NIDSTR_SIZE) - return NULL; - - nf = libcfs_namenum2netstrfns(src->ls_str); - if (!nf) - return NULL; - endlen = src->ls_len - strlen(nf->nf_name); - if (!endlen) - /* network name only, e.g. "elan" or "tcp" */ - netnum = 0; - else { - /* - * e.g. "elan25" or "tcp23", refuse to parse if - * network name is not appended with decimal or - * hexadecimal number - */ - if (!cfs_str2num_check(src->ls_str + strlen(nf->nf_name), - endlen, &netnum, 0, MAX_NUMERIC_VALUE)) - return NULL; - } - - list_for_each_entry(nr, nidlist, nr_link) { - if (nr->nr_netstrfns != nf) - continue; - if (nr->nr_netnum != netnum) - continue; - return nr; - } - - nr = kzalloc(sizeof(struct nidrange), GFP_NOFS); - if (!nr) - return NULL; - list_add_tail(&nr->nr_link, nidlist); - INIT_LIST_HEAD(&nr->nr_addrranges); - nr->nr_netstrfns = nf; - nr->nr_all = 0; - nr->nr_netnum = netnum; - - return nr; -} - -/** - * Parses \ token of the syntax. - * - * \retval 1 if \a src parses to \ '@' \ - * \retval 0 otherwise - */ -static int -parse_nidrange(struct cfs_lstr *src, struct list_head *nidlist) -{ - struct cfs_lstr addrrange; - struct cfs_lstr net; - struct nidrange *nr; - - if (!cfs_gettok(src, '@', &addrrange)) - goto failed; - - if (!cfs_gettok(src, '@', &net) || src->ls_str) - goto failed; - - nr = add_nidrange(&net, nidlist); - if (!nr) - goto failed; - - if (parse_addrange(&addrrange, nr)) - goto failed; - - return 1; -failed: - return 0; -} - -/** - * Frees addrrange structures of \a list. - * - * For each struct addrrange structure found on \a list it frees - * cfs_expr_list list attached to it and frees the addrrange itself. - * - * \retval none - */ -static void -free_addrranges(struct list_head *list) -{ - while (!list_empty(list)) { - struct addrrange *ar; - - ar = list_entry(list->next, struct addrrange, ar_link); - - cfs_expr_list_free_list(&ar->ar_numaddr_ranges); - list_del(&ar->ar_link); - kfree(ar); - } -} - -/** - * Frees nidrange strutures of \a list. - * - * For each struct nidrange structure found on \a list it frees - * addrrange list attached to it and frees the nidrange itself. - * - * \retval none - */ -void -cfs_free_nidlist(struct list_head *list) -{ - struct list_head *pos, *next; - struct nidrange *nr; - - list_for_each_safe(pos, next, list) { - nr = list_entry(pos, struct nidrange, nr_link); - free_addrranges(&nr->nr_addrranges); - list_del(pos); - kfree(nr); - } -} -EXPORT_SYMBOL(cfs_free_nidlist); - -/** - * Parses nid range list. - * - * Parses with rigorous syntax and overflow checking \a str into - * \ [ ' ' \ ], compiles \a str into set of - * structures and links that structure to \a nidlist. The resulting - * list can be used to match a NID againts set of NIDS defined by \a - * str. - * \see cfs_match_nid - * - * \retval 1 on success - * \retval 0 otherwise - */ -int -cfs_parse_nidlist(char *str, int len, struct list_head *nidlist) -{ - struct cfs_lstr src; - struct cfs_lstr res; - int rc; - - src.ls_str = str; - src.ls_len = len; - INIT_LIST_HEAD(nidlist); - while (src.ls_str) { - rc = cfs_gettok(&src, ' ', &res); - if (!rc) { - cfs_free_nidlist(nidlist); - return 0; - } - rc = parse_nidrange(&res, nidlist); - if (!rc) { - cfs_free_nidlist(nidlist); - return 0; - } - } - return 1; -} -EXPORT_SYMBOL(cfs_parse_nidlist); - -/** - * Matches a nid (\a nid) against the compiled list of nidranges (\a nidlist). - * - * \see cfs_parse_nidlist() - * - * \retval 1 on match - * \retval 0 otherwises - */ -int cfs_match_nid(lnet_nid_t nid, struct list_head *nidlist) -{ - struct nidrange *nr; - struct addrrange *ar; - - list_for_each_entry(nr, nidlist, nr_link) { - if (nr->nr_netstrfns->nf_type != LNET_NETTYP(LNET_NIDNET(nid))) - continue; - if (nr->nr_netnum != LNET_NETNUM(LNET_NIDNET(nid))) - continue; - if (nr->nr_all) - return 1; - list_for_each_entry(ar, &nr->nr_addrranges, ar_link) - if (nr->nr_netstrfns->nf_match_addr(LNET_NIDADDR(nid), - &ar->ar_numaddr_ranges)) - return 1; - } - return 0; -} -EXPORT_SYMBOL(cfs_match_nid); - -/** - * Print the network part of the nidrange \a nr into the specified \a buffer. - * - * \retval number of characters written - */ -static int -cfs_print_network(char *buffer, int count, struct nidrange *nr) -{ - struct netstrfns *nf = nr->nr_netstrfns; - - if (!nr->nr_netnum) - return scnprintf(buffer, count, "@%s", nf->nf_name); - else - return scnprintf(buffer, count, "@%s%u", - nf->nf_name, nr->nr_netnum); -} - -/** - * Print a list of addrrange (\a addrranges) into the specified \a buffer. - * At max \a count characters can be printed into \a buffer. - * - * \retval number of characters written - */ -static int -cfs_print_addrranges(char *buffer, int count, struct list_head *addrranges, - struct nidrange *nr) -{ - int i = 0; - struct addrrange *ar; - struct netstrfns *nf = nr->nr_netstrfns; - - list_for_each_entry(ar, addrranges, ar_link) { - if (i) - i += scnprintf(buffer + i, count - i, " "); - i += nf->nf_print_addrlist(buffer + i, count - i, - &ar->ar_numaddr_ranges); - i += cfs_print_network(buffer + i, count - i, nr); - } - return i; -} - -/** - * Print a list of nidranges (\a nidlist) into the specified \a buffer. - * At max \a count characters can be printed into \a buffer. - * Nidranges are separated by a space character. - * - * \retval number of characters written - */ -int cfs_print_nidlist(char *buffer, int count, struct list_head *nidlist) -{ - int i = 0; - struct nidrange *nr; - - if (count <= 0) - return 0; - - list_for_each_entry(nr, nidlist, nr_link) { - if (i) - i += scnprintf(buffer + i, count - i, " "); - - if (nr->nr_all) { - LASSERT(list_empty(&nr->nr_addrranges)); - i += scnprintf(buffer + i, count - i, "*"); - i += cfs_print_network(buffer + i, count - i, nr); - } else { - i += cfs_print_addrranges(buffer + i, count - i, - &nr->nr_addrranges, nr); - } - } - return i; -} -EXPORT_SYMBOL(cfs_print_nidlist); - -/** - * Determines minimum and maximum addresses for a single - * numeric address range - * - * \param ar - * \param min_nid - * \param max_nid - */ -static void cfs_ip_ar_min_max(struct addrrange *ar, __u32 *min_nid, - __u32 *max_nid) -{ - struct cfs_expr_list *el; - struct cfs_range_expr *re; - __u32 tmp_ip_addr = 0; - unsigned int min_ip[4] = {0}; - unsigned int max_ip[4] = {0}; - int re_count = 0; - - list_for_each_entry(el, &ar->ar_numaddr_ranges, el_link) { - list_for_each_entry(re, &el->el_exprs, re_link) { - min_ip[re_count] = re->re_lo; - max_ip[re_count] = re->re_hi; - re_count++; - } - } - - tmp_ip_addr = ((min_ip[0] << 24) | (min_ip[1] << 16) | - (min_ip[2] << 8) | min_ip[3]); - - if (min_nid) - *min_nid = tmp_ip_addr; - - tmp_ip_addr = ((max_ip[0] << 24) | (max_ip[1] << 16) | - (max_ip[2] << 8) | max_ip[3]); - - if (max_nid) - *max_nid = tmp_ip_addr; -} - -/** - * Determines minimum and maximum addresses for a single - * numeric address range - * - * \param ar - * \param min_nid - * \param max_nid - */ -static void cfs_num_ar_min_max(struct addrrange *ar, __u32 *min_nid, - __u32 *max_nid) -{ - struct cfs_expr_list *el; - struct cfs_range_expr *re; - unsigned int min_addr = 0; - unsigned int max_addr = 0; - - list_for_each_entry(el, &ar->ar_numaddr_ranges, el_link) { - list_for_each_entry(re, &el->el_exprs, re_link) { - if (re->re_lo < min_addr || !min_addr) - min_addr = re->re_lo; - if (re->re_hi > max_addr) - max_addr = re->re_hi; - } - } - - if (min_nid) - *min_nid = min_addr; - if (max_nid) - *max_nid = max_addr; -} - -/** - * Determines whether an expression list in an nidrange contains exactly - * one contiguous address range. Calls the correct netstrfns for the LND - * - * \param *nidlist - * - * \retval true if contiguous - * \retval false if not contiguous - */ -bool cfs_nidrange_is_contiguous(struct list_head *nidlist) -{ - struct nidrange *nr; - struct netstrfns *nf = NULL; - char *lndname = NULL; - int netnum = -1; - - list_for_each_entry(nr, nidlist, nr_link) { - nf = nr->nr_netstrfns; - if (!lndname) - lndname = nf->nf_name; - if (netnum == -1) - netnum = nr->nr_netnum; - - if (strcmp(lndname, nf->nf_name) || - netnum != nr->nr_netnum) - return false; - } - - if (!nf) - return false; - - if (!nf->nf_is_contiguous(nidlist)) - return false; - - return true; -} -EXPORT_SYMBOL(cfs_nidrange_is_contiguous); - -/** - * Determines whether an expression list in an num nidrange contains exactly - * one contiguous address range. - * - * \param *nidlist - * - * \retval true if contiguous - * \retval false if not contiguous - */ -static bool cfs_num_is_contiguous(struct list_head *nidlist) -{ - struct nidrange *nr; - struct addrrange *ar; - struct cfs_expr_list *el; - struct cfs_range_expr *re; - int last_hi = 0; - __u32 last_end_nid = 0; - __u32 current_start_nid = 0; - __u32 current_end_nid = 0; - - list_for_each_entry(nr, nidlist, nr_link) { - list_for_each_entry(ar, &nr->nr_addrranges, ar_link) { - cfs_num_ar_min_max(ar, ¤t_start_nid, - ¤t_end_nid); - if (last_end_nid && - (current_start_nid - last_end_nid != 1)) - return false; - last_end_nid = current_end_nid; - list_for_each_entry(el, &ar->ar_numaddr_ranges, - el_link) { - list_for_each_entry(re, &el->el_exprs, - re_link) { - if (re->re_stride > 1) - return false; - else if (last_hi && - re->re_hi - last_hi != 1) - return false; - last_hi = re->re_hi; - } - } - } - } - - return true; -} - -/** - * Determines whether an expression list in an ip nidrange contains exactly - * one contiguous address range. - * - * \param *nidlist - * - * \retval true if contiguous - * \retval false if not contiguous - */ -static bool cfs_ip_is_contiguous(struct list_head *nidlist) -{ - struct nidrange *nr; - struct addrrange *ar; - struct cfs_expr_list *el; - struct cfs_range_expr *re; - int expr_count; - int last_hi = 255; - int last_diff = 0; - __u32 last_end_nid = 0; - __u32 current_start_nid = 0; - __u32 current_end_nid = 0; - - list_for_each_entry(nr, nidlist, nr_link) { - list_for_each_entry(ar, &nr->nr_addrranges, ar_link) { - last_hi = 255; - last_diff = 0; - cfs_ip_ar_min_max(ar, ¤t_start_nid, - ¤t_end_nid); - if (last_end_nid && - (current_start_nid - last_end_nid != 1)) - return false; - last_end_nid = current_end_nid; - list_for_each_entry(el, &ar->ar_numaddr_ranges, - el_link) { - expr_count = 0; - list_for_each_entry(re, &el->el_exprs, - re_link) { - expr_count++; - if (re->re_stride > 1 || - (last_diff > 0 && last_hi != 255) || - (last_diff > 0 && last_hi == 255 && - re->re_lo > 0)) - return false; - last_hi = re->re_hi; - last_diff = re->re_hi - re->re_lo; - } - } - } - } - - return true; -} - -/** - * Takes a linked list of nidrange expressions, determines the minimum - * and maximum nid and creates appropriate nid structures - * - * \param *nidlist - * \param *min_nid - * \param *max_nid - */ -void cfs_nidrange_find_min_max(struct list_head *nidlist, char *min_nid, - char *max_nid, size_t nidstr_length) -{ - struct nidrange *nr; - struct netstrfns *nf = NULL; - int netnum = -1; - __u32 min_addr; - __u32 max_addr; - char *lndname = NULL; - char min_addr_str[IPSTRING_LENGTH]; - char max_addr_str[IPSTRING_LENGTH]; - - list_for_each_entry(nr, nidlist, nr_link) { - nf = nr->nr_netstrfns; - lndname = nf->nf_name; - if (netnum == -1) - netnum = nr->nr_netnum; - - nf->nf_min_max(nidlist, &min_addr, &max_addr); - } - nf->nf_addr2str(min_addr, min_addr_str, sizeof(min_addr_str)); - nf->nf_addr2str(max_addr, max_addr_str, sizeof(max_addr_str)); - - snprintf(min_nid, nidstr_length, "%s@%s%d", min_addr_str, lndname, - netnum); - snprintf(max_nid, nidstr_length, "%s@%s%d", max_addr_str, lndname, - netnum); -} -EXPORT_SYMBOL(cfs_nidrange_find_min_max); - -/** - * Determines the min and max NID values for num LNDs - * - * \param *nidlist - * \param *min_nid - * \param *max_nid - */ -static void cfs_num_min_max(struct list_head *nidlist, __u32 *min_nid, - __u32 *max_nid) -{ - struct nidrange *nr; - struct addrrange *ar; - unsigned int tmp_min_addr = 0; - unsigned int tmp_max_addr = 0; - unsigned int min_addr = 0; - unsigned int max_addr = 0; - - list_for_each_entry(nr, nidlist, nr_link) { - list_for_each_entry(ar, &nr->nr_addrranges, ar_link) { - cfs_num_ar_min_max(ar, &tmp_min_addr, - &tmp_max_addr); - if (tmp_min_addr < min_addr || !min_addr) - min_addr = tmp_min_addr; - if (tmp_max_addr > max_addr) - max_addr = tmp_min_addr; - } - } - *max_nid = max_addr; - *min_nid = min_addr; -} - -/** - * Takes an nidlist and determines the minimum and maximum - * ip addresses. - * - * \param *nidlist - * \param *min_nid - * \param *max_nid - */ -static void cfs_ip_min_max(struct list_head *nidlist, __u32 *min_nid, - __u32 *max_nid) -{ - struct nidrange *nr; - struct addrrange *ar; - __u32 tmp_min_ip_addr = 0; - __u32 tmp_max_ip_addr = 0; - __u32 min_ip_addr = 0; - __u32 max_ip_addr = 0; - - list_for_each_entry(nr, nidlist, nr_link) { - list_for_each_entry(ar, &nr->nr_addrranges, ar_link) { - cfs_ip_ar_min_max(ar, &tmp_min_ip_addr, - &tmp_max_ip_addr); - if (tmp_min_ip_addr < min_ip_addr || !min_ip_addr) - min_ip_addr = tmp_min_ip_addr; - if (tmp_max_ip_addr > max_ip_addr) - max_ip_addr = tmp_max_ip_addr; - } - } - - if (min_nid) - *min_nid = min_ip_addr; - if (max_nid) - *max_nid = max_ip_addr; -} - -static int -libcfs_lo_str2addr(const char *str, int nob, __u32 *addr) -{ - *addr = 0; - return 1; -} - -static void -libcfs_ip_addr2str(__u32 addr, char *str, size_t size) -{ - snprintf(str, size, "%u.%u.%u.%u", - (addr >> 24) & 0xff, (addr >> 16) & 0xff, - (addr >> 8) & 0xff, addr & 0xff); -} - -/* - * CAVEAT EMPTOR XscanfX - * I use "%n" at the end of a sscanf format to detect trailing junk. However - * sscanf may return immediately if it sees the terminating '0' in a string, so - * I initialise the %n variable to the expected length. If sscanf sets it; - * fine, if it doesn't, then the scan ended at the end of the string, which is - * fine too :) - */ -static int -libcfs_ip_str2addr(const char *str, int nob, __u32 *addr) -{ - unsigned int a; - unsigned int b; - unsigned int c; - unsigned int d; - int n = nob; /* XscanfX */ - - /* numeric IP? */ - if (sscanf(str, "%u.%u.%u.%u%n", &a, &b, &c, &d, &n) >= 4 && - n == nob && - !(a & ~0xff) && !(b & ~0xff) && - !(c & ~0xff) && !(d & ~0xff)) { - *addr = ((a << 24) | (b << 16) | (c << 8) | d); - return 1; - } - - return 0; -} - -/* Used by lnet/config.c so it can't be static */ -int -cfs_ip_addr_parse(char *str, int len, struct list_head *list) -{ - struct cfs_expr_list *el; - struct cfs_lstr src; - int rc; - int i; - - src.ls_str = str; - src.ls_len = len; - i = 0; - - while (src.ls_str) { - struct cfs_lstr res; - - if (!cfs_gettok(&src, '.', &res)) { - rc = -EINVAL; - goto out; - } - - rc = cfs_expr_list_parse(res.ls_str, res.ls_len, 0, 255, &el); - if (rc) - goto out; - - list_add_tail(&el->el_link, list); - i++; - } - - if (i == 4) - return 0; - - rc = -EINVAL; -out: - cfs_expr_list_free_list(list); - - return rc; -} - -static int -libcfs_ip_addr_range_print(char *buffer, int count, struct list_head *list) -{ - int i = 0, j = 0; - struct cfs_expr_list *el; - - list_for_each_entry(el, list, el_link) { - LASSERT(j++ < 4); - if (i) - i += scnprintf(buffer + i, count - i, "."); - i += cfs_expr_list_print(buffer + i, count - i, el); - } - return i; -} - -/** - * Matches address (\a addr) against address set encoded in \a list. - * - * \retval 1 if \a addr matches - * \retval 0 otherwise - */ -int -cfs_ip_addr_match(__u32 addr, struct list_head *list) -{ - struct cfs_expr_list *el; - int i = 0; - - list_for_each_entry_reverse(el, list, el_link) { - if (!cfs_expr_list_match(addr & 0xff, el)) - return 0; - addr >>= 8; - i++; - } - - return i == 4; -} - -static void -libcfs_decnum_addr2str(__u32 addr, char *str, size_t size) -{ - snprintf(str, size, "%u", addr); -} - -static int -libcfs_num_str2addr(const char *str, int nob, __u32 *addr) -{ - int n; - - n = nob; - if (sscanf(str, "0x%x%n", addr, &n) >= 1 && n == nob) - return 1; - - n = nob; - if (sscanf(str, "0X%x%n", addr, &n) >= 1 && n == nob) - return 1; - - n = nob; - if (sscanf(str, "%u%n", addr, &n) >= 1 && n == nob) - return 1; - - return 0; -} - -/** - * Nf_parse_addrlist method for networks using numeric addresses. - * - * Examples of such networks are gm and elan. - * - * \retval 0 if \a str parsed to numeric address - * \retval errno otherwise - */ -static int -libcfs_num_parse(char *str, int len, struct list_head *list) -{ - struct cfs_expr_list *el; - int rc; - - rc = cfs_expr_list_parse(str, len, 0, MAX_NUMERIC_VALUE, &el); - if (!rc) - list_add_tail(&el->el_link, list); - - return rc; -} - -static int -libcfs_num_addr_range_print(char *buffer, int count, struct list_head *list) -{ - int i = 0, j = 0; - struct cfs_expr_list *el; - - list_for_each_entry(el, list, el_link) { - LASSERT(j++ < 1); - i += cfs_expr_list_print(buffer + i, count - i, el); - } - return i; -} - -/* - * Nf_match_addr method for networks using numeric addresses - * - * \retval 1 on match - * \retval 0 otherwise - */ -static int -libcfs_num_match(__u32 addr, struct list_head *numaddr) -{ - struct cfs_expr_list *el; - - LASSERT(!list_empty(numaddr)); - el = list_entry(numaddr->next, struct cfs_expr_list, el_link); - - return cfs_expr_list_match(addr, el); -} - -static struct netstrfns libcfs_netstrfns[] = { - { .nf_type = LOLND, - .nf_name = "lo", - .nf_modname = "klolnd", - .nf_addr2str = libcfs_decnum_addr2str, - .nf_str2addr = libcfs_lo_str2addr, - .nf_parse_addrlist = libcfs_num_parse, - .nf_print_addrlist = libcfs_num_addr_range_print, - .nf_match_addr = libcfs_num_match, - .nf_is_contiguous = cfs_num_is_contiguous, - .nf_min_max = cfs_num_min_max }, - { .nf_type = SOCKLND, - .nf_name = "tcp", - .nf_modname = "ksocklnd", - .nf_addr2str = libcfs_ip_addr2str, - .nf_str2addr = libcfs_ip_str2addr, - .nf_parse_addrlist = cfs_ip_addr_parse, - .nf_print_addrlist = libcfs_ip_addr_range_print, - .nf_match_addr = cfs_ip_addr_match, - .nf_is_contiguous = cfs_ip_is_contiguous, - .nf_min_max = cfs_ip_min_max }, - { .nf_type = O2IBLND, - .nf_name = "o2ib", - .nf_modname = "ko2iblnd", - .nf_addr2str = libcfs_ip_addr2str, - .nf_str2addr = libcfs_ip_str2addr, - .nf_parse_addrlist = cfs_ip_addr_parse, - .nf_print_addrlist = libcfs_ip_addr_range_print, - .nf_match_addr = cfs_ip_addr_match, - .nf_is_contiguous = cfs_ip_is_contiguous, - .nf_min_max = cfs_ip_min_max }, - { .nf_type = GNILND, - .nf_name = "gni", - .nf_modname = "kgnilnd", - .nf_addr2str = libcfs_decnum_addr2str, - .nf_str2addr = libcfs_num_str2addr, - .nf_parse_addrlist = libcfs_num_parse, - .nf_print_addrlist = libcfs_num_addr_range_print, - .nf_match_addr = libcfs_num_match, - .nf_is_contiguous = cfs_num_is_contiguous, - .nf_min_max = cfs_num_min_max }, - { .nf_type = GNIIPLND, - .nf_name = "gip", - .nf_modname = "kgnilnd", - .nf_addr2str = libcfs_ip_addr2str, - .nf_str2addr = libcfs_ip_str2addr, - .nf_parse_addrlist = cfs_ip_addr_parse, - .nf_print_addrlist = libcfs_ip_addr_range_print, - .nf_match_addr = cfs_ip_addr_match, - .nf_is_contiguous = cfs_ip_is_contiguous, - .nf_min_max = cfs_ip_min_max }, -}; - -static const size_t libcfs_nnetstrfns = ARRAY_SIZE(libcfs_netstrfns); - -static struct netstrfns * -libcfs_lnd2netstrfns(__u32 lnd) -{ - int i; - - for (i = 0; i < libcfs_nnetstrfns; i++) - if (lnd == libcfs_netstrfns[i].nf_type) - return &libcfs_netstrfns[i]; - - return NULL; -} - -static struct netstrfns * -libcfs_namenum2netstrfns(const char *name) -{ - struct netstrfns *nf; - int i; - - for (i = 0; i < libcfs_nnetstrfns; i++) { - nf = &libcfs_netstrfns[i]; - if (!strncmp(name, nf->nf_name, strlen(nf->nf_name))) - return nf; - } - return NULL; -} - -static struct netstrfns * -libcfs_name2netstrfns(const char *name) -{ - int i; - - for (i = 0; i < libcfs_nnetstrfns; i++) - if (!strcmp(libcfs_netstrfns[i].nf_name, name)) - return &libcfs_netstrfns[i]; - - return NULL; -} - -int -libcfs_isknown_lnd(__u32 lnd) -{ - return !!libcfs_lnd2netstrfns(lnd); -} -EXPORT_SYMBOL(libcfs_isknown_lnd); - -char * -libcfs_lnd2modname(__u32 lnd) -{ - struct netstrfns *nf = libcfs_lnd2netstrfns(lnd); - - return nf ? nf->nf_modname : NULL; -} -EXPORT_SYMBOL(libcfs_lnd2modname); - -int -libcfs_str2lnd(const char *str) -{ - struct netstrfns *nf = libcfs_name2netstrfns(str); - - if (nf) - return nf->nf_type; - - return -ENXIO; -} -EXPORT_SYMBOL(libcfs_str2lnd); - -char * -libcfs_lnd2str_r(__u32 lnd, char *buf, size_t buf_size) -{ - struct netstrfns *nf; - - nf = libcfs_lnd2netstrfns(lnd); - if (!nf) - snprintf(buf, buf_size, "?%u?", lnd); - else - snprintf(buf, buf_size, "%s", nf->nf_name); - - return buf; -} -EXPORT_SYMBOL(libcfs_lnd2str_r); - -char * -libcfs_net2str_r(__u32 net, char *buf, size_t buf_size) -{ - __u32 nnum = LNET_NETNUM(net); - __u32 lnd = LNET_NETTYP(net); - struct netstrfns *nf; - - nf = libcfs_lnd2netstrfns(lnd); - if (!nf) - snprintf(buf, buf_size, "<%u:%u>", lnd, nnum); - else if (!nnum) - snprintf(buf, buf_size, "%s", nf->nf_name); - else - snprintf(buf, buf_size, "%s%u", nf->nf_name, nnum); - - return buf; -} -EXPORT_SYMBOL(libcfs_net2str_r); - -char * -libcfs_nid2str_r(lnet_nid_t nid, char *buf, size_t buf_size) -{ - __u32 addr = LNET_NIDADDR(nid); - __u32 net = LNET_NIDNET(nid); - __u32 nnum = LNET_NETNUM(net); - __u32 lnd = LNET_NETTYP(net); - struct netstrfns *nf; - - if (nid == LNET_NID_ANY) { - strncpy(buf, "", buf_size); - buf[buf_size - 1] = '\0'; - return buf; - } - - nf = libcfs_lnd2netstrfns(lnd); - if (!nf) { - snprintf(buf, buf_size, "%x@<%u:%u>", addr, lnd, nnum); - } else { - size_t addr_len; - - nf->nf_addr2str(addr, buf, buf_size); - addr_len = strlen(buf); - if (!nnum) - snprintf(buf + addr_len, buf_size - addr_len, "@%s", - nf->nf_name); - else - snprintf(buf + addr_len, buf_size - addr_len, "@%s%u", - nf->nf_name, nnum); - } - - return buf; -} -EXPORT_SYMBOL(libcfs_nid2str_r); - -static struct netstrfns * -libcfs_str2net_internal(const char *str, __u32 *net) -{ - struct netstrfns *nf = NULL; - int nob; - unsigned int netnum; - int i; - - for (i = 0; i < libcfs_nnetstrfns; i++) { - nf = &libcfs_netstrfns[i]; - if (!strncmp(str, nf->nf_name, strlen(nf->nf_name))) - break; - } - - if (i == libcfs_nnetstrfns) - return NULL; - - nob = strlen(nf->nf_name); - - if (strlen(str) == (unsigned int)nob) { - netnum = 0; - } else { - if (nf->nf_type == LOLND) /* net number not allowed */ - return NULL; - - str += nob; - i = strlen(str); - if (sscanf(str, "%u%n", &netnum, &i) < 1 || - i != (int)strlen(str)) - return NULL; - } - - *net = LNET_MKNET(nf->nf_type, netnum); - return nf; -} - -__u32 -libcfs_str2net(const char *str) -{ - __u32 net; - - if (libcfs_str2net_internal(str, &net)) - return net; - - return LNET_NIDNET(LNET_NID_ANY); -} -EXPORT_SYMBOL(libcfs_str2net); - -lnet_nid_t -libcfs_str2nid(const char *str) -{ - const char *sep = strchr(str, '@'); - struct netstrfns *nf; - __u32 net; - __u32 addr; - - if (sep) { - nf = libcfs_str2net_internal(sep + 1, &net); - if (!nf) - return LNET_NID_ANY; - } else { - sep = str + strlen(str); - net = LNET_MKNET(SOCKLND, 0); - nf = libcfs_lnd2netstrfns(SOCKLND); - LASSERT(nf); - } - - if (!nf->nf_str2addr(str, (int)(sep - str), &addr)) - return LNET_NID_ANY; - - return LNET_MKNID(net, addr); -} -EXPORT_SYMBOL(libcfs_str2nid); - -char * -libcfs_id2str(struct lnet_process_id id) -{ - char *str = libcfs_next_nidstring(); - - if (id.pid == LNET_PID_ANY) { - snprintf(str, LNET_NIDSTR_SIZE, - "LNET_PID_ANY-%s", libcfs_nid2str(id.nid)); - return str; - } - - snprintf(str, LNET_NIDSTR_SIZE, "%s%u-%s", - id.pid & LNET_PID_USERFLAG ? "U" : "", - id.pid & ~LNET_PID_USERFLAG, libcfs_nid2str(id.nid)); - return str; -} -EXPORT_SYMBOL(libcfs_id2str); - -int -libcfs_str2anynid(lnet_nid_t *nidp, const char *str) -{ - if (!strcmp(str, "*")) { - *nidp = LNET_NID_ANY; - return 1; - } - - *nidp = libcfs_str2nid(str); - return *nidp != LNET_NID_ANY; -} -EXPORT_SYMBOL(libcfs_str2anynid); diff --git a/drivers/staging/lustre/lnet/lnet/peer.c b/drivers/staging/lustre/lnet/lnet/peer.c deleted file mode 100644 index 58294149f7b21b23c073e2867ce38b9bba186376..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/lnet/peer.c +++ /dev/null @@ -1,456 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/lnet/peer.c - */ - -#define DEBUG_SUBSYSTEM S_LNET - -#include -#include - -int -lnet_peer_tables_create(void) -{ - struct lnet_peer_table *ptable; - struct list_head *hash; - int i; - int j; - - the_lnet.ln_peer_tables = cfs_percpt_alloc(lnet_cpt_table(), - sizeof(*ptable)); - if (!the_lnet.ln_peer_tables) { - CERROR("Failed to allocate cpu-partition peer tables\n"); - return -ENOMEM; - } - - cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) { - INIT_LIST_HEAD(&ptable->pt_deathrow); - - hash = kvmalloc_cpt(LNET_PEER_HASH_SIZE * sizeof(*hash), - GFP_KERNEL, i); - if (!hash) { - CERROR("Failed to create peer hash table\n"); - lnet_peer_tables_destroy(); - return -ENOMEM; - } - - for (j = 0; j < LNET_PEER_HASH_SIZE; j++) - INIT_LIST_HEAD(&hash[j]); - ptable->pt_hash = hash; /* sign of initialization */ - } - - return 0; -} - -void -lnet_peer_tables_destroy(void) -{ - struct lnet_peer_table *ptable; - struct list_head *hash; - int i; - int j; - - if (!the_lnet.ln_peer_tables) - return; - - cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) { - hash = ptable->pt_hash; - if (!hash) /* not initialized */ - break; - - LASSERT(list_empty(&ptable->pt_deathrow)); - - ptable->pt_hash = NULL; - for (j = 0; j < LNET_PEER_HASH_SIZE; j++) - LASSERT(list_empty(&hash[j])); - - kvfree(hash); - } - - cfs_percpt_free(the_lnet.ln_peer_tables); - the_lnet.ln_peer_tables = NULL; -} - -static void -lnet_peer_table_cleanup_locked(struct lnet_ni *ni, - struct lnet_peer_table *ptable) -{ - int i; - struct lnet_peer *lp; - struct lnet_peer *tmp; - - for (i = 0; i < LNET_PEER_HASH_SIZE; i++) { - list_for_each_entry_safe(lp, tmp, &ptable->pt_hash[i], - lp_hashlist) { - if (ni && ni != lp->lp_ni) - continue; - list_del_init(&lp->lp_hashlist); - /* Lose hash table's ref */ - ptable->pt_zombies++; - lnet_peer_decref_locked(lp); - } - } -} - -static void -lnet_peer_table_deathrow_wait_locked(struct lnet_peer_table *ptable, - int cpt_locked) -{ - int i; - - for (i = 3; ptable->pt_zombies; i++) { - lnet_net_unlock(cpt_locked); - - if (is_power_of_2(i)) { - CDEBUG(D_WARNING, - "Waiting for %d zombies on peer table\n", - ptable->pt_zombies); - } - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ >> 1); - lnet_net_lock(cpt_locked); - } -} - -static void -lnet_peer_table_del_rtrs_locked(struct lnet_ni *ni, - struct lnet_peer_table *ptable, - int cpt_locked) -{ - struct lnet_peer *lp; - struct lnet_peer *tmp; - lnet_nid_t lp_nid; - int i; - - for (i = 0; i < LNET_PEER_HASH_SIZE; i++) { - list_for_each_entry_safe(lp, tmp, &ptable->pt_hash[i], - lp_hashlist) { - if (ni != lp->lp_ni) - continue; - - if (!lp->lp_rtr_refcount) - continue; - - lp_nid = lp->lp_nid; - - lnet_net_unlock(cpt_locked); - lnet_del_route(LNET_NIDNET(LNET_NID_ANY), lp_nid); - lnet_net_lock(cpt_locked); - } - } -} - -void -lnet_peer_tables_cleanup(struct lnet_ni *ni) -{ - struct lnet_peer_table *ptable; - struct list_head deathrow; - struct lnet_peer *lp; - struct lnet_peer *temp; - int i; - - INIT_LIST_HEAD(&deathrow); - - LASSERT(the_lnet.ln_shutdown || ni); - /* - * If just deleting the peers for a NI, get rid of any routes these - * peers are gateways for. - */ - cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) { - lnet_net_lock(i); - lnet_peer_table_del_rtrs_locked(ni, ptable, i); - lnet_net_unlock(i); - } - - /* - * Start the process of moving the applicable peers to - * deathrow. - */ - cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) { - lnet_net_lock(i); - lnet_peer_table_cleanup_locked(ni, ptable); - lnet_net_unlock(i); - } - - /* Cleanup all entries on deathrow. */ - cfs_percpt_for_each(ptable, i, the_lnet.ln_peer_tables) { - lnet_net_lock(i); - lnet_peer_table_deathrow_wait_locked(ptable, i); - list_splice_init(&ptable->pt_deathrow, &deathrow); - lnet_net_unlock(i); - } - - list_for_each_entry_safe(lp, temp, &deathrow, lp_hashlist) { - list_del(&lp->lp_hashlist); - kfree(lp); - } -} - -void -lnet_destroy_peer_locked(struct lnet_peer *lp) -{ - struct lnet_peer_table *ptable; - - LASSERT(!lp->lp_refcount); - LASSERT(!lp->lp_rtr_refcount); - LASSERT(list_empty(&lp->lp_txq)); - LASSERT(list_empty(&lp->lp_hashlist)); - LASSERT(!lp->lp_txqnob); - - ptable = the_lnet.ln_peer_tables[lp->lp_cpt]; - LASSERT(ptable->pt_number > 0); - ptable->pt_number--; - - lnet_ni_decref_locked(lp->lp_ni, lp->lp_cpt); - lp->lp_ni = NULL; - - list_add(&lp->lp_hashlist, &ptable->pt_deathrow); - LASSERT(ptable->pt_zombies > 0); - ptable->pt_zombies--; -} - -struct lnet_peer * -lnet_find_peer_locked(struct lnet_peer_table *ptable, lnet_nid_t nid) -{ - struct list_head *peers; - struct lnet_peer *lp; - - LASSERT(!the_lnet.ln_shutdown); - - peers = &ptable->pt_hash[lnet_nid2peerhash(nid)]; - list_for_each_entry(lp, peers, lp_hashlist) { - if (lp->lp_nid == nid) { - lnet_peer_addref_locked(lp); - return lp; - } - } - - return NULL; -} - -int -lnet_nid2peer_locked(struct lnet_peer **lpp, lnet_nid_t nid, int cpt) -{ - struct lnet_peer_table *ptable; - struct lnet_peer *lp = NULL; - struct lnet_peer *lp2; - int cpt2; - int rc = 0; - - *lpp = NULL; - if (the_lnet.ln_shutdown) /* it's shutting down */ - return -ESHUTDOWN; - - /* cpt can be LNET_LOCK_EX if it's called from router functions */ - cpt2 = cpt != LNET_LOCK_EX ? cpt : lnet_cpt_of_nid_locked(nid); - - ptable = the_lnet.ln_peer_tables[cpt2]; - lp = lnet_find_peer_locked(ptable, nid); - if (lp) { - *lpp = lp; - return 0; - } - - if (!list_empty(&ptable->pt_deathrow)) { - lp = list_entry(ptable->pt_deathrow.next, - struct lnet_peer, lp_hashlist); - list_del(&lp->lp_hashlist); - } - - /* - * take extra refcount in case another thread has shutdown LNet - * and destroyed locks and peer-table before I finish the allocation - */ - ptable->pt_number++; - lnet_net_unlock(cpt); - - if (lp) - memset(lp, 0, sizeof(*lp)); - else - lp = kzalloc_cpt(sizeof(*lp), GFP_NOFS, cpt2); - - if (!lp) { - rc = -ENOMEM; - lnet_net_lock(cpt); - goto out; - } - - INIT_LIST_HEAD(&lp->lp_txq); - INIT_LIST_HEAD(&lp->lp_rtrq); - INIT_LIST_HEAD(&lp->lp_routes); - - lp->lp_notify = 0; - lp->lp_notifylnd = 0; - lp->lp_notifying = 0; - lp->lp_alive_count = 0; - lp->lp_timestamp = 0; - lp->lp_alive = !lnet_peers_start_down(); /* 1 bit!! */ - lp->lp_last_alive = jiffies; /* assumes alive */ - lp->lp_last_query = 0; /* haven't asked NI yet */ - lp->lp_ping_timestamp = 0; - lp->lp_ping_feats = LNET_PING_FEAT_INVAL; - lp->lp_nid = nid; - lp->lp_cpt = cpt2; - lp->lp_refcount = 2; /* 1 for caller; 1 for hash */ - lp->lp_rtr_refcount = 0; - - lnet_net_lock(cpt); - - if (the_lnet.ln_shutdown) { - rc = -ESHUTDOWN; - goto out; - } - - lp2 = lnet_find_peer_locked(ptable, nid); - if (lp2) { - *lpp = lp2; - goto out; - } - - lp->lp_ni = lnet_net2ni_locked(LNET_NIDNET(nid), cpt2); - if (!lp->lp_ni) { - rc = -EHOSTUNREACH; - goto out; - } - - lp->lp_txcredits = lp->lp_ni->ni_peertxcredits; - lp->lp_mintxcredits = lp->lp_ni->ni_peertxcredits; - lp->lp_rtrcredits = lnet_peer_buffer_credits(lp->lp_ni); - lp->lp_minrtrcredits = lnet_peer_buffer_credits(lp->lp_ni); - - list_add_tail(&lp->lp_hashlist, - &ptable->pt_hash[lnet_nid2peerhash(nid)]); - ptable->pt_version++; - *lpp = lp; - - return 0; -out: - if (lp) - list_add(&lp->lp_hashlist, &ptable->pt_deathrow); - ptable->pt_number--; - return rc; -} - -void -lnet_debug_peer(lnet_nid_t nid) -{ - char *aliveness = "NA"; - struct lnet_peer *lp; - int rc; - int cpt; - - cpt = lnet_cpt_of_nid(nid); - lnet_net_lock(cpt); - - rc = lnet_nid2peer_locked(&lp, nid, cpt); - if (rc) { - lnet_net_unlock(cpt); - CDEBUG(D_WARNING, "No peer %s\n", libcfs_nid2str(nid)); - return; - } - - if (lnet_isrouter(lp) || lnet_peer_aliveness_enabled(lp)) - aliveness = lp->lp_alive ? "up" : "down"; - - CDEBUG(D_WARNING, "%-24s %4d %5s %5d %5d %5d %5d %5d %ld\n", - libcfs_nid2str(lp->lp_nid), lp->lp_refcount, - aliveness, lp->lp_ni->ni_peertxcredits, - lp->lp_rtrcredits, lp->lp_minrtrcredits, - lp->lp_txcredits, lp->lp_mintxcredits, lp->lp_txqnob); - - lnet_peer_decref_locked(lp); - - lnet_net_unlock(cpt); -} - -int -lnet_get_peer_info(__u32 peer_index, __u64 *nid, - char aliveness[LNET_MAX_STR_LEN], - __u32 *cpt_iter, __u32 *refcount, - __u32 *ni_peer_tx_credits, __u32 *peer_tx_credits, - __u32 *peer_rtr_credits, __u32 *peer_min_rtr_credits, - __u32 *peer_tx_qnob) -{ - struct lnet_peer_table *peer_table; - struct lnet_peer *lp; - bool found = false; - int lncpt, j; - - /* get the number of CPTs */ - lncpt = cfs_percpt_number(the_lnet.ln_peer_tables); - - /* - * if the cpt number to be examined is >= the number of cpts in - * the system then indicate that there are no more cpts to examin - */ - if (*cpt_iter >= lncpt) - return -ENOENT; - - /* get the current table */ - peer_table = the_lnet.ln_peer_tables[*cpt_iter]; - /* if the ptable is NULL then there are no more cpts to examine */ - if (!peer_table) - return -ENOENT; - - lnet_net_lock(*cpt_iter); - - for (j = 0; j < LNET_PEER_HASH_SIZE && !found; j++) { - struct list_head *peers = &peer_table->pt_hash[j]; - - list_for_each_entry(lp, peers, lp_hashlist) { - if (peer_index-- > 0) - continue; - - snprintf(aliveness, LNET_MAX_STR_LEN, "NA"); - if (lnet_isrouter(lp) || - lnet_peer_aliveness_enabled(lp)) - snprintf(aliveness, LNET_MAX_STR_LEN, - lp->lp_alive ? "up" : "down"); - - *nid = lp->lp_nid; - *refcount = lp->lp_refcount; - *ni_peer_tx_credits = lp->lp_ni->ni_peertxcredits; - *peer_tx_credits = lp->lp_txcredits; - *peer_rtr_credits = lp->lp_rtrcredits; - *peer_min_rtr_credits = lp->lp_mintxcredits; - *peer_tx_qnob = lp->lp_txqnob; - - found = true; - } - } - lnet_net_unlock(*cpt_iter); - - *cpt_iter = lncpt; - - return found ? 0 : -ENOENT; -} diff --git a/drivers/staging/lustre/lnet/lnet/router.c b/drivers/staging/lustre/lnet/lnet/router.c deleted file mode 100644 index 6267d5e4bbd6fbdf37dd2b4edd954c9dd2bea9e8..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/lnet/router.c +++ /dev/null @@ -1,1799 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * - * Copyright (c) 2011, 2015, Intel Corporation. - * - * This file is part of Portals - * http://sourceforge.net/projects/sandiaportals/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#define DEBUG_SUBSYSTEM S_LNET - -#include -#include - -#define LNET_NRB_TINY_MIN 512 /* min value for each CPT */ -#define LNET_NRB_TINY (LNET_NRB_TINY_MIN * 4) -#define LNET_NRB_SMALL_MIN 4096 /* min value for each CPT */ -#define LNET_NRB_SMALL (LNET_NRB_SMALL_MIN * 4) -#define LNET_NRB_SMALL_PAGES 1 -#define LNET_NRB_LARGE_MIN 256 /* min value for each CPT */ -#define LNET_NRB_LARGE (LNET_NRB_LARGE_MIN * 4) -#define LNET_NRB_LARGE_PAGES ((LNET_MTU + PAGE_SIZE - 1) >> \ - PAGE_SHIFT) - -static char *forwarding = ""; -module_param(forwarding, charp, 0444); -MODULE_PARM_DESC(forwarding, "Explicitly enable/disable forwarding between networks"); - -static int tiny_router_buffers; -module_param(tiny_router_buffers, int, 0444); -MODULE_PARM_DESC(tiny_router_buffers, "# of 0 payload messages to buffer in the router"); -static int small_router_buffers; -module_param(small_router_buffers, int, 0444); -MODULE_PARM_DESC(small_router_buffers, "# of small (1 page) messages to buffer in the router"); -static int large_router_buffers; -module_param(large_router_buffers, int, 0444); -MODULE_PARM_DESC(large_router_buffers, "# of large messages to buffer in the router"); -static int peer_buffer_credits; -module_param(peer_buffer_credits, int, 0444); -MODULE_PARM_DESC(peer_buffer_credits, "# router buffer credits per peer"); - -static int auto_down = 1; -module_param(auto_down, int, 0444); -MODULE_PARM_DESC(auto_down, "Automatically mark peers down on comms error"); - -int -lnet_peer_buffer_credits(struct lnet_ni *ni) -{ - /* NI option overrides LNet default */ - if (ni->ni_peerrtrcredits > 0) - return ni->ni_peerrtrcredits; - if (peer_buffer_credits > 0) - return peer_buffer_credits; - - /* - * As an approximation, allow this peer the same number of router - * buffers as it is allowed outstanding sends - */ - return ni->ni_peertxcredits; -} - -/* forward ref's */ -static int lnet_router_checker(void *); - -static int check_routers_before_use; -module_param(check_routers_before_use, int, 0444); -MODULE_PARM_DESC(check_routers_before_use, "Assume routers are down and ping them before use"); - -int avoid_asym_router_failure = 1; -module_param(avoid_asym_router_failure, int, 0644); -MODULE_PARM_DESC(avoid_asym_router_failure, "Avoid asymmetrical router failures (0 to disable)"); - -static int dead_router_check_interval = 60; -module_param(dead_router_check_interval, int, 0644); -MODULE_PARM_DESC(dead_router_check_interval, "Seconds between dead router health checks (<= 0 to disable)"); - -static int live_router_check_interval = 60; -module_param(live_router_check_interval, int, 0644); -MODULE_PARM_DESC(live_router_check_interval, "Seconds between live router health checks (<= 0 to disable)"); - -static int router_ping_timeout = 50; -module_param(router_ping_timeout, int, 0644); -MODULE_PARM_DESC(router_ping_timeout, "Seconds to wait for the reply to a router health query"); - -int -lnet_peers_start_down(void) -{ - return check_routers_before_use; -} - -void -lnet_notify_locked(struct lnet_peer *lp, int notifylnd, int alive, - unsigned long when) -{ - if (time_before(when, lp->lp_timestamp)) { /* out of date information */ - CDEBUG(D_NET, "Out of date\n"); - return; - } - - lp->lp_timestamp = when; /* update timestamp */ - lp->lp_ping_deadline = 0; /* disable ping timeout */ - - if (lp->lp_alive_count && /* got old news */ - (!lp->lp_alive) == (!alive)) { /* new date for old news */ - CDEBUG(D_NET, "Old news\n"); - return; - } - - /* Flag that notification is outstanding */ - - lp->lp_alive_count++; - lp->lp_alive = !(!alive); /* 1 bit! */ - lp->lp_notify = 1; - lp->lp_notifylnd |= notifylnd; - if (lp->lp_alive) - lp->lp_ping_feats = LNET_PING_FEAT_INVAL; /* reset */ - - CDEBUG(D_NET, "set %s %d\n", libcfs_nid2str(lp->lp_nid), alive); -} - -static void -lnet_ni_notify_locked(struct lnet_ni *ni, struct lnet_peer *lp) -{ - int alive; - int notifylnd; - - /* - * Notify only in 1 thread at any time to ensure ordered notification. - * NB individual events can be missed; the only guarantee is that you - * always get the most recent news - */ - if (lp->lp_notifying || !ni) - return; - - lp->lp_notifying = 1; - - while (lp->lp_notify) { - alive = lp->lp_alive; - notifylnd = lp->lp_notifylnd; - - lp->lp_notifylnd = 0; - lp->lp_notify = 0; - - if (notifylnd && ni->ni_lnd->lnd_notify) { - lnet_net_unlock(lp->lp_cpt); - - /* - * A new notification could happen now; I'll handle it - * when control returns to me - */ - ni->ni_lnd->lnd_notify(ni, lp->lp_nid, alive); - - lnet_net_lock(lp->lp_cpt); - } - } - - lp->lp_notifying = 0; -} - -static void -lnet_rtr_addref_locked(struct lnet_peer *lp) -{ - LASSERT(lp->lp_refcount > 0); - LASSERT(lp->lp_rtr_refcount >= 0); - - /* lnet_net_lock must be exclusively locked */ - lp->lp_rtr_refcount++; - if (lp->lp_rtr_refcount == 1) { - struct list_head *pos; - - /* a simple insertion sort */ - list_for_each_prev(pos, &the_lnet.ln_routers) { - struct lnet_peer *rtr; - - rtr = list_entry(pos, struct lnet_peer, lp_rtr_list); - if (rtr->lp_nid < lp->lp_nid) - break; - } - - list_add(&lp->lp_rtr_list, pos); - /* addref for the_lnet.ln_routers */ - lnet_peer_addref_locked(lp); - the_lnet.ln_routers_version++; - } -} - -static void -lnet_rtr_decref_locked(struct lnet_peer *lp) -{ - LASSERT(lp->lp_refcount > 0); - LASSERT(lp->lp_rtr_refcount > 0); - - /* lnet_net_lock must be exclusively locked */ - lp->lp_rtr_refcount--; - if (!lp->lp_rtr_refcount) { - LASSERT(list_empty(&lp->lp_routes)); - - if (lp->lp_rcd) { - list_add(&lp->lp_rcd->rcd_list, - &the_lnet.ln_rcd_deathrow); - lp->lp_rcd = NULL; - } - - list_del(&lp->lp_rtr_list); - /* decref for the_lnet.ln_routers */ - lnet_peer_decref_locked(lp); - the_lnet.ln_routers_version++; - } -} - -struct lnet_remotenet * -lnet_find_net_locked(__u32 net) -{ - struct lnet_remotenet *rnet; - struct list_head *rn_list; - - LASSERT(!the_lnet.ln_shutdown); - - rn_list = lnet_net2rnethash(net); - list_for_each_entry(rnet, rn_list, lrn_list) { - if (rnet->lrn_net == net) - return rnet; - } - return NULL; -} - -static void lnet_shuffle_seed(void) -{ - static int seeded; - struct lnet_ni *ni; - - if (seeded) - return; - - /* - * Nodes with small feet have little entropy - * the NID for this node gives the most entropy in the low bits - */ - list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) { - __u32 lnd_type, seed; - - lnd_type = LNET_NETTYP(LNET_NIDNET(ni->ni_nid)); - if (lnd_type != LOLND) { - seed = (LNET_NIDADDR(ni->ni_nid) | lnd_type); - add_device_randomness(&seed, sizeof(seed)); - } - } - - seeded = 1; -} - -/* NB expects LNET_LOCK held */ -static void -lnet_add_route_to_rnet(struct lnet_remotenet *rnet, struct lnet_route *route) -{ - unsigned int len = 0; - unsigned int offset = 0; - struct list_head *e; - - lnet_shuffle_seed(); - - list_for_each(e, &rnet->lrn_routes) { - len++; - } - - /* len+1 positions to add a new entry */ - offset = prandom_u32_max(len + 1); - list_for_each(e, &rnet->lrn_routes) { - if (!offset) - break; - offset--; - } - list_add(&route->lr_list, e); - list_add(&route->lr_gwlist, &route->lr_gateway->lp_routes); - - the_lnet.ln_remote_nets_version++; - lnet_rtr_addref_locked(route->lr_gateway); -} - -int -lnet_add_route(__u32 net, __u32 hops, lnet_nid_t gateway, - unsigned int priority) -{ - struct list_head *e; - struct lnet_remotenet *rnet; - struct lnet_remotenet *rnet2; - struct lnet_route *route; - struct lnet_ni *ni; - int add_route; - int rc; - - CDEBUG(D_NET, "Add route: net %s hops %d priority %u gw %s\n", - libcfs_net2str(net), hops, priority, libcfs_nid2str(gateway)); - - if (gateway == LNET_NID_ANY || - LNET_NETTYP(LNET_NIDNET(gateway)) == LOLND || - net == LNET_NIDNET(LNET_NID_ANY) || - LNET_NETTYP(net) == LOLND || - LNET_NIDNET(gateway) == net || - (hops != LNET_UNDEFINED_HOPS && (hops < 1 || hops > 255))) - return -EINVAL; - - if (lnet_islocalnet(net)) /* it's a local network */ - return -EEXIST; - - /* Assume net, route, all new */ - route = kzalloc(sizeof(*route), GFP_NOFS); - rnet = kzalloc(sizeof(*rnet), GFP_NOFS); - if (!route || !rnet) { - CERROR("Out of memory creating route %s %d %s\n", - libcfs_net2str(net), hops, libcfs_nid2str(gateway)); - kfree(route); - kfree(rnet); - return -ENOMEM; - } - - INIT_LIST_HEAD(&rnet->lrn_routes); - rnet->lrn_net = net; - route->lr_hops = hops; - route->lr_net = net; - route->lr_priority = priority; - - lnet_net_lock(LNET_LOCK_EX); - - rc = lnet_nid2peer_locked(&route->lr_gateway, gateway, LNET_LOCK_EX); - if (rc) { - lnet_net_unlock(LNET_LOCK_EX); - - kfree(route); - kfree(rnet); - - if (rc == -EHOSTUNREACH) /* gateway is not on a local net */ - return rc; /* ignore the route entry */ - CERROR("Error %d creating route %s %d %s\n", rc, - libcfs_net2str(net), hops, - libcfs_nid2str(gateway)); - return rc; - } - - LASSERT(!the_lnet.ln_shutdown); - - rnet2 = lnet_find_net_locked(net); - if (!rnet2) { - /* new network */ - list_add_tail(&rnet->lrn_list, lnet_net2rnethash(net)); - rnet2 = rnet; - } - - /* Search for a duplicate route (it's a NOOP if it is) */ - add_route = 1; - list_for_each(e, &rnet2->lrn_routes) { - struct lnet_route *route2; - - route2 = list_entry(e, struct lnet_route, lr_list); - if (route2->lr_gateway == route->lr_gateway) { - add_route = 0; - break; - } - - /* our lookups must be true */ - LASSERT(route2->lr_gateway->lp_nid != gateway); - } - - if (add_route) { - lnet_peer_addref_locked(route->lr_gateway); /* +1 for notify */ - lnet_add_route_to_rnet(rnet2, route); - - ni = route->lr_gateway->lp_ni; - lnet_net_unlock(LNET_LOCK_EX); - - /* XXX Assume alive */ - if (ni->ni_lnd->lnd_notify) - ni->ni_lnd->lnd_notify(ni, gateway, 1); - - lnet_net_lock(LNET_LOCK_EX); - } - - /* -1 for notify or !add_route */ - lnet_peer_decref_locked(route->lr_gateway); - lnet_net_unlock(LNET_LOCK_EX); - rc = 0; - - if (!add_route) { - rc = -EEXIST; - kfree(route); - } - - if (rnet != rnet2) - kfree(rnet); - - /* indicate to startup the router checker if configured */ - wake_up(&the_lnet.ln_rc_waitq); - - return rc; -} - -int -lnet_check_routes(void) -{ - struct lnet_remotenet *rnet; - struct lnet_route *route; - struct lnet_route *route2; - struct list_head *e1; - struct list_head *e2; - int cpt; - struct list_head *rn_list; - int i; - - cpt = lnet_net_lock_current(); - - for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) { - rn_list = &the_lnet.ln_remote_nets_hash[i]; - list_for_each(e1, rn_list) { - rnet = list_entry(e1, struct lnet_remotenet, lrn_list); - - route2 = NULL; - list_for_each(e2, &rnet->lrn_routes) { - lnet_nid_t nid1; - lnet_nid_t nid2; - int net; - - route = list_entry(e2, struct lnet_route, lr_list); - - if (!route2) { - route2 = route; - continue; - } - - if (route->lr_gateway->lp_ni == - route2->lr_gateway->lp_ni) - continue; - - nid1 = route->lr_gateway->lp_nid; - nid2 = route2->lr_gateway->lp_nid; - net = rnet->lrn_net; - - lnet_net_unlock(cpt); - - CERROR("Routes to %s via %s and %s not supported\n", - libcfs_net2str(net), - libcfs_nid2str(nid1), - libcfs_nid2str(nid2)); - return -EINVAL; - } - } - } - - lnet_net_unlock(cpt); - return 0; -} - -int -lnet_del_route(__u32 net, lnet_nid_t gw_nid) -{ - struct lnet_peer *gateway; - struct lnet_remotenet *rnet; - struct lnet_route *route; - struct list_head *e1; - struct list_head *e2; - int rc = -ENOENT; - struct list_head *rn_list; - int idx = 0; - - CDEBUG(D_NET, "Del route: net %s : gw %s\n", - libcfs_net2str(net), libcfs_nid2str(gw_nid)); - - /* - * NB Caller may specify either all routes via the given gateway - * or a specific route entry actual NIDs) - */ - lnet_net_lock(LNET_LOCK_EX); - if (net == LNET_NIDNET(LNET_NID_ANY)) - rn_list = &the_lnet.ln_remote_nets_hash[0]; - else - rn_list = lnet_net2rnethash(net); - - again: - list_for_each(e1, rn_list) { - rnet = list_entry(e1, struct lnet_remotenet, lrn_list); - - if (!(net == LNET_NIDNET(LNET_NID_ANY) || - net == rnet->lrn_net)) - continue; - - list_for_each(e2, &rnet->lrn_routes) { - route = list_entry(e2, struct lnet_route, lr_list); - - gateway = route->lr_gateway; - if (!(gw_nid == LNET_NID_ANY || - gw_nid == gateway->lp_nid)) - continue; - - list_del(&route->lr_list); - list_del(&route->lr_gwlist); - the_lnet.ln_remote_nets_version++; - - if (list_empty(&rnet->lrn_routes)) - list_del(&rnet->lrn_list); - else - rnet = NULL; - - lnet_rtr_decref_locked(gateway); - lnet_peer_decref_locked(gateway); - - lnet_net_unlock(LNET_LOCK_EX); - - kfree(route); - kfree(rnet); - - rc = 0; - lnet_net_lock(LNET_LOCK_EX); - goto again; - } - } - - if (net == LNET_NIDNET(LNET_NID_ANY) && - ++idx < LNET_REMOTE_NETS_HASH_SIZE) { - rn_list = &the_lnet.ln_remote_nets_hash[idx]; - goto again; - } - lnet_net_unlock(LNET_LOCK_EX); - - return rc; -} - -void -lnet_destroy_routes(void) -{ - lnet_del_route(LNET_NIDNET(LNET_NID_ANY), LNET_NID_ANY); -} - -int lnet_get_rtr_pool_cfg(int idx, struct lnet_ioctl_pool_cfg *pool_cfg) -{ - int i, rc = -ENOENT, j; - - if (!the_lnet.ln_rtrpools) - return rc; - - for (i = 0; i < LNET_NRBPOOLS; i++) { - struct lnet_rtrbufpool *rbp; - - lnet_net_lock(LNET_LOCK_EX); - cfs_percpt_for_each(rbp, j, the_lnet.ln_rtrpools) { - if (i++ != idx) - continue; - - pool_cfg->pl_pools[i].pl_npages = rbp[i].rbp_npages; - pool_cfg->pl_pools[i].pl_nbuffers = rbp[i].rbp_nbuffers; - pool_cfg->pl_pools[i].pl_credits = rbp[i].rbp_credits; - pool_cfg->pl_pools[i].pl_mincredits = rbp[i].rbp_mincredits; - rc = 0; - break; - } - lnet_net_unlock(LNET_LOCK_EX); - } - - lnet_net_lock(LNET_LOCK_EX); - pool_cfg->pl_routing = the_lnet.ln_routing; - lnet_net_unlock(LNET_LOCK_EX); - - return rc; -} - -int -lnet_get_route(int idx, __u32 *net, __u32 *hops, - lnet_nid_t *gateway, __u32 *alive, __u32 *priority) -{ - struct list_head *e1; - struct list_head *e2; - struct lnet_remotenet *rnet; - struct lnet_route *route; - int cpt; - int i; - struct list_head *rn_list; - - cpt = lnet_net_lock_current(); - - for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE; i++) { - rn_list = &the_lnet.ln_remote_nets_hash[i]; - list_for_each(e1, rn_list) { - rnet = list_entry(e1, struct lnet_remotenet, lrn_list); - - list_for_each(e2, &rnet->lrn_routes) { - route = list_entry(e2, struct lnet_route, - lr_list); - - if (!idx--) { - *net = rnet->lrn_net; - *hops = route->lr_hops; - *priority = route->lr_priority; - *gateway = route->lr_gateway->lp_nid; - *alive = lnet_is_route_alive(route); - lnet_net_unlock(cpt); - return 0; - } - } - } - } - - lnet_net_unlock(cpt); - return -ENOENT; -} - -void -lnet_swap_pinginfo(struct lnet_ping_info *info) -{ - int i; - struct lnet_ni_status *stat; - - __swab32s(&info->pi_magic); - __swab32s(&info->pi_features); - __swab32s(&info->pi_pid); - __swab32s(&info->pi_nnis); - for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) { - stat = &info->pi_ni[i]; - __swab64s(&stat->ns_nid); - __swab32s(&stat->ns_status); - } -} - -/** - * parse router-checker pinginfo, record number of down NIs for remote - * networks on that router. - */ -static void -lnet_parse_rc_info(struct lnet_rc_data *rcd) -{ - struct lnet_ping_info *info = rcd->rcd_pinginfo; - struct lnet_peer *gw = rcd->rcd_gateway; - struct lnet_route *rte; - - if (!gw->lp_alive) - return; - - if (info->pi_magic == __swab32(LNET_PROTO_PING_MAGIC)) - lnet_swap_pinginfo(info); - - /* NB always racing with network! */ - if (info->pi_magic != LNET_PROTO_PING_MAGIC) { - CDEBUG(D_NET, "%s: Unexpected magic %08x\n", - libcfs_nid2str(gw->lp_nid), info->pi_magic); - gw->lp_ping_feats = LNET_PING_FEAT_INVAL; - return; - } - - gw->lp_ping_feats = info->pi_features; - if (!(gw->lp_ping_feats & LNET_PING_FEAT_MASK)) { - CDEBUG(D_NET, "%s: Unexpected features 0x%x\n", - libcfs_nid2str(gw->lp_nid), gw->lp_ping_feats); - return; /* nothing I can understand */ - } - - if (!(gw->lp_ping_feats & LNET_PING_FEAT_NI_STATUS)) - return; /* can't carry NI status info */ - - list_for_each_entry(rte, &gw->lp_routes, lr_gwlist) { - int down = 0; - int up = 0; - int i; - - if (gw->lp_ping_feats & LNET_PING_FEAT_RTE_DISABLED) { - rte->lr_downis = 1; - continue; - } - - for (i = 0; i < info->pi_nnis && i < LNET_MAX_RTR_NIS; i++) { - struct lnet_ni_status *stat = &info->pi_ni[i]; - lnet_nid_t nid = stat->ns_nid; - - if (nid == LNET_NID_ANY) { - CDEBUG(D_NET, "%s: unexpected LNET_NID_ANY\n", - libcfs_nid2str(gw->lp_nid)); - gw->lp_ping_feats = LNET_PING_FEAT_INVAL; - return; - } - - if (LNET_NETTYP(LNET_NIDNET(nid)) == LOLND) - continue; - - if (stat->ns_status == LNET_NI_STATUS_DOWN) { - down++; - continue; - } - - if (stat->ns_status == LNET_NI_STATUS_UP) { - if (LNET_NIDNET(nid) == rte->lr_net) { - up = 1; - break; - } - continue; - } - - CDEBUG(D_NET, "%s: Unexpected status 0x%x\n", - libcfs_nid2str(gw->lp_nid), stat->ns_status); - gw->lp_ping_feats = LNET_PING_FEAT_INVAL; - return; - } - - if (up) { /* ignore downed NIs if NI for dest network is up */ - rte->lr_downis = 0; - continue; - } - /** - * if @down is zero and this route is single-hop, it means - * we can't find NI for target network - */ - if (!down && rte->lr_hops == 1) - down = 1; - - rte->lr_downis = down; - } -} - -static void -lnet_router_checker_event(struct lnet_event *event) -{ - struct lnet_rc_data *rcd = event->md.user_ptr; - struct lnet_peer *lp; - - LASSERT(rcd); - - if (event->unlinked) { - LNetInvalidateMDHandle(&rcd->rcd_mdh); - return; - } - - LASSERT(event->type == LNET_EVENT_SEND || - event->type == LNET_EVENT_REPLY); - - lp = rcd->rcd_gateway; - LASSERT(lp); - - /* - * NB: it's called with holding lnet_res_lock, we have a few - * places need to hold both locks at the same time, please take - * care of lock ordering - */ - lnet_net_lock(lp->lp_cpt); - if (!lnet_isrouter(lp) || lp->lp_rcd != rcd) { - /* ignore if no longer a router or rcd is replaced */ - goto out; - } - - if (event->type == LNET_EVENT_SEND) { - lp->lp_ping_notsent = 0; - if (!event->status) - goto out; - } - - /* LNET_EVENT_REPLY */ - /* - * A successful REPLY means the router is up. If _any_ comms - * to the router fail I assume it's down (this will happen if - * we ping alive routers to try to detect router death before - * apps get burned). - */ - lnet_notify_locked(lp, 1, !event->status, jiffies); - - /* - * The router checker will wake up very shortly and do the - * actual notification. - * XXX If 'lp' stops being a router before then, it will still - * have the notification pending!!! - */ - if (avoid_asym_router_failure && !event->status) - lnet_parse_rc_info(rcd); - - out: - lnet_net_unlock(lp->lp_cpt); -} - -static void -lnet_wait_known_routerstate(void) -{ - struct lnet_peer *rtr; - struct list_head *entry; - int all_known; - - LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING); - - for (;;) { - int cpt = lnet_net_lock_current(); - - all_known = 1; - list_for_each(entry, &the_lnet.ln_routers) { - rtr = list_entry(entry, struct lnet_peer, lp_rtr_list); - - if (!rtr->lp_alive_count) { - all_known = 0; - break; - } - } - - lnet_net_unlock(cpt); - - if (all_known) - return; - - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ); - } -} - -void -lnet_router_ni_update_locked(struct lnet_peer *gw, __u32 net) -{ - struct lnet_route *rte; - - if ((gw->lp_ping_feats & LNET_PING_FEAT_NI_STATUS)) { - list_for_each_entry(rte, &gw->lp_routes, lr_gwlist) { - if (rte->lr_net == net) { - rte->lr_downis = 0; - break; - } - } - } -} - -static void -lnet_update_ni_status_locked(void) -{ - struct lnet_ni *ni; - time64_t now; - int timeout; - - LASSERT(the_lnet.ln_routing); - - timeout = router_ping_timeout + - max(live_router_check_interval, dead_router_check_interval); - - now = ktime_get_real_seconds(); - list_for_each_entry(ni, &the_lnet.ln_nis, ni_list) { - if (ni->ni_lnd->lnd_type == LOLND) - continue; - - if (now < ni->ni_last_alive + timeout) - continue; - - lnet_ni_lock(ni); - /* re-check with lock */ - if (now < ni->ni_last_alive + timeout) { - lnet_ni_unlock(ni); - continue; - } - - LASSERT(ni->ni_status); - - if (ni->ni_status->ns_status != LNET_NI_STATUS_DOWN) { - CDEBUG(D_NET, "NI(%s:%d) status changed to down\n", - libcfs_nid2str(ni->ni_nid), timeout); - /* - * NB: so far, this is the only place to set - * NI status to "down" - */ - ni->ni_status->ns_status = LNET_NI_STATUS_DOWN; - } - lnet_ni_unlock(ni); - } -} - -static void -lnet_destroy_rc_data(struct lnet_rc_data *rcd) -{ - LASSERT(list_empty(&rcd->rcd_list)); - /* detached from network */ - LASSERT(LNetMDHandleIsInvalid(rcd->rcd_mdh)); - - if (rcd->rcd_gateway) { - int cpt = rcd->rcd_gateway->lp_cpt; - - lnet_net_lock(cpt); - lnet_peer_decref_locked(rcd->rcd_gateway); - lnet_net_unlock(cpt); - } - - kfree(rcd->rcd_pinginfo); - - kfree(rcd); -} - -static struct lnet_rc_data * -lnet_create_rc_data_locked(struct lnet_peer *gateway) -{ - struct lnet_rc_data *rcd = NULL; - struct lnet_ping_info *pi; - struct lnet_md md; - int rc; - int i; - - lnet_net_unlock(gateway->lp_cpt); - - rcd = kzalloc(sizeof(*rcd), GFP_NOFS); - if (!rcd) - goto out; - - LNetInvalidateMDHandle(&rcd->rcd_mdh); - INIT_LIST_HEAD(&rcd->rcd_list); - - pi = kzalloc(LNET_PINGINFO_SIZE, GFP_NOFS); - if (!pi) - goto out; - - for (i = 0; i < LNET_MAX_RTR_NIS; i++) { - pi->pi_ni[i].ns_nid = LNET_NID_ANY; - pi->pi_ni[i].ns_status = LNET_NI_STATUS_INVALID; - } - rcd->rcd_pinginfo = pi; - - md.start = pi; - md.user_ptr = rcd; - md.length = LNET_PINGINFO_SIZE; - md.threshold = LNET_MD_THRESH_INF; - md.options = LNET_MD_TRUNCATE; - md.eq_handle = the_lnet.ln_rc_eqh; - - LASSERT(!LNetEQHandleIsInvalid(the_lnet.ln_rc_eqh)); - rc = LNetMDBind(md, LNET_UNLINK, &rcd->rcd_mdh); - if (rc < 0) { - CERROR("Can't bind MD: %d\n", rc); - goto out; - } - LASSERT(!rc); - - lnet_net_lock(gateway->lp_cpt); - /* router table changed or someone has created rcd for this gateway */ - if (!lnet_isrouter(gateway) || gateway->lp_rcd) { - lnet_net_unlock(gateway->lp_cpt); - goto out; - } - - lnet_peer_addref_locked(gateway); - rcd->rcd_gateway = gateway; - gateway->lp_rcd = rcd; - gateway->lp_ping_notsent = 0; - - return rcd; - - out: - if (rcd) { - if (!LNetMDHandleIsInvalid(rcd->rcd_mdh)) { - rc = LNetMDUnlink(rcd->rcd_mdh); - LASSERT(!rc); - } - lnet_destroy_rc_data(rcd); - } - - lnet_net_lock(gateway->lp_cpt); - return gateway->lp_rcd; -} - -static int -lnet_router_check_interval(struct lnet_peer *rtr) -{ - int secs; - - secs = rtr->lp_alive ? live_router_check_interval : - dead_router_check_interval; - if (secs < 0) - secs = 0; - - return secs; -} - -static void -lnet_ping_router_locked(struct lnet_peer *rtr) -{ - struct lnet_rc_data *rcd = NULL; - unsigned long now = jiffies; - int secs; - - lnet_peer_addref_locked(rtr); - - if (rtr->lp_ping_deadline && /* ping timed out? */ - time_after(now, rtr->lp_ping_deadline)) - lnet_notify_locked(rtr, 1, 0, now); - - /* Run any outstanding notifications */ - lnet_ni_notify_locked(rtr->lp_ni, rtr); - - if (!lnet_isrouter(rtr) || - the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) { - /* router table changed or router checker is shutting down */ - lnet_peer_decref_locked(rtr); - return; - } - - rcd = rtr->lp_rcd ? - rtr->lp_rcd : lnet_create_rc_data_locked(rtr); - - if (!rcd) - return; - - secs = lnet_router_check_interval(rtr); - - CDEBUG(D_NET, - "rtr %s %d: deadline %lu ping_notsent %d alive %d alive_count %d lp_ping_timestamp %lu\n", - libcfs_nid2str(rtr->lp_nid), secs, - rtr->lp_ping_deadline, rtr->lp_ping_notsent, - rtr->lp_alive, rtr->lp_alive_count, rtr->lp_ping_timestamp); - - if (secs && !rtr->lp_ping_notsent && - time_after(now, rtr->lp_ping_timestamp + secs * HZ)) { - int rc; - struct lnet_process_id id; - struct lnet_handle_md mdh; - - id.nid = rtr->lp_nid; - id.pid = LNET_PID_LUSTRE; - CDEBUG(D_NET, "Check: %s\n", libcfs_id2str(id)); - - rtr->lp_ping_notsent = 1; - rtr->lp_ping_timestamp = now; - - mdh = rcd->rcd_mdh; - - if (!rtr->lp_ping_deadline) { - rtr->lp_ping_deadline = - jiffies + router_ping_timeout * HZ; - } - - lnet_net_unlock(rtr->lp_cpt); - - rc = LNetGet(LNET_NID_ANY, mdh, id, LNET_RESERVED_PORTAL, - LNET_PROTO_PING_MATCHBITS, 0); - - lnet_net_lock(rtr->lp_cpt); - if (rc) - rtr->lp_ping_notsent = 0; /* no event pending */ - } - - lnet_peer_decref_locked(rtr); -} - -int -lnet_router_checker_start(void) -{ - struct task_struct *task; - int rc; - int eqsz = 0; - - LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN); - - if (check_routers_before_use && - dead_router_check_interval <= 0) { - LCONSOLE_ERROR_MSG(0x10a, "'dead_router_check_interval' must be set if 'check_routers_before_use' is set\n"); - return -EINVAL; - } - - init_completion(&the_lnet.ln_rc_signal); - - rc = LNetEQAlloc(0, lnet_router_checker_event, &the_lnet.ln_rc_eqh); - if (rc) { - CERROR("Can't allocate EQ(%d): %d\n", eqsz, rc); - return -ENOMEM; - } - - the_lnet.ln_rc_state = LNET_RC_STATE_RUNNING; - task = kthread_run(lnet_router_checker, NULL, "router_checker"); - if (IS_ERR(task)) { - rc = PTR_ERR(task); - CERROR("Can't start router checker thread: %d\n", rc); - /* block until event callback signals exit */ - wait_for_completion(&the_lnet.ln_rc_signal); - rc = LNetEQFree(the_lnet.ln_rc_eqh); - LASSERT(!rc); - the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN; - return -ENOMEM; - } - - if (check_routers_before_use) { - /* - * Note that a helpful side-effect of pinging all known routers - * at startup is that it makes them drop stale connections they - * may have to a previous instance of me. - */ - lnet_wait_known_routerstate(); - } - - return 0; -} - -void -lnet_router_checker_stop(void) -{ - int rc; - - if (the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN) - return; - - LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING); - the_lnet.ln_rc_state = LNET_RC_STATE_STOPPING; - /* wakeup the RC thread if it's sleeping */ - wake_up(&the_lnet.ln_rc_waitq); - - /* block until event callback signals exit */ - wait_for_completion(&the_lnet.ln_rc_signal); - LASSERT(the_lnet.ln_rc_state == LNET_RC_STATE_SHUTDOWN); - - rc = LNetEQFree(the_lnet.ln_rc_eqh); - LASSERT(!rc); -} - -static void -lnet_prune_rc_data(int wait_unlink) -{ - struct lnet_rc_data *rcd; - struct lnet_rc_data *tmp; - struct lnet_peer *lp; - struct list_head head; - int i = 2; - - if (likely(the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING && - list_empty(&the_lnet.ln_rcd_deathrow) && - list_empty(&the_lnet.ln_rcd_zombie))) - return; - - INIT_LIST_HEAD(&head); - - lnet_net_lock(LNET_LOCK_EX); - - if (the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) { - /* router checker is stopping, prune all */ - list_for_each_entry(lp, &the_lnet.ln_routers, - lp_rtr_list) { - if (!lp->lp_rcd) - continue; - - LASSERT(list_empty(&lp->lp_rcd->rcd_list)); - list_add(&lp->lp_rcd->rcd_list, - &the_lnet.ln_rcd_deathrow); - lp->lp_rcd = NULL; - } - } - - /* unlink all RCDs on deathrow list */ - list_splice_init(&the_lnet.ln_rcd_deathrow, &head); - - if (!list_empty(&head)) { - lnet_net_unlock(LNET_LOCK_EX); - - list_for_each_entry(rcd, &head, rcd_list) - LNetMDUnlink(rcd->rcd_mdh); - - lnet_net_lock(LNET_LOCK_EX); - } - - list_splice_init(&head, &the_lnet.ln_rcd_zombie); - - /* release all zombie RCDs */ - while (!list_empty(&the_lnet.ln_rcd_zombie)) { - list_for_each_entry_safe(rcd, tmp, &the_lnet.ln_rcd_zombie, - rcd_list) { - if (LNetMDHandleIsInvalid(rcd->rcd_mdh)) - list_move(&rcd->rcd_list, &head); - } - - wait_unlink = wait_unlink && - !list_empty(&the_lnet.ln_rcd_zombie); - - lnet_net_unlock(LNET_LOCK_EX); - - while (!list_empty(&head)) { - rcd = list_entry(head.next, - struct lnet_rc_data, rcd_list); - list_del_init(&rcd->rcd_list); - lnet_destroy_rc_data(rcd); - } - - if (!wait_unlink) - return; - - i++; - CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, - "Waiting for rc buffers to unlink\n"); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ / 4); - - lnet_net_lock(LNET_LOCK_EX); - } - - lnet_net_unlock(LNET_LOCK_EX); -} - -/* - * This function is called to check if the RC should block indefinitely. - * It's called from lnet_router_checker() as well as being passed to - * wait_event_interruptible() to avoid the lost wake_up problem. - * - * When it's called from wait_event_interruptible() it is necessary to - * also not sleep if the rc state is not running to avoid a deadlock - * when the system is shutting down - */ -static inline bool -lnet_router_checker_active(void) -{ - if (the_lnet.ln_rc_state != LNET_RC_STATE_RUNNING) - return true; - - /* - * Router Checker thread needs to run when routing is enabled in - * order to call lnet_update_ni_status_locked() - */ - if (the_lnet.ln_routing) - return true; - - return !list_empty(&the_lnet.ln_routers) && - (live_router_check_interval > 0 || - dead_router_check_interval > 0); -} - -static int -lnet_router_checker(void *arg) -{ - struct lnet_peer *rtr; - struct list_head *entry; - - while (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING) { - __u64 version; - int cpt; - int cpt2; - - cpt = lnet_net_lock_current(); -rescan: - version = the_lnet.ln_routers_version; - - list_for_each(entry, &the_lnet.ln_routers) { - rtr = list_entry(entry, struct lnet_peer, lp_rtr_list); - - cpt2 = lnet_cpt_of_nid_locked(rtr->lp_nid); - if (cpt != cpt2) { - lnet_net_unlock(cpt); - cpt = cpt2; - lnet_net_lock(cpt); - /* the routers list has changed */ - if (version != the_lnet.ln_routers_version) - goto rescan; - } - - lnet_ping_router_locked(rtr); - - /* NB dropped lock */ - if (version != the_lnet.ln_routers_version) { - /* the routers list has changed */ - goto rescan; - } - } - - if (the_lnet.ln_routing) - lnet_update_ni_status_locked(); - - lnet_net_unlock(cpt); - - lnet_prune_rc_data(0); /* don't wait for UNLINK */ - - /* - * Call schedule_timeout() here always adds 1 to load average - * because kernel counts # active tasks as nr_running - * + nr_uninterruptible. - */ - /* - * if there are any routes then wakeup every second. If - * there are no routes then sleep indefinitely until woken - * up by a user adding a route - */ - if (!lnet_router_checker_active()) - wait_event_interruptible(the_lnet.ln_rc_waitq, - lnet_router_checker_active()); - else - wait_event_interruptible_timeout(the_lnet.ln_rc_waitq, - false, - HZ); - } - - lnet_prune_rc_data(1); /* wait for UNLINK */ - - the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN; - complete(&the_lnet.ln_rc_signal); - /* The unlink event callback will signal final completion */ - return 0; -} - -void -lnet_destroy_rtrbuf(struct lnet_rtrbuf *rb, int npages) -{ - while (--npages >= 0) - __free_page(rb->rb_kiov[npages].bv_page); - - kfree(rb); -} - -static struct lnet_rtrbuf * -lnet_new_rtrbuf(struct lnet_rtrbufpool *rbp, int cpt) -{ - int npages = rbp->rbp_npages; - int sz = offsetof(struct lnet_rtrbuf, rb_kiov[npages]); - struct page *page; - struct lnet_rtrbuf *rb; - int i; - - rb = kzalloc_cpt(sz, GFP_NOFS, cpt); - if (!rb) - return NULL; - - rb->rb_pool = rbp; - - for (i = 0; i < npages; i++) { - page = alloc_pages_node( - cfs_cpt_spread_node(lnet_cpt_table(), cpt), - GFP_KERNEL | __GFP_ZERO, 0); - if (!page) { - while (--i >= 0) - __free_page(rb->rb_kiov[i].bv_page); - - kfree(rb); - return NULL; - } - - rb->rb_kiov[i].bv_len = PAGE_SIZE; - rb->rb_kiov[i].bv_offset = 0; - rb->rb_kiov[i].bv_page = page; - } - - return rb; -} - -static void -lnet_rtrpool_free_bufs(struct lnet_rtrbufpool *rbp, int cpt) -{ - int npages = rbp->rbp_npages; - struct list_head tmp; - struct lnet_rtrbuf *rb; - struct lnet_rtrbuf *temp; - - if (!rbp->rbp_nbuffers) /* not initialized or already freed */ - return; - - INIT_LIST_HEAD(&tmp); - - lnet_net_lock(cpt); - lnet_drop_routed_msgs_locked(&rbp->rbp_msgs, cpt); - list_splice_init(&rbp->rbp_bufs, &tmp); - rbp->rbp_req_nbuffers = 0; - rbp->rbp_nbuffers = 0; - rbp->rbp_credits = 0; - rbp->rbp_mincredits = 0; - lnet_net_unlock(cpt); - - /* Free buffers on the free list. */ - list_for_each_entry_safe(rb, temp, &tmp, rb_list) { - list_del(&rb->rb_list); - lnet_destroy_rtrbuf(rb, npages); - } -} - -static int -lnet_rtrpool_adjust_bufs(struct lnet_rtrbufpool *rbp, int nbufs, int cpt) -{ - struct list_head rb_list; - struct lnet_rtrbuf *rb; - int num_rb; - int num_buffers = 0; - int old_req_nbufs; - int npages = rbp->rbp_npages; - - lnet_net_lock(cpt); - /* - * If we are called for less buffers than already in the pool, we - * just lower the req_nbuffers number and excess buffers will be - * thrown away as they are returned to the free list. Credits - * then get adjusted as well. - * If we already have enough buffers allocated to serve the - * increase requested, then we can treat that the same way as we - * do the decrease. - */ - num_rb = nbufs - rbp->rbp_nbuffers; - if (nbufs <= rbp->rbp_req_nbuffers || num_rb <= 0) { - rbp->rbp_req_nbuffers = nbufs; - lnet_net_unlock(cpt); - return 0; - } - /* - * store the older value of rbp_req_nbuffers and then set it to - * the new request to prevent lnet_return_rx_credits_locked() from - * freeing buffers that we need to keep around - */ - old_req_nbufs = rbp->rbp_req_nbuffers; - rbp->rbp_req_nbuffers = nbufs; - lnet_net_unlock(cpt); - - INIT_LIST_HEAD(&rb_list); - - /* - * allocate the buffers on a local list first. If all buffers are - * allocated successfully then join this list to the rbp buffer - * list. If not then free all allocated buffers. - */ - while (num_rb-- > 0) { - rb = lnet_new_rtrbuf(rbp, cpt); - if (!rb) { - CERROR("Failed to allocate %d route bufs of %d pages\n", - nbufs, npages); - - lnet_net_lock(cpt); - rbp->rbp_req_nbuffers = old_req_nbufs; - lnet_net_unlock(cpt); - - goto failed; - } - - list_add(&rb->rb_list, &rb_list); - num_buffers++; - } - - lnet_net_lock(cpt); - - list_splice_tail(&rb_list, &rbp->rbp_bufs); - rbp->rbp_nbuffers += num_buffers; - rbp->rbp_credits += num_buffers; - rbp->rbp_mincredits = rbp->rbp_credits; - /* - * We need to schedule blocked msg using the newly - * added buffers. - */ - while (!list_empty(&rbp->rbp_bufs) && - !list_empty(&rbp->rbp_msgs)) - lnet_schedule_blocked_locked(rbp); - - lnet_net_unlock(cpt); - - return 0; - -failed: - while (!list_empty(&rb_list)) { - rb = list_entry(rb_list.next, struct lnet_rtrbuf, rb_list); - list_del(&rb->rb_list); - lnet_destroy_rtrbuf(rb, npages); - } - - return -ENOMEM; -} - -static void -lnet_rtrpool_init(struct lnet_rtrbufpool *rbp, int npages) -{ - INIT_LIST_HEAD(&rbp->rbp_msgs); - INIT_LIST_HEAD(&rbp->rbp_bufs); - - rbp->rbp_npages = npages; - rbp->rbp_credits = 0; - rbp->rbp_mincredits = 0; -} - -void -lnet_rtrpools_free(int keep_pools) -{ - struct lnet_rtrbufpool *rtrp; - int i; - - if (!the_lnet.ln_rtrpools) /* uninitialized or freed */ - return; - - cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) { - lnet_rtrpool_free_bufs(&rtrp[LNET_TINY_BUF_IDX], i); - lnet_rtrpool_free_bufs(&rtrp[LNET_SMALL_BUF_IDX], i); - lnet_rtrpool_free_bufs(&rtrp[LNET_LARGE_BUF_IDX], i); - } - - if (!keep_pools) { - cfs_percpt_free(the_lnet.ln_rtrpools); - the_lnet.ln_rtrpools = NULL; - } -} - -static int -lnet_nrb_tiny_calculate(void) -{ - int nrbs = LNET_NRB_TINY; - - if (tiny_router_buffers < 0) { - LCONSOLE_ERROR_MSG(0x10c, - "tiny_router_buffers=%d invalid when routing enabled\n", - tiny_router_buffers); - return -EINVAL; - } - - if (tiny_router_buffers > 0) - nrbs = tiny_router_buffers; - - nrbs /= LNET_CPT_NUMBER; - return max(nrbs, LNET_NRB_TINY_MIN); -} - -static int -lnet_nrb_small_calculate(void) -{ - int nrbs = LNET_NRB_SMALL; - - if (small_router_buffers < 0) { - LCONSOLE_ERROR_MSG(0x10c, - "small_router_buffers=%d invalid when routing enabled\n", - small_router_buffers); - return -EINVAL; - } - - if (small_router_buffers > 0) - nrbs = small_router_buffers; - - nrbs /= LNET_CPT_NUMBER; - return max(nrbs, LNET_NRB_SMALL_MIN); -} - -static int -lnet_nrb_large_calculate(void) -{ - int nrbs = LNET_NRB_LARGE; - - if (large_router_buffers < 0) { - LCONSOLE_ERROR_MSG(0x10c, - "large_router_buffers=%d invalid when routing enabled\n", - large_router_buffers); - return -EINVAL; - } - - if (large_router_buffers > 0) - nrbs = large_router_buffers; - - nrbs /= LNET_CPT_NUMBER; - return max(nrbs, LNET_NRB_LARGE_MIN); -} - -int -lnet_rtrpools_alloc(int im_a_router) -{ - struct lnet_rtrbufpool *rtrp; - int nrb_tiny; - int nrb_small; - int nrb_large; - int rc; - int i; - - if (!strcmp(forwarding, "")) { - /* not set either way */ - if (!im_a_router) - return 0; - } else if (!strcmp(forwarding, "disabled")) { - /* explicitly disabled */ - return 0; - } else if (!strcmp(forwarding, "enabled")) { - /* explicitly enabled */ - } else { - LCONSOLE_ERROR_MSG(0x10b, "'forwarding' not set to either 'enabled' or 'disabled'\n"); - return -EINVAL; - } - - nrb_tiny = lnet_nrb_tiny_calculate(); - if (nrb_tiny < 0) - return -EINVAL; - - nrb_small = lnet_nrb_small_calculate(); - if (nrb_small < 0) - return -EINVAL; - - nrb_large = lnet_nrb_large_calculate(); - if (nrb_large < 0) - return -EINVAL; - - the_lnet.ln_rtrpools = cfs_percpt_alloc(lnet_cpt_table(), - LNET_NRBPOOLS * - sizeof(struct lnet_rtrbufpool)); - if (!the_lnet.ln_rtrpools) { - LCONSOLE_ERROR_MSG(0x10c, - "Failed to initialize router buffe pool\n"); - return -ENOMEM; - } - - cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) { - lnet_rtrpool_init(&rtrp[LNET_TINY_BUF_IDX], 0); - rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_TINY_BUF_IDX], - nrb_tiny, i); - if (rc) - goto failed; - - lnet_rtrpool_init(&rtrp[LNET_SMALL_BUF_IDX], - LNET_NRB_SMALL_PAGES); - rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_SMALL_BUF_IDX], - nrb_small, i); - if (rc) - goto failed; - - lnet_rtrpool_init(&rtrp[LNET_LARGE_BUF_IDX], - LNET_NRB_LARGE_PAGES); - rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_LARGE_BUF_IDX], - nrb_large, i); - if (rc) - goto failed; - } - - lnet_net_lock(LNET_LOCK_EX); - the_lnet.ln_routing = 1; - lnet_net_unlock(LNET_LOCK_EX); - - return 0; - - failed: - lnet_rtrpools_free(0); - return rc; -} - -static int -lnet_rtrpools_adjust_helper(int tiny, int small, int large) -{ - int nrb = 0; - int rc = 0; - int i; - struct lnet_rtrbufpool *rtrp; - - /* - * If the provided values for each buffer pool are different than the - * configured values, we need to take action. - */ - if (tiny >= 0) { - tiny_router_buffers = tiny; - nrb = lnet_nrb_tiny_calculate(); - cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) { - rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_TINY_BUF_IDX], - nrb, i); - if (rc) - return rc; - } - } - if (small >= 0) { - small_router_buffers = small; - nrb = lnet_nrb_small_calculate(); - cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) { - rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_SMALL_BUF_IDX], - nrb, i); - if (rc) - return rc; - } - } - if (large >= 0) { - large_router_buffers = large; - nrb = lnet_nrb_large_calculate(); - cfs_percpt_for_each(rtrp, i, the_lnet.ln_rtrpools) { - rc = lnet_rtrpool_adjust_bufs(&rtrp[LNET_LARGE_BUF_IDX], - nrb, i); - if (rc) - return rc; - } - } - - return 0; -} - -int -lnet_rtrpools_adjust(int tiny, int small, int large) -{ - /* - * this function doesn't revert the changes if adding new buffers - * failed. It's up to the user space caller to revert the - * changes. - */ - if (!the_lnet.ln_routing) - return 0; - - return lnet_rtrpools_adjust_helper(tiny, small, large); -} - -int -lnet_rtrpools_enable(void) -{ - int rc = 0; - - if (the_lnet.ln_routing) - return 0; - - if (!the_lnet.ln_rtrpools) - /* - * If routing is turned off, and we have never - * initialized the pools before, just call the - * standard buffer pool allocation routine as - * if we are just configuring this for the first - * time. - */ - rc = lnet_rtrpools_alloc(1); - else - rc = lnet_rtrpools_adjust_helper(0, 0, 0); - if (rc) - return rc; - - lnet_net_lock(LNET_LOCK_EX); - the_lnet.ln_routing = 1; - - the_lnet.ln_ping_info->pi_features &= ~LNET_PING_FEAT_RTE_DISABLED; - lnet_net_unlock(LNET_LOCK_EX); - - return rc; -} - -void -lnet_rtrpools_disable(void) -{ - if (!the_lnet.ln_routing) - return; - - lnet_net_lock(LNET_LOCK_EX); - the_lnet.ln_routing = 0; - the_lnet.ln_ping_info->pi_features |= LNET_PING_FEAT_RTE_DISABLED; - - tiny_router_buffers = 0; - small_router_buffers = 0; - large_router_buffers = 0; - lnet_net_unlock(LNET_LOCK_EX); - lnet_rtrpools_free(1); -} - -int -lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, int alive, unsigned long when) -{ - struct lnet_peer *lp = NULL; - unsigned long now = jiffies; - int cpt = lnet_cpt_of_nid(nid); - - LASSERT(!in_interrupt()); - - CDEBUG(D_NET, "%s notifying %s: %s\n", - !ni ? "userspace" : libcfs_nid2str(ni->ni_nid), - libcfs_nid2str(nid), - alive ? "up" : "down"); - - if (ni && - LNET_NIDNET(ni->ni_nid) != LNET_NIDNET(nid)) { - CWARN("Ignoring notification of %s %s by %s (different net)\n", - libcfs_nid2str(nid), alive ? "birth" : "death", - libcfs_nid2str(ni->ni_nid)); - return -EINVAL; - } - - /* can't do predictions... */ - if (time_after(when, now)) { - CWARN("Ignoring prediction from %s of %s %s %ld seconds in the future\n", - !ni ? "userspace" : libcfs_nid2str(ni->ni_nid), - libcfs_nid2str(nid), alive ? "up" : "down", - (when - now) / HZ); - return -EINVAL; - } - - if (ni && !alive && /* LND telling me she's down */ - !auto_down) { /* auto-down disabled */ - CDEBUG(D_NET, "Auto-down disabled\n"); - return 0; - } - - lnet_net_lock(cpt); - - if (the_lnet.ln_shutdown) { - lnet_net_unlock(cpt); - return -ESHUTDOWN; - } - - lp = lnet_find_peer_locked(the_lnet.ln_peer_tables[cpt], nid); - if (!lp) { - /* nid not found */ - lnet_net_unlock(cpt); - CDEBUG(D_NET, "%s not found\n", libcfs_nid2str(nid)); - return 0; - } - - /* - * We can't fully trust LND on reporting exact peer last_alive - * if he notifies us about dead peer. For example ksocklnd can - * call us with when == _time_when_the_node_was_booted_ if - * no connections were successfully established - */ - if (ni && !alive && when < lp->lp_last_alive) - when = lp->lp_last_alive; - - lnet_notify_locked(lp, !ni, alive, when); - - if (ni) - lnet_ni_notify_locked(ni, lp); - - lnet_peer_decref_locked(lp); - - lnet_net_unlock(cpt); - return 0; -} -EXPORT_SYMBOL(lnet_notify); diff --git a/drivers/staging/lustre/lnet/lnet/router_proc.c b/drivers/staging/lustre/lnet/lnet/router_proc.c deleted file mode 100644 index ae4b7f5953a01b6e0cadb3b3023a3849def9396a..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/lnet/router_proc.c +++ /dev/null @@ -1,907 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * - * Copyright (c) 2011, 2012, Intel Corporation. - * - * This file is part of Portals - * http://sourceforge.net/projects/sandiaportals/ - * - * Portals is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Portals is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#define DEBUG_SUBSYSTEM S_LNET - -#include - -/* - * This is really lnet_proc.c. You might need to update sanity test 215 - * if any file format is changed. - */ - -#define LNET_LOFFT_BITS (sizeof(loff_t) * 8) -/* - * NB: max allowed LNET_CPT_BITS is 8 on 64-bit system and 2 on 32-bit system - */ -#define LNET_PROC_CPT_BITS (LNET_CPT_BITS + 1) -/* change version, 16 bits or 8 bits */ -#define LNET_PROC_VER_BITS max_t(size_t, min_t(size_t, LNET_LOFFT_BITS, 64) / 4, 8) - -#define LNET_PROC_HASH_BITS LNET_PEER_HASH_BITS -/* - * bits for peer hash offset - * NB: we don't use the highest bit of *ppos because it's signed - */ -#define LNET_PROC_HOFF_BITS (LNET_LOFFT_BITS - \ - LNET_PROC_CPT_BITS - \ - LNET_PROC_VER_BITS - \ - LNET_PROC_HASH_BITS - 1) -/* bits for hash index + position */ -#define LNET_PROC_HPOS_BITS (LNET_PROC_HASH_BITS + LNET_PROC_HOFF_BITS) -/* bits for peer hash table + hash version */ -#define LNET_PROC_VPOS_BITS (LNET_PROC_HPOS_BITS + LNET_PROC_VER_BITS) - -#define LNET_PROC_CPT_MASK ((1ULL << LNET_PROC_CPT_BITS) - 1) -#define LNET_PROC_VER_MASK ((1ULL << LNET_PROC_VER_BITS) - 1) -#define LNET_PROC_HASH_MASK ((1ULL << LNET_PROC_HASH_BITS) - 1) -#define LNET_PROC_HOFF_MASK ((1ULL << LNET_PROC_HOFF_BITS) - 1) - -#define LNET_PROC_CPT_GET(pos) \ - (int)(((pos) >> LNET_PROC_VPOS_BITS) & LNET_PROC_CPT_MASK) - -#define LNET_PROC_VER_GET(pos) \ - (int)(((pos) >> LNET_PROC_HPOS_BITS) & LNET_PROC_VER_MASK) - -#define LNET_PROC_HASH_GET(pos) \ - (int)(((pos) >> LNET_PROC_HOFF_BITS) & LNET_PROC_HASH_MASK) - -#define LNET_PROC_HOFF_GET(pos) \ - (int)((pos) & LNET_PROC_HOFF_MASK) - -#define LNET_PROC_POS_MAKE(cpt, ver, hash, off) \ - (((((loff_t)(cpt)) & LNET_PROC_CPT_MASK) << LNET_PROC_VPOS_BITS) | \ - ((((loff_t)(ver)) & LNET_PROC_VER_MASK) << LNET_PROC_HPOS_BITS) | \ - ((((loff_t)(hash)) & LNET_PROC_HASH_MASK) << LNET_PROC_HOFF_BITS) | \ - ((off) & LNET_PROC_HOFF_MASK)) - -#define LNET_PROC_VERSION(v) ((unsigned int)((v) & LNET_PROC_VER_MASK)) - -static int __proc_lnet_stats(void *data, int write, - loff_t pos, void __user *buffer, int nob) -{ - int rc; - struct lnet_counters *ctrs; - int len; - char *tmpstr; - const int tmpsiz = 256; /* 7 %u and 4 %llu */ - - if (write) { - lnet_counters_reset(); - return 0; - } - - /* read */ - - ctrs = kzalloc(sizeof(*ctrs), GFP_NOFS); - if (!ctrs) - return -ENOMEM; - - tmpstr = kmalloc(tmpsiz, GFP_KERNEL); - if (!tmpstr) { - kfree(ctrs); - return -ENOMEM; - } - - lnet_counters_get(ctrs); - - len = snprintf(tmpstr, tmpsiz, - "%u %u %u %u %u %u %u %llu %llu %llu %llu", - ctrs->msgs_alloc, ctrs->msgs_max, - ctrs->errors, - ctrs->send_count, ctrs->recv_count, - ctrs->route_count, ctrs->drop_count, - ctrs->send_length, ctrs->recv_length, - ctrs->route_length, ctrs->drop_length); - - if (pos >= min_t(int, len, strlen(tmpstr))) - rc = 0; - else - rc = cfs_trace_copyout_string(buffer, nob, - tmpstr + pos, "\n"); - - kfree(tmpstr); - kfree(ctrs); - return rc; -} - -static int proc_lnet_stats(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return lprocfs_call_handler(table->data, write, ppos, buffer, lenp, - __proc_lnet_stats); -} - -static int proc_lnet_routes(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - const int tmpsiz = 256; - char *tmpstr; - char *s; - int rc = 0; - int len; - int ver; - int off; - - BUILD_BUG_ON(sizeof(loff_t) < 4); - - off = LNET_PROC_HOFF_GET(*ppos); - ver = LNET_PROC_VER_GET(*ppos); - - LASSERT(!write); - - if (!*lenp) - return 0; - - tmpstr = kmalloc(tmpsiz, GFP_KERNEL); - if (!tmpstr) - return -ENOMEM; - - s = tmpstr; /* points to current position in tmpstr[] */ - - if (!*ppos) { - s += snprintf(s, tmpstr + tmpsiz - s, "Routing %s\n", - the_lnet.ln_routing ? "enabled" : "disabled"); - LASSERT(tmpstr + tmpsiz - s > 0); - - s += snprintf(s, tmpstr + tmpsiz - s, "%-8s %4s %8s %7s %s\n", - "net", "hops", "priority", "state", "router"); - LASSERT(tmpstr + tmpsiz - s > 0); - - lnet_net_lock(0); - ver = (unsigned int)the_lnet.ln_remote_nets_version; - lnet_net_unlock(0); - *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off); - } else { - struct list_head *n; - struct list_head *r; - struct lnet_route *route = NULL; - struct lnet_remotenet *rnet = NULL; - int skip = off - 1; - struct list_head *rn_list; - int i; - - lnet_net_lock(0); - - if (ver != LNET_PROC_VERSION(the_lnet.ln_remote_nets_version)) { - lnet_net_unlock(0); - kfree(tmpstr); - return -ESTALE; - } - - for (i = 0; i < LNET_REMOTE_NETS_HASH_SIZE && !route; i++) { - rn_list = &the_lnet.ln_remote_nets_hash[i]; - - n = rn_list->next; - - while (n != rn_list && !route) { - rnet = list_entry(n, struct lnet_remotenet, - lrn_list); - - r = rnet->lrn_routes.next; - - while (r != &rnet->lrn_routes) { - struct lnet_route *re; - - re = list_entry(r, struct lnet_route, - lr_list); - if (!skip) { - route = re; - break; - } - - skip--; - r = r->next; - } - - n = n->next; - } - } - - if (route) { - __u32 net = rnet->lrn_net; - __u32 hops = route->lr_hops; - unsigned int priority = route->lr_priority; - lnet_nid_t nid = route->lr_gateway->lp_nid; - int alive = lnet_is_route_alive(route); - - s += snprintf(s, tmpstr + tmpsiz - s, - "%-8s %4u %8u %7s %s\n", - libcfs_net2str(net), hops, - priority, - alive ? "up" : "down", - libcfs_nid2str(nid)); - LASSERT(tmpstr + tmpsiz - s > 0); - } - - lnet_net_unlock(0); - } - - len = s - tmpstr; /* how many bytes was written */ - - if (len > *lenp) { /* linux-supplied buffer is too small */ - rc = -EINVAL; - } else if (len > 0) { /* wrote something */ - if (copy_to_user(buffer, tmpstr, len)) { - rc = -EFAULT; - } else { - off += 1; - *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off); - } - } - - kfree(tmpstr); - - if (!rc) - *lenp = len; - - return rc; -} - -static int proc_lnet_routers(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int rc = 0; - char *tmpstr; - char *s; - const int tmpsiz = 256; - int len; - int ver; - int off; - - off = LNET_PROC_HOFF_GET(*ppos); - ver = LNET_PROC_VER_GET(*ppos); - - LASSERT(!write); - - if (!*lenp) - return 0; - - tmpstr = kmalloc(tmpsiz, GFP_KERNEL); - if (!tmpstr) - return -ENOMEM; - - s = tmpstr; /* points to current position in tmpstr[] */ - - if (!*ppos) { - s += snprintf(s, tmpstr + tmpsiz - s, - "%-4s %7s %9s %6s %12s %9s %8s %7s %s\n", - "ref", "rtr_ref", "alive_cnt", "state", - "last_ping", "ping_sent", "deadline", - "down_ni", "router"); - LASSERT(tmpstr + tmpsiz - s > 0); - - lnet_net_lock(0); - ver = (unsigned int)the_lnet.ln_routers_version; - lnet_net_unlock(0); - *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off); - } else { - struct list_head *r; - struct lnet_peer *peer = NULL; - int skip = off - 1; - - lnet_net_lock(0); - - if (ver != LNET_PROC_VERSION(the_lnet.ln_routers_version)) { - lnet_net_unlock(0); - - kfree(tmpstr); - return -ESTALE; - } - - r = the_lnet.ln_routers.next; - - while (r != &the_lnet.ln_routers) { - struct lnet_peer *lp; - - lp = list_entry(r, struct lnet_peer, lp_rtr_list); - if (!skip) { - peer = lp; - break; - } - - skip--; - r = r->next; - } - - if (peer) { - lnet_nid_t nid = peer->lp_nid; - unsigned long now = jiffies; - unsigned long deadline = peer->lp_ping_deadline; - int nrefs = peer->lp_refcount; - int nrtrrefs = peer->lp_rtr_refcount; - int alive_cnt = peer->lp_alive_count; - int alive = peer->lp_alive; - int pingsent = !peer->lp_ping_notsent; - int last_ping = (now - peer->lp_ping_timestamp) / HZ; - int down_ni = 0; - struct lnet_route *rtr; - - if ((peer->lp_ping_feats & - LNET_PING_FEAT_NI_STATUS)) { - list_for_each_entry(rtr, &peer->lp_routes, - lr_gwlist) { - /* - * downis on any route should be the - * number of downis on the gateway - */ - if (rtr->lr_downis) { - down_ni = rtr->lr_downis; - break; - } - } - } - - if (!deadline) - s += snprintf(s, tmpstr + tmpsiz - s, - "%-4d %7d %9d %6s %12d %9d %8s %7d %s\n", - nrefs, nrtrrefs, alive_cnt, - alive ? "up" : "down", last_ping, - pingsent, "NA", down_ni, - libcfs_nid2str(nid)); - else - s += snprintf(s, tmpstr + tmpsiz - s, - "%-4d %7d %9d %6s %12d %9d %8lu %7d %s\n", - nrefs, nrtrrefs, alive_cnt, - alive ? "up" : "down", last_ping, - pingsent, - (deadline - now) / HZ, - down_ni, libcfs_nid2str(nid)); - LASSERT(tmpstr + tmpsiz - s > 0); - } - - lnet_net_unlock(0); - } - - len = s - tmpstr; /* how many bytes was written */ - - if (len > *lenp) { /* linux-supplied buffer is too small */ - rc = -EINVAL; - } else if (len > 0) { /* wrote something */ - if (copy_to_user(buffer, tmpstr, len)) { - rc = -EFAULT; - } else { - off += 1; - *ppos = LNET_PROC_POS_MAKE(0, ver, 0, off); - } - } - - kfree(tmpstr); - - if (!rc) - *lenp = len; - - return rc; -} - -static int proc_lnet_peers(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - const int tmpsiz = 256; - struct lnet_peer_table *ptable; - char *tmpstr; - char *s; - int cpt = LNET_PROC_CPT_GET(*ppos); - int ver = LNET_PROC_VER_GET(*ppos); - int hash = LNET_PROC_HASH_GET(*ppos); - int hoff = LNET_PROC_HOFF_GET(*ppos); - int rc = 0; - int len; - - BUILD_BUG_ON(LNET_PROC_HASH_BITS < LNET_PEER_HASH_BITS); - LASSERT(!write); - - if (!*lenp) - return 0; - - if (cpt >= LNET_CPT_NUMBER) { - *lenp = 0; - return 0; - } - - tmpstr = kmalloc(tmpsiz, GFP_KERNEL); - if (!tmpstr) - return -ENOMEM; - - s = tmpstr; /* points to current position in tmpstr[] */ - - if (!*ppos) { - s += snprintf(s, tmpstr + tmpsiz - s, - "%-24s %4s %5s %5s %5s %5s %5s %5s %5s %s\n", - "nid", "refs", "state", "last", "max", - "rtr", "min", "tx", "min", "queue"); - LASSERT(tmpstr + tmpsiz - s > 0); - - hoff++; - } else { - struct lnet_peer *peer; - struct list_head *p; - int skip; - again: - p = NULL; - peer = NULL; - skip = hoff - 1; - - lnet_net_lock(cpt); - ptable = the_lnet.ln_peer_tables[cpt]; - if (hoff == 1) - ver = LNET_PROC_VERSION(ptable->pt_version); - - if (ver != LNET_PROC_VERSION(ptable->pt_version)) { - lnet_net_unlock(cpt); - kfree(tmpstr); - return -ESTALE; - } - - while (hash < LNET_PEER_HASH_SIZE) { - if (!p) - p = ptable->pt_hash[hash].next; - - while (p != &ptable->pt_hash[hash]) { - struct lnet_peer *lp; - - lp = list_entry(p, struct lnet_peer, - lp_hashlist); - if (!skip) { - peer = lp; - - /* - * minor optimization: start from idx+1 - * on next iteration if we've just - * drained lp_hashlist - */ - if (lp->lp_hashlist.next == - &ptable->pt_hash[hash]) { - hoff = 1; - hash++; - } else { - hoff++; - } - - break; - } - - skip--; - p = lp->lp_hashlist.next; - } - - if (peer) - break; - - p = NULL; - hoff = 1; - hash++; - } - - if (peer) { - lnet_nid_t nid = peer->lp_nid; - int nrefs = peer->lp_refcount; - int lastalive = -1; - char *aliveness = "NA"; - int maxcr = peer->lp_ni->ni_peertxcredits; - int txcr = peer->lp_txcredits; - int mintxcr = peer->lp_mintxcredits; - int rtrcr = peer->lp_rtrcredits; - int minrtrcr = peer->lp_minrtrcredits; - int txqnob = peer->lp_txqnob; - - if (lnet_isrouter(peer) || - lnet_peer_aliveness_enabled(peer)) - aliveness = peer->lp_alive ? "up" : "down"; - - if (lnet_peer_aliveness_enabled(peer)) { - unsigned long now = jiffies; - long delta; - - delta = now - peer->lp_last_alive; - lastalive = (delta) / HZ; - - /* No need to mess up peers contents with - * arbitrarily long integers - it suffices to - * know that lastalive is more than 10000s old - */ - if (lastalive >= 10000) - lastalive = 9999; - } - - lnet_net_unlock(cpt); - - s += snprintf(s, tmpstr + tmpsiz - s, - "%-24s %4d %5s %5d %5d %5d %5d %5d %5d %d\n", - libcfs_nid2str(nid), nrefs, aliveness, - lastalive, maxcr, rtrcr, minrtrcr, txcr, - mintxcr, txqnob); - LASSERT(tmpstr + tmpsiz - s > 0); - - } else { /* peer is NULL */ - lnet_net_unlock(cpt); - } - - if (hash == LNET_PEER_HASH_SIZE) { - cpt++; - hash = 0; - hoff = 1; - if (!peer && cpt < LNET_CPT_NUMBER) - goto again; - } - } - - len = s - tmpstr; /* how many bytes was written */ - - if (len > *lenp) { /* linux-supplied buffer is too small */ - rc = -EINVAL; - } else if (len > 0) { /* wrote something */ - if (copy_to_user(buffer, tmpstr, len)) - rc = -EFAULT; - else - *ppos = LNET_PROC_POS_MAKE(cpt, ver, hash, hoff); - } - - kfree(tmpstr); - - if (!rc) - *lenp = len; - - return rc; -} - -static int __proc_lnet_buffers(void *data, int write, - loff_t pos, void __user *buffer, int nob) -{ - char *s; - char *tmpstr; - int tmpsiz; - int idx; - int len; - int rc; - int i; - - LASSERT(!write); - - /* (4 %d) * 4 * LNET_CPT_NUMBER */ - tmpsiz = 64 * (LNET_NRBPOOLS + 1) * LNET_CPT_NUMBER; - tmpstr = kvmalloc(tmpsiz, GFP_KERNEL); - if (!tmpstr) - return -ENOMEM; - - s = tmpstr; /* points to current position in tmpstr[] */ - - s += snprintf(s, tmpstr + tmpsiz - s, - "%5s %5s %7s %7s\n", - "pages", "count", "credits", "min"); - LASSERT(tmpstr + tmpsiz - s > 0); - - if (!the_lnet.ln_rtrpools) - goto out; /* I'm not a router */ - - for (idx = 0; idx < LNET_NRBPOOLS; idx++) { - struct lnet_rtrbufpool *rbp; - - lnet_net_lock(LNET_LOCK_EX); - cfs_percpt_for_each(rbp, i, the_lnet.ln_rtrpools) { - s += snprintf(s, tmpstr + tmpsiz - s, - "%5d %5d %7d %7d\n", - rbp[idx].rbp_npages, - rbp[idx].rbp_nbuffers, - rbp[idx].rbp_credits, - rbp[idx].rbp_mincredits); - LASSERT(tmpstr + tmpsiz - s > 0); - } - lnet_net_unlock(LNET_LOCK_EX); - } - - out: - len = s - tmpstr; - - if (pos >= min_t(int, len, strlen(tmpstr))) - rc = 0; - else - rc = cfs_trace_copyout_string(buffer, nob, - tmpstr + pos, NULL); - - kvfree(tmpstr); - return rc; -} - -static int proc_lnet_buffers(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return lprocfs_call_handler(table->data, write, ppos, buffer, lenp, - __proc_lnet_buffers); -} - -static int proc_lnet_nis(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int tmpsiz = 128 * LNET_CPT_NUMBER; - int rc = 0; - char *tmpstr; - char *s; - int len; - - LASSERT(!write); - - if (!*lenp) - return 0; - - tmpstr = kvmalloc(tmpsiz, GFP_KERNEL); - if (!tmpstr) - return -ENOMEM; - - s = tmpstr; /* points to current position in tmpstr[] */ - - if (!*ppos) { - s += snprintf(s, tmpstr + tmpsiz - s, - "%-24s %6s %5s %4s %4s %4s %5s %5s %5s\n", - "nid", "status", "alive", "refs", "peer", - "rtr", "max", "tx", "min"); - LASSERT(tmpstr + tmpsiz - s > 0); - } else { - struct list_head *n; - struct lnet_ni *ni = NULL; - int skip = *ppos - 1; - - lnet_net_lock(0); - - n = the_lnet.ln_nis.next; - - while (n != &the_lnet.ln_nis) { - struct lnet_ni *a_ni; - - a_ni = list_entry(n, struct lnet_ni, ni_list); - if (!skip) { - ni = a_ni; - break; - } - - skip--; - n = n->next; - } - - if (ni) { - struct lnet_tx_queue *tq; - char *stat; - time64_t now = ktime_get_real_seconds(); - int last_alive = -1; - int i; - int j; - - if (the_lnet.ln_routing) - last_alive = now - ni->ni_last_alive; - - /* @lo forever alive */ - if (ni->ni_lnd->lnd_type == LOLND) - last_alive = 0; - - lnet_ni_lock(ni); - LASSERT(ni->ni_status); - stat = (ni->ni_status->ns_status == - LNET_NI_STATUS_UP) ? "up" : "down"; - lnet_ni_unlock(ni); - - /* - * we actually output credits information for - * TX queue of each partition - */ - cfs_percpt_for_each(tq, i, ni->ni_tx_queues) { - for (j = 0; ni->ni_cpts && - j < ni->ni_ncpts; j++) { - if (i == ni->ni_cpts[j]) - break; - } - - if (j == ni->ni_ncpts) - continue; - - if (i) - lnet_net_lock(i); - - s += snprintf(s, tmpstr + tmpsiz - s, - "%-24s %6s %5d %4d %4d %4d %5d %5d %5d\n", - libcfs_nid2str(ni->ni_nid), stat, - last_alive, *ni->ni_refs[i], - ni->ni_peertxcredits, - ni->ni_peerrtrcredits, - tq->tq_credits_max, - tq->tq_credits, - tq->tq_credits_min); - if (i) - lnet_net_unlock(i); - } - LASSERT(tmpstr + tmpsiz - s > 0); - } - - lnet_net_unlock(0); - } - - len = s - tmpstr; /* how many bytes was written */ - - if (len > *lenp) { /* linux-supplied buffer is too small */ - rc = -EINVAL; - } else if (len > 0) { /* wrote something */ - if (copy_to_user(buffer, tmpstr, len)) - rc = -EFAULT; - else - *ppos += 1; - } - - kvfree(tmpstr); - - if (!rc) - *lenp = len; - - return rc; -} - -struct lnet_portal_rotors { - int pr_value; - const char *pr_name; - const char *pr_desc; -}; - -static struct lnet_portal_rotors portal_rotors[] = { - { - .pr_value = LNET_PTL_ROTOR_OFF, - .pr_name = "OFF", - .pr_desc = "Turn off message rotor for wildcard portals" - }, - { - .pr_value = LNET_PTL_ROTOR_ON, - .pr_name = "ON", - .pr_desc = "round-robin dispatch all PUT messages for wildcard portals" - }, - { - .pr_value = LNET_PTL_ROTOR_RR_RT, - .pr_name = "RR_RT", - .pr_desc = "round-robin dispatch routed PUT message for wildcard portals" - }, - { - .pr_value = LNET_PTL_ROTOR_HASH_RT, - .pr_name = "HASH_RT", - .pr_desc = "dispatch routed PUT message by hashing source NID for wildcard portals" - }, - { - .pr_value = -1, - .pr_name = NULL, - .pr_desc = NULL - }, -}; - -static int __proc_lnet_portal_rotor(void *data, int write, - loff_t pos, void __user *buffer, int nob) -{ - const int buf_len = 128; - char *buf; - char *tmp; - int rc; - int i; - - buf = kmalloc(buf_len, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - if (!write) { - lnet_res_lock(0); - - for (i = 0; portal_rotors[i].pr_value >= 0; i++) { - if (portal_rotors[i].pr_value == portal_rotor) - break; - } - - LASSERT(portal_rotors[i].pr_value == portal_rotor); - lnet_res_unlock(0); - - rc = snprintf(buf, buf_len, - "{\n\tportals: all\n" - "\trotor: %s\n\tdescription: %s\n}", - portal_rotors[i].pr_name, - portal_rotors[i].pr_desc); - - if (pos >= min_t(int, rc, buf_len)) { - rc = 0; - } else { - rc = cfs_trace_copyout_string(buffer, nob, - buf + pos, "\n"); - } - goto out; - } - - rc = cfs_trace_copyin_string(buf, buf_len, buffer, nob); - if (rc < 0) - goto out; - - tmp = strim(buf); - - rc = -EINVAL; - lnet_res_lock(0); - for (i = 0; portal_rotors[i].pr_name; i++) { - if (!strncasecmp(portal_rotors[i].pr_name, tmp, - strlen(portal_rotors[i].pr_name))) { - portal_rotor = portal_rotors[i].pr_value; - rc = 0; - break; - } - } - lnet_res_unlock(0); -out: - kfree(buf); - return rc; -} - -static int proc_lnet_portal_rotor(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) -{ - return lprocfs_call_handler(table->data, write, ppos, buffer, lenp, - __proc_lnet_portal_rotor); -} - -static struct ctl_table lnet_table[] = { - /* - * NB No .strategy entries have been provided since sysctl(8) prefers - * to go via /proc for portability. - */ - { - .procname = "stats", - .mode = 0644, - .proc_handler = &proc_lnet_stats, - }, - { - .procname = "routes", - .mode = 0444, - .proc_handler = &proc_lnet_routes, - }, - { - .procname = "routers", - .mode = 0444, - .proc_handler = &proc_lnet_routers, - }, - { - .procname = "peers", - .mode = 0444, - .proc_handler = &proc_lnet_peers, - }, - { - .procname = "buffers", - .mode = 0444, - .proc_handler = &proc_lnet_buffers, - }, - { - .procname = "nis", - .mode = 0444, - .proc_handler = &proc_lnet_nis, - }, - { - .procname = "portal_rotor", - .mode = 0644, - .proc_handler = &proc_lnet_portal_rotor, - }, - { - } -}; - -void lnet_router_debugfs_init(void) -{ - lustre_insert_debugfs(lnet_table); -} - -void lnet_router_debugfs_fini(void) -{ -} diff --git a/drivers/staging/lustre/lnet/selftest/Makefile b/drivers/staging/lustre/lnet/selftest/Makefile deleted file mode 100644 index 3ccc8966b566d96fed49aaf0d41a134acec15e3f..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/selftest/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/include -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/lustre/include - -obj-$(CONFIG_LNET_SELFTEST) := lnet_selftest.o - -lnet_selftest-y := console.o conrpc.o conctl.o framework.o timer.o rpc.o \ - module.o ping_test.o brw_test.o diff --git a/drivers/staging/lustre/lnet/selftest/brw_test.c b/drivers/staging/lustre/lnet/selftest/brw_test.c deleted file mode 100644 index f1ee219bc8f34e04b859783a607331457dd9353e..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/selftest/brw_test.c +++ /dev/null @@ -1,526 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/selftest/brw_test.c - * - * Author: Isaac Huang - */ - -#include "selftest.h" - -static int brw_srv_workitems = SFW_TEST_WI_MAX; -module_param(brw_srv_workitems, int, 0644); -MODULE_PARM_DESC(brw_srv_workitems, "# BRW server workitems"); - -static int brw_inject_errors; -module_param(brw_inject_errors, int, 0644); -MODULE_PARM_DESC(brw_inject_errors, "# data errors to inject randomly, zero by default"); - -#define BRW_POISON 0xbeefbeefbeefbeefULL -#define BRW_MAGIC 0xeeb0eeb1eeb2eeb3ULL -#define BRW_MSIZE sizeof(u64) - -static void -brw_client_fini(struct sfw_test_instance *tsi) -{ - struct srpc_bulk *bulk; - struct sfw_test_unit *tsu; - - LASSERT(tsi->tsi_is_client); - - list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) { - bulk = tsu->tsu_private; - if (!bulk) - continue; - - srpc_free_bulk(bulk); - tsu->tsu_private = NULL; - } -} - -static int -brw_client_init(struct sfw_test_instance *tsi) -{ - struct sfw_session *sn = tsi->tsi_batch->bat_session; - int flags; - int off; - int npg; - int len; - int opc; - struct srpc_bulk *bulk; - struct sfw_test_unit *tsu; - - LASSERT(sn); - LASSERT(tsi->tsi_is_client); - - if (!(sn->sn_features & LST_FEAT_BULK_LEN)) { - struct test_bulk_req *breq = &tsi->tsi_u.bulk_v0; - - opc = breq->blk_opc; - flags = breq->blk_flags; - npg = breq->blk_npg; - /* - * NB: this is not going to work for variable page size, - * but we have to keep it for compatibility - */ - len = npg * PAGE_SIZE; - off = 0; - } else { - struct test_bulk_req_v1 *breq = &tsi->tsi_u.bulk_v1; - - /* - * I should never get this step if it's unknown feature - * because make_session will reject unknown feature - */ - LASSERT(!(sn->sn_features & ~LST_FEATS_MASK)); - - opc = breq->blk_opc; - flags = breq->blk_flags; - len = breq->blk_len; - off = breq->blk_offset & ~PAGE_MASK; - npg = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - } - - if (off % BRW_MSIZE) - return -EINVAL; - - if (npg > LNET_MAX_IOV || npg <= 0) - return -EINVAL; - - if (opc != LST_BRW_READ && opc != LST_BRW_WRITE) - return -EINVAL; - - if (flags != LST_BRW_CHECK_NONE && - flags != LST_BRW_CHECK_FULL && flags != LST_BRW_CHECK_SIMPLE) - return -EINVAL; - - list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) { - bulk = srpc_alloc_bulk(lnet_cpt_of_nid(tsu->tsu_dest.nid), - off, npg, len, opc == LST_BRW_READ); - if (!bulk) { - brw_client_fini(tsi); - return -ENOMEM; - } - - tsu->tsu_private = bulk; - } - - return 0; -} - -static int brw_inject_one_error(void) -{ - struct timespec64 ts; - - if (brw_inject_errors <= 0) - return 0; - - ktime_get_ts64(&ts); - - if (!((ts.tv_nsec / NSEC_PER_USEC) & 1)) - return 0; - - return brw_inject_errors--; -} - -static void -brw_fill_page(struct page *pg, int off, int len, int pattern, __u64 magic) -{ - char *addr = page_address(pg) + off; - int i; - - LASSERT(addr); - LASSERT(!(off % BRW_MSIZE) && !(len % BRW_MSIZE)); - - if (pattern == LST_BRW_CHECK_NONE) - return; - - if (magic == BRW_MAGIC) - magic += brw_inject_one_error(); - - if (pattern == LST_BRW_CHECK_SIMPLE) { - memcpy(addr, &magic, BRW_MSIZE); - if (len > BRW_MSIZE) { - addr += PAGE_SIZE - BRW_MSIZE; - memcpy(addr, &magic, BRW_MSIZE); - } - return; - } - - if (pattern == LST_BRW_CHECK_FULL) { - for (i = 0; i < len; i += BRW_MSIZE) - memcpy(addr + i, &magic, BRW_MSIZE); - return; - } - - LBUG(); -} - -static int -brw_check_page(struct page *pg, int off, int len, int pattern, __u64 magic) -{ - char *addr = page_address(pg) + off; - __u64 data = 0; /* make compiler happy */ - int i; - - LASSERT(addr); - LASSERT(!(off % BRW_MSIZE) && !(len % BRW_MSIZE)); - - if (pattern == LST_BRW_CHECK_NONE) - return 0; - - if (pattern == LST_BRW_CHECK_SIMPLE) { - data = *((__u64 *)addr); - if (data != magic) - goto bad_data; - - if (len > BRW_MSIZE) { - addr += PAGE_SIZE - BRW_MSIZE; - data = *((__u64 *)addr); - if (data != magic) - goto bad_data; - } - return 0; - } - - if (pattern == LST_BRW_CHECK_FULL) { - for (i = 0; i < len; i += BRW_MSIZE) { - data = *(u64 *)(addr + i); - if (data != magic) - goto bad_data; - } - return 0; - } - - LBUG(); - -bad_data: - CERROR("Bad data in page %p: %#llx, %#llx expected\n", - pg, data, magic); - return 1; -} - -static void -brw_fill_bulk(struct srpc_bulk *bk, int pattern, __u64 magic) -{ - int i; - struct page *pg; - - for (i = 0; i < bk->bk_niov; i++) { - int off, len; - - pg = bk->bk_iovs[i].bv_page; - off = bk->bk_iovs[i].bv_offset; - len = bk->bk_iovs[i].bv_len; - brw_fill_page(pg, off, len, pattern, magic); - } -} - -static int -brw_check_bulk(struct srpc_bulk *bk, int pattern, __u64 magic) -{ - int i; - struct page *pg; - - for (i = 0; i < bk->bk_niov; i++) { - int off, len; - - pg = bk->bk_iovs[i].bv_page; - off = bk->bk_iovs[i].bv_offset; - len = bk->bk_iovs[i].bv_len; - if (brw_check_page(pg, off, len, pattern, magic)) { - CERROR("Bulk page %p (%d/%d) is corrupted!\n", - pg, i, bk->bk_niov); - return 1; - } - } - - return 0; -} - -static int -brw_client_prep_rpc(struct sfw_test_unit *tsu, struct lnet_process_id dest, - struct srpc_client_rpc **rpcpp) -{ - struct srpc_bulk *bulk = tsu->tsu_private; - struct sfw_test_instance *tsi = tsu->tsu_instance; - struct sfw_session *sn = tsi->tsi_batch->bat_session; - struct srpc_client_rpc *rpc; - struct srpc_brw_reqst *req; - int flags; - int npg; - int len; - int opc; - int rc; - - LASSERT(sn); - LASSERT(bulk); - - if (!(sn->sn_features & LST_FEAT_BULK_LEN)) { - struct test_bulk_req *breq = &tsi->tsi_u.bulk_v0; - - opc = breq->blk_opc; - flags = breq->blk_flags; - npg = breq->blk_npg; - len = npg * PAGE_SIZE; - } else { - struct test_bulk_req_v1 *breq = &tsi->tsi_u.bulk_v1; - int off; - - /* - * I should never get this step if it's unknown feature - * because make_session will reject unknown feature - */ - LASSERT(!(sn->sn_features & ~LST_FEATS_MASK)); - - opc = breq->blk_opc; - flags = breq->blk_flags; - len = breq->blk_len; - off = breq->blk_offset; - npg = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - } - - rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, npg, len, &rpc); - if (rc) - return rc; - - memcpy(&rpc->crpc_bulk, bulk, offsetof(struct srpc_bulk, bk_iovs[npg])); - if (opc == LST_BRW_WRITE) - brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_MAGIC); - else - brw_fill_bulk(&rpc->crpc_bulk, flags, BRW_POISON); - - req = &rpc->crpc_reqstmsg.msg_body.brw_reqst; - req->brw_flags = flags; - req->brw_rw = opc; - req->brw_len = len; - - *rpcpp = rpc; - return 0; -} - -static void -brw_client_done_rpc(struct sfw_test_unit *tsu, struct srpc_client_rpc *rpc) -{ - __u64 magic = BRW_MAGIC; - struct sfw_test_instance *tsi = tsu->tsu_instance; - struct sfw_session *sn = tsi->tsi_batch->bat_session; - struct srpc_msg *msg = &rpc->crpc_replymsg; - struct srpc_brw_reply *reply = &msg->msg_body.brw_reply; - struct srpc_brw_reqst *reqst = &rpc->crpc_reqstmsg.msg_body.brw_reqst; - - LASSERT(sn); - - if (rpc->crpc_status) { - CERROR("BRW RPC to %s failed with %d\n", - libcfs_id2str(rpc->crpc_dest), rpc->crpc_status); - if (!tsi->tsi_stopping) /* rpc could have been aborted */ - atomic_inc(&sn->sn_brw_errors); - return; - } - - if (msg->msg_magic != SRPC_MSG_MAGIC) { - __swab64s(&magic); - __swab32s(&reply->brw_status); - } - - CDEBUG(reply->brw_status ? D_WARNING : D_NET, - "BRW RPC to %s finished with brw_status: %d\n", - libcfs_id2str(rpc->crpc_dest), reply->brw_status); - - if (reply->brw_status) { - atomic_inc(&sn->sn_brw_errors); - rpc->crpc_status = -(int)reply->brw_status; - return; - } - - if (reqst->brw_rw == LST_BRW_WRITE) - return; - - if (brw_check_bulk(&rpc->crpc_bulk, reqst->brw_flags, magic)) { - CERROR("Bulk data from %s is corrupted!\n", - libcfs_id2str(rpc->crpc_dest)); - atomic_inc(&sn->sn_brw_errors); - rpc->crpc_status = -EBADMSG; - } -} - -static void -brw_server_rpc_done(struct srpc_server_rpc *rpc) -{ - struct srpc_bulk *blk = rpc->srpc_bulk; - - if (!blk) - return; - - if (rpc->srpc_status) - CERROR("Bulk transfer %s %s has failed: %d\n", - blk->bk_sink ? "from" : "to", - libcfs_id2str(rpc->srpc_peer), rpc->srpc_status); - else - CDEBUG(D_NET, "Transferred %d pages bulk data %s %s\n", - blk->bk_niov, blk->bk_sink ? "from" : "to", - libcfs_id2str(rpc->srpc_peer)); - - sfw_free_pages(rpc); -} - -static int -brw_bulk_ready(struct srpc_server_rpc *rpc, int status) -{ - __u64 magic = BRW_MAGIC; - struct srpc_brw_reply *reply = &rpc->srpc_replymsg.msg_body.brw_reply; - struct srpc_brw_reqst *reqst; - struct srpc_msg *reqstmsg; - - LASSERT(rpc->srpc_bulk); - LASSERT(rpc->srpc_reqstbuf); - - reqstmsg = &rpc->srpc_reqstbuf->buf_msg; - reqst = &reqstmsg->msg_body.brw_reqst; - - if (status) { - CERROR("BRW bulk %s failed for RPC from %s: %d\n", - reqst->brw_rw == LST_BRW_READ ? "READ" : "WRITE", - libcfs_id2str(rpc->srpc_peer), status); - return -EIO; - } - - if (reqst->brw_rw == LST_BRW_READ) - return 0; - - if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) - __swab64s(&magic); - - if (brw_check_bulk(rpc->srpc_bulk, reqst->brw_flags, magic)) { - CERROR("Bulk data from %s is corrupted!\n", - libcfs_id2str(rpc->srpc_peer)); - reply->brw_status = EBADMSG; - } - - return 0; -} - -static int -brw_server_handle(struct srpc_server_rpc *rpc) -{ - struct srpc_service *sv = rpc->srpc_scd->scd_svc; - struct srpc_msg *replymsg = &rpc->srpc_replymsg; - struct srpc_msg *reqstmsg = &rpc->srpc_reqstbuf->buf_msg; - struct srpc_brw_reply *reply = &replymsg->msg_body.brw_reply; - struct srpc_brw_reqst *reqst = &reqstmsg->msg_body.brw_reqst; - int npg; - int rc; - - LASSERT(sv->sv_id == SRPC_SERVICE_BRW); - - if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) { - LASSERT(reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC)); - - __swab32s(&reqst->brw_rw); - __swab32s(&reqst->brw_len); - __swab32s(&reqst->brw_flags); - __swab64s(&reqst->brw_rpyid); - __swab64s(&reqst->brw_bulkid); - } - LASSERT(reqstmsg->msg_type == (__u32)srpc_service2request(sv->sv_id)); - - reply->brw_status = 0; - rpc->srpc_done = brw_server_rpc_done; - - if ((reqst->brw_rw != LST_BRW_READ && reqst->brw_rw != LST_BRW_WRITE) || - (reqst->brw_flags != LST_BRW_CHECK_NONE && - reqst->brw_flags != LST_BRW_CHECK_FULL && - reqst->brw_flags != LST_BRW_CHECK_SIMPLE)) { - reply->brw_status = EINVAL; - return 0; - } - - if (reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) { - replymsg->msg_ses_feats = LST_FEATS_MASK; - reply->brw_status = EPROTO; - return 0; - } - - if (!(reqstmsg->msg_ses_feats & LST_FEAT_BULK_LEN)) { - /* compat with old version */ - if (reqst->brw_len & ~PAGE_MASK) { - reply->brw_status = EINVAL; - return 0; - } - npg = reqst->brw_len >> PAGE_SHIFT; - - } else { - npg = (reqst->brw_len + PAGE_SIZE - 1) >> PAGE_SHIFT; - } - - replymsg->msg_ses_feats = reqstmsg->msg_ses_feats; - - if (!reqst->brw_len || npg > LNET_MAX_IOV) { - reply->brw_status = EINVAL; - return 0; - } - - rc = sfw_alloc_pages(rpc, rpc->srpc_scd->scd_cpt, npg, - reqst->brw_len, - reqst->brw_rw == LST_BRW_WRITE); - if (rc) - return rc; - - if (reqst->brw_rw == LST_BRW_READ) - brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_MAGIC); - else - brw_fill_bulk(rpc->srpc_bulk, reqst->brw_flags, BRW_POISON); - - return 0; -} - -struct sfw_test_client_ops brw_test_client; - -void brw_init_test_client(void) -{ - brw_test_client.tso_init = brw_client_init; - brw_test_client.tso_fini = brw_client_fini; - brw_test_client.tso_prep_rpc = brw_client_prep_rpc; - brw_test_client.tso_done_rpc = brw_client_done_rpc; -}; - -struct srpc_service brw_test_service; - -void brw_init_test_service(void) -{ - brw_test_service.sv_id = SRPC_SERVICE_BRW; - brw_test_service.sv_name = "brw_test"; - brw_test_service.sv_handler = brw_server_handle; - brw_test_service.sv_bulk_ready = brw_bulk_ready; - brw_test_service.sv_wi_total = brw_srv_workitems; -} diff --git a/drivers/staging/lustre/lnet/selftest/conctl.c b/drivers/staging/lustre/lnet/selftest/conctl.c deleted file mode 100644 index 906d82d90c0c9e41c8a62b27c52bbbebbe85793d..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/selftest/conctl.c +++ /dev/null @@ -1,801 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/selftest/conctl.c - * - * IOC handle in kernel - * - * Author: Liang Zhen - */ - -#include -#include -#include "console.h" - -static int -lst_session_new_ioctl(struct lstio_session_new_args *args) -{ - char name[LST_NAME_SIZE + 1]; - int rc; - - if (!args->lstio_ses_idp || /* address for output sid */ - !args->lstio_ses_key || /* no key is specified */ - !args->lstio_ses_namep || /* session name */ - args->lstio_ses_nmlen <= 0 || - args->lstio_ses_nmlen > LST_NAME_SIZE) - return -EINVAL; - - if (copy_from_user(name, args->lstio_ses_namep, - args->lstio_ses_nmlen)) { - return -EFAULT; - } - - name[args->lstio_ses_nmlen] = 0; - - rc = lstcon_session_new(name, - args->lstio_ses_key, - args->lstio_ses_feats, - args->lstio_ses_timeout, - args->lstio_ses_force, - args->lstio_ses_idp); - - return rc; -} - -static int -lst_session_end_ioctl(struct lstio_session_end_args *args) -{ - if (args->lstio_ses_key != console_session.ses_key) - return -EACCES; - - return lstcon_session_end(); -} - -static int -lst_session_info_ioctl(struct lstio_session_info_args *args) -{ - /* no checking of key */ - - if (!args->lstio_ses_idp || /* address for output sid */ - !args->lstio_ses_keyp || /* address for output key */ - !args->lstio_ses_featp || /* address for output features */ - !args->lstio_ses_ndinfo || /* address for output ndinfo */ - !args->lstio_ses_namep || /* address for output name */ - args->lstio_ses_nmlen <= 0 || - args->lstio_ses_nmlen > LST_NAME_SIZE) - return -EINVAL; - - return lstcon_session_info(args->lstio_ses_idp, - args->lstio_ses_keyp, - args->lstio_ses_featp, - args->lstio_ses_ndinfo, - args->lstio_ses_namep, - args->lstio_ses_nmlen); -} - -static int -lst_debug_ioctl(struct lstio_debug_args *args) -{ - char name[LST_NAME_SIZE + 1]; - int client = 1; - int rc; - - if (args->lstio_dbg_key != console_session.ses_key) - return -EACCES; - - if (!args->lstio_dbg_resultp) - return -EINVAL; - - if (args->lstio_dbg_namep && /* name of batch/group */ - (args->lstio_dbg_nmlen <= 0 || - args->lstio_dbg_nmlen > LST_NAME_SIZE)) - return -EINVAL; - - if (args->lstio_dbg_namep) { - - if (copy_from_user(name, args->lstio_dbg_namep, - args->lstio_dbg_nmlen)) - return -EFAULT; - - name[args->lstio_dbg_nmlen] = 0; - } - - rc = -EINVAL; - - switch (args->lstio_dbg_type) { - case LST_OPC_SESSION: - rc = lstcon_session_debug(args->lstio_dbg_timeout, - args->lstio_dbg_resultp); - break; - - case LST_OPC_BATCHSRV: - client = 0; - /* fall through */ - case LST_OPC_BATCHCLI: - if (!args->lstio_dbg_namep) - goto out; - - rc = lstcon_batch_debug(args->lstio_dbg_timeout, - name, client, args->lstio_dbg_resultp); - break; - - case LST_OPC_GROUP: - if (!args->lstio_dbg_namep) - goto out; - - rc = lstcon_group_debug(args->lstio_dbg_timeout, - name, args->lstio_dbg_resultp); - break; - - case LST_OPC_NODES: - if (args->lstio_dbg_count <= 0 || - !args->lstio_dbg_idsp) - goto out; - - rc = lstcon_nodes_debug(args->lstio_dbg_timeout, - args->lstio_dbg_count, - args->lstio_dbg_idsp, - args->lstio_dbg_resultp); - break; - - default: - break; - } - -out: - return rc; -} - -static int -lst_group_add_ioctl(struct lstio_group_add_args *args) -{ - char name[LST_NAME_SIZE + 1]; - int rc; - - if (args->lstio_grp_key != console_session.ses_key) - return -EACCES; - - if (!args->lstio_grp_namep || - args->lstio_grp_nmlen <= 0 || - args->lstio_grp_nmlen > LST_NAME_SIZE) - return -EINVAL; - - if (copy_from_user(name, args->lstio_grp_namep, - args->lstio_grp_nmlen)) - return -EFAULT; - - name[args->lstio_grp_nmlen] = 0; - - rc = lstcon_group_add(name); - - return rc; -} - -static int -lst_group_del_ioctl(struct lstio_group_del_args *args) -{ - int rc; - char name[LST_NAME_SIZE + 1]; - - if (args->lstio_grp_key != console_session.ses_key) - return -EACCES; - - if (!args->lstio_grp_namep || - args->lstio_grp_nmlen <= 0 || - args->lstio_grp_nmlen > LST_NAME_SIZE) - return -EINVAL; - - if (copy_from_user(name, args->lstio_grp_namep, - args->lstio_grp_nmlen)) - return -EFAULT; - - name[args->lstio_grp_nmlen] = 0; - - rc = lstcon_group_del(name); - - return rc; -} - -static int -lst_group_update_ioctl(struct lstio_group_update_args *args) -{ - int rc; - char name[LST_NAME_SIZE + 1]; - - if (args->lstio_grp_key != console_session.ses_key) - return -EACCES; - - if (!args->lstio_grp_resultp || - !args->lstio_grp_namep || - args->lstio_grp_nmlen <= 0 || - args->lstio_grp_nmlen > LST_NAME_SIZE) - return -EINVAL; - - if (copy_from_user(name, args->lstio_grp_namep, - args->lstio_grp_nmlen)) - return -EFAULT; - - name[args->lstio_grp_nmlen] = 0; - - switch (args->lstio_grp_opc) { - case LST_GROUP_CLEAN: - rc = lstcon_group_clean(name, args->lstio_grp_args); - break; - - case LST_GROUP_REFRESH: - rc = lstcon_group_refresh(name, args->lstio_grp_resultp); - break; - - case LST_GROUP_RMND: - if (args->lstio_grp_count <= 0 || - !args->lstio_grp_idsp) { - rc = -EINVAL; - break; - } - rc = lstcon_nodes_remove(name, args->lstio_grp_count, - args->lstio_grp_idsp, - args->lstio_grp_resultp); - break; - - default: - rc = -EINVAL; - break; - } - - return rc; -} - -static int -lst_nodes_add_ioctl(struct lstio_group_nodes_args *args) -{ - unsigned int feats; - int rc; - char name[LST_NAME_SIZE + 1]; - - if (args->lstio_grp_key != console_session.ses_key) - return -EACCES; - - if (!args->lstio_grp_idsp || /* array of ids */ - args->lstio_grp_count <= 0 || - !args->lstio_grp_resultp || - !args->lstio_grp_featp || - !args->lstio_grp_namep || - args->lstio_grp_nmlen <= 0 || - args->lstio_grp_nmlen > LST_NAME_SIZE) - return -EINVAL; - - if (copy_from_user(name, args->lstio_grp_namep, - args->lstio_grp_nmlen)) - return -EFAULT; - - name[args->lstio_grp_nmlen] = 0; - - rc = lstcon_nodes_add(name, args->lstio_grp_count, - args->lstio_grp_idsp, &feats, - args->lstio_grp_resultp); - - if (!rc && - copy_to_user(args->lstio_grp_featp, &feats, sizeof(feats))) { - return -EINVAL; - } - - return rc; -} - -static int -lst_group_list_ioctl(struct lstio_group_list_args *args) -{ - if (args->lstio_grp_key != console_session.ses_key) - return -EACCES; - - if (args->lstio_grp_idx < 0 || - !args->lstio_grp_namep || - args->lstio_grp_nmlen <= 0 || - args->lstio_grp_nmlen > LST_NAME_SIZE) - return -EINVAL; - - return lstcon_group_list(args->lstio_grp_idx, - args->lstio_grp_nmlen, - args->lstio_grp_namep); -} - -static int -lst_group_info_ioctl(struct lstio_group_info_args *args) -{ - char name[LST_NAME_SIZE + 1]; - int ndent; - int index; - int rc; - - if (args->lstio_grp_key != console_session.ses_key) - return -EACCES; - - if (!args->lstio_grp_namep || - args->lstio_grp_nmlen <= 0 || - args->lstio_grp_nmlen > LST_NAME_SIZE) - return -EINVAL; - - if (!args->lstio_grp_entp && /* output: group entry */ - !args->lstio_grp_dentsp) /* output: node entry */ - return -EINVAL; - - if (args->lstio_grp_dentsp) { /* have node entry */ - if (!args->lstio_grp_idxp || /* node index */ - !args->lstio_grp_ndentp) /* # of node entry */ - return -EINVAL; - - if (copy_from_user(&ndent, args->lstio_grp_ndentp, - sizeof(ndent)) || - copy_from_user(&index, args->lstio_grp_idxp, - sizeof(index))) - return -EFAULT; - - if (ndent <= 0 || index < 0) - return -EINVAL; - } - - if (copy_from_user(name, args->lstio_grp_namep, - args->lstio_grp_nmlen)) - return -EFAULT; - - name[args->lstio_grp_nmlen] = 0; - - rc = lstcon_group_info(name, args->lstio_grp_entp, - &index, &ndent, args->lstio_grp_dentsp); - - if (rc) - return rc; - - if (args->lstio_grp_dentsp && - (copy_to_user(args->lstio_grp_idxp, &index, sizeof(index)) || - copy_to_user(args->lstio_grp_ndentp, &ndent, sizeof(ndent)))) - return -EFAULT; - - return 0; -} - -static int -lst_batch_add_ioctl(struct lstio_batch_add_args *args) -{ - int rc; - char name[LST_NAME_SIZE + 1]; - - if (args->lstio_bat_key != console_session.ses_key) - return -EACCES; - - if (!args->lstio_bat_namep || - args->lstio_bat_nmlen <= 0 || - args->lstio_bat_nmlen > LST_NAME_SIZE) - return -EINVAL; - - if (copy_from_user(name, args->lstio_bat_namep, - args->lstio_bat_nmlen)) - return -EFAULT; - - name[args->lstio_bat_nmlen] = 0; - - rc = lstcon_batch_add(name); - - return rc; -} - -static int -lst_batch_run_ioctl(struct lstio_batch_run_args *args) -{ - int rc; - char name[LST_NAME_SIZE + 1]; - - if (args->lstio_bat_key != console_session.ses_key) - return -EACCES; - - if (!args->lstio_bat_namep || - args->lstio_bat_nmlen <= 0 || - args->lstio_bat_nmlen > LST_NAME_SIZE) - return -EINVAL; - - if (copy_from_user(name, args->lstio_bat_namep, - args->lstio_bat_nmlen)) - return -EFAULT; - - name[args->lstio_bat_nmlen] = 0; - - rc = lstcon_batch_run(name, args->lstio_bat_timeout, - args->lstio_bat_resultp); - - return rc; -} - -static int -lst_batch_stop_ioctl(struct lstio_batch_stop_args *args) -{ - int rc; - char name[LST_NAME_SIZE + 1]; - - if (args->lstio_bat_key != console_session.ses_key) - return -EACCES; - - if (!args->lstio_bat_resultp || - !args->lstio_bat_namep || - args->lstio_bat_nmlen <= 0 || - args->lstio_bat_nmlen > LST_NAME_SIZE) - return -EINVAL; - - if (copy_from_user(name, args->lstio_bat_namep, - args->lstio_bat_nmlen)) - return -EFAULT; - - name[args->lstio_bat_nmlen] = 0; - - rc = lstcon_batch_stop(name, args->lstio_bat_force, - args->lstio_bat_resultp); - - return rc; -} - -static int -lst_batch_query_ioctl(struct lstio_batch_query_args *args) -{ - char name[LST_NAME_SIZE + 1]; - int rc; - - if (args->lstio_bat_key != console_session.ses_key) - return -EACCES; - - if (!args->lstio_bat_resultp || - !args->lstio_bat_namep || - args->lstio_bat_nmlen <= 0 || - args->lstio_bat_nmlen > LST_NAME_SIZE) - return -EINVAL; - - if (args->lstio_bat_testidx < 0) - return -EINVAL; - - if (copy_from_user(name, args->lstio_bat_namep, - args->lstio_bat_nmlen)) - return -EFAULT; - - name[args->lstio_bat_nmlen] = 0; - - rc = lstcon_test_batch_query(name, - args->lstio_bat_testidx, - args->lstio_bat_client, - args->lstio_bat_timeout, - args->lstio_bat_resultp); - - return rc; -} - -static int -lst_batch_list_ioctl(struct lstio_batch_list_args *args) -{ - if (args->lstio_bat_key != console_session.ses_key) - return -EACCES; - - if (args->lstio_bat_idx < 0 || - !args->lstio_bat_namep || - args->lstio_bat_nmlen <= 0 || - args->lstio_bat_nmlen > LST_NAME_SIZE) - return -EINVAL; - - return lstcon_batch_list(args->lstio_bat_idx, - args->lstio_bat_nmlen, - args->lstio_bat_namep); -} - -static int -lst_batch_info_ioctl(struct lstio_batch_info_args *args) -{ - char name[LST_NAME_SIZE + 1]; - int rc; - int index; - int ndent; - - if (args->lstio_bat_key != console_session.ses_key) - return -EACCES; - - if (!args->lstio_bat_namep || /* batch name */ - args->lstio_bat_nmlen <= 0 || - args->lstio_bat_nmlen > LST_NAME_SIZE) - return -EINVAL; - - if (!args->lstio_bat_entp && /* output: batch entry */ - !args->lstio_bat_dentsp) /* output: node entry */ - return -EINVAL; - - if (args->lstio_bat_dentsp) { /* have node entry */ - if (!args->lstio_bat_idxp || /* node index */ - !args->lstio_bat_ndentp) /* # of node entry */ - return -EINVAL; - - if (copy_from_user(&index, args->lstio_bat_idxp, - sizeof(index)) || - copy_from_user(&ndent, args->lstio_bat_ndentp, - sizeof(ndent))) - return -EFAULT; - - if (ndent <= 0 || index < 0) - return -EINVAL; - } - - if (copy_from_user(name, args->lstio_bat_namep, - args->lstio_bat_nmlen)) - return -EFAULT; - - name[args->lstio_bat_nmlen] = 0; - - rc = lstcon_batch_info(name, args->lstio_bat_entp, - args->lstio_bat_server, args->lstio_bat_testidx, - &index, &ndent, args->lstio_bat_dentsp); - - if (rc) - return rc; - - if (args->lstio_bat_dentsp && - (copy_to_user(args->lstio_bat_idxp, &index, sizeof(index)) || - copy_to_user(args->lstio_bat_ndentp, &ndent, sizeof(ndent)))) - rc = -EFAULT; - - return rc; -} - -static int -lst_stat_query_ioctl(struct lstio_stat_args *args) -{ - int rc; - char name[LST_NAME_SIZE + 1]; - - /* TODO: not finished */ - if (args->lstio_sta_key != console_session.ses_key) - return -EACCES; - - if (!args->lstio_sta_resultp) - return -EINVAL; - - if (args->lstio_sta_idsp) { - if (args->lstio_sta_count <= 0) - return -EINVAL; - - rc = lstcon_nodes_stat(args->lstio_sta_count, - args->lstio_sta_idsp, - args->lstio_sta_timeout, - args->lstio_sta_resultp); - } else if (args->lstio_sta_namep) { - if (args->lstio_sta_nmlen <= 0 || - args->lstio_sta_nmlen > LST_NAME_SIZE) - return -EINVAL; - - rc = copy_from_user(name, args->lstio_sta_namep, - args->lstio_sta_nmlen); - if (!rc) - rc = lstcon_group_stat(name, args->lstio_sta_timeout, - args->lstio_sta_resultp); - else - rc = -EFAULT; - } else { - rc = -EINVAL; - } - - return rc; -} - -static int lst_test_add_ioctl(struct lstio_test_args *args) -{ - char batch_name[LST_NAME_SIZE + 1]; - char src_name[LST_NAME_SIZE + 1]; - char dst_name[LST_NAME_SIZE + 1]; - void *param = NULL; - int ret = 0; - int rc = -ENOMEM; - - if (!args->lstio_tes_resultp || - !args->lstio_tes_retp || - !args->lstio_tes_bat_name || /* no specified batch */ - args->lstio_tes_bat_nmlen <= 0 || - args->lstio_tes_bat_nmlen > LST_NAME_SIZE || - !args->lstio_tes_sgrp_name || /* no source group */ - args->lstio_tes_sgrp_nmlen <= 0 || - args->lstio_tes_sgrp_nmlen > LST_NAME_SIZE || - !args->lstio_tes_dgrp_name || /* no target group */ - args->lstio_tes_dgrp_nmlen <= 0 || - args->lstio_tes_dgrp_nmlen > LST_NAME_SIZE) - return -EINVAL; - - if (!args->lstio_tes_loop || /* negative is infinite */ - args->lstio_tes_concur <= 0 || - args->lstio_tes_dist <= 0 || - args->lstio_tes_span <= 0) - return -EINVAL; - - /* have parameter, check if parameter length is valid */ - if (args->lstio_tes_param && - (args->lstio_tes_param_len <= 0 || - args->lstio_tes_param_len > - PAGE_SIZE - sizeof(struct lstcon_test))) - return -EINVAL; - - /* Enforce zero parameter length if there's no parameter */ - if (!args->lstio_tes_param && args->lstio_tes_param_len) - return -EINVAL; - - if (args->lstio_tes_param) { - param = memdup_user(args->lstio_tes_param, - args->lstio_tes_param_len); - if (IS_ERR(param)) - return PTR_ERR(param); - } - - rc = -EFAULT; - if (copy_from_user(batch_name, args->lstio_tes_bat_name, - args->lstio_tes_bat_nmlen) || - copy_from_user(src_name, args->lstio_tes_sgrp_name, - args->lstio_tes_sgrp_nmlen) || - copy_from_user(dst_name, args->lstio_tes_dgrp_name, - args->lstio_tes_dgrp_nmlen)) - goto out; - - rc = lstcon_test_add(batch_name, args->lstio_tes_type, - args->lstio_tes_loop, args->lstio_tes_concur, - args->lstio_tes_dist, args->lstio_tes_span, - src_name, dst_name, param, - args->lstio_tes_param_len, - &ret, args->lstio_tes_resultp); - - if (!rc && ret) - rc = (copy_to_user(args->lstio_tes_retp, &ret, - sizeof(ret))) ? -EFAULT : 0; -out: - kfree(param); - - return rc; -} - -int -lstcon_ioctl_entry(struct notifier_block *nb, - unsigned long cmd, void *vdata) -{ - struct libcfs_ioctl_hdr *hdr = vdata; - char *buf = NULL; - struct libcfs_ioctl_data *data; - int opc; - int rc = -EINVAL; - - if (cmd != IOC_LIBCFS_LNETST) - goto err; - - data = container_of(hdr, struct libcfs_ioctl_data, ioc_hdr); - - opc = data->ioc_u32[0]; - - if (data->ioc_plen1 > PAGE_SIZE) - goto err; - - buf = kmalloc(data->ioc_plen1, GFP_KERNEL); - rc = -ENOMEM; - if (!buf) - goto err; - - /* copy in parameter */ - rc = -EFAULT; - if (copy_from_user(buf, data->ioc_pbuf1, data->ioc_plen1)) - goto err; - - mutex_lock(&console_session.ses_mutex); - - console_session.ses_laststamp = ktime_get_real_seconds(); - - if (console_session.ses_shutdown) { - rc = -ESHUTDOWN; - goto out; - } - - if (console_session.ses_expired) - lstcon_session_end(); - - if (opc != LSTIO_SESSION_NEW && - console_session.ses_state == LST_SESSION_NONE) { - CDEBUG(D_NET, "LST no active session\n"); - rc = -ESRCH; - goto out; - } - - memset(&console_session.ses_trans_stat, 0, sizeof(struct lstcon_trans_stat)); - - switch (opc) { - case LSTIO_SESSION_NEW: - rc = lst_session_new_ioctl((struct lstio_session_new_args *)buf); - break; - case LSTIO_SESSION_END: - rc = lst_session_end_ioctl((struct lstio_session_end_args *)buf); - break; - case LSTIO_SESSION_INFO: - rc = lst_session_info_ioctl((struct lstio_session_info_args *)buf); - break; - case LSTIO_DEBUG: - rc = lst_debug_ioctl((struct lstio_debug_args *)buf); - break; - case LSTIO_GROUP_ADD: - rc = lst_group_add_ioctl((struct lstio_group_add_args *)buf); - break; - case LSTIO_GROUP_DEL: - rc = lst_group_del_ioctl((struct lstio_group_del_args *)buf); - break; - case LSTIO_GROUP_UPDATE: - rc = lst_group_update_ioctl((struct lstio_group_update_args *)buf); - break; - case LSTIO_NODES_ADD: - rc = lst_nodes_add_ioctl((struct lstio_group_nodes_args *)buf); - break; - case LSTIO_GROUP_LIST: - rc = lst_group_list_ioctl((struct lstio_group_list_args *)buf); - break; - case LSTIO_GROUP_INFO: - rc = lst_group_info_ioctl((struct lstio_group_info_args *)buf); - break; - case LSTIO_BATCH_ADD: - rc = lst_batch_add_ioctl((struct lstio_batch_add_args *)buf); - break; - case LSTIO_BATCH_START: - rc = lst_batch_run_ioctl((struct lstio_batch_run_args *)buf); - break; - case LSTIO_BATCH_STOP: - rc = lst_batch_stop_ioctl((struct lstio_batch_stop_args *)buf); - break; - case LSTIO_BATCH_QUERY: - rc = lst_batch_query_ioctl((struct lstio_batch_query_args *)buf); - break; - case LSTIO_BATCH_LIST: - rc = lst_batch_list_ioctl((struct lstio_batch_list_args *)buf); - break; - case LSTIO_BATCH_INFO: - rc = lst_batch_info_ioctl((struct lstio_batch_info_args *)buf); - break; - case LSTIO_TEST_ADD: - rc = lst_test_add_ioctl((struct lstio_test_args *)buf); - break; - case LSTIO_STAT_QUERY: - rc = lst_stat_query_ioctl((struct lstio_stat_args *)buf); - break; - default: - rc = -EINVAL; - goto out; - } - - if (copy_to_user(data->ioc_pbuf2, &console_session.ses_trans_stat, - sizeof(struct lstcon_trans_stat))) - rc = -EFAULT; -out: - mutex_unlock(&console_session.ses_mutex); -err: - kfree(buf); - - return notifier_from_ioctl_errno(rc); -} diff --git a/drivers/staging/lustre/lnet/selftest/conrpc.c b/drivers/staging/lustre/lnet/selftest/conrpc.c deleted file mode 100644 index 0dabade3d091088ef68bab619728bed732808084..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/selftest/conrpc.c +++ /dev/null @@ -1,1396 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/selftest/conctl.c - * - * Console framework rpcs - * - * Author: Liang Zhen - */ - -#include -#include "timer.h" -#include "conrpc.h" -#include "console.h" - -void lstcon_rpc_stat_reply(struct lstcon_rpc_trans *, struct srpc_msg *, - struct lstcon_node *, struct lstcon_trans_stat *); - -static void -lstcon_rpc_done(struct srpc_client_rpc *rpc) -{ - struct lstcon_rpc *crpc = (struct lstcon_rpc *)rpc->crpc_priv; - - LASSERT(crpc && rpc == crpc->crp_rpc); - LASSERT(crpc->crp_posted && !crpc->crp_finished); - - spin_lock(&rpc->crpc_lock); - - if (!crpc->crp_trans) { - /* - * Orphan RPC is not in any transaction, - * I'm just a poor body and nobody loves me - */ - spin_unlock(&rpc->crpc_lock); - - /* release it */ - lstcon_rpc_put(crpc); - return; - } - - /* not an orphan RPC */ - crpc->crp_finished = 1; - - if (!crpc->crp_stamp) { - /* not aborted */ - LASSERT(!crpc->crp_status); - - crpc->crp_stamp = jiffies; - crpc->crp_status = rpc->crpc_status; - } - - /* wakeup (transaction)thread if I'm the last RPC in the transaction */ - if (atomic_dec_and_test(&crpc->crp_trans->tas_remaining)) - wake_up(&crpc->crp_trans->tas_waitq); - - spin_unlock(&rpc->crpc_lock); -} - -static int -lstcon_rpc_init(struct lstcon_node *nd, int service, unsigned int feats, - int bulk_npg, int bulk_len, int embedded, - struct lstcon_rpc *crpc) -{ - crpc->crp_rpc = sfw_create_rpc(nd->nd_id, service, - feats, bulk_npg, bulk_len, - lstcon_rpc_done, (void *)crpc); - if (!crpc->crp_rpc) - return -ENOMEM; - - crpc->crp_trans = NULL; - crpc->crp_node = nd; - crpc->crp_posted = 0; - crpc->crp_finished = 0; - crpc->crp_unpacked = 0; - crpc->crp_status = 0; - crpc->crp_stamp = 0; - crpc->crp_embedded = embedded; - INIT_LIST_HEAD(&crpc->crp_link); - - atomic_inc(&console_session.ses_rpc_counter); - - return 0; -} - -static int -lstcon_rpc_prep(struct lstcon_node *nd, int service, unsigned int feats, - int bulk_npg, int bulk_len, struct lstcon_rpc **crpcpp) -{ - struct lstcon_rpc *crpc = NULL; - int rc; - - spin_lock(&console_session.ses_rpc_lock); - - crpc = list_first_entry_or_null(&console_session.ses_rpc_freelist, - struct lstcon_rpc, crp_link); - if (crpc) - list_del_init(&crpc->crp_link); - - spin_unlock(&console_session.ses_rpc_lock); - - if (!crpc) { - crpc = kzalloc(sizeof(*crpc), GFP_NOFS); - if (!crpc) - return -ENOMEM; - } - - rc = lstcon_rpc_init(nd, service, feats, bulk_npg, bulk_len, 0, crpc); - if (!rc) { - *crpcpp = crpc; - return 0; - } - - kfree(crpc); - - return rc; -} - -void -lstcon_rpc_put(struct lstcon_rpc *crpc) -{ - struct srpc_bulk *bulk = &crpc->crp_rpc->crpc_bulk; - int i; - - LASSERT(list_empty(&crpc->crp_link)); - - for (i = 0; i < bulk->bk_niov; i++) { - if (!bulk->bk_iovs[i].bv_page) - continue; - - __free_page(bulk->bk_iovs[i].bv_page); - } - - srpc_client_rpc_decref(crpc->crp_rpc); - - if (crpc->crp_embedded) { - /* embedded RPC, don't recycle it */ - memset(crpc, 0, sizeof(*crpc)); - crpc->crp_embedded = 1; - - } else { - spin_lock(&console_session.ses_rpc_lock); - - list_add(&crpc->crp_link, - &console_session.ses_rpc_freelist); - - spin_unlock(&console_session.ses_rpc_lock); - } - - /* RPC is not alive now */ - atomic_dec(&console_session.ses_rpc_counter); -} - -static void -lstcon_rpc_post(struct lstcon_rpc *crpc) -{ - struct lstcon_rpc_trans *trans = crpc->crp_trans; - - LASSERT(trans); - - atomic_inc(&trans->tas_remaining); - crpc->crp_posted = 1; - - sfw_post_rpc(crpc->crp_rpc); -} - -static char * -lstcon_rpc_trans_name(int transop) -{ - if (transop == LST_TRANS_SESNEW) - return "SESNEW"; - - if (transop == LST_TRANS_SESEND) - return "SESEND"; - - if (transop == LST_TRANS_SESQRY) - return "SESQRY"; - - if (transop == LST_TRANS_SESPING) - return "SESPING"; - - if (transop == LST_TRANS_TSBCLIADD) - return "TSBCLIADD"; - - if (transop == LST_TRANS_TSBSRVADD) - return "TSBSRVADD"; - - if (transop == LST_TRANS_TSBRUN) - return "TSBRUN"; - - if (transop == LST_TRANS_TSBSTOP) - return "TSBSTOP"; - - if (transop == LST_TRANS_TSBCLIQRY) - return "TSBCLIQRY"; - - if (transop == LST_TRANS_TSBSRVQRY) - return "TSBSRVQRY"; - - if (transop == LST_TRANS_STATQRY) - return "STATQRY"; - - return "Unknown"; -} - -int -lstcon_rpc_trans_prep(struct list_head *translist, int transop, - struct lstcon_rpc_trans **transpp) -{ - struct lstcon_rpc_trans *trans; - - if (translist) { - list_for_each_entry(trans, translist, tas_link) { - /* - * Can't enqueue two private transaction on - * the same object - */ - if ((trans->tas_opc & transop) == LST_TRANS_PRIVATE) - return -EPERM; - } - } - - /* create a trans group */ - trans = kzalloc(sizeof(*trans), GFP_NOFS); - if (!trans) - return -ENOMEM; - - trans->tas_opc = transop; - - if (!translist) - INIT_LIST_HEAD(&trans->tas_olink); - else - list_add_tail(&trans->tas_olink, translist); - - list_add_tail(&trans->tas_link, &console_session.ses_trans_list); - - INIT_LIST_HEAD(&trans->tas_rpcs_list); - atomic_set(&trans->tas_remaining, 0); - init_waitqueue_head(&trans->tas_waitq); - - spin_lock(&console_session.ses_rpc_lock); - trans->tas_features = console_session.ses_features; - spin_unlock(&console_session.ses_rpc_lock); - - *transpp = trans; - return 0; -} - -void -lstcon_rpc_trans_addreq(struct lstcon_rpc_trans *trans, struct lstcon_rpc *crpc) -{ - list_add_tail(&crpc->crp_link, &trans->tas_rpcs_list); - crpc->crp_trans = trans; -} - -void -lstcon_rpc_trans_abort(struct lstcon_rpc_trans *trans, int error) -{ - struct srpc_client_rpc *rpc; - struct lstcon_rpc *crpc; - struct lstcon_node *nd; - - list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) { - rpc = crpc->crp_rpc; - - spin_lock(&rpc->crpc_lock); - - if (!crpc->crp_posted || /* not posted */ - crpc->crp_stamp) { /* rpc done or aborted already */ - if (!crpc->crp_stamp) { - crpc->crp_stamp = jiffies; - crpc->crp_status = -EINTR; - } - spin_unlock(&rpc->crpc_lock); - continue; - } - - crpc->crp_stamp = jiffies; - crpc->crp_status = error; - - spin_unlock(&rpc->crpc_lock); - - sfw_abort_rpc(rpc); - - if (error != -ETIMEDOUT) - continue; - - nd = crpc->crp_node; - if (time_after(nd->nd_stamp, crpc->crp_stamp)) - continue; - - nd->nd_stamp = crpc->crp_stamp; - nd->nd_state = LST_NODE_DOWN; - } -} - -static int -lstcon_rpc_trans_check(struct lstcon_rpc_trans *trans) -{ - if (console_session.ses_shutdown && - !list_empty(&trans->tas_olink)) /* Not an end session RPC */ - return 1; - - return !atomic_read(&trans->tas_remaining) ? 1 : 0; -} - -int -lstcon_rpc_trans_postwait(struct lstcon_rpc_trans *trans, int timeout) -{ - struct lstcon_rpc *crpc; - int rc; - - if (list_empty(&trans->tas_rpcs_list)) - return 0; - - if (timeout < LST_TRANS_MIN_TIMEOUT) - timeout = LST_TRANS_MIN_TIMEOUT; - - CDEBUG(D_NET, "Transaction %s started\n", - lstcon_rpc_trans_name(trans->tas_opc)); - - /* post all requests */ - list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) { - LASSERT(!crpc->crp_posted); - - lstcon_rpc_post(crpc); - } - - mutex_unlock(&console_session.ses_mutex); - - rc = wait_event_interruptible_timeout(trans->tas_waitq, - lstcon_rpc_trans_check(trans), - timeout * HZ); - rc = (rc > 0) ? 0 : ((rc < 0) ? -EINTR : -ETIMEDOUT); - - mutex_lock(&console_session.ses_mutex); - - if (console_session.ses_shutdown) - rc = -ESHUTDOWN; - - if (rc || atomic_read(&trans->tas_remaining)) { - /* treat short timeout as canceled */ - if (rc == -ETIMEDOUT && timeout < LST_TRANS_MIN_TIMEOUT * 2) - rc = -EINTR; - - lstcon_rpc_trans_abort(trans, rc); - } - - CDEBUG(D_NET, "Transaction %s stopped: %d\n", - lstcon_rpc_trans_name(trans->tas_opc), rc); - - lstcon_rpc_trans_stat(trans, lstcon_trans_stat()); - - return rc; -} - -static int -lstcon_rpc_get_reply(struct lstcon_rpc *crpc, struct srpc_msg **msgpp) -{ - struct lstcon_node *nd = crpc->crp_node; - struct srpc_client_rpc *rpc = crpc->crp_rpc; - struct srpc_generic_reply *rep; - - LASSERT(nd && rpc); - LASSERT(crpc->crp_stamp); - - if (crpc->crp_status) { - *msgpp = NULL; - return crpc->crp_status; - } - - *msgpp = &rpc->crpc_replymsg; - if (!crpc->crp_unpacked) { - sfw_unpack_message(*msgpp); - crpc->crp_unpacked = 1; - } - - if (time_after(nd->nd_stamp, crpc->crp_stamp)) - return 0; - - nd->nd_stamp = crpc->crp_stamp; - rep = &(*msgpp)->msg_body.reply; - - if (rep->sid.ses_nid == LNET_NID_ANY) - nd->nd_state = LST_NODE_UNKNOWN; - else if (lstcon_session_match(rep->sid)) - nd->nd_state = LST_NODE_ACTIVE; - else - nd->nd_state = LST_NODE_BUSY; - - return 0; -} - -void -lstcon_rpc_trans_stat(struct lstcon_rpc_trans *trans, struct lstcon_trans_stat *stat) -{ - struct lstcon_rpc *crpc; - struct srpc_msg *rep; - int error; - - LASSERT(stat); - - memset(stat, 0, sizeof(*stat)); - - list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) { - lstcon_rpc_stat_total(stat, 1); - - LASSERT(crpc->crp_stamp); - - error = lstcon_rpc_get_reply(crpc, &rep); - if (error) { - lstcon_rpc_stat_failure(stat, 1); - if (!stat->trs_rpc_errno) - stat->trs_rpc_errno = -error; - - continue; - } - - lstcon_rpc_stat_success(stat, 1); - - lstcon_rpc_stat_reply(trans, rep, crpc->crp_node, stat); - } - - if (trans->tas_opc == LST_TRANS_SESNEW && !stat->trs_fwk_errno) { - stat->trs_fwk_errno = - lstcon_session_feats_check(trans->tas_features); - } - - CDEBUG(D_NET, "transaction %s : success %d, failure %d, total %d, RPC error(%d), Framework error(%d)\n", - lstcon_rpc_trans_name(trans->tas_opc), - lstcon_rpc_stat_success(stat, 0), - lstcon_rpc_stat_failure(stat, 0), - lstcon_rpc_stat_total(stat, 0), - stat->trs_rpc_errno, stat->trs_fwk_errno); -} - -int -lstcon_rpc_trans_interpreter(struct lstcon_rpc_trans *trans, - struct list_head __user *head_up, - lstcon_rpc_readent_func_t readent) -{ - struct list_head tmp; - struct list_head __user *next; - struct lstcon_rpc_ent *ent; - struct srpc_generic_reply *rep; - struct lstcon_rpc *crpc; - struct srpc_msg *msg; - struct lstcon_node *nd; - long dur; - struct timeval tv; - int error; - - LASSERT(head_up); - - next = head_up; - - list_for_each_entry(crpc, &trans->tas_rpcs_list, crp_link) { - if (copy_from_user(&tmp, next, - sizeof(struct list_head))) - return -EFAULT; - - next = tmp.next; - if (next == head_up) - return 0; - - ent = list_entry(next, struct lstcon_rpc_ent, rpe_link); - - LASSERT(crpc->crp_stamp); - - error = lstcon_rpc_get_reply(crpc, &msg); - - nd = crpc->crp_node; - - dur = (long)(crpc->crp_stamp - - (unsigned long)console_session.ses_id.ses_stamp); - jiffies_to_timeval(dur, &tv); - - if (copy_to_user(&ent->rpe_peer, &nd->nd_id, - sizeof(struct lnet_process_id)) || - copy_to_user(&ent->rpe_stamp, &tv, sizeof(tv)) || - copy_to_user(&ent->rpe_state, &nd->nd_state, - sizeof(nd->nd_state)) || - copy_to_user(&ent->rpe_rpc_errno, &error, - sizeof(error))) - return -EFAULT; - - if (error) - continue; - - /* RPC is done */ - rep = (struct srpc_generic_reply *)&msg->msg_body.reply; - - if (copy_to_user(&ent->rpe_sid, &rep->sid, sizeof(rep->sid)) || - copy_to_user(&ent->rpe_fwk_errno, &rep->status, - sizeof(rep->status))) - return -EFAULT; - - if (!readent) - continue; - - error = readent(trans->tas_opc, msg, ent); - if (error) - return error; - } - - return 0; -} - -void -lstcon_rpc_trans_destroy(struct lstcon_rpc_trans *trans) -{ - struct srpc_client_rpc *rpc; - struct lstcon_rpc *crpc; - struct lstcon_rpc *tmp; - int count = 0; - - list_for_each_entry_safe(crpc, tmp, &trans->tas_rpcs_list, crp_link) { - rpc = crpc->crp_rpc; - - spin_lock(&rpc->crpc_lock); - - /* free it if not posted or finished already */ - if (!crpc->crp_posted || crpc->crp_finished) { - spin_unlock(&rpc->crpc_lock); - - list_del_init(&crpc->crp_link); - lstcon_rpc_put(crpc); - - continue; - } - - /* - * rpcs can be still not callbacked (even LNetMDUnlink is - * called) because huge timeout for inaccessible network, - * don't make user wait for them, just abandon them, they - * will be recycled in callback - */ - LASSERT(crpc->crp_status); - - crpc->crp_node = NULL; - crpc->crp_trans = NULL; - list_del_init(&crpc->crp_link); - count++; - - spin_unlock(&rpc->crpc_lock); - - atomic_dec(&trans->tas_remaining); - } - - LASSERT(!atomic_read(&trans->tas_remaining)); - - list_del(&trans->tas_link); - if (!list_empty(&trans->tas_olink)) - list_del(&trans->tas_olink); - - CDEBUG(D_NET, "Transaction %s destroyed with %d pending RPCs\n", - lstcon_rpc_trans_name(trans->tas_opc), count); - - kfree(trans); -} - -int -lstcon_sesrpc_prep(struct lstcon_node *nd, int transop, - unsigned int feats, struct lstcon_rpc **crpc) -{ - struct srpc_mksn_reqst *msrq; - struct srpc_rmsn_reqst *rsrq; - int rc; - - switch (transop) { - case LST_TRANS_SESNEW: - rc = lstcon_rpc_prep(nd, SRPC_SERVICE_MAKE_SESSION, - feats, 0, 0, crpc); - if (rc) - return rc; - - msrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.mksn_reqst; - msrq->mksn_sid = console_session.ses_id; - msrq->mksn_force = console_session.ses_force; - strlcpy(msrq->mksn_name, console_session.ses_name, - sizeof(msrq->mksn_name)); - break; - - case LST_TRANS_SESEND: - rc = lstcon_rpc_prep(nd, SRPC_SERVICE_REMOVE_SESSION, - feats, 0, 0, crpc); - if (rc) - return rc; - - rsrq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.rmsn_reqst; - rsrq->rmsn_sid = console_session.ses_id; - break; - - default: - LBUG(); - } - - return 0; -} - -int -lstcon_dbgrpc_prep(struct lstcon_node *nd, unsigned int feats, - struct lstcon_rpc **crpc) -{ - struct srpc_debug_reqst *drq; - int rc; - - rc = lstcon_rpc_prep(nd, SRPC_SERVICE_DEBUG, feats, 0, 0, crpc); - if (rc) - return rc; - - drq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst; - - drq->dbg_sid = console_session.ses_id; - drq->dbg_flags = 0; - - return rc; -} - -int -lstcon_batrpc_prep(struct lstcon_node *nd, int transop, unsigned int feats, - struct lstcon_tsb_hdr *tsb, struct lstcon_rpc **crpc) -{ - struct lstcon_batch *batch; - struct srpc_batch_reqst *brq; - int rc; - - rc = lstcon_rpc_prep(nd, SRPC_SERVICE_BATCH, feats, 0, 0, crpc); - if (rc) - return rc; - - brq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.bat_reqst; - - brq->bar_sid = console_session.ses_id; - brq->bar_bid = tsb->tsb_id; - brq->bar_testidx = tsb->tsb_index; - brq->bar_opc = transop == LST_TRANS_TSBRUN ? SRPC_BATCH_OPC_RUN : - (transop == LST_TRANS_TSBSTOP ? SRPC_BATCH_OPC_STOP : - SRPC_BATCH_OPC_QUERY); - - if (transop != LST_TRANS_TSBRUN && - transop != LST_TRANS_TSBSTOP) - return 0; - - LASSERT(!tsb->tsb_index); - - batch = (struct lstcon_batch *)tsb; - brq->bar_arg = batch->bat_arg; - - return 0; -} - -int -lstcon_statrpc_prep(struct lstcon_node *nd, unsigned int feats, - struct lstcon_rpc **crpc) -{ - struct srpc_stat_reqst *srq; - int rc; - - rc = lstcon_rpc_prep(nd, SRPC_SERVICE_QUERY_STAT, feats, 0, 0, crpc); - if (rc) - return rc; - - srq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.stat_reqst; - - srq->str_sid = console_session.ses_id; - srq->str_type = 0; /* XXX remove it */ - - return 0; -} - -static struct lnet_process_id_packed * -lstcon_next_id(int idx, int nkiov, struct bio_vec *kiov) -{ - struct lnet_process_id_packed *pid; - int i; - - i = idx / SFW_ID_PER_PAGE; - - LASSERT(i < nkiov); - - pid = (struct lnet_process_id_packed *)page_address(kiov[i].bv_page); - - return &pid[idx % SFW_ID_PER_PAGE]; -} - -static int -lstcon_dstnodes_prep(struct lstcon_group *grp, int idx, - int dist, int span, int nkiov, struct bio_vec *kiov) -{ - struct lnet_process_id_packed *pid; - struct lstcon_ndlink *ndl; - struct lstcon_node *nd; - int start; - int end; - int i = 0; - - LASSERT(dist >= 1); - LASSERT(span >= 1); - LASSERT(grp->grp_nnode >= 1); - - if (span > grp->grp_nnode) - return -EINVAL; - - start = ((idx / dist) * span) % grp->grp_nnode; - end = ((idx / dist) * span + span - 1) % grp->grp_nnode; - - list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) { - nd = ndl->ndl_node; - if (i < start) { - i++; - continue; - } - - if (i > (end >= start ? end : grp->grp_nnode)) - break; - - pid = lstcon_next_id((i - start), nkiov, kiov); - pid->nid = nd->nd_id.nid; - pid->pid = nd->nd_id.pid; - i++; - } - - if (start <= end) /* done */ - return 0; - - list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) { - if (i > grp->grp_nnode + end) - break; - - nd = ndl->ndl_node; - pid = lstcon_next_id((i - start), nkiov, kiov); - pid->nid = nd->nd_id.nid; - pid->pid = nd->nd_id.pid; - i++; - } - - return 0; -} - -static int -lstcon_pingrpc_prep(struct lst_test_ping_param *param, struct srpc_test_reqst *req) -{ - struct test_ping_req *prq = &req->tsr_u.ping; - - prq->png_size = param->png_size; - prq->png_flags = param->png_flags; - /* TODO dest */ - return 0; -} - -static int -lstcon_bulkrpc_v0_prep(struct lst_test_bulk_param *param, - struct srpc_test_reqst *req) -{ - struct test_bulk_req *brq = &req->tsr_u.bulk_v0; - - brq->blk_opc = param->blk_opc; - brq->blk_npg = DIV_ROUND_UP(param->blk_size, PAGE_SIZE); - brq->blk_flags = param->blk_flags; - - return 0; -} - -static int -lstcon_bulkrpc_v1_prep(struct lst_test_bulk_param *param, bool is_client, - struct srpc_test_reqst *req) -{ - struct test_bulk_req_v1 *brq = &req->tsr_u.bulk_v1; - - brq->blk_opc = param->blk_opc; - brq->blk_flags = param->blk_flags; - brq->blk_len = param->blk_size; - brq->blk_offset = is_client ? param->blk_cli_off : param->blk_srv_off; - - return 0; -} - -int -lstcon_testrpc_prep(struct lstcon_node *nd, int transop, unsigned int feats, - struct lstcon_test *test, struct lstcon_rpc **crpc) -{ - struct lstcon_group *sgrp = test->tes_src_grp; - struct lstcon_group *dgrp = test->tes_dst_grp; - struct srpc_test_reqst *trq; - struct srpc_bulk *bulk; - int i; - int npg = 0; - int nob = 0; - int rc = 0; - - if (transop == LST_TRANS_TSBCLIADD) { - npg = sfw_id_pages(test->tes_span); - nob = !(feats & LST_FEAT_BULK_LEN) ? - npg * PAGE_SIZE : - sizeof(struct lnet_process_id_packed) * test->tes_span; - } - - rc = lstcon_rpc_prep(nd, SRPC_SERVICE_TEST, feats, npg, nob, crpc); - if (rc) - return rc; - - trq = &(*crpc)->crp_rpc->crpc_reqstmsg.msg_body.tes_reqst; - - if (transop == LST_TRANS_TSBSRVADD) { - int ndist = DIV_ROUND_UP(sgrp->grp_nnode, test->tes_dist); - int nspan = DIV_ROUND_UP(dgrp->grp_nnode, test->tes_span); - int nmax = DIV_ROUND_UP(ndist, nspan); - - trq->tsr_ndest = 0; - trq->tsr_loop = nmax * test->tes_dist * test->tes_concur; - } else { - bulk = &(*crpc)->crp_rpc->crpc_bulk; - - for (i = 0; i < npg; i++) { - int len; - - LASSERT(nob > 0); - - len = !(feats & LST_FEAT_BULK_LEN) ? - PAGE_SIZE : - min_t(int, nob, PAGE_SIZE); - nob -= len; - - bulk->bk_iovs[i].bv_offset = 0; - bulk->bk_iovs[i].bv_len = len; - bulk->bk_iovs[i].bv_page = alloc_page(GFP_KERNEL); - - if (!bulk->bk_iovs[i].bv_page) { - lstcon_rpc_put(*crpc); - return -ENOMEM; - } - } - - bulk->bk_sink = 0; - - LASSERT(transop == LST_TRANS_TSBCLIADD); - - rc = lstcon_dstnodes_prep(test->tes_dst_grp, - test->tes_cliidx++, - test->tes_dist, - test->tes_span, - npg, &bulk->bk_iovs[0]); - if (rc) { - lstcon_rpc_put(*crpc); - return rc; - } - - trq->tsr_ndest = test->tes_span; - trq->tsr_loop = test->tes_loop; - } - - trq->tsr_sid = console_session.ses_id; - trq->tsr_bid = test->tes_hdr.tsb_id; - trq->tsr_concur = test->tes_concur; - trq->tsr_is_client = (transop == LST_TRANS_TSBCLIADD) ? 1 : 0; - trq->tsr_stop_onerr = !!test->tes_stop_onerr; - - switch (test->tes_type) { - case LST_TEST_PING: - trq->tsr_service = SRPC_SERVICE_PING; - rc = lstcon_pingrpc_prep((struct lst_test_ping_param *) - &test->tes_param[0], trq); - break; - - case LST_TEST_BULK: - trq->tsr_service = SRPC_SERVICE_BRW; - if (!(feats & LST_FEAT_BULK_LEN)) { - rc = lstcon_bulkrpc_v0_prep((struct lst_test_bulk_param *) - &test->tes_param[0], trq); - } else { - rc = lstcon_bulkrpc_v1_prep((struct lst_test_bulk_param *) - &test->tes_param[0], - trq->tsr_is_client, trq); - } - - break; - default: - LBUG(); - break; - } - - return rc; -} - -static int -lstcon_sesnew_stat_reply(struct lstcon_rpc_trans *trans, - struct lstcon_node *nd, struct srpc_msg *reply) -{ - struct srpc_mksn_reply *mksn_rep = &reply->msg_body.mksn_reply; - int status = mksn_rep->mksn_status; - - if (!status && - (reply->msg_ses_feats & ~LST_FEATS_MASK)) { - mksn_rep->mksn_status = EPROTO; - status = EPROTO; - } - - if (status == EPROTO) { - CNETERR("session protocol error from %s: %u\n", - libcfs_nid2str(nd->nd_id.nid), - reply->msg_ses_feats); - } - - if (status) - return status; - - if (!trans->tas_feats_updated) { - spin_lock(&console_session.ses_rpc_lock); - if (!trans->tas_feats_updated) { /* recheck with lock */ - trans->tas_feats_updated = 1; - trans->tas_features = reply->msg_ses_feats; - } - spin_unlock(&console_session.ses_rpc_lock); - } - - if (reply->msg_ses_feats != trans->tas_features) { - CNETERR("Framework features %x from %s is different with features on this transaction: %x\n", - reply->msg_ses_feats, libcfs_nid2str(nd->nd_id.nid), - trans->tas_features); - mksn_rep->mksn_status = EPROTO; - status = EPROTO; - } - - if (!status) { - /* session timeout on remote node */ - nd->nd_timeout = mksn_rep->mksn_timeout; - } - - return status; -} - -void -lstcon_rpc_stat_reply(struct lstcon_rpc_trans *trans, struct srpc_msg *msg, - struct lstcon_node *nd, struct lstcon_trans_stat *stat) -{ - struct srpc_rmsn_reply *rmsn_rep; - struct srpc_debug_reply *dbg_rep; - struct srpc_batch_reply *bat_rep; - struct srpc_test_reply *test_rep; - struct srpc_stat_reply *stat_rep; - int rc = 0; - - switch (trans->tas_opc) { - case LST_TRANS_SESNEW: - rc = lstcon_sesnew_stat_reply(trans, nd, msg); - if (!rc) { - lstcon_sesop_stat_success(stat, 1); - return; - } - - lstcon_sesop_stat_failure(stat, 1); - break; - - case LST_TRANS_SESEND: - rmsn_rep = &msg->msg_body.rmsn_reply; - /* ESRCH is not an error for end session */ - if (!rmsn_rep->rmsn_status || - rmsn_rep->rmsn_status == ESRCH) { - lstcon_sesop_stat_success(stat, 1); - return; - } - - lstcon_sesop_stat_failure(stat, 1); - rc = rmsn_rep->rmsn_status; - break; - - case LST_TRANS_SESQRY: - case LST_TRANS_SESPING: - dbg_rep = &msg->msg_body.dbg_reply; - - if (dbg_rep->dbg_status == ESRCH) { - lstcon_sesqry_stat_unknown(stat, 1); - return; - } - - if (lstcon_session_match(dbg_rep->dbg_sid)) - lstcon_sesqry_stat_active(stat, 1); - else - lstcon_sesqry_stat_busy(stat, 1); - return; - - case LST_TRANS_TSBRUN: - case LST_TRANS_TSBSTOP: - bat_rep = &msg->msg_body.bat_reply; - - if (!bat_rep->bar_status) { - lstcon_tsbop_stat_success(stat, 1); - return; - } - - if (bat_rep->bar_status == EPERM && - trans->tas_opc == LST_TRANS_TSBSTOP) { - lstcon_tsbop_stat_success(stat, 1); - return; - } - - lstcon_tsbop_stat_failure(stat, 1); - rc = bat_rep->bar_status; - break; - - case LST_TRANS_TSBCLIQRY: - case LST_TRANS_TSBSRVQRY: - bat_rep = &msg->msg_body.bat_reply; - - if (bat_rep->bar_active) - lstcon_tsbqry_stat_run(stat, 1); - else - lstcon_tsbqry_stat_idle(stat, 1); - - if (!bat_rep->bar_status) - return; - - lstcon_tsbqry_stat_failure(stat, 1); - rc = bat_rep->bar_status; - break; - - case LST_TRANS_TSBCLIADD: - case LST_TRANS_TSBSRVADD: - test_rep = &msg->msg_body.tes_reply; - - if (!test_rep->tsr_status) { - lstcon_tsbop_stat_success(stat, 1); - return; - } - - lstcon_tsbop_stat_failure(stat, 1); - rc = test_rep->tsr_status; - break; - - case LST_TRANS_STATQRY: - stat_rep = &msg->msg_body.stat_reply; - - if (!stat_rep->str_status) { - lstcon_statqry_stat_success(stat, 1); - return; - } - - lstcon_statqry_stat_failure(stat, 1); - rc = stat_rep->str_status; - break; - - default: - LBUG(); - } - - if (!stat->trs_fwk_errno) - stat->trs_fwk_errno = rc; -} - -int -lstcon_rpc_trans_ndlist(struct list_head *ndlist, - struct list_head *translist, int transop, - void *arg, lstcon_rpc_cond_func_t condition, - struct lstcon_rpc_trans **transpp) -{ - struct lstcon_rpc_trans *trans; - struct lstcon_ndlink *ndl; - struct lstcon_node *nd; - struct lstcon_rpc *rpc; - unsigned int feats; - int rc; - - /* Creating session RPG for list of nodes */ - - rc = lstcon_rpc_trans_prep(translist, transop, &trans); - if (rc) { - CERROR("Can't create transaction %d: %d\n", transop, rc); - return rc; - } - - feats = trans->tas_features; - list_for_each_entry(ndl, ndlist, ndl_link) { - rc = !condition ? 1 : - condition(transop, ndl->ndl_node, arg); - - if (!rc) - continue; - - if (rc < 0) { - CDEBUG(D_NET, "Condition error while creating RPC for transaction %d: %d\n", - transop, rc); - break; - } - - nd = ndl->ndl_node; - - switch (transop) { - case LST_TRANS_SESNEW: - case LST_TRANS_SESEND: - rc = lstcon_sesrpc_prep(nd, transop, feats, &rpc); - break; - case LST_TRANS_SESQRY: - case LST_TRANS_SESPING: - rc = lstcon_dbgrpc_prep(nd, feats, &rpc); - break; - case LST_TRANS_TSBCLIADD: - case LST_TRANS_TSBSRVADD: - rc = lstcon_testrpc_prep(nd, transop, feats, - (struct lstcon_test *)arg, - &rpc); - break; - case LST_TRANS_TSBRUN: - case LST_TRANS_TSBSTOP: - case LST_TRANS_TSBCLIQRY: - case LST_TRANS_TSBSRVQRY: - rc = lstcon_batrpc_prep(nd, transop, feats, - (struct lstcon_tsb_hdr *)arg, - &rpc); - break; - case LST_TRANS_STATQRY: - rc = lstcon_statrpc_prep(nd, feats, &rpc); - break; - default: - rc = -EINVAL; - break; - } - - if (rc) { - CERROR("Failed to create RPC for transaction %s: %d\n", - lstcon_rpc_trans_name(transop), rc); - break; - } - - lstcon_rpc_trans_addreq(trans, rpc); - } - - if (!rc) { - *transpp = trans; - return 0; - } - - lstcon_rpc_trans_destroy(trans); - - return rc; -} - -static void -lstcon_rpc_pinger(void *arg) -{ - struct stt_timer *ptimer = (struct stt_timer *)arg; - struct lstcon_rpc_trans *trans; - struct lstcon_rpc *crpc; - struct srpc_msg *rep; - struct srpc_debug_reqst *drq; - struct lstcon_ndlink *ndl; - struct lstcon_node *nd; - int intv; - int count = 0; - int rc; - - /* - * RPC pinger is a special case of transaction, - * it's called by timer at 8 seconds interval. - */ - mutex_lock(&console_session.ses_mutex); - - if (console_session.ses_shutdown || console_session.ses_expired) { - mutex_unlock(&console_session.ses_mutex); - return; - } - - if (!console_session.ses_expired && - ktime_get_real_seconds() - console_session.ses_laststamp > - (time64_t)console_session.ses_timeout) - console_session.ses_expired = 1; - - trans = console_session.ses_ping; - - LASSERT(trans); - - list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link) { - nd = ndl->ndl_node; - - if (console_session.ses_expired) { - /* idle console, end session on all nodes */ - if (nd->nd_state != LST_NODE_ACTIVE) - continue; - - rc = lstcon_sesrpc_prep(nd, LST_TRANS_SESEND, - trans->tas_features, &crpc); - if (rc) { - CERROR("Out of memory\n"); - break; - } - - lstcon_rpc_trans_addreq(trans, crpc); - lstcon_rpc_post(crpc); - - continue; - } - - crpc = &nd->nd_ping; - - if (crpc->crp_rpc) { - LASSERT(crpc->crp_trans == trans); - LASSERT(!list_empty(&crpc->crp_link)); - - spin_lock(&crpc->crp_rpc->crpc_lock); - - LASSERT(crpc->crp_posted); - - if (!crpc->crp_finished) { - /* in flight */ - spin_unlock(&crpc->crp_rpc->crpc_lock); - continue; - } - - spin_unlock(&crpc->crp_rpc->crpc_lock); - - lstcon_rpc_get_reply(crpc, &rep); - - list_del_init(&crpc->crp_link); - - lstcon_rpc_put(crpc); - } - - if (nd->nd_state != LST_NODE_ACTIVE) - continue; - - intv = (jiffies - nd->nd_stamp) / msecs_to_jiffies(MSEC_PER_SEC); - if (intv < nd->nd_timeout / 2) - continue; - - rc = lstcon_rpc_init(nd, SRPC_SERVICE_DEBUG, - trans->tas_features, 0, 0, 1, crpc); - if (rc) { - CERROR("Out of memory\n"); - break; - } - - drq = &crpc->crp_rpc->crpc_reqstmsg.msg_body.dbg_reqst; - - drq->dbg_sid = console_session.ses_id; - drq->dbg_flags = 0; - - lstcon_rpc_trans_addreq(trans, crpc); - lstcon_rpc_post(crpc); - - count++; - } - - if (console_session.ses_expired) { - mutex_unlock(&console_session.ses_mutex); - return; - } - - CDEBUG(D_NET, "Ping %d nodes in session\n", count); - - ptimer->stt_expires = ktime_get_real_seconds() + LST_PING_INTERVAL; - stt_add_timer(ptimer); - - mutex_unlock(&console_session.ses_mutex); -} - -int -lstcon_rpc_pinger_start(void) -{ - struct stt_timer *ptimer; - int rc; - - LASSERT(list_empty(&console_session.ses_rpc_freelist)); - LASSERT(!atomic_read(&console_session.ses_rpc_counter)); - - rc = lstcon_rpc_trans_prep(NULL, LST_TRANS_SESPING, - &console_session.ses_ping); - if (rc) { - CERROR("Failed to create console pinger\n"); - return rc; - } - - ptimer = &console_session.ses_ping_timer; - ptimer->stt_expires = ktime_get_real_seconds() + LST_PING_INTERVAL; - - stt_add_timer(ptimer); - - return 0; -} - -void -lstcon_rpc_pinger_stop(void) -{ - LASSERT(console_session.ses_shutdown); - - stt_del_timer(&console_session.ses_ping_timer); - - lstcon_rpc_trans_abort(console_session.ses_ping, -ESHUTDOWN); - lstcon_rpc_trans_stat(console_session.ses_ping, lstcon_trans_stat()); - lstcon_rpc_trans_destroy(console_session.ses_ping); - - memset(lstcon_trans_stat(), 0, sizeof(struct lstcon_trans_stat)); - - console_session.ses_ping = NULL; -} - -void -lstcon_rpc_cleanup_wait(void) -{ - struct lstcon_rpc_trans *trans; - struct lstcon_rpc *crpc; - struct lstcon_rpc *temp; - struct list_head *pacer; - struct list_head zlist; - - /* Called with hold of global mutex */ - - LASSERT(console_session.ses_shutdown); - - while (!list_empty(&console_session.ses_trans_list)) { - list_for_each(pacer, &console_session.ses_trans_list) { - trans = list_entry(pacer, struct lstcon_rpc_trans, - tas_link); - - CDEBUG(D_NET, "Session closed, wakeup transaction %s\n", - lstcon_rpc_trans_name(trans->tas_opc)); - - wake_up(&trans->tas_waitq); - } - - mutex_unlock(&console_session.ses_mutex); - - CWARN("Session is shutting down, waiting for termination of transactions\n"); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ); - - mutex_lock(&console_session.ses_mutex); - } - - spin_lock(&console_session.ses_rpc_lock); - - lst_wait_until(!atomic_read(&console_session.ses_rpc_counter), - console_session.ses_rpc_lock, - "Network is not accessible or target is down, waiting for %d console RPCs to being recycled\n", - atomic_read(&console_session.ses_rpc_counter)); - - list_add(&zlist, &console_session.ses_rpc_freelist); - list_del_init(&console_session.ses_rpc_freelist); - - spin_unlock(&console_session.ses_rpc_lock); - - list_for_each_entry_safe(crpc, temp, &zlist, crp_link) { - list_del(&crpc->crp_link); - kfree(crpc); - } -} - -int -lstcon_rpc_module_init(void) -{ - INIT_LIST_HEAD(&console_session.ses_ping_timer.stt_list); - console_session.ses_ping_timer.stt_func = lstcon_rpc_pinger; - console_session.ses_ping_timer.stt_data = &console_session.ses_ping_timer; - - console_session.ses_ping = NULL; - - spin_lock_init(&console_session.ses_rpc_lock); - atomic_set(&console_session.ses_rpc_counter, 0); - INIT_LIST_HEAD(&console_session.ses_rpc_freelist); - - return 0; -} - -void -lstcon_rpc_module_fini(void) -{ - LASSERT(list_empty(&console_session.ses_rpc_freelist)); - LASSERT(!atomic_read(&console_session.ses_rpc_counter)); -} diff --git a/drivers/staging/lustre/lnet/selftest/conrpc.h b/drivers/staging/lustre/lnet/selftest/conrpc.h deleted file mode 100644 index ce2f92d04838d79e6de9cd01913affeb254be7c3..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/selftest/conrpc.h +++ /dev/null @@ -1,142 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * /lnet/selftest/conrpc.h - * - * Console rpc - * - * Author: Liang Zhen - */ - -#ifndef __LST_CONRPC_H__ -#define __LST_CONRPC_H__ - -#include -#include -#include "rpc.h" -#include "selftest.h" - -/* Console rpc and rpc transaction */ -#define LST_TRANS_TIMEOUT 30 -#define LST_TRANS_MIN_TIMEOUT 3 - -#define LST_VALIDATE_TIMEOUT(t) min(max(t, LST_TRANS_MIN_TIMEOUT), LST_TRANS_TIMEOUT) - -#define LST_PING_INTERVAL 8 - -struct lstcon_rpc_trans; -struct lstcon_tsb_hdr; -struct lstcon_test; -struct lstcon_node; - -struct lstcon_rpc { - struct list_head crp_link; /* chain on rpc transaction */ - struct srpc_client_rpc *crp_rpc; /* client rpc */ - struct lstcon_node *crp_node; /* destination node */ - struct lstcon_rpc_trans *crp_trans; /* conrpc transaction */ - - unsigned int crp_posted:1; /* rpc is posted */ - unsigned int crp_finished:1; /* rpc is finished */ - unsigned int crp_unpacked:1; /* reply is unpacked */ - /** RPC is embedded in other structure and can't free it */ - unsigned int crp_embedded:1; - int crp_status; /* console rpc errors */ - unsigned long crp_stamp; /* replied time stamp */ -}; - -struct lstcon_rpc_trans { - struct list_head tas_olink; /* link chain on owner list */ - struct list_head tas_link; /* link chain on global list */ - int tas_opc; /* operation code of transaction */ - unsigned int tas_feats_updated; /* features mask is uptodate */ - unsigned int tas_features; /* test features mask */ - wait_queue_head_t tas_waitq; /* wait queue head */ - atomic_t tas_remaining; /* # of un-scheduled rpcs */ - struct list_head tas_rpcs_list; /* queued requests */ -}; - -#define LST_TRANS_PRIVATE 0x1000 - -#define LST_TRANS_SESNEW (LST_TRANS_PRIVATE | 0x01) -#define LST_TRANS_SESEND (LST_TRANS_PRIVATE | 0x02) -#define LST_TRANS_SESQRY 0x03 -#define LST_TRANS_SESPING 0x04 - -#define LST_TRANS_TSBCLIADD (LST_TRANS_PRIVATE | 0x11) -#define LST_TRANS_TSBSRVADD (LST_TRANS_PRIVATE | 0x12) -#define LST_TRANS_TSBRUN (LST_TRANS_PRIVATE | 0x13) -#define LST_TRANS_TSBSTOP (LST_TRANS_PRIVATE | 0x14) -#define LST_TRANS_TSBCLIQRY 0x15 -#define LST_TRANS_TSBSRVQRY 0x16 - -#define LST_TRANS_STATQRY 0x21 - -typedef int (*lstcon_rpc_cond_func_t)(int, struct lstcon_node *, void *); -typedef int (*lstcon_rpc_readent_func_t)(int, struct srpc_msg *, - struct lstcon_rpc_ent __user *); - -int lstcon_sesrpc_prep(struct lstcon_node *nd, int transop, - unsigned int version, struct lstcon_rpc **crpc); -int lstcon_dbgrpc_prep(struct lstcon_node *nd, - unsigned int version, struct lstcon_rpc **crpc); -int lstcon_batrpc_prep(struct lstcon_node *nd, int transop, - unsigned int version, struct lstcon_tsb_hdr *tsb, - struct lstcon_rpc **crpc); -int lstcon_testrpc_prep(struct lstcon_node *nd, int transop, - unsigned int version, struct lstcon_test *test, - struct lstcon_rpc **crpc); -int lstcon_statrpc_prep(struct lstcon_node *nd, unsigned int version, - struct lstcon_rpc **crpc); -void lstcon_rpc_put(struct lstcon_rpc *crpc); -int lstcon_rpc_trans_prep(struct list_head *translist, - int transop, struct lstcon_rpc_trans **transpp); -int lstcon_rpc_trans_ndlist(struct list_head *ndlist, - struct list_head *translist, int transop, - void *arg, lstcon_rpc_cond_func_t condition, - struct lstcon_rpc_trans **transpp); -void lstcon_rpc_trans_stat(struct lstcon_rpc_trans *trans, - struct lstcon_trans_stat *stat); -int lstcon_rpc_trans_interpreter(struct lstcon_rpc_trans *trans, - struct list_head __user *head_up, - lstcon_rpc_readent_func_t readent); -void lstcon_rpc_trans_abort(struct lstcon_rpc_trans *trans, int error); -void lstcon_rpc_trans_destroy(struct lstcon_rpc_trans *trans); -void lstcon_rpc_trans_addreq(struct lstcon_rpc_trans *trans, - struct lstcon_rpc *req); -int lstcon_rpc_trans_postwait(struct lstcon_rpc_trans *trans, int timeout); -int lstcon_rpc_pinger_start(void); -void lstcon_rpc_pinger_stop(void); -void lstcon_rpc_cleanup_wait(void); -int lstcon_rpc_module_init(void); -void lstcon_rpc_module_fini(void); - -#endif diff --git a/drivers/staging/lustre/lnet/selftest/console.c b/drivers/staging/lustre/lnet/selftest/console.c deleted file mode 100644 index 3c1c1b5997e015c58435474e7a7f641b365732b4..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/selftest/console.c +++ /dev/null @@ -1,2104 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/selftest/conctl.c - * - * Infrastructure of LST console - * - * Author: Liang Zhen - */ - -#include -#include "console.h" -#include "conrpc.h" - -#define LST_NODE_STATE_COUNTER(nd, p) \ -do { \ - if ((nd)->nd_state == LST_NODE_ACTIVE) \ - (p)->nle_nactive++; \ - else if ((nd)->nd_state == LST_NODE_BUSY) \ - (p)->nle_nbusy++; \ - else if ((nd)->nd_state == LST_NODE_DOWN) \ - (p)->nle_ndown++; \ - else \ - (p)->nle_nunknown++; \ - (p)->nle_nnode++; \ -} while (0) - -struct lstcon_session console_session; - -static void -lstcon_node_get(struct lstcon_node *nd) -{ - LASSERT(nd->nd_ref >= 1); - - nd->nd_ref++; -} - -static int -lstcon_node_find(struct lnet_process_id id, struct lstcon_node **ndpp, - int create) -{ - struct lstcon_ndlink *ndl; - unsigned int idx = LNET_NIDADDR(id.nid) % LST_GLOBAL_HASHSIZE; - - LASSERT(id.nid != LNET_NID_ANY); - - list_for_each_entry(ndl, &console_session.ses_ndl_hash[idx], - ndl_hlink) { - if (ndl->ndl_node->nd_id.nid != id.nid || - ndl->ndl_node->nd_id.pid != id.pid) - continue; - - lstcon_node_get(ndl->ndl_node); - *ndpp = ndl->ndl_node; - return 0; - } - - if (!create) - return -ENOENT; - - *ndpp = kzalloc(sizeof(**ndpp) + sizeof(*ndl), GFP_KERNEL); - if (!*ndpp) - return -ENOMEM; - - ndl = (struct lstcon_ndlink *)(*ndpp + 1); - - ndl->ndl_node = *ndpp; - - ndl->ndl_node->nd_ref = 1; - ndl->ndl_node->nd_id = id; - ndl->ndl_node->nd_stamp = jiffies; - ndl->ndl_node->nd_state = LST_NODE_UNKNOWN; - ndl->ndl_node->nd_timeout = 0; - memset(&ndl->ndl_node->nd_ping, 0, sizeof(struct lstcon_rpc)); - - /* - * queued in global hash & list, no refcount is taken by - * global hash & list, if caller release his refcount, - * node will be released - */ - list_add_tail(&ndl->ndl_hlink, &console_session.ses_ndl_hash[idx]); - list_add_tail(&ndl->ndl_link, &console_session.ses_ndl_list); - - return 0; -} - -static void -lstcon_node_put(struct lstcon_node *nd) -{ - struct lstcon_ndlink *ndl; - - LASSERT(nd->nd_ref > 0); - - if (--nd->nd_ref > 0) - return; - - ndl = (struct lstcon_ndlink *)(nd + 1); - - LASSERT(!list_empty(&ndl->ndl_link)); - LASSERT(!list_empty(&ndl->ndl_hlink)); - - /* remove from session */ - list_del(&ndl->ndl_link); - list_del(&ndl->ndl_hlink); - - kfree(nd); -} - -static int -lstcon_ndlink_find(struct list_head *hash, struct lnet_process_id id, - struct lstcon_ndlink **ndlpp, int create) -{ - unsigned int idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE; - struct lstcon_ndlink *ndl; - struct lstcon_node *nd; - int rc; - - if (id.nid == LNET_NID_ANY) - return -EINVAL; - - /* search in hash */ - list_for_each_entry(ndl, &hash[idx], ndl_hlink) { - if (ndl->ndl_node->nd_id.nid != id.nid || - ndl->ndl_node->nd_id.pid != id.pid) - continue; - - *ndlpp = ndl; - return 0; - } - - if (!create) - return -ENOENT; - - /* find or create in session hash */ - rc = lstcon_node_find(id, &nd, (create == 1) ? 1 : 0); - if (rc) - return rc; - - ndl = kzalloc(sizeof(struct lstcon_ndlink), GFP_NOFS); - if (!ndl) { - lstcon_node_put(nd); - return -ENOMEM; - } - - *ndlpp = ndl; - - ndl->ndl_node = nd; - INIT_LIST_HEAD(&ndl->ndl_link); - list_add_tail(&ndl->ndl_hlink, &hash[idx]); - - return 0; -} - -static void -lstcon_ndlink_release(struct lstcon_ndlink *ndl) -{ - LASSERT(list_empty(&ndl->ndl_link)); - LASSERT(!list_empty(&ndl->ndl_hlink)); - - list_del(&ndl->ndl_hlink); /* delete from hash */ - lstcon_node_put(ndl->ndl_node); - - kfree(ndl); -} - -static int -lstcon_group_alloc(char *name, struct lstcon_group **grpp) -{ - struct lstcon_group *grp; - int i; - - grp = kmalloc(offsetof(struct lstcon_group, - grp_ndl_hash[LST_NODE_HASHSIZE]), - GFP_KERNEL); - if (!grp) - return -ENOMEM; - - grp->grp_ref = 1; - if (name) { - if (strlen(name) > sizeof(grp->grp_name) - 1) { - kfree(grp); - return -E2BIG; - } - strncpy(grp->grp_name, name, sizeof(grp->grp_name)); - } - - INIT_LIST_HEAD(&grp->grp_link); - INIT_LIST_HEAD(&grp->grp_ndl_list); - INIT_LIST_HEAD(&grp->grp_trans_list); - - for (i = 0; i < LST_NODE_HASHSIZE; i++) - INIT_LIST_HEAD(&grp->grp_ndl_hash[i]); - - *grpp = grp; - - return 0; -} - -static void -lstcon_group_addref(struct lstcon_group *grp) -{ - grp->grp_ref++; -} - -static void lstcon_group_ndlink_release(struct lstcon_group *, - struct lstcon_ndlink *); - -static void -lstcon_group_drain(struct lstcon_group *grp, int keep) -{ - struct lstcon_ndlink *ndl; - struct lstcon_ndlink *tmp; - - list_for_each_entry_safe(ndl, tmp, &grp->grp_ndl_list, ndl_link) { - if (!(ndl->ndl_node->nd_state & keep)) - lstcon_group_ndlink_release(grp, ndl); - } -} - -static void -lstcon_group_decref(struct lstcon_group *grp) -{ - int i; - - if (--grp->grp_ref > 0) - return; - - if (!list_empty(&grp->grp_link)) - list_del(&grp->grp_link); - - lstcon_group_drain(grp, 0); - - for (i = 0; i < LST_NODE_HASHSIZE; i++) - LASSERT(list_empty(&grp->grp_ndl_hash[i])); - - kfree(grp); -} - -static int -lstcon_group_find(const char *name, struct lstcon_group **grpp) -{ - struct lstcon_group *grp; - - list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) { - if (strncmp(grp->grp_name, name, LST_NAME_SIZE)) - continue; - - lstcon_group_addref(grp); /* +1 ref for caller */ - *grpp = grp; - return 0; - } - - return -ENOENT; -} - -static int -lstcon_group_ndlink_find(struct lstcon_group *grp, struct lnet_process_id id, - struct lstcon_ndlink **ndlpp, int create) -{ - int rc; - - rc = lstcon_ndlink_find(&grp->grp_ndl_hash[0], id, ndlpp, create); - if (rc) - return rc; - - if (!list_empty(&(*ndlpp)->ndl_link)) - return 0; - - list_add_tail(&(*ndlpp)->ndl_link, &grp->grp_ndl_list); - grp->grp_nnode++; - - return 0; -} - -static void -lstcon_group_ndlink_release(struct lstcon_group *grp, struct lstcon_ndlink *ndl) -{ - list_del_init(&ndl->ndl_link); - lstcon_ndlink_release(ndl); - grp->grp_nnode--; -} - -static void -lstcon_group_ndlink_move(struct lstcon_group *old, - struct lstcon_group *new, struct lstcon_ndlink *ndl) -{ - unsigned int idx = LNET_NIDADDR(ndl->ndl_node->nd_id.nid) % - LST_NODE_HASHSIZE; - - list_del(&ndl->ndl_hlink); - list_del(&ndl->ndl_link); - old->grp_nnode--; - - list_add_tail(&ndl->ndl_hlink, &new->grp_ndl_hash[idx]); - list_add_tail(&ndl->ndl_link, &new->grp_ndl_list); - new->grp_nnode++; -} - -static void -lstcon_group_move(struct lstcon_group *old, struct lstcon_group *new) -{ - struct lstcon_ndlink *ndl; - - while (!list_empty(&old->grp_ndl_list)) { - ndl = list_entry(old->grp_ndl_list.next, - struct lstcon_ndlink, ndl_link); - lstcon_group_ndlink_move(old, new, ndl); - } -} - -static int -lstcon_sesrpc_condition(int transop, struct lstcon_node *nd, void *arg) -{ - struct lstcon_group *grp = (struct lstcon_group *)arg; - - switch (transop) { - case LST_TRANS_SESNEW: - if (nd->nd_state == LST_NODE_ACTIVE) - return 0; - break; - - case LST_TRANS_SESEND: - if (nd->nd_state != LST_NODE_ACTIVE) - return 0; - - if (grp && nd->nd_ref > 1) - return 0; - break; - - case LST_TRANS_SESQRY: - break; - - default: - LBUG(); - } - - return 1; -} - -static int -lstcon_sesrpc_readent(int transop, struct srpc_msg *msg, - struct lstcon_rpc_ent __user *ent_up) -{ - struct srpc_debug_reply *rep; - - switch (transop) { - case LST_TRANS_SESNEW: - case LST_TRANS_SESEND: - return 0; - - case LST_TRANS_SESQRY: - rep = &msg->msg_body.dbg_reply; - - if (copy_to_user(&ent_up->rpe_priv[0], - &rep->dbg_timeout, sizeof(int)) || - copy_to_user(&ent_up->rpe_payload[0], - &rep->dbg_name, LST_NAME_SIZE)) - return -EFAULT; - - return 0; - - default: - LBUG(); - } - - return 0; -} - -static int -lstcon_group_nodes_add(struct lstcon_group *grp, - int count, struct lnet_process_id __user *ids_up, - unsigned int *featp, - struct list_head __user *result_up) -{ - struct lstcon_rpc_trans *trans; - struct lstcon_ndlink *ndl; - struct lstcon_group *tmp; - struct lnet_process_id id; - int i; - int rc; - - rc = lstcon_group_alloc(NULL, &tmp); - if (rc) { - CERROR("Out of memory\n"); - return -ENOMEM; - } - - for (i = 0 ; i < count; i++) { - if (copy_from_user(&id, &ids_up[i], sizeof(id))) { - rc = -EFAULT; - break; - } - - /* skip if it's in this group already */ - rc = lstcon_group_ndlink_find(grp, id, &ndl, 0); - if (!rc) - continue; - - /* add to tmp group */ - rc = lstcon_group_ndlink_find(tmp, id, &ndl, 1); - if (rc) { - CERROR("Can't create ndlink, out of memory\n"); - break; - } - } - - if (rc) { - lstcon_group_decref(tmp); - return rc; - } - - rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list, - &tmp->grp_trans_list, LST_TRANS_SESNEW, - tmp, lstcon_sesrpc_condition, &trans); - if (rc) { - CERROR("Can't create transaction: %d\n", rc); - lstcon_group_decref(tmp); - return rc; - } - - /* post all RPCs */ - lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); - - rc = lstcon_rpc_trans_interpreter(trans, result_up, - lstcon_sesrpc_readent); - *featp = trans->tas_features; - - /* destroy all RPGs */ - lstcon_rpc_trans_destroy(trans); - - lstcon_group_move(tmp, grp); - lstcon_group_decref(tmp); - - return rc; -} - -static int -lstcon_group_nodes_remove(struct lstcon_group *grp, - int count, struct lnet_process_id __user *ids_up, - struct list_head __user *result_up) -{ - struct lstcon_rpc_trans *trans; - struct lstcon_ndlink *ndl; - struct lstcon_group *tmp; - struct lnet_process_id id; - int rc; - int i; - - /* End session and remove node from the group */ - - rc = lstcon_group_alloc(NULL, &tmp); - if (rc) { - CERROR("Out of memory\n"); - return -ENOMEM; - } - - for (i = 0; i < count; i++) { - if (copy_from_user(&id, &ids_up[i], sizeof(id))) { - rc = -EFAULT; - goto error; - } - - /* move node to tmp group */ - if (!lstcon_group_ndlink_find(grp, id, &ndl, 0)) - lstcon_group_ndlink_move(grp, tmp, ndl); - } - - rc = lstcon_rpc_trans_ndlist(&tmp->grp_ndl_list, - &tmp->grp_trans_list, LST_TRANS_SESEND, - tmp, lstcon_sesrpc_condition, &trans); - if (rc) { - CERROR("Can't create transaction: %d\n", rc); - goto error; - } - - lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); - - rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL); - - lstcon_rpc_trans_destroy(trans); - /* release nodes anyway, because we can't rollback status */ - lstcon_group_decref(tmp); - - return rc; -error: - lstcon_group_move(tmp, grp); - lstcon_group_decref(tmp); - - return rc; -} - -int -lstcon_group_add(char *name) -{ - struct lstcon_group *grp; - int rc; - - rc = lstcon_group_find(name, &grp) ? 0 : -EEXIST; - if (rc) { - /* find a group with same name */ - lstcon_group_decref(grp); - return rc; - } - - rc = lstcon_group_alloc(name, &grp); - if (rc) { - CERROR("Can't allocate descriptor for group %s\n", name); - return -ENOMEM; - } - - list_add_tail(&grp->grp_link, &console_session.ses_grp_list); - - return rc; -} - -int -lstcon_nodes_add(char *name, int count, struct lnet_process_id __user *ids_up, - unsigned int *featp, struct list_head __user *result_up) -{ - struct lstcon_group *grp; - int rc; - - LASSERT(count > 0); - LASSERT(ids_up); - - rc = lstcon_group_find(name, &grp); - if (rc) { - CDEBUG(D_NET, "Can't find group %s\n", name); - return rc; - } - - if (grp->grp_ref > 2) { - /* referred by other threads or test */ - CDEBUG(D_NET, "Group %s is busy\n", name); - lstcon_group_decref(grp); - - return -EBUSY; - } - - rc = lstcon_group_nodes_add(grp, count, ids_up, featp, result_up); - - lstcon_group_decref(grp); - - return rc; -} - -int -lstcon_group_del(char *name) -{ - struct lstcon_rpc_trans *trans; - struct lstcon_group *grp; - int rc; - - rc = lstcon_group_find(name, &grp); - if (rc) { - CDEBUG(D_NET, "Can't find group: %s\n", name); - return rc; - } - - if (grp->grp_ref > 2) { - /* referred by others threads or test */ - CDEBUG(D_NET, "Group %s is busy\n", name); - lstcon_group_decref(grp); - return -EBUSY; - } - - rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list, - &grp->grp_trans_list, LST_TRANS_SESEND, - grp, lstcon_sesrpc_condition, &trans); - if (rc) { - CERROR("Can't create transaction: %d\n", rc); - lstcon_group_decref(grp); - return rc; - } - - lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); - - lstcon_rpc_trans_destroy(trans); - - lstcon_group_decref(grp); - /* - * -ref for session, it's destroyed, - * status can't be rolled back, destroy group anyway - */ - lstcon_group_decref(grp); - - return rc; -} - -int -lstcon_group_clean(char *name, int args) -{ - struct lstcon_group *grp = NULL; - int rc; - - rc = lstcon_group_find(name, &grp); - if (rc) { - CDEBUG(D_NET, "Can't find group %s\n", name); - return rc; - } - - if (grp->grp_ref > 2) { - /* referred by test */ - CDEBUG(D_NET, "Group %s is busy\n", name); - lstcon_group_decref(grp); - return -EBUSY; - } - - args = (LST_NODE_ACTIVE | LST_NODE_BUSY | - LST_NODE_DOWN | LST_NODE_UNKNOWN) & ~args; - - lstcon_group_drain(grp, args); - - lstcon_group_decref(grp); - /* release empty group */ - if (list_empty(&grp->grp_ndl_list)) - lstcon_group_decref(grp); - - return 0; -} - -int -lstcon_nodes_remove(char *name, int count, - struct lnet_process_id __user *ids_up, - struct list_head __user *result_up) -{ - struct lstcon_group *grp = NULL; - int rc; - - rc = lstcon_group_find(name, &grp); - if (rc) { - CDEBUG(D_NET, "Can't find group: %s\n", name); - return rc; - } - - if (grp->grp_ref > 2) { - /* referred by test */ - CDEBUG(D_NET, "Group %s is busy\n", name); - lstcon_group_decref(grp); - return -EBUSY; - } - - rc = lstcon_group_nodes_remove(grp, count, ids_up, result_up); - - lstcon_group_decref(grp); - /* release empty group */ - if (list_empty(&grp->grp_ndl_list)) - lstcon_group_decref(grp); - - return rc; -} - -int -lstcon_group_refresh(char *name, struct list_head __user *result_up) -{ - struct lstcon_rpc_trans *trans; - struct lstcon_group *grp; - int rc; - - rc = lstcon_group_find(name, &grp); - if (rc) { - CDEBUG(D_NET, "Can't find group: %s\n", name); - return rc; - } - - if (grp->grp_ref > 2) { - /* referred by test */ - CDEBUG(D_NET, "Group %s is busy\n", name); - lstcon_group_decref(grp); - return -EBUSY; - } - - /* re-invite all inactive nodes int the group */ - rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list, - &grp->grp_trans_list, LST_TRANS_SESNEW, - grp, lstcon_sesrpc_condition, &trans); - if (rc) { - /* local error, return */ - CDEBUG(D_NET, "Can't create transaction: %d\n", rc); - lstcon_group_decref(grp); - return rc; - } - - lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); - - rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL); - - lstcon_rpc_trans_destroy(trans); - /* -ref for me */ - lstcon_group_decref(grp); - - return rc; -} - -int -lstcon_group_list(int index, int len, char __user *name_up) -{ - struct lstcon_group *grp; - - LASSERT(index >= 0); - LASSERT(name_up); - - list_for_each_entry(grp, &console_session.ses_grp_list, grp_link) { - if (!index--) { - return copy_to_user(name_up, grp->grp_name, len) ? - -EFAULT : 0; - } - } - - return -ENOENT; -} - -static int -lstcon_nodes_getent(struct list_head *head, int *index_p, - int *count_p, struct lstcon_node_ent __user *dents_up) -{ - struct lstcon_ndlink *ndl; - struct lstcon_node *nd; - int count = 0; - int index = 0; - - LASSERT(index_p && count_p); - LASSERT(dents_up); - LASSERT(*index_p >= 0); - LASSERT(*count_p > 0); - - list_for_each_entry(ndl, head, ndl_link) { - if (index++ < *index_p) - continue; - - if (count >= *count_p) - break; - - nd = ndl->ndl_node; - if (copy_to_user(&dents_up[count].nde_id, - &nd->nd_id, sizeof(nd->nd_id)) || - copy_to_user(&dents_up[count].nde_state, - &nd->nd_state, sizeof(nd->nd_state))) - return -EFAULT; - - count++; - } - - if (index <= *index_p) - return -ENOENT; - - *count_p = count; - *index_p = index; - - return 0; -} - -int -lstcon_group_info(char *name, struct lstcon_ndlist_ent __user *gents_p, - int *index_p, int *count_p, - struct lstcon_node_ent __user *dents_up) -{ - struct lstcon_ndlist_ent *gentp; - struct lstcon_group *grp; - struct lstcon_ndlink *ndl; - int rc; - - rc = lstcon_group_find(name, &grp); - if (rc) { - CDEBUG(D_NET, "Can't find group %s\n", name); - return rc; - } - - if (dents_up) { - /* verbose query */ - rc = lstcon_nodes_getent(&grp->grp_ndl_list, - index_p, count_p, dents_up); - lstcon_group_decref(grp); - - return rc; - } - - /* non-verbose query */ - gentp = kzalloc(sizeof(struct lstcon_ndlist_ent), GFP_NOFS); - if (!gentp) { - CERROR("Can't allocate ndlist_ent\n"); - lstcon_group_decref(grp); - - return -ENOMEM; - } - - list_for_each_entry(ndl, &grp->grp_ndl_list, ndl_link) - LST_NODE_STATE_COUNTER(ndl->ndl_node, gentp); - - rc = copy_to_user(gents_p, gentp, - sizeof(struct lstcon_ndlist_ent)) ? -EFAULT : 0; - - kfree(gentp); - - lstcon_group_decref(grp); - - return rc; -} - -static int -lstcon_batch_find(const char *name, struct lstcon_batch **batpp) -{ - struct lstcon_batch *bat; - - list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) { - if (!strncmp(bat->bat_name, name, LST_NAME_SIZE)) { - *batpp = bat; - return 0; - } - } - - return -ENOENT; -} - -int -lstcon_batch_add(char *name) -{ - struct lstcon_batch *bat; - int i; - int rc; - - rc = !lstcon_batch_find(name, &bat) ? -EEXIST : 0; - if (rc) { - CDEBUG(D_NET, "Batch %s already exists\n", name); - return rc; - } - - bat = kzalloc(sizeof(struct lstcon_batch), GFP_NOFS); - if (!bat) { - CERROR("Can't allocate descriptor for batch %s\n", name); - return -ENOMEM; - } - - bat->bat_cli_hash = kmalloc(sizeof(struct list_head) * LST_NODE_HASHSIZE, - GFP_KERNEL); - if (!bat->bat_cli_hash) { - CERROR("Can't allocate hash for batch %s\n", name); - kfree(bat); - - return -ENOMEM; - } - - bat->bat_srv_hash = kmalloc(sizeof(struct list_head) * LST_NODE_HASHSIZE, - GFP_KERNEL); - if (!bat->bat_srv_hash) { - CERROR("Can't allocate hash for batch %s\n", name); - kfree(bat->bat_cli_hash); - kfree(bat); - - return -ENOMEM; - } - - if (strlen(name) > sizeof(bat->bat_name) - 1) { - kfree(bat->bat_srv_hash); - kfree(bat->bat_cli_hash); - kfree(bat); - return -E2BIG; - } - strncpy(bat->bat_name, name, sizeof(bat->bat_name)); - bat->bat_hdr.tsb_index = 0; - bat->bat_hdr.tsb_id.bat_id = ++console_session.ses_id_cookie; - - bat->bat_ntest = 0; - bat->bat_state = LST_BATCH_IDLE; - - INIT_LIST_HEAD(&bat->bat_cli_list); - INIT_LIST_HEAD(&bat->bat_srv_list); - INIT_LIST_HEAD(&bat->bat_test_list); - INIT_LIST_HEAD(&bat->bat_trans_list); - - for (i = 0; i < LST_NODE_HASHSIZE; i++) { - INIT_LIST_HEAD(&bat->bat_cli_hash[i]); - INIT_LIST_HEAD(&bat->bat_srv_hash[i]); - } - - list_add_tail(&bat->bat_link, &console_session.ses_bat_list); - - return rc; -} - -int -lstcon_batch_list(int index, int len, char __user *name_up) -{ - struct lstcon_batch *bat; - - LASSERT(name_up); - LASSERT(index >= 0); - - list_for_each_entry(bat, &console_session.ses_bat_list, bat_link) { - if (!index--) { - return copy_to_user(name_up, bat->bat_name, len) ? - -EFAULT : 0; - } - } - - return -ENOENT; -} - -int -lstcon_batch_info(char *name, struct lstcon_test_batch_ent __user *ent_up, - int server, int testidx, int *index_p, int *ndent_p, - struct lstcon_node_ent __user *dents_up) -{ - struct lstcon_test_batch_ent *entp; - struct list_head *clilst; - struct list_head *srvlst; - struct lstcon_test *test = NULL; - struct lstcon_batch *bat; - struct lstcon_ndlink *ndl; - int rc; - - rc = lstcon_batch_find(name, &bat); - if (rc) { - CDEBUG(D_NET, "Can't find batch %s\n", name); - return -ENOENT; - } - - if (testidx > 0) { - /* query test, test index start from 1 */ - list_for_each_entry(test, &bat->bat_test_list, tes_link) { - if (testidx-- == 1) - break; - } - - if (testidx > 0) { - CDEBUG(D_NET, "Can't find specified test in batch\n"); - return -ENOENT; - } - } - - clilst = !test ? &bat->bat_cli_list : - &test->tes_src_grp->grp_ndl_list; - srvlst = !test ? &bat->bat_srv_list : - &test->tes_dst_grp->grp_ndl_list; - - if (dents_up) { - rc = lstcon_nodes_getent((server ? srvlst : clilst), - index_p, ndent_p, dents_up); - return rc; - } - - /* non-verbose query */ - entp = kzalloc(sizeof(struct lstcon_test_batch_ent), GFP_NOFS); - if (!entp) - return -ENOMEM; - - if (!test) { - entp->u.tbe_batch.bae_ntest = bat->bat_ntest; - entp->u.tbe_batch.bae_state = bat->bat_state; - } else { - entp->u.tbe_test.tse_type = test->tes_type; - entp->u.tbe_test.tse_loop = test->tes_loop; - entp->u.tbe_test.tse_concur = test->tes_concur; - } - - list_for_each_entry(ndl, clilst, ndl_link) - LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_cli_nle); - - list_for_each_entry(ndl, srvlst, ndl_link) - LST_NODE_STATE_COUNTER(ndl->ndl_node, &entp->tbe_srv_nle); - - rc = copy_to_user(ent_up, entp, - sizeof(struct lstcon_test_batch_ent)) ? -EFAULT : 0; - - kfree(entp); - - return rc; -} - -static int -lstcon_batrpc_condition(int transop, struct lstcon_node *nd, void *arg) -{ - switch (transop) { - case LST_TRANS_TSBRUN: - if (nd->nd_state != LST_NODE_ACTIVE) - return -ENETDOWN; - break; - - case LST_TRANS_TSBSTOP: - if (nd->nd_state != LST_NODE_ACTIVE) - return 0; - break; - - case LST_TRANS_TSBCLIQRY: - case LST_TRANS_TSBSRVQRY: - break; - } - - return 1; -} - -static int -lstcon_batch_op(struct lstcon_batch *bat, int transop, - struct list_head __user *result_up) -{ - struct lstcon_rpc_trans *trans; - int rc; - - rc = lstcon_rpc_trans_ndlist(&bat->bat_cli_list, - &bat->bat_trans_list, transop, - bat, lstcon_batrpc_condition, &trans); - if (rc) { - CERROR("Can't create transaction: %d\n", rc); - return rc; - } - - lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); - - rc = lstcon_rpc_trans_interpreter(trans, result_up, NULL); - - lstcon_rpc_trans_destroy(trans); - - return rc; -} - -int -lstcon_batch_run(char *name, int timeout, struct list_head __user *result_up) -{ - struct lstcon_batch *bat; - int rc; - - if (lstcon_batch_find(name, &bat)) { - CDEBUG(D_NET, "Can't find batch %s\n", name); - return -ENOENT; - } - - bat->bat_arg = timeout; - - rc = lstcon_batch_op(bat, LST_TRANS_TSBRUN, result_up); - - /* mark batch as running if it's started in any node */ - if (lstcon_tsbop_stat_success(lstcon_trans_stat(), 0)) - bat->bat_state = LST_BATCH_RUNNING; - - return rc; -} - -int -lstcon_batch_stop(char *name, int force, struct list_head __user *result_up) -{ - struct lstcon_batch *bat; - int rc; - - if (lstcon_batch_find(name, &bat)) { - CDEBUG(D_NET, "Can't find batch %s\n", name); - return -ENOENT; - } - - bat->bat_arg = force; - - rc = lstcon_batch_op(bat, LST_TRANS_TSBSTOP, result_up); - - /* mark batch as stopped if all RPCs finished */ - if (!lstcon_tsbop_stat_failure(lstcon_trans_stat(), 0)) - bat->bat_state = LST_BATCH_IDLE; - - return rc; -} - -static void -lstcon_batch_destroy(struct lstcon_batch *bat) -{ - struct lstcon_ndlink *ndl; - struct lstcon_test *test; - int i; - - list_del(&bat->bat_link); - - while (!list_empty(&bat->bat_test_list)) { - test = list_entry(bat->bat_test_list.next, - struct lstcon_test, tes_link); - LASSERT(list_empty(&test->tes_trans_list)); - - list_del(&test->tes_link); - - lstcon_group_decref(test->tes_src_grp); - lstcon_group_decref(test->tes_dst_grp); - - kfree(test); - } - - LASSERT(list_empty(&bat->bat_trans_list)); - - while (!list_empty(&bat->bat_cli_list)) { - ndl = list_entry(bat->bat_cli_list.next, - struct lstcon_ndlink, ndl_link); - list_del_init(&ndl->ndl_link); - - lstcon_ndlink_release(ndl); - } - - while (!list_empty(&bat->bat_srv_list)) { - ndl = list_entry(bat->bat_srv_list.next, - struct lstcon_ndlink, ndl_link); - list_del_init(&ndl->ndl_link); - - lstcon_ndlink_release(ndl); - } - - for (i = 0; i < LST_NODE_HASHSIZE; i++) { - LASSERT(list_empty(&bat->bat_cli_hash[i])); - LASSERT(list_empty(&bat->bat_srv_hash[i])); - } - - kfree(bat->bat_cli_hash); - kfree(bat->bat_srv_hash); - kfree(bat); -} - -static int -lstcon_testrpc_condition(int transop, struct lstcon_node *nd, void *arg) -{ - struct lstcon_test *test; - struct lstcon_batch *batch; - struct lstcon_ndlink *ndl; - struct list_head *hash; - struct list_head *head; - - test = (struct lstcon_test *)arg; - LASSERT(test); - - batch = test->tes_batch; - LASSERT(batch); - - if (test->tes_oneside && - transop == LST_TRANS_TSBSRVADD) - return 0; - - if (nd->nd_state != LST_NODE_ACTIVE) - return -ENETDOWN; - - if (transop == LST_TRANS_TSBCLIADD) { - hash = batch->bat_cli_hash; - head = &batch->bat_cli_list; - - } else { - LASSERT(transop == LST_TRANS_TSBSRVADD); - - hash = batch->bat_srv_hash; - head = &batch->bat_srv_list; - } - - LASSERT(nd->nd_id.nid != LNET_NID_ANY); - - if (lstcon_ndlink_find(hash, nd->nd_id, &ndl, 1)) - return -ENOMEM; - - if (list_empty(&ndl->ndl_link)) - list_add_tail(&ndl->ndl_link, head); - - return 1; -} - -static int -lstcon_test_nodes_add(struct lstcon_test *test, - struct list_head __user *result_up) -{ - struct lstcon_rpc_trans *trans; - struct lstcon_group *grp; - int transop; - int rc; - - LASSERT(test->tes_src_grp); - LASSERT(test->tes_dst_grp); - - transop = LST_TRANS_TSBSRVADD; - grp = test->tes_dst_grp; -again: - rc = lstcon_rpc_trans_ndlist(&grp->grp_ndl_list, - &test->tes_trans_list, transop, - test, lstcon_testrpc_condition, &trans); - if (rc) { - CERROR("Can't create transaction: %d\n", rc); - return rc; - } - - lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); - - if (lstcon_trans_stat()->trs_rpc_errno || - lstcon_trans_stat()->trs_fwk_errno) { - lstcon_rpc_trans_interpreter(trans, result_up, NULL); - - lstcon_rpc_trans_destroy(trans); - /* return if any error */ - CDEBUG(D_NET, "Failed to add test %s, RPC error %d, framework error %d\n", - transop == LST_TRANS_TSBCLIADD ? "client" : "server", - lstcon_trans_stat()->trs_rpc_errno, - lstcon_trans_stat()->trs_fwk_errno); - - return rc; - } - - lstcon_rpc_trans_destroy(trans); - - if (transop == LST_TRANS_TSBCLIADD) - return rc; - - transop = LST_TRANS_TSBCLIADD; - grp = test->tes_src_grp; - test->tes_cliidx = 0; - - /* requests to test clients */ - goto again; -} - -static int -lstcon_verify_batch(const char *name, struct lstcon_batch **batch) -{ - int rc; - - rc = lstcon_batch_find(name, batch); - if (rc) { - CDEBUG(D_NET, "Can't find batch %s\n", name); - return rc; - } - - if ((*batch)->bat_state != LST_BATCH_IDLE) { - CDEBUG(D_NET, "Can't change running batch %s\n", name); - return -EINVAL; - } - - return 0; -} - -static int -lstcon_verify_group(const char *name, struct lstcon_group **grp) -{ - int rc; - struct lstcon_ndlink *ndl; - - rc = lstcon_group_find(name, grp); - if (rc) { - CDEBUG(D_NET, "can't find group %s\n", name); - return rc; - } - - list_for_each_entry(ndl, &(*grp)->grp_ndl_list, ndl_link) { - if (ndl->ndl_node->nd_state == LST_NODE_ACTIVE) - return 0; - } - - CDEBUG(D_NET, "Group %s has no ACTIVE nodes\n", name); - - return -EINVAL; -} - -int -lstcon_test_add(char *batch_name, int type, int loop, - int concur, int dist, int span, - char *src_name, char *dst_name, - void *param, int paramlen, int *retp, - struct list_head __user *result_up) -{ - struct lstcon_test *test = NULL; - int rc; - struct lstcon_group *src_grp = NULL; - struct lstcon_group *dst_grp = NULL; - struct lstcon_batch *batch = NULL; - - /* - * verify that a batch of the given name exists, and the groups - * that will be part of the batch exist and have at least one - * active node - */ - rc = lstcon_verify_batch(batch_name, &batch); - if (rc) - goto out; - - rc = lstcon_verify_group(src_name, &src_grp); - if (rc) - goto out; - - rc = lstcon_verify_group(dst_name, &dst_grp); - if (rc) - goto out; - - if (dst_grp->grp_userland) - *retp = 1; - - test = kzalloc(offsetof(struct lstcon_test, tes_param[paramlen]), - GFP_KERNEL); - if (!test) { - CERROR("Can't allocate test descriptor\n"); - rc = -ENOMEM; - - goto out; - } - - test->tes_hdr.tsb_id = batch->bat_hdr.tsb_id; - test->tes_batch = batch; - test->tes_type = type; - test->tes_oneside = 0; /* TODO */ - test->tes_loop = loop; - test->tes_concur = concur; - test->tes_stop_onerr = 1; /* TODO */ - test->tes_span = span; - test->tes_dist = dist; - test->tes_cliidx = 0; /* just used for creating RPC */ - test->tes_src_grp = src_grp; - test->tes_dst_grp = dst_grp; - INIT_LIST_HEAD(&test->tes_trans_list); - - if (param) { - test->tes_paramlen = paramlen; - memcpy(&test->tes_param[0], param, paramlen); - } - - rc = lstcon_test_nodes_add(test, result_up); - - if (rc) - goto out; - - if (lstcon_trans_stat()->trs_rpc_errno || - lstcon_trans_stat()->trs_fwk_errno) - CDEBUG(D_NET, "Failed to add test %d to batch %s\n", type, - batch_name); - - /* add to test list anyway, so user can check what's going on */ - list_add_tail(&test->tes_link, &batch->bat_test_list); - - batch->bat_ntest++; - test->tes_hdr.tsb_index = batch->bat_ntest; - - /* hold groups so nobody can change them */ - return rc; -out: - kfree(test); - - if (dst_grp) - lstcon_group_decref(dst_grp); - - if (src_grp) - lstcon_group_decref(src_grp); - - return rc; -} - -static int -lstcon_test_find(struct lstcon_batch *batch, int idx, - struct lstcon_test **testpp) -{ - struct lstcon_test *test; - - list_for_each_entry(test, &batch->bat_test_list, tes_link) { - if (idx == test->tes_hdr.tsb_index) { - *testpp = test; - return 0; - } - } - - return -ENOENT; -} - -static int -lstcon_tsbrpc_readent(int transop, struct srpc_msg *msg, - struct lstcon_rpc_ent __user *ent_up) -{ - struct srpc_batch_reply *rep = &msg->msg_body.bat_reply; - - LASSERT(transop == LST_TRANS_TSBCLIQRY || - transop == LST_TRANS_TSBSRVQRY); - - /* positive errno, framework error code */ - if (copy_to_user(&ent_up->rpe_priv[0], &rep->bar_active, - sizeof(rep->bar_active))) - return -EFAULT; - - return 0; -} - -int -lstcon_test_batch_query(char *name, int testidx, int client, - int timeout, struct list_head __user *result_up) -{ - struct lstcon_rpc_trans *trans; - struct list_head *translist; - struct list_head *ndlist; - struct lstcon_tsb_hdr *hdr; - struct lstcon_batch *batch; - struct lstcon_test *test = NULL; - int transop; - int rc; - - rc = lstcon_batch_find(name, &batch); - if (rc) { - CDEBUG(D_NET, "Can't find batch: %s\n", name); - return rc; - } - - if (!testidx) { - translist = &batch->bat_trans_list; - ndlist = &batch->bat_cli_list; - hdr = &batch->bat_hdr; - } else { - /* query specified test only */ - rc = lstcon_test_find(batch, testidx, &test); - if (rc) { - CDEBUG(D_NET, "Can't find test: %d\n", testidx); - return rc; - } - - translist = &test->tes_trans_list; - ndlist = &test->tes_src_grp->grp_ndl_list; - hdr = &test->tes_hdr; - } - - transop = client ? LST_TRANS_TSBCLIQRY : LST_TRANS_TSBSRVQRY; - - rc = lstcon_rpc_trans_ndlist(ndlist, translist, transop, hdr, - lstcon_batrpc_condition, &trans); - if (rc) { - CERROR("Can't create transaction: %d\n", rc); - return rc; - } - - lstcon_rpc_trans_postwait(trans, timeout); - - /* query a batch, not a test */ - if (!testidx && - !lstcon_rpc_stat_failure(lstcon_trans_stat(), 0) && - !lstcon_tsbqry_stat_run(lstcon_trans_stat(), 0)) { - /* all RPCs finished, and no active test */ - batch->bat_state = LST_BATCH_IDLE; - } - - rc = lstcon_rpc_trans_interpreter(trans, result_up, - lstcon_tsbrpc_readent); - lstcon_rpc_trans_destroy(trans); - - return rc; -} - -static int -lstcon_statrpc_readent(int transop, struct srpc_msg *msg, - struct lstcon_rpc_ent __user *ent_up) -{ - struct srpc_stat_reply *rep = &msg->msg_body.stat_reply; - struct sfw_counters __user *sfwk_stat; - struct srpc_counters __user *srpc_stat; - struct lnet_counters __user *lnet_stat; - - if (rep->str_status) - return 0; - - sfwk_stat = (struct sfw_counters __user *)&ent_up->rpe_payload[0]; - srpc_stat = (struct srpc_counters __user *)(sfwk_stat + 1); - lnet_stat = (struct lnet_counters __user *)(srpc_stat + 1); - - if (copy_to_user(sfwk_stat, &rep->str_fw, sizeof(*sfwk_stat)) || - copy_to_user(srpc_stat, &rep->str_rpc, sizeof(*srpc_stat)) || - copy_to_user(lnet_stat, &rep->str_lnet, sizeof(*lnet_stat))) - return -EFAULT; - - return 0; -} - -static int -lstcon_ndlist_stat(struct list_head *ndlist, - int timeout, struct list_head __user *result_up) -{ - struct list_head head; - struct lstcon_rpc_trans *trans; - int rc; - - INIT_LIST_HEAD(&head); - - rc = lstcon_rpc_trans_ndlist(ndlist, &head, - LST_TRANS_STATQRY, NULL, NULL, &trans); - if (rc) { - CERROR("Can't create transaction: %d\n", rc); - return rc; - } - - lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout)); - - rc = lstcon_rpc_trans_interpreter(trans, result_up, - lstcon_statrpc_readent); - lstcon_rpc_trans_destroy(trans); - - return rc; -} - -int -lstcon_group_stat(char *grp_name, int timeout, - struct list_head __user *result_up) -{ - struct lstcon_group *grp; - int rc; - - rc = lstcon_group_find(grp_name, &grp); - if (rc) { - CDEBUG(D_NET, "Can't find group %s\n", grp_name); - return rc; - } - - rc = lstcon_ndlist_stat(&grp->grp_ndl_list, timeout, result_up); - - lstcon_group_decref(grp); - - return rc; -} - -int -lstcon_nodes_stat(int count, struct lnet_process_id __user *ids_up, - int timeout, struct list_head __user *result_up) -{ - struct lstcon_ndlink *ndl; - struct lstcon_group *tmp; - struct lnet_process_id id; - int i; - int rc; - - rc = lstcon_group_alloc(NULL, &tmp); - if (rc) { - CERROR("Out of memory\n"); - return -ENOMEM; - } - - for (i = 0 ; i < count; i++) { - if (copy_from_user(&id, &ids_up[i], sizeof(id))) { - rc = -EFAULT; - break; - } - - /* add to tmp group */ - rc = lstcon_group_ndlink_find(tmp, id, &ndl, 2); - if (rc) { - CDEBUG((rc == -ENOMEM) ? D_ERROR : D_NET, - "Failed to find or create %s: %d\n", - libcfs_id2str(id), rc); - break; - } - } - - if (rc) { - lstcon_group_decref(tmp); - return rc; - } - - rc = lstcon_ndlist_stat(&tmp->grp_ndl_list, timeout, result_up); - - lstcon_group_decref(tmp); - - return rc; -} - -static int -lstcon_debug_ndlist(struct list_head *ndlist, - struct list_head *translist, - int timeout, struct list_head __user *result_up) -{ - struct lstcon_rpc_trans *trans; - int rc; - - rc = lstcon_rpc_trans_ndlist(ndlist, translist, LST_TRANS_SESQRY, - NULL, lstcon_sesrpc_condition, &trans); - if (rc) { - CERROR("Can't create transaction: %d\n", rc); - return rc; - } - - lstcon_rpc_trans_postwait(trans, LST_VALIDATE_TIMEOUT(timeout)); - - rc = lstcon_rpc_trans_interpreter(trans, result_up, - lstcon_sesrpc_readent); - lstcon_rpc_trans_destroy(trans); - - return rc; -} - -int -lstcon_session_debug(int timeout, struct list_head __user *result_up) -{ - return lstcon_debug_ndlist(&console_session.ses_ndl_list, - NULL, timeout, result_up); -} - -int -lstcon_batch_debug(int timeout, char *name, - int client, struct list_head __user *result_up) -{ - struct lstcon_batch *bat; - int rc; - - rc = lstcon_batch_find(name, &bat); - if (rc) - return -ENOENT; - - rc = lstcon_debug_ndlist(client ? &bat->bat_cli_list : - &bat->bat_srv_list, - NULL, timeout, result_up); - - return rc; -} - -int -lstcon_group_debug(int timeout, char *name, - struct list_head __user *result_up) -{ - struct lstcon_group *grp; - int rc; - - rc = lstcon_group_find(name, &grp); - if (rc) - return -ENOENT; - - rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL, - timeout, result_up); - lstcon_group_decref(grp); - - return rc; -} - -int -lstcon_nodes_debug(int timeout, int count, - struct lnet_process_id __user *ids_up, - struct list_head __user *result_up) -{ - struct lnet_process_id id; - struct lstcon_ndlink *ndl; - struct lstcon_group *grp; - int i; - int rc; - - rc = lstcon_group_alloc(NULL, &grp); - if (rc) { - CDEBUG(D_NET, "Out of memory\n"); - return rc; - } - - for (i = 0; i < count; i++) { - if (copy_from_user(&id, &ids_up[i], sizeof(id))) { - rc = -EFAULT; - break; - } - - /* node is added to tmp group */ - rc = lstcon_group_ndlink_find(grp, id, &ndl, 1); - if (rc) { - CERROR("Can't create node link\n"); - break; - } - } - - if (rc) { - lstcon_group_decref(grp); - return rc; - } - - rc = lstcon_debug_ndlist(&grp->grp_ndl_list, NULL, - timeout, result_up); - - lstcon_group_decref(grp); - - return rc; -} - -int -lstcon_session_match(struct lst_sid sid) -{ - return (console_session.ses_id.ses_nid == sid.ses_nid && - console_session.ses_id.ses_stamp == sid.ses_stamp) ? 1 : 0; -} - -static void -lstcon_new_session_id(struct lst_sid *sid) -{ - struct lnet_process_id id; - - LASSERT(console_session.ses_state == LST_SESSION_NONE); - - LNetGetId(1, &id); - sid->ses_nid = id.nid; - sid->ses_stamp = jiffies; -} - -int -lstcon_session_new(char *name, int key, unsigned int feats, - int timeout, int force, struct lst_sid __user *sid_up) -{ - int rc = 0; - int i; - - if (console_session.ses_state != LST_SESSION_NONE) { - /* session exists */ - if (!force) { - CNETERR("Session %s already exists\n", - console_session.ses_name); - return -EEXIST; - } - - rc = lstcon_session_end(); - - /* lstcon_session_end() only return local error */ - if (rc) - return rc; - } - - if (feats & ~LST_FEATS_MASK) { - CNETERR("Unknown session features %x\n", - (feats & ~LST_FEATS_MASK)); - return -EINVAL; - } - - for (i = 0; i < LST_GLOBAL_HASHSIZE; i++) - LASSERT(list_empty(&console_session.ses_ndl_hash[i])); - - lstcon_new_session_id(&console_session.ses_id); - - console_session.ses_key = key; - console_session.ses_state = LST_SESSION_ACTIVE; - console_session.ses_force = !!force; - console_session.ses_features = feats; - console_session.ses_feats_updated = 0; - console_session.ses_timeout = (timeout <= 0) ? - LST_CONSOLE_TIMEOUT : timeout; - - if (strlen(name) > sizeof(console_session.ses_name) - 1) - return -E2BIG; - strlcpy(console_session.ses_name, name, - sizeof(console_session.ses_name)); - - rc = lstcon_batch_add(LST_DEFAULT_BATCH); - if (rc) - return rc; - - rc = lstcon_rpc_pinger_start(); - if (rc) { - struct lstcon_batch *bat = NULL; - - lstcon_batch_find(LST_DEFAULT_BATCH, &bat); - lstcon_batch_destroy(bat); - - return rc; - } - - if (!copy_to_user(sid_up, &console_session.ses_id, - sizeof(struct lst_sid))) - return rc; - - lstcon_session_end(); - - return -EFAULT; -} - -int -lstcon_session_info(struct lst_sid __user *sid_up, int __user *key_up, - unsigned __user *featp, - struct lstcon_ndlist_ent __user *ndinfo_up, - char __user *name_up, int len) -{ - struct lstcon_ndlist_ent *entp; - struct lstcon_ndlink *ndl; - int rc = 0; - - if (console_session.ses_state != LST_SESSION_ACTIVE) - return -ESRCH; - - entp = kzalloc(sizeof(*entp), GFP_NOFS); - if (!entp) - return -ENOMEM; - - list_for_each_entry(ndl, &console_session.ses_ndl_list, ndl_link) - LST_NODE_STATE_COUNTER(ndl->ndl_node, entp); - - if (copy_to_user(sid_up, &console_session.ses_id, - sizeof(*sid_up)) || - copy_to_user(key_up, &console_session.ses_key, - sizeof(*key_up)) || - copy_to_user(featp, &console_session.ses_features, - sizeof(*featp)) || - copy_to_user(ndinfo_up, entp, sizeof(*entp)) || - copy_to_user(name_up, console_session.ses_name, len)) - rc = -EFAULT; - - kfree(entp); - - return rc; -} - -int -lstcon_session_end(void) -{ - struct lstcon_rpc_trans *trans; - struct lstcon_group *grp; - struct lstcon_batch *bat; - int rc = 0; - - LASSERT(console_session.ses_state == LST_SESSION_ACTIVE); - - rc = lstcon_rpc_trans_ndlist(&console_session.ses_ndl_list, - NULL, LST_TRANS_SESEND, NULL, - lstcon_sesrpc_condition, &trans); - if (rc) { - CERROR("Can't create transaction: %d\n", rc); - return rc; - } - - console_session.ses_shutdown = 1; - - lstcon_rpc_pinger_stop(); - - lstcon_rpc_trans_postwait(trans, LST_TRANS_TIMEOUT); - - lstcon_rpc_trans_destroy(trans); - /* User can do nothing even rpc failed, so go on */ - - /* waiting for orphan rpcs to die */ - lstcon_rpc_cleanup_wait(); - - console_session.ses_id = LST_INVALID_SID; - console_session.ses_state = LST_SESSION_NONE; - console_session.ses_key = 0; - console_session.ses_force = 0; - console_session.ses_feats_updated = 0; - - /* destroy all batches */ - while (!list_empty(&console_session.ses_bat_list)) { - bat = list_entry(console_session.ses_bat_list.next, - struct lstcon_batch, bat_link); - - lstcon_batch_destroy(bat); - } - - /* destroy all groups */ - while (!list_empty(&console_session.ses_grp_list)) { - grp = list_entry(console_session.ses_grp_list.next, - struct lstcon_group, grp_link); - LASSERT(grp->grp_ref == 1); - - lstcon_group_decref(grp); - } - - /* all nodes should be released */ - LASSERT(list_empty(&console_session.ses_ndl_list)); - - console_session.ses_shutdown = 0; - console_session.ses_expired = 0; - - return rc; -} - -int -lstcon_session_feats_check(unsigned int feats) -{ - int rc = 0; - - if (feats & ~LST_FEATS_MASK) { - CERROR("Can't support these features: %x\n", - (feats & ~LST_FEATS_MASK)); - return -EPROTO; - } - - spin_lock(&console_session.ses_rpc_lock); - - if (!console_session.ses_feats_updated) { - console_session.ses_feats_updated = 1; - console_session.ses_features = feats; - } - - if (console_session.ses_features != feats) - rc = -EPROTO; - - spin_unlock(&console_session.ses_rpc_lock); - - if (rc) { - CERROR("remote features %x do not match with session features %x of console\n", - feats, console_session.ses_features); - } - - return rc; -} - -static int -lstcon_acceptor_handle(struct srpc_server_rpc *rpc) -{ - struct srpc_msg *rep = &rpc->srpc_replymsg; - struct srpc_msg *req = &rpc->srpc_reqstbuf->buf_msg; - struct srpc_join_reqst *jreq = &req->msg_body.join_reqst; - struct srpc_join_reply *jrep = &rep->msg_body.join_reply; - struct lstcon_group *grp = NULL; - struct lstcon_ndlink *ndl; - int rc = 0; - - sfw_unpack_message(req); - - mutex_lock(&console_session.ses_mutex); - - jrep->join_sid = console_session.ses_id; - - if (console_session.ses_id.ses_nid == LNET_NID_ANY) { - jrep->join_status = ESRCH; - goto out; - } - - if (lstcon_session_feats_check(req->msg_ses_feats)) { - jrep->join_status = EPROTO; - goto out; - } - - if (jreq->join_sid.ses_nid != LNET_NID_ANY && - !lstcon_session_match(jreq->join_sid)) { - jrep->join_status = EBUSY; - goto out; - } - - if (lstcon_group_find(jreq->join_group, &grp)) { - rc = lstcon_group_alloc(jreq->join_group, &grp); - if (rc) { - CERROR("Out of memory\n"); - goto out; - } - - list_add_tail(&grp->grp_link, - &console_session.ses_grp_list); - lstcon_group_addref(grp); - } - - if (grp->grp_ref > 2) { - /* Group in using */ - jrep->join_status = EBUSY; - goto out; - } - - rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 0); - if (!rc) { - jrep->join_status = EEXIST; - goto out; - } - - rc = lstcon_group_ndlink_find(grp, rpc->srpc_peer, &ndl, 1); - if (rc) { - CERROR("Out of memory\n"); - goto out; - } - - ndl->ndl_node->nd_state = LST_NODE_ACTIVE; - ndl->ndl_node->nd_timeout = console_session.ses_timeout; - - if (!grp->grp_userland) - grp->grp_userland = 1; - - strlcpy(jrep->join_session, console_session.ses_name, - sizeof(jrep->join_session)); - jrep->join_timeout = console_session.ses_timeout; - jrep->join_status = 0; - -out: - rep->msg_ses_feats = console_session.ses_features; - if (grp) - lstcon_group_decref(grp); - - mutex_unlock(&console_session.ses_mutex); - - return rc; -} - -static struct srpc_service lstcon_acceptor_service; - -static void lstcon_init_acceptor_service(void) -{ - /* initialize selftest console acceptor service table */ - lstcon_acceptor_service.sv_name = "join session"; - lstcon_acceptor_service.sv_handler = lstcon_acceptor_handle; - lstcon_acceptor_service.sv_id = SRPC_SERVICE_JOIN; - lstcon_acceptor_service.sv_wi_total = SFW_FRWK_WI_MAX; -} - -static struct notifier_block lstcon_ioctl_handler = { - .notifier_call = lstcon_ioctl_entry, -}; - -/* initialize console */ -int -lstcon_console_init(void) -{ - int i; - int rc; - - memset(&console_session, 0, sizeof(struct lstcon_session)); - - console_session.ses_id = LST_INVALID_SID; - console_session.ses_state = LST_SESSION_NONE; - console_session.ses_timeout = 0; - console_session.ses_force = 0; - console_session.ses_expired = 0; - console_session.ses_feats_updated = 0; - console_session.ses_features = LST_FEATS_MASK; - console_session.ses_laststamp = ktime_get_real_seconds(); - - mutex_init(&console_session.ses_mutex); - - INIT_LIST_HEAD(&console_session.ses_ndl_list); - INIT_LIST_HEAD(&console_session.ses_grp_list); - INIT_LIST_HEAD(&console_session.ses_bat_list); - INIT_LIST_HEAD(&console_session.ses_trans_list); - - console_session.ses_ndl_hash = - kmalloc(sizeof(struct list_head) * LST_GLOBAL_HASHSIZE, GFP_KERNEL); - if (!console_session.ses_ndl_hash) - return -ENOMEM; - - for (i = 0; i < LST_GLOBAL_HASHSIZE; i++) - INIT_LIST_HEAD(&console_session.ses_ndl_hash[i]); - - /* initialize acceptor service table */ - lstcon_init_acceptor_service(); - - rc = srpc_add_service(&lstcon_acceptor_service); - LASSERT(rc != -EBUSY); - if (rc) { - kfree(console_session.ses_ndl_hash); - return rc; - } - - rc = srpc_service_add_buffers(&lstcon_acceptor_service, - lstcon_acceptor_service.sv_wi_total); - if (rc) { - rc = -ENOMEM; - goto out; - } - - rc = blocking_notifier_chain_register(&libcfs_ioctl_list, - &lstcon_ioctl_handler); - - if (!rc) { - lstcon_rpc_module_init(); - return 0; - } - -out: - srpc_shutdown_service(&lstcon_acceptor_service); - srpc_remove_service(&lstcon_acceptor_service); - - kfree(console_session.ses_ndl_hash); - - srpc_wait_service_shutdown(&lstcon_acceptor_service); - - return rc; -} - -int -lstcon_console_fini(void) -{ - int i; - - blocking_notifier_chain_unregister(&libcfs_ioctl_list, - &lstcon_ioctl_handler); - - mutex_lock(&console_session.ses_mutex); - - srpc_shutdown_service(&lstcon_acceptor_service); - srpc_remove_service(&lstcon_acceptor_service); - - if (console_session.ses_state != LST_SESSION_NONE) - lstcon_session_end(); - - lstcon_rpc_module_fini(); - - mutex_unlock(&console_session.ses_mutex); - - LASSERT(list_empty(&console_session.ses_ndl_list)); - LASSERT(list_empty(&console_session.ses_grp_list)); - LASSERT(list_empty(&console_session.ses_bat_list)); - LASSERT(list_empty(&console_session.ses_trans_list)); - - for (i = 0; i < LST_NODE_HASHSIZE; i++) - LASSERT(list_empty(&console_session.ses_ndl_hash[i])); - - kfree(console_session.ses_ndl_hash); - - srpc_wait_service_shutdown(&lstcon_acceptor_service); - - return 0; -} diff --git a/drivers/staging/lustre/lnet/selftest/console.h b/drivers/staging/lustre/lnet/selftest/console.h deleted file mode 100644 index 2826205e36a14a0487282499d6c12d34727b388f..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/selftest/console.h +++ /dev/null @@ -1,244 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/selftest/console.h - * - * kernel structure for LST console - * - * Author: Liang Zhen - */ - -#ifndef __LST_CONSOLE_H__ -#define __LST_CONSOLE_H__ - -#include -#include -#include "selftest.h" -#include "conrpc.h" - -/* node descriptor */ -struct lstcon_node { - struct lnet_process_id nd_id; /* id of the node */ - int nd_ref; /* reference count */ - int nd_state; /* state of the node */ - int nd_timeout; /* session timeout */ - unsigned long nd_stamp; /* timestamp of last replied RPC */ - struct lstcon_rpc nd_ping; /* ping rpc */ -}; - -/* node link descriptor */ -struct lstcon_ndlink { - struct list_head ndl_link; /* chain on list */ - struct list_head ndl_hlink; /* chain on hash */ - struct lstcon_node *ndl_node; /* pointer to node */ -}; - -/* (alias of nodes) group descriptor */ -struct lstcon_group { - struct list_head grp_link; /* chain on global group list - */ - int grp_ref; /* reference count */ - int grp_userland; /* has userland nodes */ - int grp_nnode; /* # of nodes */ - char grp_name[LST_NAME_SIZE]; /* group name */ - - struct list_head grp_trans_list; /* transaction list */ - struct list_head grp_ndl_list; /* nodes list */ - struct list_head grp_ndl_hash[0]; /* hash table for nodes */ -}; - -#define LST_BATCH_IDLE 0xB0 /* idle batch */ -#define LST_BATCH_RUNNING 0xB1 /* running batch */ - -struct lstcon_tsb_hdr { - struct lst_bid tsb_id; /* batch ID */ - int tsb_index; /* test index */ -}; - -/* (tests ) batch descriptor */ -struct lstcon_batch { - struct lstcon_tsb_hdr bat_hdr; /* test_batch header */ - struct list_head bat_link; /* chain on session's batches list */ - int bat_ntest; /* # of test */ - int bat_state; /* state of the batch */ - int bat_arg; /* parameter for run|stop, timeout - * for run, force for stop - */ - char bat_name[LST_NAME_SIZE];/* name of batch */ - - struct list_head bat_test_list; /* list head of tests (struct lstcon_test) - */ - struct list_head bat_trans_list; /* list head of transaction */ - struct list_head bat_cli_list; /* list head of client nodes - * (struct lstcon_node) - */ - struct list_head *bat_cli_hash; /* hash table of client nodes */ - struct list_head bat_srv_list; /* list head of server nodes */ - struct list_head *bat_srv_hash; /* hash table of server nodes */ -}; - -/* a single test descriptor */ -struct lstcon_test { - struct lstcon_tsb_hdr tes_hdr; /* test batch header */ - struct list_head tes_link; /* chain on batch's tests list */ - struct lstcon_batch *tes_batch; /* pointer to batch */ - - int tes_type; /* type of the test, i.e: bulk, ping */ - int tes_stop_onerr; /* stop on error */ - int tes_oneside; /* one-sided test */ - int tes_concur; /* concurrency */ - int tes_loop; /* loop count */ - int tes_dist; /* nodes distribution of target group */ - int tes_span; /* nodes span of target group */ - int tes_cliidx; /* client index, used for RPC creating */ - - struct list_head tes_trans_list; /* transaction list */ - struct lstcon_group *tes_src_grp; /* group run the test */ - struct lstcon_group *tes_dst_grp; /* target group */ - - int tes_paramlen; /* test parameter length */ - char tes_param[0]; /* test parameter */ -}; - -#define LST_GLOBAL_HASHSIZE 503 /* global nodes hash table size */ -#define LST_NODE_HASHSIZE 239 /* node hash table (for batch or group) */ - -#define LST_SESSION_NONE 0x0 /* no session */ -#define LST_SESSION_ACTIVE 0x1 /* working session */ - -#define LST_CONSOLE_TIMEOUT 300 /* default console timeout */ - -struct lstcon_session { - struct mutex ses_mutex; /* only 1 thread in session */ - struct lst_sid ses_id; /* global session id */ - int ses_key; /* local session key */ - int ses_state; /* state of session */ - int ses_timeout; /* timeout in seconds */ - time64_t ses_laststamp; /* last operation stamp (seconds) - */ - unsigned int ses_features; /* tests features of the session - */ - unsigned int ses_feats_updated:1; /* features are synced with - * remote test nodes - */ - unsigned int ses_force:1; /* force creating */ - unsigned int ses_shutdown:1; /* session is shutting down */ - unsigned int ses_expired:1; /* console is timedout */ - __u64 ses_id_cookie; /* batch id cookie */ - char ses_name[LST_NAME_SIZE];/* session name */ - struct lstcon_rpc_trans *ses_ping; /* session pinger */ - struct stt_timer ses_ping_timer; /* timer for pinger */ - struct lstcon_trans_stat ses_trans_stat; /* transaction stats */ - - struct list_head ses_trans_list; /* global list of transaction */ - struct list_head ses_grp_list; /* global list of groups */ - struct list_head ses_bat_list; /* global list of batches */ - struct list_head ses_ndl_list; /* global list of nodes */ - struct list_head *ses_ndl_hash; /* hash table of nodes */ - - spinlock_t ses_rpc_lock; /* serialize */ - atomic_t ses_rpc_counter; /* # of initialized RPCs */ - struct list_head ses_rpc_freelist; /* idle console rpc */ -}; /* session descriptor */ - -extern struct lstcon_session console_session; - -static inline struct lstcon_trans_stat * -lstcon_trans_stat(void) -{ - return &console_session.ses_trans_stat; -} - -static inline struct list_head * -lstcon_id2hash(struct lnet_process_id id, struct list_head *hash) -{ - unsigned int idx = LNET_NIDADDR(id.nid) % LST_NODE_HASHSIZE; - - return &hash[idx]; -} - -int lstcon_ioctl_entry(struct notifier_block *nb, - unsigned long cmd, void *vdata); -int lstcon_console_init(void); -int lstcon_console_fini(void); -int lstcon_session_match(struct lst_sid sid); -int lstcon_session_new(char *name, int key, unsigned int version, - int timeout, int flags, struct lst_sid __user *sid_up); -int lstcon_session_info(struct lst_sid __user *sid_up, int __user *key, - unsigned __user *verp, struct lstcon_ndlist_ent __user *entp, - char __user *name_up, int len); -int lstcon_session_end(void); -int lstcon_session_debug(int timeout, struct list_head __user *result_up); -int lstcon_session_feats_check(unsigned int feats); -int lstcon_batch_debug(int timeout, char *name, - int client, struct list_head __user *result_up); -int lstcon_group_debug(int timeout, char *name, - struct list_head __user *result_up); -int lstcon_nodes_debug(int timeout, int nnd, - struct lnet_process_id __user *nds_up, - struct list_head __user *result_up); -int lstcon_group_add(char *name); -int lstcon_group_del(char *name); -int lstcon_group_clean(char *name, int args); -int lstcon_group_refresh(char *name, struct list_head __user *result_up); -int lstcon_nodes_add(char *name, int nnd, struct lnet_process_id __user *nds_up, - unsigned int *featp, struct list_head __user *result_up); -int lstcon_nodes_remove(char *name, int nnd, - struct lnet_process_id __user *nds_up, - struct list_head __user *result_up); -int lstcon_group_info(char *name, struct lstcon_ndlist_ent __user *gent_up, - int *index_p, int *ndent_p, - struct lstcon_node_ent __user *ndents_up); -int lstcon_group_list(int idx, int len, char __user *name_up); -int lstcon_batch_add(char *name); -int lstcon_batch_run(char *name, int timeout, - struct list_head __user *result_up); -int lstcon_batch_stop(char *name, int force, - struct list_head __user *result_up); -int lstcon_test_batch_query(char *name, int testidx, - int client, int timeout, - struct list_head __user *result_up); -int lstcon_batch_del(char *name); -int lstcon_batch_list(int idx, int namelen, char __user *name_up); -int lstcon_batch_info(char *name, struct lstcon_test_batch_ent __user *ent_up, - int server, int testidx, int *index_p, - int *ndent_p, struct lstcon_node_ent __user *dents_up); -int lstcon_group_stat(char *grp_name, int timeout, - struct list_head __user *result_up); -int lstcon_nodes_stat(int count, struct lnet_process_id __user *ids_up, - int timeout, struct list_head __user *result_up); -int lstcon_test_add(char *batch_name, int type, int loop, - int concur, int dist, int span, - char *src_name, char *dst_name, - void *param, int paramlen, int *retp, - struct list_head __user *result_up); -#endif diff --git a/drivers/staging/lustre/lnet/selftest/framework.c b/drivers/staging/lustre/lnet/selftest/framework.c deleted file mode 100644 index 741af10560ad1d8434e63b0d2c239bd91caef117..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/selftest/framework.c +++ /dev/null @@ -1,1786 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/selftest/framework.c - * - * Author: Isaac Huang - * Author: Liang Zhen - */ - -#define DEBUG_SUBSYSTEM S_LNET - -#include "selftest.h" - -struct lst_sid LST_INVALID_SID = {LNET_NID_ANY, -1}; - -static int session_timeout = 100; -module_param(session_timeout, int, 0444); -MODULE_PARM_DESC(session_timeout, "test session timeout in seconds (100 by default, 0 == never)"); - -static int rpc_timeout = 64; -module_param(rpc_timeout, int, 0644); -MODULE_PARM_DESC(rpc_timeout, "rpc timeout in seconds (64 by default, 0 == never)"); - -#define sfw_unpack_id(id) \ -do { \ - __swab64s(&(id).nid); \ - __swab32s(&(id).pid); \ -} while (0) - -#define sfw_unpack_sid(sid) \ -do { \ - __swab64s(&(sid).ses_nid); \ - __swab64s(&(sid).ses_stamp); \ -} while (0) - -#define sfw_unpack_fw_counters(fc) \ -do { \ - __swab32s(&(fc).running_ms); \ - __swab32s(&(fc).active_batches); \ - __swab32s(&(fc).zombie_sessions); \ - __swab32s(&(fc).brw_errors); \ - __swab32s(&(fc).ping_errors); \ -} while (0) - -#define sfw_unpack_rpc_counters(rc) \ -do { \ - __swab32s(&(rc).errors); \ - __swab32s(&(rc).rpcs_sent); \ - __swab32s(&(rc).rpcs_rcvd); \ - __swab32s(&(rc).rpcs_dropped); \ - __swab32s(&(rc).rpcs_expired); \ - __swab64s(&(rc).bulk_get); \ - __swab64s(&(rc).bulk_put); \ -} while (0) - -#define sfw_unpack_lnet_counters(lc) \ -do { \ - __swab32s(&(lc).errors); \ - __swab32s(&(lc).msgs_max); \ - __swab32s(&(lc).msgs_alloc); \ - __swab32s(&(lc).send_count); \ - __swab32s(&(lc).recv_count); \ - __swab32s(&(lc).drop_count); \ - __swab32s(&(lc).route_count); \ - __swab64s(&(lc).send_length); \ - __swab64s(&(lc).recv_length); \ - __swab64s(&(lc).drop_length); \ - __swab64s(&(lc).route_length); \ -} while (0) - -#define sfw_test_active(t) (atomic_read(&(t)->tsi_nactive)) -#define sfw_batch_active(b) (atomic_read(&(b)->bat_nactive)) - -static struct smoketest_framework { - struct list_head fw_zombie_rpcs; /* RPCs to be recycled */ - struct list_head fw_zombie_sessions; /* stopping sessions */ - struct list_head fw_tests; /* registered test cases */ - atomic_t fw_nzombies; /* # zombie sessions */ - spinlock_t fw_lock; /* serialise */ - struct sfw_session *fw_session; /* _the_ session */ - int fw_shuttingdown; /* shutdown in progress */ - struct srpc_server_rpc *fw_active_srpc;/* running RPC */ -} sfw_data; - -/* forward ref's */ -int sfw_stop_batch(struct sfw_batch *tsb, int force); -void sfw_destroy_session(struct sfw_session *sn); - -static inline struct sfw_test_case * -sfw_find_test_case(int id) -{ - struct sfw_test_case *tsc; - - LASSERT(id <= SRPC_SERVICE_MAX_ID); - LASSERT(id > SRPC_FRAMEWORK_SERVICE_MAX_ID); - - list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) { - if (tsc->tsc_srv_service->sv_id == id) - return tsc; - } - - return NULL; -} - -static int -sfw_register_test(struct srpc_service *service, - struct sfw_test_client_ops *cliops) -{ - struct sfw_test_case *tsc; - - if (sfw_find_test_case(service->sv_id)) { - CERROR("Failed to register test %s (%d)\n", - service->sv_name, service->sv_id); - return -EEXIST; - } - - tsc = kzalloc(sizeof(struct sfw_test_case), GFP_NOFS); - if (!tsc) - return -ENOMEM; - - tsc->tsc_cli_ops = cliops; - tsc->tsc_srv_service = service; - - list_add_tail(&tsc->tsc_list, &sfw_data.fw_tests); - return 0; -} - -static void -sfw_add_session_timer(void) -{ - struct sfw_session *sn = sfw_data.fw_session; - struct stt_timer *timer = &sn->sn_timer; - - LASSERT(!sfw_data.fw_shuttingdown); - - if (!sn || !sn->sn_timeout) - return; - - LASSERT(!sn->sn_timer_active); - - sn->sn_timer_active = 1; - timer->stt_expires = ktime_get_real_seconds() + sn->sn_timeout; - stt_add_timer(timer); -} - -static int -sfw_del_session_timer(void) -{ - struct sfw_session *sn = sfw_data.fw_session; - - if (!sn || !sn->sn_timer_active) - return 0; - - LASSERT(sn->sn_timeout); - - if (stt_del_timer(&sn->sn_timer)) { /* timer defused */ - sn->sn_timer_active = 0; - return 0; - } - - return -EBUSY; /* racing with sfw_session_expired() */ -} - -static void -sfw_deactivate_session(void) -__must_hold(&sfw_data.fw_lock) -{ - struct sfw_session *sn = sfw_data.fw_session; - int nactive = 0; - struct sfw_batch *tsb; - struct sfw_test_case *tsc; - - if (!sn) - return; - - LASSERT(!sn->sn_timer_active); - - sfw_data.fw_session = NULL; - atomic_inc(&sfw_data.fw_nzombies); - list_add(&sn->sn_list, &sfw_data.fw_zombie_sessions); - - spin_unlock(&sfw_data.fw_lock); - - list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) { - srpc_abort_service(tsc->tsc_srv_service); - } - - spin_lock(&sfw_data.fw_lock); - - list_for_each_entry(tsb, &sn->sn_batches, bat_list) { - if (sfw_batch_active(tsb)) { - nactive++; - sfw_stop_batch(tsb, 1); - } - } - - if (nactive) - return; /* wait for active batches to stop */ - - list_del_init(&sn->sn_list); - spin_unlock(&sfw_data.fw_lock); - - sfw_destroy_session(sn); - - spin_lock(&sfw_data.fw_lock); -} - -static void -sfw_session_expired(void *data) -{ - struct sfw_session *sn = data; - - spin_lock(&sfw_data.fw_lock); - - LASSERT(sn->sn_timer_active); - LASSERT(sn == sfw_data.fw_session); - - CWARN("Session expired! sid: %s-%llu, name: %s\n", - libcfs_nid2str(sn->sn_id.ses_nid), - sn->sn_id.ses_stamp, &sn->sn_name[0]); - - sn->sn_timer_active = 0; - sfw_deactivate_session(); - - spin_unlock(&sfw_data.fw_lock); -} - -static inline void -sfw_init_session(struct sfw_session *sn, struct lst_sid sid, - unsigned int features, const char *name) -{ - struct stt_timer *timer = &sn->sn_timer; - - memset(sn, 0, sizeof(struct sfw_session)); - INIT_LIST_HEAD(&sn->sn_list); - INIT_LIST_HEAD(&sn->sn_batches); - atomic_set(&sn->sn_refcount, 1); /* +1 for caller */ - atomic_set(&sn->sn_brw_errors, 0); - atomic_set(&sn->sn_ping_errors, 0); - strlcpy(&sn->sn_name[0], name, sizeof(sn->sn_name)); - - sn->sn_timer_active = 0; - sn->sn_id = sid; - sn->sn_features = features; - sn->sn_timeout = session_timeout; - sn->sn_started = jiffies; - - timer->stt_data = sn; - timer->stt_func = sfw_session_expired; - INIT_LIST_HEAD(&timer->stt_list); -} - -/* completion handler for incoming framework RPCs */ -static void -sfw_server_rpc_done(struct srpc_server_rpc *rpc) -{ - struct srpc_service *sv = rpc->srpc_scd->scd_svc; - int status = rpc->srpc_status; - - CDEBUG(D_NET, "Incoming framework RPC done: service %s, peer %s, status %s:%d\n", - sv->sv_name, libcfs_id2str(rpc->srpc_peer), - swi_state2str(rpc->srpc_wi.swi_state), - status); - - if (rpc->srpc_bulk) - sfw_free_pages(rpc); -} - -static void -sfw_client_rpc_fini(struct srpc_client_rpc *rpc) -{ - LASSERT(!rpc->crpc_bulk.bk_niov); - LASSERT(list_empty(&rpc->crpc_list)); - LASSERT(!atomic_read(&rpc->crpc_refcount)); - - CDEBUG(D_NET, "Outgoing framework RPC done: service %d, peer %s, status %s:%d:%d\n", - rpc->crpc_service, libcfs_id2str(rpc->crpc_dest), - swi_state2str(rpc->crpc_wi.swi_state), - rpc->crpc_aborted, rpc->crpc_status); - - spin_lock(&sfw_data.fw_lock); - - /* my callers must finish all RPCs before shutting me down */ - LASSERT(!sfw_data.fw_shuttingdown); - list_add(&rpc->crpc_list, &sfw_data.fw_zombie_rpcs); - - spin_unlock(&sfw_data.fw_lock); -} - -static struct sfw_batch * -sfw_find_batch(struct lst_bid bid) -{ - struct sfw_session *sn = sfw_data.fw_session; - struct sfw_batch *bat; - - LASSERT(sn); - - list_for_each_entry(bat, &sn->sn_batches, bat_list) { - if (bat->bat_id.bat_id == bid.bat_id) - return bat; - } - - return NULL; -} - -static struct sfw_batch * -sfw_bid2batch(struct lst_bid bid) -{ - struct sfw_session *sn = sfw_data.fw_session; - struct sfw_batch *bat; - - LASSERT(sn); - - bat = sfw_find_batch(bid); - if (bat) - return bat; - - bat = kzalloc(sizeof(struct sfw_batch), GFP_NOFS); - if (!bat) - return NULL; - - bat->bat_error = 0; - bat->bat_session = sn; - bat->bat_id = bid; - atomic_set(&bat->bat_nactive, 0); - INIT_LIST_HEAD(&bat->bat_tests); - - list_add_tail(&bat->bat_list, &sn->sn_batches); - return bat; -} - -static int -sfw_get_stats(struct srpc_stat_reqst *request, struct srpc_stat_reply *reply) -{ - struct sfw_session *sn = sfw_data.fw_session; - struct sfw_counters *cnt = &reply->str_fw; - struct sfw_batch *bat; - - reply->str_sid = !sn ? LST_INVALID_SID : sn->sn_id; - - if (request->str_sid.ses_nid == LNET_NID_ANY) { - reply->str_status = EINVAL; - return 0; - } - - if (!sn || !sfw_sid_equal(request->str_sid, sn->sn_id)) { - reply->str_status = ESRCH; - return 0; - } - - lnet_counters_get(&reply->str_lnet); - srpc_get_counters(&reply->str_rpc); - - /* - * send over the msecs since the session was started - * with 32 bits to send, this is ~49 days - */ - cnt->running_ms = jiffies_to_msecs(jiffies - sn->sn_started); - cnt->brw_errors = atomic_read(&sn->sn_brw_errors); - cnt->ping_errors = atomic_read(&sn->sn_ping_errors); - cnt->zombie_sessions = atomic_read(&sfw_data.fw_nzombies); - - cnt->active_batches = 0; - list_for_each_entry(bat, &sn->sn_batches, bat_list) { - if (atomic_read(&bat->bat_nactive) > 0) - cnt->active_batches++; - } - - reply->str_status = 0; - return 0; -} - -int -sfw_make_session(struct srpc_mksn_reqst *request, struct srpc_mksn_reply *reply) -{ - struct sfw_session *sn = sfw_data.fw_session; - struct srpc_msg *msg = container_of(request, struct srpc_msg, - msg_body.mksn_reqst); - int cplen = 0; - - if (request->mksn_sid.ses_nid == LNET_NID_ANY) { - reply->mksn_sid = !sn ? LST_INVALID_SID : sn->sn_id; - reply->mksn_status = EINVAL; - return 0; - } - - if (sn) { - reply->mksn_status = 0; - reply->mksn_sid = sn->sn_id; - reply->mksn_timeout = sn->sn_timeout; - - if (sfw_sid_equal(request->mksn_sid, sn->sn_id)) { - atomic_inc(&sn->sn_refcount); - return 0; - } - - if (!request->mksn_force) { - reply->mksn_status = EBUSY; - cplen = strlcpy(&reply->mksn_name[0], &sn->sn_name[0], - sizeof(reply->mksn_name)); - if (cplen >= sizeof(reply->mksn_name)) - return -E2BIG; - return 0; - } - } - - /* - * reject the request if it requires unknown features - * NB: old version will always accept all features because it's not - * aware of srpc_msg::msg_ses_feats, it's a defect but it's also - * harmless because it will return zero feature to console, and it's - * console's responsibility to make sure all nodes in a session have - * same feature mask. - */ - if (msg->msg_ses_feats & ~LST_FEATS_MASK) { - reply->mksn_status = EPROTO; - return 0; - } - - /* brand new or create by force */ - sn = kzalloc(sizeof(struct sfw_session), GFP_NOFS); - if (!sn) { - CERROR("dropping RPC mksn under memory pressure\n"); - return -ENOMEM; - } - - sfw_init_session(sn, request->mksn_sid, - msg->msg_ses_feats, &request->mksn_name[0]); - - spin_lock(&sfw_data.fw_lock); - - sfw_deactivate_session(); - LASSERT(!sfw_data.fw_session); - sfw_data.fw_session = sn; - - spin_unlock(&sfw_data.fw_lock); - - reply->mksn_status = 0; - reply->mksn_sid = sn->sn_id; - reply->mksn_timeout = sn->sn_timeout; - return 0; -} - -static int -sfw_remove_session(struct srpc_rmsn_reqst *request, - struct srpc_rmsn_reply *reply) -{ - struct sfw_session *sn = sfw_data.fw_session; - - reply->rmsn_sid = !sn ? LST_INVALID_SID : sn->sn_id; - - if (request->rmsn_sid.ses_nid == LNET_NID_ANY) { - reply->rmsn_status = EINVAL; - return 0; - } - - if (!sn || !sfw_sid_equal(request->rmsn_sid, sn->sn_id)) { - reply->rmsn_status = !sn ? ESRCH : EBUSY; - return 0; - } - - if (!atomic_dec_and_test(&sn->sn_refcount)) { - reply->rmsn_status = 0; - return 0; - } - - spin_lock(&sfw_data.fw_lock); - sfw_deactivate_session(); - spin_unlock(&sfw_data.fw_lock); - - reply->rmsn_status = 0; - reply->rmsn_sid = LST_INVALID_SID; - LASSERT(!sfw_data.fw_session); - return 0; -} - -static int -sfw_debug_session(struct srpc_debug_reqst *request, - struct srpc_debug_reply *reply) -{ - struct sfw_session *sn = sfw_data.fw_session; - - if (!sn) { - reply->dbg_status = ESRCH; - reply->dbg_sid = LST_INVALID_SID; - return 0; - } - - reply->dbg_status = 0; - reply->dbg_sid = sn->sn_id; - reply->dbg_timeout = sn->sn_timeout; - if (strlcpy(reply->dbg_name, &sn->sn_name[0], sizeof(reply->dbg_name)) - >= sizeof(reply->dbg_name)) - return -E2BIG; - - return 0; -} - -static void -sfw_test_rpc_fini(struct srpc_client_rpc *rpc) -{ - struct sfw_test_unit *tsu = rpc->crpc_priv; - struct sfw_test_instance *tsi = tsu->tsu_instance; - - /* Called with hold of tsi->tsi_lock */ - LASSERT(list_empty(&rpc->crpc_list)); - list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs); -} - -static inline int -sfw_test_buffers(struct sfw_test_instance *tsi) -{ - struct sfw_test_case *tsc; - struct srpc_service *svc; - int nbuf; - - LASSERT(tsi); - tsc = sfw_find_test_case(tsi->tsi_service); - LASSERT(tsc); - svc = tsc->tsc_srv_service; - LASSERT(svc); - - nbuf = min(svc->sv_wi_total, tsi->tsi_loop) / svc->sv_ncpts; - return max(SFW_TEST_WI_MIN, nbuf + SFW_TEST_WI_EXTRA); -} - -static int -sfw_load_test(struct sfw_test_instance *tsi) -{ - struct sfw_test_case *tsc; - struct srpc_service *svc; - int nbuf; - int rc; - - LASSERT(tsi); - tsc = sfw_find_test_case(tsi->tsi_service); - nbuf = sfw_test_buffers(tsi); - LASSERT(tsc); - svc = tsc->tsc_srv_service; - - if (tsi->tsi_is_client) { - tsi->tsi_ops = tsc->tsc_cli_ops; - return 0; - } - - rc = srpc_service_add_buffers(svc, nbuf); - if (rc) { - CWARN("Failed to reserve enough buffers: service %s, %d needed: %d\n", - svc->sv_name, nbuf, rc); - /* - * NB: this error handler is not strictly correct, because - * it may release more buffers than already allocated, - * but it doesn't matter because request portal should - * be lazy portal and will grow buffers if necessary. - */ - srpc_service_remove_buffers(svc, nbuf); - return -ENOMEM; - } - - CDEBUG(D_NET, "Reserved %d buffers for test %s\n", - nbuf * (srpc_serv_is_framework(svc) ? - 2 : cfs_cpt_number(cfs_cpt_tab)), svc->sv_name); - return 0; -} - -static void -sfw_unload_test(struct sfw_test_instance *tsi) -{ - struct sfw_test_case *tsc; - - LASSERT(tsi); - tsc = sfw_find_test_case(tsi->tsi_service); - LASSERT(tsc); - - if (tsi->tsi_is_client) - return; - - /* - * shrink buffers, because request portal is lazy portal - * which can grow buffers at runtime so we may leave - * some buffers behind, but never mind... - */ - srpc_service_remove_buffers(tsc->tsc_srv_service, - sfw_test_buffers(tsi)); -} - -static void -sfw_destroy_test_instance(struct sfw_test_instance *tsi) -{ - struct srpc_client_rpc *rpc; - struct sfw_test_unit *tsu; - - if (!tsi->tsi_is_client) - goto clean; - - tsi->tsi_ops->tso_fini(tsi); - - LASSERT(!tsi->tsi_stopping); - LASSERT(list_empty(&tsi->tsi_active_rpcs)); - LASSERT(!sfw_test_active(tsi)); - - while (!list_empty(&tsi->tsi_units)) { - tsu = list_entry(tsi->tsi_units.next, - struct sfw_test_unit, tsu_list); - list_del(&tsu->tsu_list); - kfree(tsu); - } - - while (!list_empty(&tsi->tsi_free_rpcs)) { - rpc = list_entry(tsi->tsi_free_rpcs.next, - struct srpc_client_rpc, crpc_list); - list_del(&rpc->crpc_list); - kfree(rpc); - } - -clean: - sfw_unload_test(tsi); - kfree(tsi); -} - -static void -sfw_destroy_batch(struct sfw_batch *tsb) -{ - struct sfw_test_instance *tsi; - - LASSERT(!sfw_batch_active(tsb)); - LASSERT(list_empty(&tsb->bat_list)); - - while (!list_empty(&tsb->bat_tests)) { - tsi = list_entry(tsb->bat_tests.next, - struct sfw_test_instance, tsi_list); - list_del_init(&tsi->tsi_list); - sfw_destroy_test_instance(tsi); - } - - kfree(tsb); -} - -void -sfw_destroy_session(struct sfw_session *sn) -{ - struct sfw_batch *batch; - - LASSERT(list_empty(&sn->sn_list)); - LASSERT(sn != sfw_data.fw_session); - - while (!list_empty(&sn->sn_batches)) { - batch = list_entry(sn->sn_batches.next, - struct sfw_batch, bat_list); - list_del_init(&batch->bat_list); - sfw_destroy_batch(batch); - } - - kfree(sn); - atomic_dec(&sfw_data.fw_nzombies); -} - -static void -sfw_unpack_addtest_req(struct srpc_msg *msg) -{ - struct srpc_test_reqst *req = &msg->msg_body.tes_reqst; - - LASSERT(msg->msg_type == SRPC_MSG_TEST_REQST); - LASSERT(req->tsr_is_client); - - if (msg->msg_magic == SRPC_MSG_MAGIC) - return; /* no flipping needed */ - - LASSERT(msg->msg_magic == __swab32(SRPC_MSG_MAGIC)); - - if (req->tsr_service == SRPC_SERVICE_BRW) { - if (!(msg->msg_ses_feats & LST_FEAT_BULK_LEN)) { - struct test_bulk_req *bulk = &req->tsr_u.bulk_v0; - - __swab32s(&bulk->blk_opc); - __swab32s(&bulk->blk_npg); - __swab32s(&bulk->blk_flags); - - } else { - struct test_bulk_req_v1 *bulk = &req->tsr_u.bulk_v1; - - __swab16s(&bulk->blk_opc); - __swab16s(&bulk->blk_flags); - __swab32s(&bulk->blk_offset); - __swab32s(&bulk->blk_len); - } - - return; - } - - if (req->tsr_service == SRPC_SERVICE_PING) { - struct test_ping_req *ping = &req->tsr_u.ping; - - __swab32s(&ping->png_size); - __swab32s(&ping->png_flags); - return; - } - - LBUG(); -} - -static int -sfw_add_test_instance(struct sfw_batch *tsb, struct srpc_server_rpc *rpc) -{ - struct srpc_msg *msg = &rpc->srpc_reqstbuf->buf_msg; - struct srpc_test_reqst *req = &msg->msg_body.tes_reqst; - struct srpc_bulk *bk = rpc->srpc_bulk; - int ndest = req->tsr_ndest; - struct sfw_test_unit *tsu; - struct sfw_test_instance *tsi; - int i; - int rc; - - tsi = kzalloc(sizeof(*tsi), GFP_NOFS); - if (!tsi) { - CERROR("Can't allocate test instance for batch: %llu\n", - tsb->bat_id.bat_id); - return -ENOMEM; - } - - spin_lock_init(&tsi->tsi_lock); - atomic_set(&tsi->tsi_nactive, 0); - INIT_LIST_HEAD(&tsi->tsi_units); - INIT_LIST_HEAD(&tsi->tsi_free_rpcs); - INIT_LIST_HEAD(&tsi->tsi_active_rpcs); - - tsi->tsi_stopping = 0; - tsi->tsi_batch = tsb; - tsi->tsi_loop = req->tsr_loop; - tsi->tsi_concur = req->tsr_concur; - tsi->tsi_service = req->tsr_service; - tsi->tsi_is_client = !!(req->tsr_is_client); - tsi->tsi_stoptsu_onerr = !!(req->tsr_stop_onerr); - - rc = sfw_load_test(tsi); - if (rc) { - kfree(tsi); - return rc; - } - - LASSERT(!sfw_batch_active(tsb)); - - if (!tsi->tsi_is_client) { - /* it's test server, just add it to tsb */ - list_add_tail(&tsi->tsi_list, &tsb->bat_tests); - return 0; - } - - LASSERT(bk); - LASSERT(bk->bk_niov * SFW_ID_PER_PAGE >= (unsigned int)ndest); - LASSERT((unsigned int)bk->bk_len >= - sizeof(struct lnet_process_id_packed) * ndest); - - sfw_unpack_addtest_req(msg); - memcpy(&tsi->tsi_u, &req->tsr_u, sizeof(tsi->tsi_u)); - - for (i = 0; i < ndest; i++) { - struct lnet_process_id_packed *dests; - struct lnet_process_id_packed id; - int j; - - dests = page_address(bk->bk_iovs[i / SFW_ID_PER_PAGE].bv_page); - LASSERT(dests); /* my pages are within KVM always */ - id = dests[i % SFW_ID_PER_PAGE]; - if (msg->msg_magic != SRPC_MSG_MAGIC) - sfw_unpack_id(id); - - for (j = 0; j < tsi->tsi_concur; j++) { - tsu = kzalloc(sizeof(struct sfw_test_unit), GFP_NOFS); - if (!tsu) { - rc = -ENOMEM; - CERROR("Can't allocate tsu for %d\n", - tsi->tsi_service); - goto error; - } - - tsu->tsu_dest.nid = id.nid; - tsu->tsu_dest.pid = id.pid; - tsu->tsu_instance = tsi; - tsu->tsu_private = NULL; - list_add_tail(&tsu->tsu_list, &tsi->tsi_units); - } - } - - rc = tsi->tsi_ops->tso_init(tsi); - if (!rc) { - list_add_tail(&tsi->tsi_list, &tsb->bat_tests); - return 0; - } - -error: - LASSERT(rc); - sfw_destroy_test_instance(tsi); - return rc; -} - -static void -sfw_test_unit_done(struct sfw_test_unit *tsu) -{ - struct sfw_test_instance *tsi = tsu->tsu_instance; - struct sfw_batch *tsb = tsi->tsi_batch; - struct sfw_session *sn = tsb->bat_session; - - LASSERT(sfw_test_active(tsi)); - - if (!atomic_dec_and_test(&tsi->tsi_nactive)) - return; - - /* the test instance is done */ - spin_lock(&tsi->tsi_lock); - - tsi->tsi_stopping = 0; - - spin_unlock(&tsi->tsi_lock); - - spin_lock(&sfw_data.fw_lock); - - if (!atomic_dec_and_test(&tsb->bat_nactive) || /* tsb still active */ - sn == sfw_data.fw_session) { /* sn also active */ - spin_unlock(&sfw_data.fw_lock); - return; - } - - LASSERT(!list_empty(&sn->sn_list)); /* I'm a zombie! */ - - list_for_each_entry(tsb, &sn->sn_batches, bat_list) { - if (sfw_batch_active(tsb)) { - spin_unlock(&sfw_data.fw_lock); - return; - } - } - - list_del_init(&sn->sn_list); - spin_unlock(&sfw_data.fw_lock); - - sfw_destroy_session(sn); -} - -static void -sfw_test_rpc_done(struct srpc_client_rpc *rpc) -{ - struct sfw_test_unit *tsu = rpc->crpc_priv; - struct sfw_test_instance *tsi = tsu->tsu_instance; - int done = 0; - - tsi->tsi_ops->tso_done_rpc(tsu, rpc); - - spin_lock(&tsi->tsi_lock); - - LASSERT(sfw_test_active(tsi)); - LASSERT(!list_empty(&rpc->crpc_list)); - - list_del_init(&rpc->crpc_list); - - /* batch is stopping or loop is done or get error */ - if (tsi->tsi_stopping || !tsu->tsu_loop || - (rpc->crpc_status && tsi->tsi_stoptsu_onerr)) - done = 1; - - /* dec ref for poster */ - srpc_client_rpc_decref(rpc); - - spin_unlock(&tsi->tsi_lock); - - if (!done) { - swi_schedule_workitem(&tsu->tsu_worker); - return; - } - - sfw_test_unit_done(tsu); -} - -int -sfw_create_test_rpc(struct sfw_test_unit *tsu, struct lnet_process_id peer, - unsigned int features, int nblk, int blklen, - struct srpc_client_rpc **rpcpp) -{ - struct srpc_client_rpc *rpc = NULL; - struct sfw_test_instance *tsi = tsu->tsu_instance; - - spin_lock(&tsi->tsi_lock); - - LASSERT(sfw_test_active(tsi)); - /* pick request from buffer */ - rpc = list_first_entry_or_null(&tsi->tsi_free_rpcs, - struct srpc_client_rpc, crpc_list); - if (rpc) { - LASSERT(nblk == rpc->crpc_bulk.bk_niov); - list_del_init(&rpc->crpc_list); - } - - spin_unlock(&tsi->tsi_lock); - - if (!rpc) { - rpc = srpc_create_client_rpc(peer, tsi->tsi_service, nblk, - blklen, sfw_test_rpc_done, - sfw_test_rpc_fini, tsu); - } else { - srpc_init_client_rpc(rpc, peer, tsi->tsi_service, nblk, - blklen, sfw_test_rpc_done, - sfw_test_rpc_fini, tsu); - } - - if (!rpc) { - CERROR("Can't create rpc for test %d\n", tsi->tsi_service); - return -ENOMEM; - } - - rpc->crpc_reqstmsg.msg_ses_feats = features; - *rpcpp = rpc; - - return 0; -} - -static void -sfw_run_test(struct swi_workitem *wi) -{ - struct sfw_test_unit *tsu = container_of(wi, struct sfw_test_unit, tsu_worker); - struct sfw_test_instance *tsi = tsu->tsu_instance; - struct srpc_client_rpc *rpc = NULL; - - if (tsi->tsi_ops->tso_prep_rpc(tsu, tsu->tsu_dest, &rpc)) { - LASSERT(!rpc); - goto test_done; - } - - LASSERT(rpc); - - spin_lock(&tsi->tsi_lock); - - if (tsi->tsi_stopping) { - list_add(&rpc->crpc_list, &tsi->tsi_free_rpcs); - spin_unlock(&tsi->tsi_lock); - goto test_done; - } - - if (tsu->tsu_loop > 0) - tsu->tsu_loop--; - - list_add_tail(&rpc->crpc_list, &tsi->tsi_active_rpcs); - spin_unlock(&tsi->tsi_lock); - - spin_lock(&rpc->crpc_lock); - rpc->crpc_timeout = rpc_timeout; - srpc_post_rpc(rpc); - spin_unlock(&rpc->crpc_lock); - return; - -test_done: - /* - * No one can schedule me now since: - * - previous RPC, if any, has done and - * - no new RPC is initiated. - * - my batch is still active; no one can run it again now. - * Cancel pending schedules and prevent future schedule attempts: - */ - sfw_test_unit_done(tsu); -} - -static int -sfw_run_batch(struct sfw_batch *tsb) -{ - struct swi_workitem *wi; - struct sfw_test_unit *tsu; - struct sfw_test_instance *tsi; - - if (sfw_batch_active(tsb)) { - CDEBUG(D_NET, "Batch already active: %llu (%d)\n", - tsb->bat_id.bat_id, atomic_read(&tsb->bat_nactive)); - return 0; - } - - list_for_each_entry(tsi, &tsb->bat_tests, tsi_list) { - if (!tsi->tsi_is_client) /* skip server instances */ - continue; - - LASSERT(!tsi->tsi_stopping); - LASSERT(!sfw_test_active(tsi)); - - atomic_inc(&tsb->bat_nactive); - - list_for_each_entry(tsu, &tsi->tsi_units, tsu_list) { - atomic_inc(&tsi->tsi_nactive); - tsu->tsu_loop = tsi->tsi_loop; - wi = &tsu->tsu_worker; - swi_init_workitem(wi, sfw_run_test, - lst_test_wq[lnet_cpt_of_nid(tsu->tsu_dest.nid)]); - swi_schedule_workitem(wi); - } - } - - return 0; -} - -int -sfw_stop_batch(struct sfw_batch *tsb, int force) -{ - struct sfw_test_instance *tsi; - struct srpc_client_rpc *rpc; - - if (!sfw_batch_active(tsb)) { - CDEBUG(D_NET, "Batch %llu inactive\n", tsb->bat_id.bat_id); - return 0; - } - - list_for_each_entry(tsi, &tsb->bat_tests, tsi_list) { - spin_lock(&tsi->tsi_lock); - - if (!tsi->tsi_is_client || - !sfw_test_active(tsi) || tsi->tsi_stopping) { - spin_unlock(&tsi->tsi_lock); - continue; - } - - tsi->tsi_stopping = 1; - - if (!force) { - spin_unlock(&tsi->tsi_lock); - continue; - } - - /* abort launched rpcs in the test */ - list_for_each_entry(rpc, &tsi->tsi_active_rpcs, crpc_list) { - spin_lock(&rpc->crpc_lock); - - srpc_abort_rpc(rpc, -EINTR); - - spin_unlock(&rpc->crpc_lock); - } - - spin_unlock(&tsi->tsi_lock); - } - - return 0; -} - -static int -sfw_query_batch(struct sfw_batch *tsb, int testidx, - struct srpc_batch_reply *reply) -{ - struct sfw_test_instance *tsi; - - if (testidx < 0) - return -EINVAL; - - if (!testidx) { - reply->bar_active = atomic_read(&tsb->bat_nactive); - return 0; - } - - list_for_each_entry(tsi, &tsb->bat_tests, tsi_list) { - if (testidx-- > 1) - continue; - - reply->bar_active = atomic_read(&tsi->tsi_nactive); - return 0; - } - - return -ENOENT; -} - -void -sfw_free_pages(struct srpc_server_rpc *rpc) -{ - srpc_free_bulk(rpc->srpc_bulk); - rpc->srpc_bulk = NULL; -} - -int -sfw_alloc_pages(struct srpc_server_rpc *rpc, int cpt, int npages, int len, - int sink) -{ - LASSERT(!rpc->srpc_bulk); - LASSERT(npages > 0 && npages <= LNET_MAX_IOV); - - rpc->srpc_bulk = srpc_alloc_bulk(cpt, 0, npages, len, sink); - if (!rpc->srpc_bulk) - return -ENOMEM; - - return 0; -} - -static int -sfw_add_test(struct srpc_server_rpc *rpc) -{ - struct sfw_session *sn = sfw_data.fw_session; - struct srpc_test_reply *reply = &rpc->srpc_replymsg.msg_body.tes_reply; - struct srpc_test_reqst *request; - int rc; - struct sfw_batch *bat; - - request = &rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst; - reply->tsr_sid = !sn ? LST_INVALID_SID : sn->sn_id; - - if (!request->tsr_loop || - !request->tsr_concur || - request->tsr_sid.ses_nid == LNET_NID_ANY || - request->tsr_ndest > SFW_MAX_NDESTS || - (request->tsr_is_client && !request->tsr_ndest) || - request->tsr_concur > SFW_MAX_CONCUR || - request->tsr_service > SRPC_SERVICE_MAX_ID || - request->tsr_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID) { - reply->tsr_status = EINVAL; - return 0; - } - - if (!sn || !sfw_sid_equal(request->tsr_sid, sn->sn_id) || - !sfw_find_test_case(request->tsr_service)) { - reply->tsr_status = ENOENT; - return 0; - } - - bat = sfw_bid2batch(request->tsr_bid); - if (!bat) { - CERROR("dropping RPC %s from %s under memory pressure\n", - rpc->srpc_scd->scd_svc->sv_name, - libcfs_id2str(rpc->srpc_peer)); - return -ENOMEM; - } - - if (sfw_batch_active(bat)) { - reply->tsr_status = EBUSY; - return 0; - } - - if (request->tsr_is_client && !rpc->srpc_bulk) { - /* rpc will be resumed later in sfw_bulk_ready */ - int npg = sfw_id_pages(request->tsr_ndest); - int len; - - if (!(sn->sn_features & LST_FEAT_BULK_LEN)) { - len = npg * PAGE_SIZE; - - } else { - len = sizeof(struct lnet_process_id_packed) * - request->tsr_ndest; - } - - return sfw_alloc_pages(rpc, CFS_CPT_ANY, npg, len, 1); - } - - rc = sfw_add_test_instance(bat, rpc); - CDEBUG(!rc ? D_NET : D_WARNING, - "%s test: sv %d %s, loop %d, concur %d, ndest %d\n", - !rc ? "Added" : "Failed to add", request->tsr_service, - request->tsr_is_client ? "client" : "server", - request->tsr_loop, request->tsr_concur, request->tsr_ndest); - - reply->tsr_status = (rc < 0) ? -rc : rc; - return 0; -} - -static int -sfw_control_batch(struct srpc_batch_reqst *request, - struct srpc_batch_reply *reply) -{ - struct sfw_session *sn = sfw_data.fw_session; - int rc = 0; - struct sfw_batch *bat; - - reply->bar_sid = !sn ? LST_INVALID_SID : sn->sn_id; - - if (!sn || !sfw_sid_equal(request->bar_sid, sn->sn_id)) { - reply->bar_status = ESRCH; - return 0; - } - - bat = sfw_find_batch(request->bar_bid); - if (!bat) { - reply->bar_status = ENOENT; - return 0; - } - - switch (request->bar_opc) { - case SRPC_BATCH_OPC_RUN: - rc = sfw_run_batch(bat); - break; - - case SRPC_BATCH_OPC_STOP: - rc = sfw_stop_batch(bat, request->bar_arg); - break; - - case SRPC_BATCH_OPC_QUERY: - rc = sfw_query_batch(bat, request->bar_testidx, reply); - break; - - default: - return -EINVAL; /* drop it */ - } - - reply->bar_status = (rc < 0) ? -rc : rc; - return 0; -} - -static int -sfw_handle_server_rpc(struct srpc_server_rpc *rpc) -{ - struct srpc_service *sv = rpc->srpc_scd->scd_svc; - struct srpc_msg *reply = &rpc->srpc_replymsg; - struct srpc_msg *request = &rpc->srpc_reqstbuf->buf_msg; - unsigned int features = LST_FEATS_MASK; - int rc = 0; - - LASSERT(!sfw_data.fw_active_srpc); - LASSERT(sv->sv_id <= SRPC_FRAMEWORK_SERVICE_MAX_ID); - - spin_lock(&sfw_data.fw_lock); - - if (sfw_data.fw_shuttingdown) { - spin_unlock(&sfw_data.fw_lock); - return -ESHUTDOWN; - } - - /* Remove timer to avoid racing with it or expiring active session */ - if (sfw_del_session_timer()) { - CERROR("dropping RPC %s from %s: racing with expiry timer\n", - sv->sv_name, libcfs_id2str(rpc->srpc_peer)); - spin_unlock(&sfw_data.fw_lock); - return -EAGAIN; - } - - sfw_data.fw_active_srpc = rpc; - spin_unlock(&sfw_data.fw_lock); - - sfw_unpack_message(request); - LASSERT(request->msg_type == srpc_service2request(sv->sv_id)); - - /* rpc module should have checked this */ - LASSERT(request->msg_version == SRPC_MSG_VERSION); - - if (sv->sv_id != SRPC_SERVICE_MAKE_SESSION && - sv->sv_id != SRPC_SERVICE_DEBUG) { - struct sfw_session *sn = sfw_data.fw_session; - - if (sn && - sn->sn_features != request->msg_ses_feats) { - CNETERR("Features of framework RPC don't match features of current session: %x/%x\n", - request->msg_ses_feats, sn->sn_features); - reply->msg_body.reply.status = EPROTO; - reply->msg_body.reply.sid = sn->sn_id; - goto out; - } - - } else if (request->msg_ses_feats & ~LST_FEATS_MASK) { - /* - * NB: at this point, old version will ignore features and - * create new session anyway, so console should be able - * to handle this - */ - reply->msg_body.reply.status = EPROTO; - goto out; - } - - switch (sv->sv_id) { - default: - LBUG(); - case SRPC_SERVICE_TEST: - rc = sfw_add_test(rpc); - break; - - case SRPC_SERVICE_BATCH: - rc = sfw_control_batch(&request->msg_body.bat_reqst, - &reply->msg_body.bat_reply); - break; - - case SRPC_SERVICE_QUERY_STAT: - rc = sfw_get_stats(&request->msg_body.stat_reqst, - &reply->msg_body.stat_reply); - break; - - case SRPC_SERVICE_DEBUG: - rc = sfw_debug_session(&request->msg_body.dbg_reqst, - &reply->msg_body.dbg_reply); - break; - - case SRPC_SERVICE_MAKE_SESSION: - rc = sfw_make_session(&request->msg_body.mksn_reqst, - &reply->msg_body.mksn_reply); - break; - - case SRPC_SERVICE_REMOVE_SESSION: - rc = sfw_remove_session(&request->msg_body.rmsn_reqst, - &reply->msg_body.rmsn_reply); - break; - } - - if (sfw_data.fw_session) - features = sfw_data.fw_session->sn_features; - out: - reply->msg_ses_feats = features; - rpc->srpc_done = sfw_server_rpc_done; - spin_lock(&sfw_data.fw_lock); - - if (!sfw_data.fw_shuttingdown) - sfw_add_session_timer(); - - sfw_data.fw_active_srpc = NULL; - spin_unlock(&sfw_data.fw_lock); - return rc; -} - -static int -sfw_bulk_ready(struct srpc_server_rpc *rpc, int status) -{ - struct srpc_service *sv = rpc->srpc_scd->scd_svc; - int rc; - - LASSERT(rpc->srpc_bulk); - LASSERT(sv->sv_id == SRPC_SERVICE_TEST); - LASSERT(!sfw_data.fw_active_srpc); - LASSERT(rpc->srpc_reqstbuf->buf_msg.msg_body.tes_reqst.tsr_is_client); - - spin_lock(&sfw_data.fw_lock); - - if (status) { - CERROR("Bulk transfer failed for RPC: service %s, peer %s, status %d\n", - sv->sv_name, libcfs_id2str(rpc->srpc_peer), status); - spin_unlock(&sfw_data.fw_lock); - return -EIO; - } - - if (sfw_data.fw_shuttingdown) { - spin_unlock(&sfw_data.fw_lock); - return -ESHUTDOWN; - } - - if (sfw_del_session_timer()) { - CERROR("dropping RPC %s from %s: racing with expiry timer\n", - sv->sv_name, libcfs_id2str(rpc->srpc_peer)); - spin_unlock(&sfw_data.fw_lock); - return -EAGAIN; - } - - sfw_data.fw_active_srpc = rpc; - spin_unlock(&sfw_data.fw_lock); - - rc = sfw_add_test(rpc); - - spin_lock(&sfw_data.fw_lock); - - if (!sfw_data.fw_shuttingdown) - sfw_add_session_timer(); - - sfw_data.fw_active_srpc = NULL; - spin_unlock(&sfw_data.fw_lock); - return rc; -} - -struct srpc_client_rpc * -sfw_create_rpc(struct lnet_process_id peer, int service, - unsigned int features, int nbulkiov, int bulklen, - void (*done)(struct srpc_client_rpc *), void *priv) -{ - struct srpc_client_rpc *rpc = NULL; - - spin_lock(&sfw_data.fw_lock); - - LASSERT(!sfw_data.fw_shuttingdown); - LASSERT(service <= SRPC_FRAMEWORK_SERVICE_MAX_ID); - - if (!nbulkiov && !list_empty(&sfw_data.fw_zombie_rpcs)) { - rpc = list_entry(sfw_data.fw_zombie_rpcs.next, - struct srpc_client_rpc, crpc_list); - list_del(&rpc->crpc_list); - - srpc_init_client_rpc(rpc, peer, service, 0, 0, - done, sfw_client_rpc_fini, priv); - } - - spin_unlock(&sfw_data.fw_lock); - - if (!rpc) { - rpc = srpc_create_client_rpc(peer, service, - nbulkiov, bulklen, done, - nbulkiov ? NULL : - sfw_client_rpc_fini, - priv); - } - - if (rpc) /* "session" is concept in framework */ - rpc->crpc_reqstmsg.msg_ses_feats = features; - - return rpc; -} - -void -sfw_unpack_message(struct srpc_msg *msg) -{ - if (msg->msg_magic == SRPC_MSG_MAGIC) - return; /* no flipping needed */ - - /* srpc module should guarantee I wouldn't get crap */ - LASSERT(msg->msg_magic == __swab32(SRPC_MSG_MAGIC)); - - if (msg->msg_type == SRPC_MSG_STAT_REQST) { - struct srpc_stat_reqst *req = &msg->msg_body.stat_reqst; - - __swab32s(&req->str_type); - __swab64s(&req->str_rpyid); - sfw_unpack_sid(req->str_sid); - return; - } - - if (msg->msg_type == SRPC_MSG_STAT_REPLY) { - struct srpc_stat_reply *rep = &msg->msg_body.stat_reply; - - __swab32s(&rep->str_status); - sfw_unpack_sid(rep->str_sid); - sfw_unpack_fw_counters(rep->str_fw); - sfw_unpack_rpc_counters(rep->str_rpc); - sfw_unpack_lnet_counters(rep->str_lnet); - return; - } - - if (msg->msg_type == SRPC_MSG_MKSN_REQST) { - struct srpc_mksn_reqst *req = &msg->msg_body.mksn_reqst; - - __swab64s(&req->mksn_rpyid); - __swab32s(&req->mksn_force); - sfw_unpack_sid(req->mksn_sid); - return; - } - - if (msg->msg_type == SRPC_MSG_MKSN_REPLY) { - struct srpc_mksn_reply *rep = &msg->msg_body.mksn_reply; - - __swab32s(&rep->mksn_status); - __swab32s(&rep->mksn_timeout); - sfw_unpack_sid(rep->mksn_sid); - return; - } - - if (msg->msg_type == SRPC_MSG_RMSN_REQST) { - struct srpc_rmsn_reqst *req = &msg->msg_body.rmsn_reqst; - - __swab64s(&req->rmsn_rpyid); - sfw_unpack_sid(req->rmsn_sid); - return; - } - - if (msg->msg_type == SRPC_MSG_RMSN_REPLY) { - struct srpc_rmsn_reply *rep = &msg->msg_body.rmsn_reply; - - __swab32s(&rep->rmsn_status); - sfw_unpack_sid(rep->rmsn_sid); - return; - } - - if (msg->msg_type == SRPC_MSG_DEBUG_REQST) { - struct srpc_debug_reqst *req = &msg->msg_body.dbg_reqst; - - __swab64s(&req->dbg_rpyid); - __swab32s(&req->dbg_flags); - sfw_unpack_sid(req->dbg_sid); - return; - } - - if (msg->msg_type == SRPC_MSG_DEBUG_REPLY) { - struct srpc_debug_reply *rep = &msg->msg_body.dbg_reply; - - __swab32s(&rep->dbg_nbatch); - __swab32s(&rep->dbg_timeout); - sfw_unpack_sid(rep->dbg_sid); - return; - } - - if (msg->msg_type == SRPC_MSG_BATCH_REQST) { - struct srpc_batch_reqst *req = &msg->msg_body.bat_reqst; - - __swab32s(&req->bar_opc); - __swab64s(&req->bar_rpyid); - __swab32s(&req->bar_testidx); - __swab32s(&req->bar_arg); - sfw_unpack_sid(req->bar_sid); - __swab64s(&req->bar_bid.bat_id); - return; - } - - if (msg->msg_type == SRPC_MSG_BATCH_REPLY) { - struct srpc_batch_reply *rep = &msg->msg_body.bat_reply; - - __swab32s(&rep->bar_status); - sfw_unpack_sid(rep->bar_sid); - return; - } - - if (msg->msg_type == SRPC_MSG_TEST_REQST) { - struct srpc_test_reqst *req = &msg->msg_body.tes_reqst; - - __swab64s(&req->tsr_rpyid); - __swab64s(&req->tsr_bulkid); - __swab32s(&req->tsr_loop); - __swab32s(&req->tsr_ndest); - __swab32s(&req->tsr_concur); - __swab32s(&req->tsr_service); - sfw_unpack_sid(req->tsr_sid); - __swab64s(&req->tsr_bid.bat_id); - return; - } - - if (msg->msg_type == SRPC_MSG_TEST_REPLY) { - struct srpc_test_reply *rep = &msg->msg_body.tes_reply; - - __swab32s(&rep->tsr_status); - sfw_unpack_sid(rep->tsr_sid); - return; - } - - if (msg->msg_type == SRPC_MSG_JOIN_REQST) { - struct srpc_join_reqst *req = &msg->msg_body.join_reqst; - - __swab64s(&req->join_rpyid); - sfw_unpack_sid(req->join_sid); - return; - } - - if (msg->msg_type == SRPC_MSG_JOIN_REPLY) { - struct srpc_join_reply *rep = &msg->msg_body.join_reply; - - __swab32s(&rep->join_status); - __swab32s(&rep->join_timeout); - sfw_unpack_sid(rep->join_sid); - return; - } - - LBUG(); -} - -void -sfw_abort_rpc(struct srpc_client_rpc *rpc) -{ - LASSERT(atomic_read(&rpc->crpc_refcount) > 0); - LASSERT(rpc->crpc_service <= SRPC_FRAMEWORK_SERVICE_MAX_ID); - - spin_lock(&rpc->crpc_lock); - srpc_abort_rpc(rpc, -EINTR); - spin_unlock(&rpc->crpc_lock); -} - -void -sfw_post_rpc(struct srpc_client_rpc *rpc) -{ - spin_lock(&rpc->crpc_lock); - - LASSERT(!rpc->crpc_closed); - LASSERT(!rpc->crpc_aborted); - LASSERT(list_empty(&rpc->crpc_list)); - LASSERT(!sfw_data.fw_shuttingdown); - - rpc->crpc_timeout = rpc_timeout; - srpc_post_rpc(rpc); - - spin_unlock(&rpc->crpc_lock); -} - -static struct srpc_service sfw_services[] = { - { - /* sv_id */ SRPC_SERVICE_DEBUG, - /* sv_name */ "debug", - 0 - }, - { - /* sv_id */ SRPC_SERVICE_QUERY_STAT, - /* sv_name */ "query stats", - 0 - }, - { - /* sv_id */ SRPC_SERVICE_MAKE_SESSION, - /* sv_name */ "make session", - 0 - }, - { - /* sv_id */ SRPC_SERVICE_REMOVE_SESSION, - /* sv_name */ "remove session", - 0 - }, - { - /* sv_id */ SRPC_SERVICE_BATCH, - /* sv_name */ "batch service", - 0 - }, - { - /* sv_id */ SRPC_SERVICE_TEST, - /* sv_name */ "test service", - 0 - }, - { - /* sv_id */ 0, - /* sv_name */ NULL, - 0 - } -}; - -int -sfw_startup(void) -{ - int i; - int rc; - int error; - struct srpc_service *sv; - struct sfw_test_case *tsc; - - if (session_timeout < 0) { - CERROR("Session timeout must be non-negative: %d\n", - session_timeout); - return -EINVAL; - } - - if (rpc_timeout < 0) { - CERROR("RPC timeout must be non-negative: %d\n", - rpc_timeout); - return -EINVAL; - } - - if (!session_timeout) - CWARN("Zero session_timeout specified - test sessions never expire.\n"); - - if (!rpc_timeout) - CWARN("Zero rpc_timeout specified - test RPC never expire.\n"); - - memset(&sfw_data, 0, sizeof(struct smoketest_framework)); - - sfw_data.fw_session = NULL; - sfw_data.fw_active_srpc = NULL; - spin_lock_init(&sfw_data.fw_lock); - atomic_set(&sfw_data.fw_nzombies, 0); - INIT_LIST_HEAD(&sfw_data.fw_tests); - INIT_LIST_HEAD(&sfw_data.fw_zombie_rpcs); - INIT_LIST_HEAD(&sfw_data.fw_zombie_sessions); - - brw_init_test_client(); - brw_init_test_service(); - rc = sfw_register_test(&brw_test_service, &brw_test_client); - LASSERT(!rc); - - ping_init_test_client(); - ping_init_test_service(); - rc = sfw_register_test(&ping_test_service, &ping_test_client); - LASSERT(!rc); - - error = 0; - list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) { - sv = tsc->tsc_srv_service; - - rc = srpc_add_service(sv); - LASSERT(rc != -EBUSY); - if (rc) { - CWARN("Failed to add %s service: %d\n", - sv->sv_name, rc); - error = rc; - } - } - - for (i = 0; ; i++) { - sv = &sfw_services[i]; - if (!sv->sv_name) - break; - - sv->sv_bulk_ready = NULL; - sv->sv_handler = sfw_handle_server_rpc; - sv->sv_wi_total = SFW_FRWK_WI_MAX; - if (sv->sv_id == SRPC_SERVICE_TEST) - sv->sv_bulk_ready = sfw_bulk_ready; - - rc = srpc_add_service(sv); - LASSERT(rc != -EBUSY); - if (rc) { - CWARN("Failed to add %s service: %d\n", - sv->sv_name, rc); - error = rc; - } - - /* about to sfw_shutdown, no need to add buffer */ - if (error) - continue; - - rc = srpc_service_add_buffers(sv, sv->sv_wi_total); - if (rc) { - CWARN("Failed to reserve enough buffers: service %s, %d needed: %d\n", - sv->sv_name, sv->sv_wi_total, rc); - error = -ENOMEM; - } - } - - if (error) - sfw_shutdown(); - return error; -} - -void -sfw_shutdown(void) -{ - struct srpc_service *sv; - struct sfw_test_case *tsc; - int i; - - spin_lock(&sfw_data.fw_lock); - - sfw_data.fw_shuttingdown = 1; - lst_wait_until(!sfw_data.fw_active_srpc, sfw_data.fw_lock, - "waiting for active RPC to finish.\n"); - - if (sfw_del_session_timer()) - lst_wait_until(!sfw_data.fw_session, sfw_data.fw_lock, - "waiting for session timer to explode.\n"); - - sfw_deactivate_session(); - lst_wait_until(!atomic_read(&sfw_data.fw_nzombies), - sfw_data.fw_lock, - "waiting for %d zombie sessions to die.\n", - atomic_read(&sfw_data.fw_nzombies)); - - spin_unlock(&sfw_data.fw_lock); - - for (i = 0; ; i++) { - sv = &sfw_services[i]; - if (!sv->sv_name) - break; - - srpc_shutdown_service(sv); - srpc_remove_service(sv); - } - - list_for_each_entry(tsc, &sfw_data.fw_tests, tsc_list) { - sv = tsc->tsc_srv_service; - srpc_shutdown_service(sv); - srpc_remove_service(sv); - } - - while (!list_empty(&sfw_data.fw_zombie_rpcs)) { - struct srpc_client_rpc *rpc; - - rpc = list_entry(sfw_data.fw_zombie_rpcs.next, - struct srpc_client_rpc, crpc_list); - list_del(&rpc->crpc_list); - - kfree(rpc); - } - - for (i = 0; ; i++) { - sv = &sfw_services[i]; - if (!sv->sv_name) - break; - - srpc_wait_service_shutdown(sv); - } - - while (!list_empty(&sfw_data.fw_tests)) { - tsc = list_entry(sfw_data.fw_tests.next, - struct sfw_test_case, tsc_list); - - srpc_wait_service_shutdown(tsc->tsc_srv_service); - - list_del(&tsc->tsc_list); - kfree(tsc); - } -} diff --git a/drivers/staging/lustre/lnet/selftest/module.c b/drivers/staging/lustre/lnet/selftest/module.c deleted file mode 100644 index 9ba65320f748a96609ae3e056dafb3724d6568fa..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/selftest/module.c +++ /dev/null @@ -1,169 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_LNET - -#include "selftest.h" -#include "console.h" - -enum { - LST_INIT_NONE = 0, - LST_INIT_WI_SERIAL, - LST_INIT_WI_TEST, - LST_INIT_RPC, - LST_INIT_FW, - LST_INIT_CONSOLE -}; - -static int lst_init_step = LST_INIT_NONE; - -struct workqueue_struct *lst_serial_wq; -struct workqueue_struct **lst_test_wq; - -static void -lnet_selftest_exit(void) -{ - int i; - - switch (lst_init_step) { - case LST_INIT_CONSOLE: - lstcon_console_fini(); - /* fall through */ - case LST_INIT_FW: - sfw_shutdown(); - /* fall through */ - case LST_INIT_RPC: - srpc_shutdown(); - /* fall through */ - case LST_INIT_WI_TEST: - for (i = 0; - i < cfs_cpt_number(lnet_cpt_table()); i++) { - if (!lst_test_wq[i]) - continue; - destroy_workqueue(lst_test_wq[i]); - } - kvfree(lst_test_wq); - lst_test_wq = NULL; - /* fall through */ - case LST_INIT_WI_SERIAL: - destroy_workqueue(lst_serial_wq); - lst_serial_wq = NULL; - case LST_INIT_NONE: - break; - default: - LBUG(); - } -} - -static int -lnet_selftest_init(void) -{ - int nscheds; - int rc; - int i; - - rc = libcfs_setup(); - if (rc) - return rc; - - lst_serial_wq = alloc_ordered_workqueue("lst_s", 0); - if (!lst_serial_wq) { - CERROR("Failed to create serial WI scheduler for LST\n"); - return -ENOMEM; - } - lst_init_step = LST_INIT_WI_SERIAL; - - nscheds = cfs_cpt_number(lnet_cpt_table()); - lst_test_wq = kvmalloc_array(nscheds, sizeof(lst_test_wq[0]), - GFP_KERNEL | __GFP_ZERO); - if (!lst_test_wq) { - rc = -ENOMEM; - goto error; - } - - lst_init_step = LST_INIT_WI_TEST; - for (i = 0; i < nscheds; i++) { - int nthrs = cfs_cpt_weight(lnet_cpt_table(), i); - struct workqueue_attrs attrs = {0}; - cpumask_var_t *mask = cfs_cpt_cpumask(lnet_cpt_table(), i); - - /* reserve at least one CPU for LND */ - nthrs = max(nthrs - 1, 1); - lst_test_wq[i] = alloc_workqueue("lst_t", WQ_UNBOUND, nthrs); - if (!lst_test_wq[i]) { - CWARN("Failed to create CPU partition affinity WI scheduler %d for LST\n", - i); - rc = -ENOMEM; - goto error; - } - - if (mask && alloc_cpumask_var(&attrs.cpumask, GFP_KERNEL)) { - cpumask_copy(attrs.cpumask, *mask); - apply_workqueue_attrs(lst_test_wq[i], &attrs); - free_cpumask_var(attrs.cpumask); - } - } - - rc = srpc_startup(); - if (rc) { - CERROR("LST can't startup rpc\n"); - goto error; - } - lst_init_step = LST_INIT_RPC; - - rc = sfw_startup(); - if (rc) { - CERROR("LST can't startup framework\n"); - goto error; - } - lst_init_step = LST_INIT_FW; - - rc = lstcon_console_init(); - if (rc) { - CERROR("LST can't startup console\n"); - goto error; - } - lst_init_step = LST_INIT_CONSOLE; - return 0; -error: - lnet_selftest_exit(); - return rc; -} - -MODULE_AUTHOR("OpenSFS, Inc. "); -MODULE_DESCRIPTION("LNet Selftest"); -MODULE_VERSION("2.7.0"); -MODULE_LICENSE("GPL"); - -module_init(lnet_selftest_init); -module_exit(lnet_selftest_exit); diff --git a/drivers/staging/lustre/lnet/selftest/ping_test.c b/drivers/staging/lustre/lnet/selftest/ping_test.c deleted file mode 100644 index f54bd630dbf86169cbea40d6a283963caedd2429..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/selftest/ping_test.c +++ /dev/null @@ -1,228 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/selftest/conctl.c - * - * Test client & Server - * - * Author: Liang Zhen - */ - -#include "selftest.h" - -#define LST_PING_TEST_MAGIC 0xbabeface - -static int ping_srv_workitems = SFW_TEST_WI_MAX; -module_param(ping_srv_workitems, int, 0644); -MODULE_PARM_DESC(ping_srv_workitems, "# PING server workitems"); - -struct lst_ping_data { - spinlock_t pnd_lock; /* serialize */ - int pnd_counter; /* sequence counter */ -}; - -static struct lst_ping_data lst_ping_data; - -static int -ping_client_init(struct sfw_test_instance *tsi) -{ - struct sfw_session *sn = tsi->tsi_batch->bat_session; - - LASSERT(tsi->tsi_is_client); - LASSERT(sn && !(sn->sn_features & ~LST_FEATS_MASK)); - - spin_lock_init(&lst_ping_data.pnd_lock); - lst_ping_data.pnd_counter = 0; - - return 0; -} - -static void -ping_client_fini(struct sfw_test_instance *tsi) -{ - struct sfw_session *sn = tsi->tsi_batch->bat_session; - int errors; - - LASSERT(sn); - LASSERT(tsi->tsi_is_client); - - errors = atomic_read(&sn->sn_ping_errors); - if (errors) - CWARN("%d pings have failed.\n", errors); - else - CDEBUG(D_NET, "Ping test finished OK.\n"); -} - -static int -ping_client_prep_rpc(struct sfw_test_unit *tsu, struct lnet_process_id dest, - struct srpc_client_rpc **rpc) -{ - struct srpc_ping_reqst *req; - struct sfw_test_instance *tsi = tsu->tsu_instance; - struct sfw_session *sn = tsi->tsi_batch->bat_session; - struct timespec64 ts; - int rc; - - LASSERT(sn); - LASSERT(!(sn->sn_features & ~LST_FEATS_MASK)); - - rc = sfw_create_test_rpc(tsu, dest, sn->sn_features, 0, 0, rpc); - if (rc) - return rc; - - req = &(*rpc)->crpc_reqstmsg.msg_body.ping_reqst; - - req->pnr_magic = LST_PING_TEST_MAGIC; - - spin_lock(&lst_ping_data.pnd_lock); - req->pnr_seq = lst_ping_data.pnd_counter++; - spin_unlock(&lst_ping_data.pnd_lock); - - ktime_get_real_ts64(&ts); - req->pnr_time_sec = ts.tv_sec; - req->pnr_time_usec = ts.tv_nsec / NSEC_PER_USEC; - - return rc; -} - -static void -ping_client_done_rpc(struct sfw_test_unit *tsu, struct srpc_client_rpc *rpc) -{ - struct sfw_test_instance *tsi = tsu->tsu_instance; - struct sfw_session *sn = tsi->tsi_batch->bat_session; - struct srpc_ping_reqst *reqst = &rpc->crpc_reqstmsg.msg_body.ping_reqst; - struct srpc_ping_reply *reply = &rpc->crpc_replymsg.msg_body.ping_reply; - struct timespec64 ts; - - LASSERT(sn); - - if (rpc->crpc_status) { - if (!tsi->tsi_stopping) /* rpc could have been aborted */ - atomic_inc(&sn->sn_ping_errors); - CERROR("Unable to ping %s (%d): %d\n", - libcfs_id2str(rpc->crpc_dest), - reqst->pnr_seq, rpc->crpc_status); - return; - } - - if (rpc->crpc_replymsg.msg_magic != SRPC_MSG_MAGIC) { - __swab32s(&reply->pnr_seq); - __swab32s(&reply->pnr_magic); - __swab32s(&reply->pnr_status); - } - - if (reply->pnr_magic != LST_PING_TEST_MAGIC) { - rpc->crpc_status = -EBADMSG; - atomic_inc(&sn->sn_ping_errors); - CERROR("Bad magic %u from %s, %u expected.\n", - reply->pnr_magic, libcfs_id2str(rpc->crpc_dest), - LST_PING_TEST_MAGIC); - return; - } - - if (reply->pnr_seq != reqst->pnr_seq) { - rpc->crpc_status = -EBADMSG; - atomic_inc(&sn->sn_ping_errors); - CERROR("Bad seq %u from %s, %u expected.\n", - reply->pnr_seq, libcfs_id2str(rpc->crpc_dest), - reqst->pnr_seq); - return; - } - - ktime_get_real_ts64(&ts); - CDEBUG(D_NET, "%d reply in %u usec\n", reply->pnr_seq, - (unsigned int)((ts.tv_sec - reqst->pnr_time_sec) * 1000000 + - (ts.tv_nsec / NSEC_PER_USEC - reqst->pnr_time_usec))); -} - -static int -ping_server_handle(struct srpc_server_rpc *rpc) -{ - struct srpc_service *sv = rpc->srpc_scd->scd_svc; - struct srpc_msg *reqstmsg = &rpc->srpc_reqstbuf->buf_msg; - struct srpc_msg *replymsg = &rpc->srpc_replymsg; - struct srpc_ping_reqst *req = &reqstmsg->msg_body.ping_reqst; - struct srpc_ping_reply *rep = &rpc->srpc_replymsg.msg_body.ping_reply; - - LASSERT(sv->sv_id == SRPC_SERVICE_PING); - - if (reqstmsg->msg_magic != SRPC_MSG_MAGIC) { - LASSERT(reqstmsg->msg_magic == __swab32(SRPC_MSG_MAGIC)); - - __swab32s(&req->pnr_seq); - __swab32s(&req->pnr_magic); - __swab64s(&req->pnr_time_sec); - __swab64s(&req->pnr_time_usec); - } - LASSERT(reqstmsg->msg_type == srpc_service2request(sv->sv_id)); - - if (req->pnr_magic != LST_PING_TEST_MAGIC) { - CERROR("Unexpected magic %08x from %s\n", - req->pnr_magic, libcfs_id2str(rpc->srpc_peer)); - return -EINVAL; - } - - rep->pnr_seq = req->pnr_seq; - rep->pnr_magic = LST_PING_TEST_MAGIC; - - if (reqstmsg->msg_ses_feats & ~LST_FEATS_MASK) { - replymsg->msg_ses_feats = LST_FEATS_MASK; - rep->pnr_status = EPROTO; - return 0; - } - - replymsg->msg_ses_feats = reqstmsg->msg_ses_feats; - - CDEBUG(D_NET, "Get ping %d from %s\n", - req->pnr_seq, libcfs_id2str(rpc->srpc_peer)); - return 0; -} - -struct sfw_test_client_ops ping_test_client; - -void ping_init_test_client(void) -{ - ping_test_client.tso_init = ping_client_init; - ping_test_client.tso_fini = ping_client_fini; - ping_test_client.tso_prep_rpc = ping_client_prep_rpc; - ping_test_client.tso_done_rpc = ping_client_done_rpc; -} - -struct srpc_service ping_test_service; - -void ping_init_test_service(void) -{ - ping_test_service.sv_id = SRPC_SERVICE_PING; - ping_test_service.sv_name = "ping_test"; - ping_test_service.sv_handler = ping_server_handle; - ping_test_service.sv_wi_total = ping_srv_workitems; -} diff --git a/drivers/staging/lustre/lnet/selftest/rpc.c b/drivers/staging/lustre/lnet/selftest/rpc.c deleted file mode 100644 index 9613b0a77007c32fb834127a7e5d41978f8a7064..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/selftest/rpc.c +++ /dev/null @@ -1,1682 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/selftest/rpc.c - * - * Author: Isaac Huang - * - * 2012-05-13: Liang Zhen - * - percpt data for service to improve smp performance - * - code cleanup - */ - -#define DEBUG_SUBSYSTEM S_LNET - -#include "selftest.h" - -enum srpc_state { - SRPC_STATE_NONE, - SRPC_STATE_NI_INIT, - SRPC_STATE_EQ_INIT, - SRPC_STATE_RUNNING, - SRPC_STATE_STOPPING, -}; - -static struct smoketest_rpc { - spinlock_t rpc_glock; /* global lock */ - struct srpc_service *rpc_services[SRPC_SERVICE_MAX_ID + 1]; - struct lnet_handle_eq rpc_lnet_eq; /* _the_ LNet event queue */ - enum srpc_state rpc_state; - struct srpc_counters rpc_counters; - __u64 rpc_matchbits; /* matchbits counter */ -} srpc_data; - -static inline int -srpc_serv_portal(int svc_id) -{ - return svc_id < SRPC_FRAMEWORK_SERVICE_MAX_ID ? - SRPC_FRAMEWORK_REQUEST_PORTAL : SRPC_REQUEST_PORTAL; -} - -/* forward ref's */ -void srpc_handle_rpc(struct swi_workitem *wi); - -void srpc_get_counters(struct srpc_counters *cnt) -{ - spin_lock(&srpc_data.rpc_glock); - *cnt = srpc_data.rpc_counters; - spin_unlock(&srpc_data.rpc_glock); -} - -void srpc_set_counters(const struct srpc_counters *cnt) -{ - spin_lock(&srpc_data.rpc_glock); - srpc_data.rpc_counters = *cnt; - spin_unlock(&srpc_data.rpc_glock); -} - -static int -srpc_add_bulk_page(struct srpc_bulk *bk, struct page *pg, int i, int off, - int nob) -{ - LASSERT(off < PAGE_SIZE); - LASSERT(nob > 0 && nob <= PAGE_SIZE); - - bk->bk_iovs[i].bv_offset = off; - bk->bk_iovs[i].bv_page = pg; - bk->bk_iovs[i].bv_len = nob; - return nob; -} - -void -srpc_free_bulk(struct srpc_bulk *bk) -{ - int i; - struct page *pg; - - LASSERT(bk); - - for (i = 0; i < bk->bk_niov; i++) { - pg = bk->bk_iovs[i].bv_page; - if (!pg) - break; - - __free_page(pg); - } - - kfree(bk); -} - -struct srpc_bulk * -srpc_alloc_bulk(int cpt, unsigned int bulk_off, unsigned int bulk_npg, - unsigned int bulk_len, int sink) -{ - struct srpc_bulk *bk; - int i; - - LASSERT(bulk_npg > 0 && bulk_npg <= LNET_MAX_IOV); - - bk = kzalloc_cpt(offsetof(struct srpc_bulk, bk_iovs[bulk_npg]), - GFP_KERNEL, cpt); - if (!bk) { - CERROR("Can't allocate descriptor for %d pages\n", bulk_npg); - return NULL; - } - - memset(bk, 0, offsetof(struct srpc_bulk, bk_iovs[bulk_npg])); - bk->bk_sink = sink; - bk->bk_len = bulk_len; - bk->bk_niov = bulk_npg; - - for (i = 0; i < bulk_npg; i++) { - struct page *pg; - int nob; - - pg = alloc_pages_node(cfs_cpt_spread_node(lnet_cpt_table(), cpt), - GFP_KERNEL, 0); - if (!pg) { - CERROR("Can't allocate page %d of %d\n", i, bulk_npg); - srpc_free_bulk(bk); - return NULL; - } - - nob = min_t(unsigned int, bulk_off + bulk_len, PAGE_SIZE) - - bulk_off; - srpc_add_bulk_page(bk, pg, i, bulk_off, nob); - bulk_len -= nob; - bulk_off = 0; - } - - return bk; -} - -static inline __u64 -srpc_next_id(void) -{ - __u64 id; - - spin_lock(&srpc_data.rpc_glock); - id = srpc_data.rpc_matchbits++; - spin_unlock(&srpc_data.rpc_glock); - return id; -} - -static void -srpc_init_server_rpc(struct srpc_server_rpc *rpc, - struct srpc_service_cd *scd, - struct srpc_buffer *buffer) -{ - memset(rpc, 0, sizeof(*rpc)); - swi_init_workitem(&rpc->srpc_wi, srpc_handle_rpc, - srpc_serv_is_framework(scd->scd_svc) ? - lst_serial_wq : lst_test_wq[scd->scd_cpt]); - - rpc->srpc_ev.ev_fired = 1; /* no event expected now */ - - rpc->srpc_scd = scd; - rpc->srpc_reqstbuf = buffer; - rpc->srpc_peer = buffer->buf_peer; - rpc->srpc_self = buffer->buf_self; - LNetInvalidateMDHandle(&rpc->srpc_replymdh); -} - -static void -srpc_service_fini(struct srpc_service *svc) -{ - struct srpc_service_cd *scd; - struct srpc_server_rpc *rpc; - struct srpc_buffer *buf; - struct list_head *q; - int i; - - if (!svc->sv_cpt_data) - return; - - cfs_percpt_for_each(scd, i, svc->sv_cpt_data) { - while (1) { - if (!list_empty(&scd->scd_buf_posted)) - q = &scd->scd_buf_posted; - else if (!list_empty(&scd->scd_buf_blocked)) - q = &scd->scd_buf_blocked; - else - break; - - while (!list_empty(q)) { - buf = list_entry(q->next, struct srpc_buffer, - buf_list); - list_del(&buf->buf_list); - kfree(buf); - } - } - - LASSERT(list_empty(&scd->scd_rpc_active)); - - while (!list_empty(&scd->scd_rpc_free)) { - rpc = list_entry(scd->scd_rpc_free.next, - struct srpc_server_rpc, - srpc_list); - list_del(&rpc->srpc_list); - kfree(rpc); - } - } - - cfs_percpt_free(svc->sv_cpt_data); - svc->sv_cpt_data = NULL; -} - -static int -srpc_service_nrpcs(struct srpc_service *svc) -{ - int nrpcs = svc->sv_wi_total / svc->sv_ncpts; - - return srpc_serv_is_framework(svc) ? - max(nrpcs, SFW_FRWK_WI_MIN) : max(nrpcs, SFW_TEST_WI_MIN); -} - -void srpc_add_buffer(struct swi_workitem *wi); - -static int -srpc_service_init(struct srpc_service *svc) -{ - struct srpc_service_cd *scd; - struct srpc_server_rpc *rpc; - int nrpcs; - int i; - int j; - - svc->sv_shuttingdown = 0; - - svc->sv_cpt_data = cfs_percpt_alloc(lnet_cpt_table(), - sizeof(**svc->sv_cpt_data)); - if (!svc->sv_cpt_data) - return -ENOMEM; - - svc->sv_ncpts = srpc_serv_is_framework(svc) ? - 1 : cfs_cpt_number(lnet_cpt_table()); - nrpcs = srpc_service_nrpcs(svc); - - cfs_percpt_for_each(scd, i, svc->sv_cpt_data) { - scd->scd_cpt = i; - scd->scd_svc = svc; - spin_lock_init(&scd->scd_lock); - INIT_LIST_HEAD(&scd->scd_rpc_free); - INIT_LIST_HEAD(&scd->scd_rpc_active); - INIT_LIST_HEAD(&scd->scd_buf_posted); - INIT_LIST_HEAD(&scd->scd_buf_blocked); - - scd->scd_ev.ev_data = scd; - scd->scd_ev.ev_type = SRPC_REQUEST_RCVD; - - /* - * NB: don't use lst_serial_wq for adding buffer, - * see details in srpc_service_add_buffers() - */ - swi_init_workitem(&scd->scd_buf_wi, - srpc_add_buffer, lst_test_wq[i]); - - if (i && srpc_serv_is_framework(svc)) { - /* - * NB: framework service only needs srpc_service_cd for - * one partition, but we allocate for all to make - * it easier to implement, it will waste a little - * memory but nobody should care about this - */ - continue; - } - - for (j = 0; j < nrpcs; j++) { - rpc = kzalloc_cpt(sizeof(*rpc), GFP_NOFS, i); - if (!rpc) { - srpc_service_fini(svc); - return -ENOMEM; - } - list_add(&rpc->srpc_list, &scd->scd_rpc_free); - } - } - - return 0; -} - -int -srpc_add_service(struct srpc_service *sv) -{ - int id = sv->sv_id; - - LASSERT(0 <= id && id <= SRPC_SERVICE_MAX_ID); - - if (srpc_service_init(sv)) - return -ENOMEM; - - spin_lock(&srpc_data.rpc_glock); - - LASSERT(srpc_data.rpc_state == SRPC_STATE_RUNNING); - - if (srpc_data.rpc_services[id]) { - spin_unlock(&srpc_data.rpc_glock); - goto failed; - } - - srpc_data.rpc_services[id] = sv; - spin_unlock(&srpc_data.rpc_glock); - - CDEBUG(D_NET, "Adding service: id %d, name %s\n", id, sv->sv_name); - return 0; - - failed: - srpc_service_fini(sv); - return -EBUSY; -} - -int -srpc_remove_service(struct srpc_service *sv) -{ - int id = sv->sv_id; - - spin_lock(&srpc_data.rpc_glock); - - if (srpc_data.rpc_services[id] != sv) { - spin_unlock(&srpc_data.rpc_glock); - return -ENOENT; - } - - srpc_data.rpc_services[id] = NULL; - spin_unlock(&srpc_data.rpc_glock); - return 0; -} - -static int -srpc_post_passive_rdma(int portal, int local, __u64 matchbits, void *buf, - int len, int options, struct lnet_process_id peer, - struct lnet_handle_md *mdh, struct srpc_event *ev) -{ - int rc; - struct lnet_md md; - struct lnet_handle_me meh; - - rc = LNetMEAttach(portal, peer, matchbits, 0, LNET_UNLINK, - local ? LNET_INS_LOCAL : LNET_INS_AFTER, &meh); - if (rc) { - CERROR("LNetMEAttach failed: %d\n", rc); - LASSERT(rc == -ENOMEM); - return -ENOMEM; - } - - md.threshold = 1; - md.user_ptr = ev; - md.start = buf; - md.length = len; - md.options = options; - md.eq_handle = srpc_data.rpc_lnet_eq; - - rc = LNetMDAttach(meh, md, LNET_UNLINK, mdh); - if (rc) { - CERROR("LNetMDAttach failed: %d\n", rc); - LASSERT(rc == -ENOMEM); - - rc = LNetMEUnlink(meh); - LASSERT(!rc); - return -ENOMEM; - } - - CDEBUG(D_NET, "Posted passive RDMA: peer %s, portal %d, matchbits %#llx\n", - libcfs_id2str(peer), portal, matchbits); - return 0; -} - -static int -srpc_post_active_rdma(int portal, __u64 matchbits, void *buf, int len, - int options, struct lnet_process_id peer, - lnet_nid_t self, struct lnet_handle_md *mdh, - struct srpc_event *ev) -{ - int rc; - struct lnet_md md; - - md.user_ptr = ev; - md.start = buf; - md.length = len; - md.eq_handle = srpc_data.rpc_lnet_eq; - md.threshold = options & LNET_MD_OP_GET ? 2 : 1; - md.options = options & ~(LNET_MD_OP_PUT | LNET_MD_OP_GET); - - rc = LNetMDBind(md, LNET_UNLINK, mdh); - if (rc) { - CERROR("LNetMDBind failed: %d\n", rc); - LASSERT(rc == -ENOMEM); - return -ENOMEM; - } - - /* - * this is kind of an abuse of the LNET_MD_OP_{PUT,GET} options. - * they're only meaningful for MDs attached to an ME (i.e. passive - * buffers... - */ - if (options & LNET_MD_OP_PUT) { - rc = LNetPut(self, *mdh, LNET_NOACK_REQ, peer, - portal, matchbits, 0, 0); - } else { - LASSERT(options & LNET_MD_OP_GET); - - rc = LNetGet(self, *mdh, peer, portal, matchbits, 0); - } - - if (rc) { - CERROR("LNet%s(%s, %d, %lld) failed: %d\n", - options & LNET_MD_OP_PUT ? "Put" : "Get", - libcfs_id2str(peer), portal, matchbits, rc); - - /* - * The forthcoming unlink event will complete this operation - * with failure, so fall through and return success here. - */ - rc = LNetMDUnlink(*mdh); - LASSERT(!rc); - } else { - CDEBUG(D_NET, "Posted active RDMA: peer %s, portal %u, matchbits %#llx\n", - libcfs_id2str(peer), portal, matchbits); - } - return 0; -} - -static int -srpc_post_passive_rqtbuf(int service, int local, void *buf, int len, - struct lnet_handle_md *mdh, struct srpc_event *ev) -{ - struct lnet_process_id any = { 0 }; - - any.nid = LNET_NID_ANY; - any.pid = LNET_PID_ANY; - - return srpc_post_passive_rdma(srpc_serv_portal(service), - local, service, buf, len, - LNET_MD_OP_PUT, any, mdh, ev); -} - -static int -srpc_service_post_buffer(struct srpc_service_cd *scd, struct srpc_buffer *buf) -__must_hold(&scd->scd_lock) -{ - struct srpc_service *sv = scd->scd_svc; - struct srpc_msg *msg = &buf->buf_msg; - int rc; - - LNetInvalidateMDHandle(&buf->buf_mdh); - list_add(&buf->buf_list, &scd->scd_buf_posted); - scd->scd_buf_nposted++; - spin_unlock(&scd->scd_lock); - - rc = srpc_post_passive_rqtbuf(sv->sv_id, - !srpc_serv_is_framework(sv), - msg, sizeof(*msg), &buf->buf_mdh, - &scd->scd_ev); - - /* - * At this point, a RPC (new or delayed) may have arrived in - * msg and its event handler has been called. So we must add - * buf to scd_buf_posted _before_ dropping scd_lock - */ - spin_lock(&scd->scd_lock); - - if (!rc) { - if (!sv->sv_shuttingdown) - return 0; - - spin_unlock(&scd->scd_lock); - /* - * srpc_shutdown_service might have tried to unlink me - * when my buf_mdh was still invalid - */ - LNetMDUnlink(buf->buf_mdh); - spin_lock(&scd->scd_lock); - return 0; - } - - scd->scd_buf_nposted--; - if (sv->sv_shuttingdown) - return rc; /* don't allow to change scd_buf_posted */ - - list_del(&buf->buf_list); - spin_unlock(&scd->scd_lock); - - kfree(buf); - - spin_lock(&scd->scd_lock); - return rc; -} - -void -srpc_add_buffer(struct swi_workitem *wi) -{ - struct srpc_service_cd *scd = container_of(wi, struct srpc_service_cd, scd_buf_wi); - struct srpc_buffer *buf; - int rc = 0; - - /* - * it's called by workitem scheduler threads, these threads - * should have been set CPT affinity, so buffers will be posted - * on CPT local list of Portal - */ - spin_lock(&scd->scd_lock); - - while (scd->scd_buf_adjust > 0 && - !scd->scd_svc->sv_shuttingdown) { - scd->scd_buf_adjust--; /* consume it */ - scd->scd_buf_posting++; - - spin_unlock(&scd->scd_lock); - - buf = kzalloc(sizeof(*buf), GFP_NOFS); - if (!buf) { - CERROR("Failed to add new buf to service: %s\n", - scd->scd_svc->sv_name); - spin_lock(&scd->scd_lock); - rc = -ENOMEM; - break; - } - - spin_lock(&scd->scd_lock); - if (scd->scd_svc->sv_shuttingdown) { - spin_unlock(&scd->scd_lock); - kfree(buf); - - spin_lock(&scd->scd_lock); - rc = -ESHUTDOWN; - break; - } - - rc = srpc_service_post_buffer(scd, buf); - if (rc) - break; /* buf has been freed inside */ - - LASSERT(scd->scd_buf_posting > 0); - scd->scd_buf_posting--; - scd->scd_buf_total++; - scd->scd_buf_low = max(2, scd->scd_buf_total / 4); - } - - if (rc) { - scd->scd_buf_err_stamp = ktime_get_real_seconds(); - scd->scd_buf_err = rc; - - LASSERT(scd->scd_buf_posting > 0); - scd->scd_buf_posting--; - } - - spin_unlock(&scd->scd_lock); -} - -int -srpc_service_add_buffers(struct srpc_service *sv, int nbuffer) -{ - struct srpc_service_cd *scd; - int rc = 0; - int i; - - LASSERTF(nbuffer > 0, "nbuffer must be positive: %d\n", nbuffer); - - cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { - spin_lock(&scd->scd_lock); - - scd->scd_buf_err = 0; - scd->scd_buf_err_stamp = 0; - scd->scd_buf_posting = 0; - scd->scd_buf_adjust = nbuffer; - /* start to post buffers */ - swi_schedule_workitem(&scd->scd_buf_wi); - spin_unlock(&scd->scd_lock); - - /* framework service only post buffer for one partition */ - if (srpc_serv_is_framework(sv)) - break; - } - - cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { - spin_lock(&scd->scd_lock); - /* - * NB: srpc_service_add_buffers() can be called inside - * thread context of lst_serial_wq, and we don't normally - * allow to sleep inside thread context of WI scheduler - * because it will block current scheduler thread from doing - * anything else, even worse, it could deadlock if it's - * waiting on result from another WI of the same scheduler. - * However, it's safe at here because scd_buf_wi is scheduled - * by thread in a different WI scheduler (lst_test_wq), - * so we don't have any risk of deadlock, though this could - * block all WIs pending on lst_serial_wq for a moment - * which is not good but not fatal. - */ - lst_wait_until(scd->scd_buf_err || - (!scd->scd_buf_adjust && - !scd->scd_buf_posting), - scd->scd_lock, "waiting for adding buffer\n"); - - if (scd->scd_buf_err && !rc) - rc = scd->scd_buf_err; - - spin_unlock(&scd->scd_lock); - } - - return rc; -} - -void -srpc_service_remove_buffers(struct srpc_service *sv, int nbuffer) -{ - struct srpc_service_cd *scd; - int num; - int i; - - LASSERT(!sv->sv_shuttingdown); - - cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { - spin_lock(&scd->scd_lock); - - num = scd->scd_buf_total + scd->scd_buf_posting; - scd->scd_buf_adjust -= min(nbuffer, num); - - spin_unlock(&scd->scd_lock); - } -} - -/* returns 1 if sv has finished, otherwise 0 */ -int -srpc_finish_service(struct srpc_service *sv) -{ - struct srpc_service_cd *scd; - struct srpc_server_rpc *rpc; - int i; - - LASSERT(sv->sv_shuttingdown); /* srpc_shutdown_service called */ - - cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { - swi_cancel_workitem(&scd->scd_buf_wi); - - spin_lock(&scd->scd_lock); - - if (scd->scd_buf_nposted > 0) { - CDEBUG(D_NET, "waiting for %d posted buffers to unlink\n", - scd->scd_buf_nposted); - spin_unlock(&scd->scd_lock); - return 0; - } - - if (list_empty(&scd->scd_rpc_active)) { - spin_unlock(&scd->scd_lock); - continue; - } - - rpc = list_entry(scd->scd_rpc_active.next, - struct srpc_server_rpc, srpc_list); - CNETERR("Active RPC %p on shutdown: sv %s, peer %s, wi %s, ev fired %d type %d status %d lnet %d\n", - rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer), - swi_state2str(rpc->srpc_wi.swi_state), - rpc->srpc_ev.ev_fired, rpc->srpc_ev.ev_type, - rpc->srpc_ev.ev_status, rpc->srpc_ev.ev_lnet); - spin_unlock(&scd->scd_lock); - return 0; - } - - /* no lock needed from now on */ - srpc_service_fini(sv); - return 1; -} - -/* called with sv->sv_lock held */ -static void -srpc_service_recycle_buffer(struct srpc_service_cd *scd, - struct srpc_buffer *buf) -__must_hold(&scd->scd_lock) -{ - if (!scd->scd_svc->sv_shuttingdown && scd->scd_buf_adjust >= 0) { - if (srpc_service_post_buffer(scd, buf)) { - CWARN("Failed to post %s buffer\n", - scd->scd_svc->sv_name); - } - return; - } - - /* service is shutting down, or we want to recycle some buffers */ - scd->scd_buf_total--; - - if (scd->scd_buf_adjust < 0) { - scd->scd_buf_adjust++; - if (scd->scd_buf_adjust < 0 && - !scd->scd_buf_total && !scd->scd_buf_posting) { - CDEBUG(D_INFO, - "Try to recycle %d buffers but nothing left\n", - scd->scd_buf_adjust); - scd->scd_buf_adjust = 0; - } - } - - spin_unlock(&scd->scd_lock); - kfree(buf); - spin_lock(&scd->scd_lock); -} - -void -srpc_abort_service(struct srpc_service *sv) -{ - struct srpc_service_cd *scd; - struct srpc_server_rpc *rpc; - int i; - - CDEBUG(D_NET, "Aborting service: id %d, name %s\n", - sv->sv_id, sv->sv_name); - - cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { - spin_lock(&scd->scd_lock); - - /* - * schedule in-flight RPCs to notice the abort, NB: - * racing with incoming RPCs; complete fix should make test - * RPCs carry session ID in its headers - */ - list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list) { - rpc->srpc_aborted = 1; - swi_schedule_workitem(&rpc->srpc_wi); - } - - spin_unlock(&scd->scd_lock); - } -} - -void -srpc_shutdown_service(struct srpc_service *sv) -{ - struct srpc_service_cd *scd; - struct srpc_server_rpc *rpc; - struct srpc_buffer *buf; - int i; - - CDEBUG(D_NET, "Shutting down service: id %d, name %s\n", - sv->sv_id, sv->sv_name); - - cfs_percpt_for_each(scd, i, sv->sv_cpt_data) - spin_lock(&scd->scd_lock); - - sv->sv_shuttingdown = 1; /* i.e. no new active RPC */ - - cfs_percpt_for_each(scd, i, sv->sv_cpt_data) - spin_unlock(&scd->scd_lock); - - cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { - spin_lock(&scd->scd_lock); - - /* schedule in-flight RPCs to notice the shutdown */ - list_for_each_entry(rpc, &scd->scd_rpc_active, srpc_list) - swi_schedule_workitem(&rpc->srpc_wi); - - spin_unlock(&scd->scd_lock); - - /* - * OK to traverse scd_buf_posted without lock, since no one - * touches scd_buf_posted now - */ - list_for_each_entry(buf, &scd->scd_buf_posted, buf_list) - LNetMDUnlink(buf->buf_mdh); - } -} - -static int -srpc_send_request(struct srpc_client_rpc *rpc) -{ - struct srpc_event *ev = &rpc->crpc_reqstev; - int rc; - - ev->ev_fired = 0; - ev->ev_data = rpc; - ev->ev_type = SRPC_REQUEST_SENT; - - rc = srpc_post_active_rdma(srpc_serv_portal(rpc->crpc_service), - rpc->crpc_service, &rpc->crpc_reqstmsg, - sizeof(struct srpc_msg), LNET_MD_OP_PUT, - rpc->crpc_dest, LNET_NID_ANY, - &rpc->crpc_reqstmdh, ev); - if (rc) { - LASSERT(rc == -ENOMEM); - ev->ev_fired = 1; /* no more event expected */ - } - return rc; -} - -static int -srpc_prepare_reply(struct srpc_client_rpc *rpc) -{ - struct srpc_event *ev = &rpc->crpc_replyev; - __u64 *id = &rpc->crpc_reqstmsg.msg_body.reqst.rpyid; - int rc; - - ev->ev_fired = 0; - ev->ev_data = rpc; - ev->ev_type = SRPC_REPLY_RCVD; - - *id = srpc_next_id(); - - rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id, - &rpc->crpc_replymsg, - sizeof(struct srpc_msg), - LNET_MD_OP_PUT, rpc->crpc_dest, - &rpc->crpc_replymdh, ev); - if (rc) { - LASSERT(rc == -ENOMEM); - ev->ev_fired = 1; /* no more event expected */ - } - return rc; -} - -static int -srpc_prepare_bulk(struct srpc_client_rpc *rpc) -{ - struct srpc_bulk *bk = &rpc->crpc_bulk; - struct srpc_event *ev = &rpc->crpc_bulkev; - __u64 *id = &rpc->crpc_reqstmsg.msg_body.reqst.bulkid; - int rc; - int opt; - - LASSERT(bk->bk_niov <= LNET_MAX_IOV); - - if (!bk->bk_niov) - return 0; /* nothing to do */ - - opt = bk->bk_sink ? LNET_MD_OP_PUT : LNET_MD_OP_GET; - opt |= LNET_MD_KIOV; - - ev->ev_fired = 0; - ev->ev_data = rpc; - ev->ev_type = SRPC_BULK_REQ_RCVD; - - *id = srpc_next_id(); - - rc = srpc_post_passive_rdma(SRPC_RDMA_PORTAL, 0, *id, - &bk->bk_iovs[0], bk->bk_niov, opt, - rpc->crpc_dest, &bk->bk_mdh, ev); - if (rc) { - LASSERT(rc == -ENOMEM); - ev->ev_fired = 1; /* no more event expected */ - } - return rc; -} - -static int -srpc_do_bulk(struct srpc_server_rpc *rpc) -{ - struct srpc_event *ev = &rpc->srpc_ev; - struct srpc_bulk *bk = rpc->srpc_bulk; - __u64 id = rpc->srpc_reqstbuf->buf_msg.msg_body.reqst.bulkid; - int rc; - int opt; - - LASSERT(bk); - - opt = bk->bk_sink ? LNET_MD_OP_GET : LNET_MD_OP_PUT; - opt |= LNET_MD_KIOV; - - ev->ev_fired = 0; - ev->ev_data = rpc; - ev->ev_type = bk->bk_sink ? SRPC_BULK_GET_RPLD : SRPC_BULK_PUT_SENT; - - rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, id, - &bk->bk_iovs[0], bk->bk_niov, opt, - rpc->srpc_peer, rpc->srpc_self, - &bk->bk_mdh, ev); - if (rc) - ev->ev_fired = 1; /* no more event expected */ - return rc; -} - -/* only called from srpc_handle_rpc */ -static void -srpc_server_rpc_done(struct srpc_server_rpc *rpc, int status) -{ - struct srpc_service_cd *scd = rpc->srpc_scd; - struct srpc_service *sv = scd->scd_svc; - struct srpc_buffer *buffer; - - LASSERT(status || rpc->srpc_wi.swi_state == SWI_STATE_DONE); - - rpc->srpc_status = status; - - CDEBUG_LIMIT(!status ? D_NET : D_NETERROR, - "Server RPC %p done: service %s, peer %s, status %s:%d\n", - rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer), - swi_state2str(rpc->srpc_wi.swi_state), status); - - if (status) { - spin_lock(&srpc_data.rpc_glock); - srpc_data.rpc_counters.rpcs_dropped++; - spin_unlock(&srpc_data.rpc_glock); - } - - if (rpc->srpc_done) - (*rpc->srpc_done) (rpc); - LASSERT(!rpc->srpc_bulk); - - spin_lock(&scd->scd_lock); - - if (rpc->srpc_reqstbuf) { - /* - * NB might drop sv_lock in srpc_service_recycle_buffer, but - * sv won't go away for scd_rpc_active must not be empty - */ - srpc_service_recycle_buffer(scd, rpc->srpc_reqstbuf); - rpc->srpc_reqstbuf = NULL; - } - - list_del(&rpc->srpc_list); /* from scd->scd_rpc_active */ - - /* - * No one can schedule me now since: - * - I'm not on scd_rpc_active. - * - all LNet events have been fired. - * Cancel pending schedules and prevent future schedule attempts: - */ - LASSERT(rpc->srpc_ev.ev_fired); - - if (!sv->sv_shuttingdown && !list_empty(&scd->scd_buf_blocked)) { - buffer = list_entry(scd->scd_buf_blocked.next, - struct srpc_buffer, buf_list); - list_del(&buffer->buf_list); - - srpc_init_server_rpc(rpc, scd, buffer); - list_add_tail(&rpc->srpc_list, &scd->scd_rpc_active); - swi_schedule_workitem(&rpc->srpc_wi); - } else { - list_add(&rpc->srpc_list, &scd->scd_rpc_free); - } - - spin_unlock(&scd->scd_lock); -} - -/* handles an incoming RPC */ -void -srpc_handle_rpc(struct swi_workitem *wi) -{ - struct srpc_server_rpc *rpc = container_of(wi, struct srpc_server_rpc, srpc_wi); - struct srpc_service_cd *scd = rpc->srpc_scd; - struct srpc_service *sv = scd->scd_svc; - struct srpc_event *ev = &rpc->srpc_ev; - int rc = 0; - - LASSERT(wi == &rpc->srpc_wi); - - spin_lock(&scd->scd_lock); - - if (sv->sv_shuttingdown || rpc->srpc_aborted) { - spin_unlock(&scd->scd_lock); - - if (rpc->srpc_bulk) - LNetMDUnlink(rpc->srpc_bulk->bk_mdh); - LNetMDUnlink(rpc->srpc_replymdh); - - if (ev->ev_fired) { /* no more event, OK to finish */ - srpc_server_rpc_done(rpc, -ESHUTDOWN); - } - return; - } - - spin_unlock(&scd->scd_lock); - - switch (wi->swi_state) { - default: - LBUG(); - case SWI_STATE_NEWBORN: { - struct srpc_msg *msg; - struct srpc_generic_reply *reply; - - msg = &rpc->srpc_reqstbuf->buf_msg; - reply = &rpc->srpc_replymsg.msg_body.reply; - - if (!msg->msg_magic) { - /* moaned already in srpc_lnet_ev_handler */ - srpc_server_rpc_done(rpc, EBADMSG); - return; - } - - srpc_unpack_msg_hdr(msg); - if (msg->msg_version != SRPC_MSG_VERSION) { - CWARN("Version mismatch: %u, %u expected, from %s\n", - msg->msg_version, SRPC_MSG_VERSION, - libcfs_id2str(rpc->srpc_peer)); - reply->status = EPROTO; - /* drop through and send reply */ - } else { - reply->status = 0; - rc = (*sv->sv_handler)(rpc); - LASSERT(!reply->status || !rpc->srpc_bulk); - if (rc) { - srpc_server_rpc_done(rpc, rc); - return; - } - } - - wi->swi_state = SWI_STATE_BULK_STARTED; - - if (rpc->srpc_bulk) { - rc = srpc_do_bulk(rpc); - if (!rc) - return; /* wait for bulk */ - - LASSERT(ev->ev_fired); - ev->ev_status = rc; - } - } - /* fall through */ - case SWI_STATE_BULK_STARTED: - LASSERT(!rpc->srpc_bulk || ev->ev_fired); - - if (rpc->srpc_bulk) { - rc = ev->ev_status; - - if (sv->sv_bulk_ready) - rc = (*sv->sv_bulk_ready) (rpc, rc); - - if (rc) { - srpc_server_rpc_done(rpc, rc); - return; - } - } - - wi->swi_state = SWI_STATE_REPLY_SUBMITTED; - rc = srpc_send_reply(rpc); - if (!rc) - return; /* wait for reply */ - srpc_server_rpc_done(rpc, rc); - return; - - case SWI_STATE_REPLY_SUBMITTED: - if (!ev->ev_fired) { - CERROR("RPC %p: bulk %p, service %d\n", - rpc, rpc->srpc_bulk, sv->sv_id); - CERROR("Event: status %d, type %d, lnet %d\n", - ev->ev_status, ev->ev_type, ev->ev_lnet); - LASSERT(ev->ev_fired); - } - - wi->swi_state = SWI_STATE_DONE; - srpc_server_rpc_done(rpc, ev->ev_status); - return; - } -} - -static void -srpc_client_rpc_expired(void *data) -{ - struct srpc_client_rpc *rpc = data; - - CWARN("Client RPC expired: service %d, peer %s, timeout %d.\n", - rpc->crpc_service, libcfs_id2str(rpc->crpc_dest), - rpc->crpc_timeout); - - spin_lock(&rpc->crpc_lock); - - rpc->crpc_timeout = 0; - srpc_abort_rpc(rpc, -ETIMEDOUT); - - spin_unlock(&rpc->crpc_lock); - - spin_lock(&srpc_data.rpc_glock); - srpc_data.rpc_counters.rpcs_expired++; - spin_unlock(&srpc_data.rpc_glock); -} - -static void -srpc_add_client_rpc_timer(struct srpc_client_rpc *rpc) -{ - struct stt_timer *timer = &rpc->crpc_timer; - - if (!rpc->crpc_timeout) - return; - - INIT_LIST_HEAD(&timer->stt_list); - timer->stt_data = rpc; - timer->stt_func = srpc_client_rpc_expired; - timer->stt_expires = ktime_get_real_seconds() + rpc->crpc_timeout; - stt_add_timer(timer); -} - -/* - * Called with rpc->crpc_lock held. - * - * Upon exit the RPC expiry timer is not queued and the handler is not - * running on any CPU. - */ -static void -srpc_del_client_rpc_timer(struct srpc_client_rpc *rpc) -{ - /* timer not planted or already exploded */ - if (!rpc->crpc_timeout) - return; - - /* timer successfully defused */ - if (stt_del_timer(&rpc->crpc_timer)) - return; - - /* timer detonated, wait for it to explode */ - while (rpc->crpc_timeout) { - spin_unlock(&rpc->crpc_lock); - - schedule(); - - spin_lock(&rpc->crpc_lock); - } -} - -static void -srpc_client_rpc_done(struct srpc_client_rpc *rpc, int status) -{ - struct swi_workitem *wi = &rpc->crpc_wi; - - LASSERT(status || wi->swi_state == SWI_STATE_DONE); - - spin_lock(&rpc->crpc_lock); - - rpc->crpc_closed = 1; - if (!rpc->crpc_status) - rpc->crpc_status = status; - - srpc_del_client_rpc_timer(rpc); - - CDEBUG_LIMIT(!status ? D_NET : D_NETERROR, - "Client RPC done: service %d, peer %s, status %s:%d:%d\n", - rpc->crpc_service, libcfs_id2str(rpc->crpc_dest), - swi_state2str(wi->swi_state), rpc->crpc_aborted, status); - - /* - * No one can schedule me now since: - * - RPC timer has been defused. - * - all LNet events have been fired. - * - crpc_closed has been set, preventing srpc_abort_rpc from - * scheduling me. - * Cancel pending schedules and prevent future schedule attempts: - */ - LASSERT(!srpc_event_pending(rpc)); - - spin_unlock(&rpc->crpc_lock); - - (*rpc->crpc_done)(rpc); -} - -/* sends an outgoing RPC */ -void -srpc_send_rpc(struct swi_workitem *wi) -{ - int rc = 0; - struct srpc_client_rpc *rpc; - struct srpc_msg *reply; - int do_bulk; - - LASSERT(wi); - - rpc = container_of(wi, struct srpc_client_rpc, crpc_wi); - - LASSERT(rpc); - LASSERT(wi == &rpc->crpc_wi); - - reply = &rpc->crpc_replymsg; - do_bulk = rpc->crpc_bulk.bk_niov > 0; - - spin_lock(&rpc->crpc_lock); - - if (rpc->crpc_aborted) { - spin_unlock(&rpc->crpc_lock); - goto abort; - } - - spin_unlock(&rpc->crpc_lock); - - switch (wi->swi_state) { - default: - LBUG(); - case SWI_STATE_NEWBORN: - LASSERT(!srpc_event_pending(rpc)); - - rc = srpc_prepare_reply(rpc); - if (rc) { - srpc_client_rpc_done(rpc, rc); - return; - } - - rc = srpc_prepare_bulk(rpc); - if (rc) - break; - - wi->swi_state = SWI_STATE_REQUEST_SUBMITTED; - rc = srpc_send_request(rpc); - break; - - case SWI_STATE_REQUEST_SUBMITTED: - /* - * CAVEAT EMPTOR: rqtev, rpyev, and bulkev may come in any - * order; however, they're processed in a strict order: - * rqt, rpy, and bulk. - */ - if (!rpc->crpc_reqstev.ev_fired) - break; - - rc = rpc->crpc_reqstev.ev_status; - if (rc) - break; - - wi->swi_state = SWI_STATE_REQUEST_SENT; - /* perhaps more events */ - /* fall through */ - case SWI_STATE_REQUEST_SENT: { - enum srpc_msg_type type = srpc_service2reply(rpc->crpc_service); - - if (!rpc->crpc_replyev.ev_fired) - break; - - rc = rpc->crpc_replyev.ev_status; - if (rc) - break; - - srpc_unpack_msg_hdr(reply); - if (reply->msg_type != type || - (reply->msg_magic != SRPC_MSG_MAGIC && - reply->msg_magic != __swab32(SRPC_MSG_MAGIC))) { - CWARN("Bad message from %s: type %u (%d expected), magic %u (%d expected).\n", - libcfs_id2str(rpc->crpc_dest), - reply->msg_type, type, - reply->msg_magic, SRPC_MSG_MAGIC); - rc = -EBADMSG; - break; - } - - if (do_bulk && reply->msg_body.reply.status) { - CWARN("Remote error %d at %s, unlink bulk buffer in case peer didn't initiate bulk transfer\n", - reply->msg_body.reply.status, - libcfs_id2str(rpc->crpc_dest)); - LNetMDUnlink(rpc->crpc_bulk.bk_mdh); - } - - wi->swi_state = SWI_STATE_REPLY_RECEIVED; - } - /* fall through */ - case SWI_STATE_REPLY_RECEIVED: - if (do_bulk && !rpc->crpc_bulkev.ev_fired) - break; - - rc = do_bulk ? rpc->crpc_bulkev.ev_status : 0; - - /* - * Bulk buffer was unlinked due to remote error. Clear error - * since reply buffer still contains valid data. - * NB rpc->crpc_done shouldn't look into bulk data in case of - * remote error. - */ - if (do_bulk && rpc->crpc_bulkev.ev_lnet == LNET_EVENT_UNLINK && - !rpc->crpc_status && reply->msg_body.reply.status) - rc = 0; - - wi->swi_state = SWI_STATE_DONE; - srpc_client_rpc_done(rpc, rc); - return; - } - - if (rc) { - spin_lock(&rpc->crpc_lock); - srpc_abort_rpc(rpc, rc); - spin_unlock(&rpc->crpc_lock); - } - -abort: - if (rpc->crpc_aborted) { - LNetMDUnlink(rpc->crpc_reqstmdh); - LNetMDUnlink(rpc->crpc_replymdh); - LNetMDUnlink(rpc->crpc_bulk.bk_mdh); - - if (!srpc_event_pending(rpc)) { - srpc_client_rpc_done(rpc, -EINTR); - return; - } - } -} - -struct srpc_client_rpc * -srpc_create_client_rpc(struct lnet_process_id peer, int service, - int nbulkiov, int bulklen, - void (*rpc_done)(struct srpc_client_rpc *), - void (*rpc_fini)(struct srpc_client_rpc *), void *priv) -{ - struct srpc_client_rpc *rpc; - - rpc = kzalloc(offsetof(struct srpc_client_rpc, - crpc_bulk.bk_iovs[nbulkiov]), GFP_KERNEL); - if (!rpc) - return NULL; - - srpc_init_client_rpc(rpc, peer, service, nbulkiov, - bulklen, rpc_done, rpc_fini, priv); - return rpc; -} - -/* called with rpc->crpc_lock held */ -void -srpc_abort_rpc(struct srpc_client_rpc *rpc, int why) -{ - LASSERT(why); - - if (rpc->crpc_aborted || /* already aborted */ - rpc->crpc_closed) /* callback imminent */ - return; - - CDEBUG(D_NET, "Aborting RPC: service %d, peer %s, state %s, why %d\n", - rpc->crpc_service, libcfs_id2str(rpc->crpc_dest), - swi_state2str(rpc->crpc_wi.swi_state), why); - - rpc->crpc_aborted = 1; - rpc->crpc_status = why; - swi_schedule_workitem(&rpc->crpc_wi); -} - -/* called with rpc->crpc_lock held */ -void -srpc_post_rpc(struct srpc_client_rpc *rpc) -{ - LASSERT(!rpc->crpc_aborted); - LASSERT(srpc_data.rpc_state == SRPC_STATE_RUNNING); - - CDEBUG(D_NET, "Posting RPC: peer %s, service %d, timeout %d\n", - libcfs_id2str(rpc->crpc_dest), rpc->crpc_service, - rpc->crpc_timeout); - - srpc_add_client_rpc_timer(rpc); - swi_schedule_workitem(&rpc->crpc_wi); -} - -int -srpc_send_reply(struct srpc_server_rpc *rpc) -{ - struct srpc_event *ev = &rpc->srpc_ev; - struct srpc_msg *msg = &rpc->srpc_replymsg; - struct srpc_buffer *buffer = rpc->srpc_reqstbuf; - struct srpc_service_cd *scd = rpc->srpc_scd; - struct srpc_service *sv = scd->scd_svc; - __u64 rpyid; - int rc; - - LASSERT(buffer); - rpyid = buffer->buf_msg.msg_body.reqst.rpyid; - - spin_lock(&scd->scd_lock); - - if (!sv->sv_shuttingdown && !srpc_serv_is_framework(sv)) { - /* - * Repost buffer before replying since test client - * might send me another RPC once it gets the reply - */ - if (srpc_service_post_buffer(scd, buffer)) - CWARN("Failed to repost %s buffer\n", sv->sv_name); - rpc->srpc_reqstbuf = NULL; - } - - spin_unlock(&scd->scd_lock); - - ev->ev_fired = 0; - ev->ev_data = rpc; - ev->ev_type = SRPC_REPLY_SENT; - - msg->msg_magic = SRPC_MSG_MAGIC; - msg->msg_version = SRPC_MSG_VERSION; - msg->msg_type = srpc_service2reply(sv->sv_id); - - rc = srpc_post_active_rdma(SRPC_RDMA_PORTAL, rpyid, msg, - sizeof(*msg), LNET_MD_OP_PUT, - rpc->srpc_peer, rpc->srpc_self, - &rpc->srpc_replymdh, ev); - if (rc) - ev->ev_fired = 1; /* no more event expected */ - return rc; -} - -/* when in kernel always called with LNET_LOCK() held, and in thread context */ -static void -srpc_lnet_ev_handler(struct lnet_event *ev) -{ - struct srpc_service_cd *scd; - struct srpc_event *rpcev = ev->md.user_ptr; - struct srpc_client_rpc *crpc; - struct srpc_server_rpc *srpc; - struct srpc_buffer *buffer; - struct srpc_service *sv; - struct srpc_msg *msg; - enum srpc_msg_type type; - - LASSERT(!in_interrupt()); - - if (ev->status) { - __u32 errors; - - spin_lock(&srpc_data.rpc_glock); - if (ev->status != -ECANCELED) /* cancellation is not error */ - srpc_data.rpc_counters.errors++; - errors = srpc_data.rpc_counters.errors; - spin_unlock(&srpc_data.rpc_glock); - - CNETERR("LNet event status %d type %d, RPC errors %u\n", - ev->status, ev->type, errors); - } - - rpcev->ev_lnet = ev->type; - - switch (rpcev->ev_type) { - default: - CERROR("Unknown event: status %d, type %d, lnet %d\n", - rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet); - LBUG(); - case SRPC_REQUEST_SENT: - if (!ev->status && ev->type != LNET_EVENT_UNLINK) { - spin_lock(&srpc_data.rpc_glock); - srpc_data.rpc_counters.rpcs_sent++; - spin_unlock(&srpc_data.rpc_glock); - } - /* fall through */ - case SRPC_REPLY_RCVD: - case SRPC_BULK_REQ_RCVD: - crpc = rpcev->ev_data; - - if (rpcev != &crpc->crpc_reqstev && - rpcev != &crpc->crpc_replyev && - rpcev != &crpc->crpc_bulkev) { - CERROR("rpcev %p, crpc %p, reqstev %p, replyev %p, bulkev %p\n", - rpcev, crpc, &crpc->crpc_reqstev, - &crpc->crpc_replyev, &crpc->crpc_bulkev); - CERROR("Bad event: status %d, type %d, lnet %d\n", - rpcev->ev_status, rpcev->ev_type, rpcev->ev_lnet); - LBUG(); - } - - spin_lock(&crpc->crpc_lock); - - LASSERT(!rpcev->ev_fired); - rpcev->ev_fired = 1; - rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ? - -EINTR : ev->status; - swi_schedule_workitem(&crpc->crpc_wi); - - spin_unlock(&crpc->crpc_lock); - break; - - case SRPC_REQUEST_RCVD: - scd = rpcev->ev_data; - sv = scd->scd_svc; - - LASSERT(rpcev == &scd->scd_ev); - - spin_lock(&scd->scd_lock); - - LASSERT(ev->unlinked); - LASSERT(ev->type == LNET_EVENT_PUT || - ev->type == LNET_EVENT_UNLINK); - LASSERT(ev->type != LNET_EVENT_UNLINK || - sv->sv_shuttingdown); - - buffer = container_of(ev->md.start, struct srpc_buffer, buf_msg); - buffer->buf_peer = ev->initiator; - buffer->buf_self = ev->target.nid; - - LASSERT(scd->scd_buf_nposted > 0); - scd->scd_buf_nposted--; - - if (sv->sv_shuttingdown) { - /* - * Leave buffer on scd->scd_buf_nposted since - * srpc_finish_service needs to traverse it. - */ - spin_unlock(&scd->scd_lock); - break; - } - - if (scd->scd_buf_err_stamp && - scd->scd_buf_err_stamp < ktime_get_real_seconds()) { - /* re-enable adding buffer */ - scd->scd_buf_err_stamp = 0; - scd->scd_buf_err = 0; - } - - if (!scd->scd_buf_err && /* adding buffer is enabled */ - !scd->scd_buf_adjust && - scd->scd_buf_nposted < scd->scd_buf_low) { - scd->scd_buf_adjust = max(scd->scd_buf_total / 2, - SFW_TEST_WI_MIN); - swi_schedule_workitem(&scd->scd_buf_wi); - } - - list_del(&buffer->buf_list); /* from scd->scd_buf_posted */ - msg = &buffer->buf_msg; - type = srpc_service2request(sv->sv_id); - - if (ev->status || ev->mlength != sizeof(*msg) || - (msg->msg_type != type && - msg->msg_type != __swab32(type)) || - (msg->msg_magic != SRPC_MSG_MAGIC && - msg->msg_magic != __swab32(SRPC_MSG_MAGIC))) { - CERROR("Dropping RPC (%s) from %s: status %d mlength %d type %u magic %u.\n", - sv->sv_name, libcfs_id2str(ev->initiator), - ev->status, ev->mlength, - msg->msg_type, msg->msg_magic); - - /* - * NB can't call srpc_service_recycle_buffer here since - * it may call LNetM[DE]Attach. The invalid magic tells - * srpc_handle_rpc to drop this RPC - */ - msg->msg_magic = 0; - } - - if (!list_empty(&scd->scd_rpc_free)) { - srpc = list_entry(scd->scd_rpc_free.next, - struct srpc_server_rpc, - srpc_list); - list_del(&srpc->srpc_list); - - srpc_init_server_rpc(srpc, scd, buffer); - list_add_tail(&srpc->srpc_list, - &scd->scd_rpc_active); - swi_schedule_workitem(&srpc->srpc_wi); - } else { - list_add_tail(&buffer->buf_list, - &scd->scd_buf_blocked); - } - - spin_unlock(&scd->scd_lock); - - spin_lock(&srpc_data.rpc_glock); - srpc_data.rpc_counters.rpcs_rcvd++; - spin_unlock(&srpc_data.rpc_glock); - break; - - case SRPC_BULK_GET_RPLD: - LASSERT(ev->type == LNET_EVENT_SEND || - ev->type == LNET_EVENT_REPLY || - ev->type == LNET_EVENT_UNLINK); - - if (!ev->unlinked) - break; /* wait for final event */ - /* fall through */ - case SRPC_BULK_PUT_SENT: - if (!ev->status && ev->type != LNET_EVENT_UNLINK) { - spin_lock(&srpc_data.rpc_glock); - - if (rpcev->ev_type == SRPC_BULK_GET_RPLD) - srpc_data.rpc_counters.bulk_get += ev->mlength; - else - srpc_data.rpc_counters.bulk_put += ev->mlength; - - spin_unlock(&srpc_data.rpc_glock); - } - /* fall through */ - case SRPC_REPLY_SENT: - srpc = rpcev->ev_data; - scd = srpc->srpc_scd; - - LASSERT(rpcev == &srpc->srpc_ev); - - spin_lock(&scd->scd_lock); - - rpcev->ev_fired = 1; - rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ? - -EINTR : ev->status; - swi_schedule_workitem(&srpc->srpc_wi); - - spin_unlock(&scd->scd_lock); - break; - } -} - -int -srpc_startup(void) -{ - int rc; - - memset(&srpc_data, 0, sizeof(struct smoketest_rpc)); - spin_lock_init(&srpc_data.rpc_glock); - - /* 1 second pause to avoid timestamp reuse */ - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ); - srpc_data.rpc_matchbits = ((__u64)ktime_get_real_seconds()) << 48; - - srpc_data.rpc_state = SRPC_STATE_NONE; - - rc = LNetNIInit(LNET_PID_LUSTRE); - if (rc < 0) { - CERROR("LNetNIInit() has failed: %d\n", rc); - return rc; - } - - srpc_data.rpc_state = SRPC_STATE_NI_INIT; - - LNetInvalidateEQHandle(&srpc_data.rpc_lnet_eq); - rc = LNetEQAlloc(0, srpc_lnet_ev_handler, &srpc_data.rpc_lnet_eq); - if (rc) { - CERROR("LNetEQAlloc() has failed: %d\n", rc); - goto bail; - } - - rc = LNetSetLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL); - LASSERT(!rc); - rc = LNetSetLazyPortal(SRPC_REQUEST_PORTAL); - LASSERT(!rc); - - srpc_data.rpc_state = SRPC_STATE_EQ_INIT; - - rc = stt_startup(); - -bail: - if (rc) - srpc_shutdown(); - else - srpc_data.rpc_state = SRPC_STATE_RUNNING; - - return rc; -} - -void -srpc_shutdown(void) -{ - int i; - int rc; - int state; - - state = srpc_data.rpc_state; - srpc_data.rpc_state = SRPC_STATE_STOPPING; - - switch (state) { - default: - LBUG(); - case SRPC_STATE_RUNNING: - spin_lock(&srpc_data.rpc_glock); - - for (i = 0; i <= SRPC_SERVICE_MAX_ID; i++) { - struct srpc_service *sv = srpc_data.rpc_services[i]; - - LASSERTF(!sv, "service not empty: id %d, name %s\n", - i, sv->sv_name); - } - - spin_unlock(&srpc_data.rpc_glock); - - stt_shutdown(); - /* fall through */ - case SRPC_STATE_EQ_INIT: - rc = LNetClearLazyPortal(SRPC_FRAMEWORK_REQUEST_PORTAL); - rc = LNetClearLazyPortal(SRPC_REQUEST_PORTAL); - LASSERT(!rc); - rc = LNetEQFree(srpc_data.rpc_lnet_eq); - LASSERT(!rc); /* the EQ should have no user by now */ - /* fall through */ - case SRPC_STATE_NI_INIT: - LNetNIFini(); - } -} diff --git a/drivers/staging/lustre/lnet/selftest/rpc.h b/drivers/staging/lustre/lnet/selftest/rpc.h deleted file mode 100644 index 465b5b534423d296a9624121e2dd49f5e1c5fe64..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/selftest/rpc.h +++ /dev/null @@ -1,295 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#ifndef __SELFTEST_RPC_H__ -#define __SELFTEST_RPC_H__ - -#include - -/* - * LST wired structures - * - * XXX: *REPLY == *REQST + 1 - */ -enum srpc_msg_type { - SRPC_MSG_MKSN_REQST = 0, - SRPC_MSG_MKSN_REPLY = 1, - SRPC_MSG_RMSN_REQST = 2, - SRPC_MSG_RMSN_REPLY = 3, - SRPC_MSG_BATCH_REQST = 4, - SRPC_MSG_BATCH_REPLY = 5, - SRPC_MSG_STAT_REQST = 6, - SRPC_MSG_STAT_REPLY = 7, - SRPC_MSG_TEST_REQST = 8, - SRPC_MSG_TEST_REPLY = 9, - SRPC_MSG_DEBUG_REQST = 10, - SRPC_MSG_DEBUG_REPLY = 11, - SRPC_MSG_BRW_REQST = 12, - SRPC_MSG_BRW_REPLY = 13, - SRPC_MSG_PING_REQST = 14, - SRPC_MSG_PING_REPLY = 15, - SRPC_MSG_JOIN_REQST = 16, - SRPC_MSG_JOIN_REPLY = 17, -}; - -/* CAVEAT EMPTOR: - * All srpc_*_reqst_t's 1st field must be matchbits of reply buffer, - * and 2nd field matchbits of bulk buffer if any. - * - * All srpc_*_reply_t's 1st field must be a __u32 status, and 2nd field - * session id if needed. - */ -struct srpc_generic_reqst { - __u64 rpyid; /* reply buffer matchbits */ - __u64 bulkid; /* bulk buffer matchbits */ -} WIRE_ATTR; - -struct srpc_generic_reply { - __u32 status; - struct lst_sid sid; -} WIRE_ATTR; - -/* FRAMEWORK RPCs */ -struct srpc_mksn_reqst { - __u64 mksn_rpyid; /* reply buffer matchbits */ - struct lst_sid mksn_sid; /* session id */ - __u32 mksn_force; /* use brute force */ - char mksn_name[LST_NAME_SIZE]; -} WIRE_ATTR; /* make session request */ - -struct srpc_mksn_reply { - __u32 mksn_status; /* session status */ - struct lst_sid mksn_sid; /* session id */ - __u32 mksn_timeout; /* session timeout */ - char mksn_name[LST_NAME_SIZE]; -} WIRE_ATTR; /* make session reply */ - -struct srpc_rmsn_reqst { - __u64 rmsn_rpyid; /* reply buffer matchbits */ - struct lst_sid rmsn_sid; /* session id */ -} WIRE_ATTR; /* remove session request */ - -struct srpc_rmsn_reply { - __u32 rmsn_status; - struct lst_sid rmsn_sid; /* session id */ -} WIRE_ATTR; /* remove session reply */ - -struct srpc_join_reqst { - __u64 join_rpyid; /* reply buffer matchbits */ - struct lst_sid join_sid; /* session id to join */ - char join_group[LST_NAME_SIZE]; /* group name */ -} WIRE_ATTR; - -struct srpc_join_reply { - __u32 join_status; /* returned status */ - struct lst_sid join_sid; /* session id */ - __u32 join_timeout; /* # seconds' inactivity to - * expire - */ - char join_session[LST_NAME_SIZE]; /* session name */ -} WIRE_ATTR; - -struct srpc_debug_reqst { - __u64 dbg_rpyid; /* reply buffer matchbits */ - struct lst_sid dbg_sid; /* session id */ - __u32 dbg_flags; /* bitmap of debug */ -} WIRE_ATTR; - -struct srpc_debug_reply { - __u32 dbg_status; /* returned code */ - struct lst_sid dbg_sid; /* session id */ - __u32 dbg_timeout; /* session timeout */ - __u32 dbg_nbatch; /* # of batches in the node */ - char dbg_name[LST_NAME_SIZE]; /* session name */ -} WIRE_ATTR; - -#define SRPC_BATCH_OPC_RUN 1 -#define SRPC_BATCH_OPC_STOP 2 -#define SRPC_BATCH_OPC_QUERY 3 - -struct srpc_batch_reqst { - __u64 bar_rpyid; /* reply buffer matchbits */ - struct lst_sid bar_sid; /* session id */ - struct lst_bid bar_bid; /* batch id */ - __u32 bar_opc; /* create/start/stop batch */ - __u32 bar_testidx; /* index of test */ - __u32 bar_arg; /* parameters */ -} WIRE_ATTR; - -struct srpc_batch_reply { - __u32 bar_status; /* status of request */ - struct lst_sid bar_sid; /* session id */ - __u32 bar_active; /* # of active tests in batch/test */ - __u32 bar_time; /* remained time */ -} WIRE_ATTR; - -struct srpc_stat_reqst { - __u64 str_rpyid; /* reply buffer matchbits */ - struct lst_sid str_sid; /* session id */ - __u32 str_type; /* type of stat */ -} WIRE_ATTR; - -struct srpc_stat_reply { - __u32 str_status; - struct lst_sid str_sid; - struct sfw_counters str_fw; - struct srpc_counters str_rpc; - struct lnet_counters str_lnet; -} WIRE_ATTR; - -struct test_bulk_req { - __u32 blk_opc; /* bulk operation code */ - __u32 blk_npg; /* # of pages */ - __u32 blk_flags; /* reserved flags */ -} WIRE_ATTR; - -struct test_bulk_req_v1 { - __u16 blk_opc; /* bulk operation code */ - __u16 blk_flags; /* data check flags */ - __u32 blk_len; /* data length */ - __u32 blk_offset; /* offset */ -} WIRE_ATTR; - -struct test_ping_req { - __u32 png_size; /* size of ping message */ - __u32 png_flags; /* reserved flags */ -} WIRE_ATTR; - -struct srpc_test_reqst { - __u64 tsr_rpyid; /* reply buffer matchbits */ - __u64 tsr_bulkid; /* bulk buffer matchbits */ - struct lst_sid tsr_sid; /* session id */ - struct lst_bid tsr_bid; /* batch id */ - __u32 tsr_service; /* test type: bulk|ping|... */ - __u32 tsr_loop; /* test client loop count or - * # server buffers needed - */ - __u32 tsr_concur; /* concurrency of test */ - __u8 tsr_is_client; /* is test client or not */ - __u8 tsr_stop_onerr; /* stop on error */ - __u32 tsr_ndest; /* # of dest nodes */ - - union { - struct test_ping_req ping; - struct test_bulk_req bulk_v0; - struct test_bulk_req_v1 bulk_v1; - } tsr_u; -} WIRE_ATTR; - -struct srpc_test_reply { - __u32 tsr_status; /* returned code */ - struct lst_sid tsr_sid; -} WIRE_ATTR; - -/* TEST RPCs */ -struct srpc_ping_reqst { - __u64 pnr_rpyid; - __u32 pnr_magic; - __u32 pnr_seq; - __u64 pnr_time_sec; - __u64 pnr_time_usec; -} WIRE_ATTR; - -struct srpc_ping_reply { - __u32 pnr_status; - __u32 pnr_magic; - __u32 pnr_seq; -} WIRE_ATTR; - -struct srpc_brw_reqst { - __u64 brw_rpyid; /* reply buffer matchbits */ - __u64 brw_bulkid; /* bulk buffer matchbits */ - __u32 brw_rw; /* read or write */ - __u32 brw_len; /* bulk data len */ - __u32 brw_flags; /* bulk data patterns */ -} WIRE_ATTR; /* bulk r/w request */ - -struct srpc_brw_reply { - __u32 brw_status; -} WIRE_ATTR; /* bulk r/w reply */ - -#define SRPC_MSG_MAGIC 0xeeb0f00d -#define SRPC_MSG_VERSION 1 - -struct srpc_msg { - __u32 msg_magic; /* magic number */ - __u32 msg_version; /* message version number */ - __u32 msg_type; /* type of message body: srpc_msg_type */ - __u32 msg_reserved0; - __u32 msg_reserved1; - __u32 msg_ses_feats; /* test session features */ - union { - struct srpc_generic_reqst reqst; - struct srpc_generic_reply reply; - - struct srpc_mksn_reqst mksn_reqst; - struct srpc_mksn_reply mksn_reply; - struct srpc_rmsn_reqst rmsn_reqst; - struct srpc_rmsn_reply rmsn_reply; - struct srpc_debug_reqst dbg_reqst; - struct srpc_debug_reply dbg_reply; - struct srpc_batch_reqst bat_reqst; - struct srpc_batch_reply bat_reply; - struct srpc_stat_reqst stat_reqst; - struct srpc_stat_reply stat_reply; - struct srpc_test_reqst tes_reqst; - struct srpc_test_reply tes_reply; - struct srpc_join_reqst join_reqst; - struct srpc_join_reply join_reply; - - struct srpc_ping_reqst ping_reqst; - struct srpc_ping_reply ping_reply; - struct srpc_brw_reqst brw_reqst; - struct srpc_brw_reply brw_reply; - } msg_body; -} WIRE_ATTR; - -static inline void -srpc_unpack_msg_hdr(struct srpc_msg *msg) -{ - if (msg->msg_magic == SRPC_MSG_MAGIC) - return; /* no flipping needed */ - - /* - * We do not swap the magic number here as it is needed to - * determine whether the body needs to be swapped. - */ - /* __swab32s(&msg->msg_magic); */ - __swab32s(&msg->msg_type); - __swab32s(&msg->msg_version); - __swab32s(&msg->msg_ses_feats); - __swab32s(&msg->msg_reserved0); - __swab32s(&msg->msg_reserved1); -} - -#endif /* __SELFTEST_RPC_H__ */ diff --git a/drivers/staging/lustre/lnet/selftest/selftest.h b/drivers/staging/lustre/lnet/selftest/selftest.h deleted file mode 100644 index 8737fa96b192d615c80811a11cd5babdea0d9cd0..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/selftest/selftest.h +++ /dev/null @@ -1,622 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/selftest/selftest.h - * - * Author: Isaac Huang - */ -#ifndef __SELFTEST_SELFTEST_H__ -#define __SELFTEST_SELFTEST_H__ - -#define LNET_ONLY - -#include -#include -#include - -#include "rpc.h" -#include "timer.h" - -#ifndef MADE_WITHOUT_COMPROMISE -#define MADE_WITHOUT_COMPROMISE -#endif - -#define SWI_STATE_NEWBORN 0 -#define SWI_STATE_REPLY_SUBMITTED 1 -#define SWI_STATE_REPLY_SENT 2 -#define SWI_STATE_REQUEST_SUBMITTED 3 -#define SWI_STATE_REQUEST_SENT 4 -#define SWI_STATE_REPLY_RECEIVED 5 -#define SWI_STATE_BULK_STARTED 6 -#define SWI_STATE_DONE 10 - -/* forward refs */ -struct srpc_service; -struct srpc_service_cd; -struct sfw_test_unit; -struct sfw_test_instance; - -/* services below SRPC_FRAMEWORK_SERVICE_MAX_ID are framework - * services, e.g. create/modify session. - */ -#define SRPC_SERVICE_DEBUG 0 -#define SRPC_SERVICE_MAKE_SESSION 1 -#define SRPC_SERVICE_REMOVE_SESSION 2 -#define SRPC_SERVICE_BATCH 3 -#define SRPC_SERVICE_TEST 4 -#define SRPC_SERVICE_QUERY_STAT 5 -#define SRPC_SERVICE_JOIN 6 -#define SRPC_FRAMEWORK_SERVICE_MAX_ID 10 -/* other services start from SRPC_FRAMEWORK_SERVICE_MAX_ID+1 */ -#define SRPC_SERVICE_BRW 11 -#define SRPC_SERVICE_PING 12 -#define SRPC_SERVICE_MAX_ID 12 - -#define SRPC_REQUEST_PORTAL 50 -/* a lazy portal for framework RPC requests */ -#define SRPC_FRAMEWORK_REQUEST_PORTAL 51 -/* all reply/bulk RDMAs go to this portal */ -#define SRPC_RDMA_PORTAL 52 - -static inline enum srpc_msg_type -srpc_service2request(int service) -{ - switch (service) { - default: - LBUG(); - case SRPC_SERVICE_DEBUG: - return SRPC_MSG_DEBUG_REQST; - - case SRPC_SERVICE_MAKE_SESSION: - return SRPC_MSG_MKSN_REQST; - - case SRPC_SERVICE_REMOVE_SESSION: - return SRPC_MSG_RMSN_REQST; - - case SRPC_SERVICE_BATCH: - return SRPC_MSG_BATCH_REQST; - - case SRPC_SERVICE_TEST: - return SRPC_MSG_TEST_REQST; - - case SRPC_SERVICE_QUERY_STAT: - return SRPC_MSG_STAT_REQST; - - case SRPC_SERVICE_BRW: - return SRPC_MSG_BRW_REQST; - - case SRPC_SERVICE_PING: - return SRPC_MSG_PING_REQST; - - case SRPC_SERVICE_JOIN: - return SRPC_MSG_JOIN_REQST; - } -} - -static inline enum srpc_msg_type -srpc_service2reply(int service) -{ - return srpc_service2request(service) + 1; -} - -enum srpc_event_type { - SRPC_BULK_REQ_RCVD = 1, /* passive bulk request(PUT sink/GET source) - * received - */ - SRPC_BULK_PUT_SENT = 2, /* active bulk PUT sent (source) */ - SRPC_BULK_GET_RPLD = 3, /* active bulk GET replied (sink) */ - SRPC_REPLY_RCVD = 4, /* incoming reply received */ - SRPC_REPLY_SENT = 5, /* outgoing reply sent */ - SRPC_REQUEST_RCVD = 6, /* incoming request received */ - SRPC_REQUEST_SENT = 7, /* outgoing request sent */ -}; - -/* RPC event */ -struct srpc_event { - enum srpc_event_type ev_type; /* what's up */ - enum lnet_event_kind ev_lnet; /* LNet event type */ - int ev_fired; /* LNet event fired? */ - int ev_status; /* LNet event status */ - void *ev_data; /* owning server/client RPC */ -}; - -/* bulk descriptor */ -struct srpc_bulk { - int bk_len; /* len of bulk data */ - struct lnet_handle_md bk_mdh; - int bk_sink; /* sink/source */ - int bk_niov; /* # iov in bk_iovs */ - struct bio_vec bk_iovs[0]; -}; - -/* message buffer descriptor */ -struct srpc_buffer { - struct list_head buf_list; /* chain on srpc_service::*_msgq */ - struct srpc_msg buf_msg; - struct lnet_handle_md buf_mdh; - lnet_nid_t buf_self; - struct lnet_process_id buf_peer; -}; - -struct swi_workitem; -typedef void (*swi_action_t) (struct swi_workitem *); - -struct swi_workitem { - struct workqueue_struct *swi_wq; - struct work_struct swi_work; - swi_action_t swi_action; - int swi_state; -}; - -/* server-side state of a RPC */ -struct srpc_server_rpc { - /* chain on srpc_service::*_rpcq */ - struct list_head srpc_list; - struct srpc_service_cd *srpc_scd; - struct swi_workitem srpc_wi; - struct srpc_event srpc_ev; /* bulk/reply event */ - lnet_nid_t srpc_self; - struct lnet_process_id srpc_peer; - struct srpc_msg srpc_replymsg; - struct lnet_handle_md srpc_replymdh; - struct srpc_buffer *srpc_reqstbuf; - struct srpc_bulk *srpc_bulk; - - unsigned int srpc_aborted; /* being given up */ - int srpc_status; - void (*srpc_done)(struct srpc_server_rpc *); -}; - -/* client-side state of a RPC */ -struct srpc_client_rpc { - struct list_head crpc_list; /* chain on user's lists */ - spinlock_t crpc_lock; /* serialize */ - int crpc_service; - atomic_t crpc_refcount; - int crpc_timeout; /* # seconds to wait for reply */ - struct stt_timer crpc_timer; - struct swi_workitem crpc_wi; - struct lnet_process_id crpc_dest; - - void (*crpc_done)(struct srpc_client_rpc *); - void (*crpc_fini)(struct srpc_client_rpc *); - int crpc_status; /* completion status */ - void *crpc_priv; /* caller data */ - - /* state flags */ - unsigned int crpc_aborted:1; /* being given up */ - unsigned int crpc_closed:1; /* completed */ - - /* RPC events */ - struct srpc_event crpc_bulkev; /* bulk event */ - struct srpc_event crpc_reqstev; /* request event */ - struct srpc_event crpc_replyev; /* reply event */ - - /* bulk, request(reqst), and reply exchanged on wire */ - struct srpc_msg crpc_reqstmsg; - struct srpc_msg crpc_replymsg; - struct lnet_handle_md crpc_reqstmdh; - struct lnet_handle_md crpc_replymdh; - struct srpc_bulk crpc_bulk; -}; - -#define srpc_client_rpc_size(rpc) \ -offsetof(struct srpc_client_rpc, crpc_bulk.bk_iovs[(rpc)->crpc_bulk.bk_niov]) - -#define srpc_client_rpc_addref(rpc) \ -do { \ - CDEBUG(D_NET, "RPC[%p] -> %s (%d)++\n", \ - (rpc), libcfs_id2str((rpc)->crpc_dest), \ - atomic_read(&(rpc)->crpc_refcount)); \ - LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0); \ - atomic_inc(&(rpc)->crpc_refcount); \ -} while (0) - -#define srpc_client_rpc_decref(rpc) \ -do { \ - CDEBUG(D_NET, "RPC[%p] -> %s (%d)--\n", \ - (rpc), libcfs_id2str((rpc)->crpc_dest), \ - atomic_read(&(rpc)->crpc_refcount)); \ - LASSERT(atomic_read(&(rpc)->crpc_refcount) > 0); \ - if (atomic_dec_and_test(&(rpc)->crpc_refcount)) \ - srpc_destroy_client_rpc(rpc); \ -} while (0) - -#define srpc_event_pending(rpc) (!(rpc)->crpc_bulkev.ev_fired || \ - !(rpc)->crpc_reqstev.ev_fired || \ - !(rpc)->crpc_replyev.ev_fired) - -/* CPU partition data of srpc service */ -struct srpc_service_cd { - /** serialize */ - spinlock_t scd_lock; - /** backref to service */ - struct srpc_service *scd_svc; - /** event buffer */ - struct srpc_event scd_ev; - /** free RPC descriptors */ - struct list_head scd_rpc_free; - /** in-flight RPCs */ - struct list_head scd_rpc_active; - /** workitem for posting buffer */ - struct swi_workitem scd_buf_wi; - /** CPT id */ - int scd_cpt; - /** error code for scd_buf_wi */ - int scd_buf_err; - /** timestamp for scd_buf_err */ - time64_t scd_buf_err_stamp; - /** total # request buffers */ - int scd_buf_total; - /** # posted request buffers */ - int scd_buf_nposted; - /** in progress of buffer posting */ - int scd_buf_posting; - /** allocate more buffers if scd_buf_nposted < scd_buf_low */ - int scd_buf_low; - /** increase/decrease some buffers */ - int scd_buf_adjust; - /** posted message buffers */ - struct list_head scd_buf_posted; - /** blocked for RPC descriptor */ - struct list_head scd_buf_blocked; -}; - -/* number of server workitems (mini-thread) for testing service */ -#define SFW_TEST_WI_MIN 256 -#define SFW_TEST_WI_MAX 2048 -/* extra buffers for tolerating buggy peers, or unbalanced number - * of peers between partitions - */ -#define SFW_TEST_WI_EXTRA 64 - -/* number of server workitems (mini-thread) for framework service */ -#define SFW_FRWK_WI_MIN 16 -#define SFW_FRWK_WI_MAX 256 - -struct srpc_service { - int sv_id; /* service id */ - const char *sv_name; /* human readable name */ - int sv_wi_total; /* total server workitems */ - int sv_shuttingdown; - int sv_ncpts; - /* percpt data for srpc_service */ - struct srpc_service_cd **sv_cpt_data; - /* Service callbacks: - * - sv_handler: process incoming RPC request - * - sv_bulk_ready: notify bulk data - */ - int (*sv_handler)(struct srpc_server_rpc *); - int (*sv_bulk_ready)(struct srpc_server_rpc *, int); -}; - -struct sfw_session { - struct list_head sn_list; /* chain on fw_zombie_sessions */ - struct lst_sid sn_id; /* unique identifier */ - unsigned int sn_timeout; /* # seconds' inactivity to expire */ - int sn_timer_active; - unsigned int sn_features; - struct stt_timer sn_timer; - struct list_head sn_batches; /* list of batches */ - char sn_name[LST_NAME_SIZE]; - atomic_t sn_refcount; - atomic_t sn_brw_errors; - atomic_t sn_ping_errors; - unsigned long sn_started; -}; - -#define sfw_sid_equal(sid0, sid1) ((sid0).ses_nid == (sid1).ses_nid && \ - (sid0).ses_stamp == (sid1).ses_stamp) - -struct sfw_batch { - struct list_head bat_list; /* chain on sn_batches */ - struct lst_bid bat_id; /* batch id */ - int bat_error; /* error code of batch */ - struct sfw_session *bat_session; /* batch's session */ - atomic_t bat_nactive; /* # of active tests */ - struct list_head bat_tests; /* test instances */ -}; - -struct sfw_test_client_ops { - int (*tso_init)(struct sfw_test_instance *tsi); /* initialize test - * client - */ - void (*tso_fini)(struct sfw_test_instance *tsi); /* finalize test - * client - */ - int (*tso_prep_rpc)(struct sfw_test_unit *tsu, - struct lnet_process_id dest, - struct srpc_client_rpc **rpc); /* prep a tests rpc */ - void (*tso_done_rpc)(struct sfw_test_unit *tsu, - struct srpc_client_rpc *rpc); /* done a test rpc */ -}; - -struct sfw_test_instance { - struct list_head tsi_list; /* chain on batch */ - int tsi_service; /* test type */ - struct sfw_batch *tsi_batch; /* batch */ - struct sfw_test_client_ops *tsi_ops; /* test client operation - */ - - /* public parameter for all test units */ - unsigned int tsi_is_client:1; /* is test client */ - unsigned int tsi_stoptsu_onerr:1; /* stop tsu on error */ - int tsi_concur; /* concurrency */ - int tsi_loop; /* loop count */ - - /* status of test instance */ - spinlock_t tsi_lock; /* serialize */ - unsigned int tsi_stopping:1; /* test is stopping */ - atomic_t tsi_nactive; /* # of active test - * unit - */ - struct list_head tsi_units; /* test units */ - struct list_head tsi_free_rpcs; /* free rpcs */ - struct list_head tsi_active_rpcs; /* active rpcs */ - - union { - struct test_ping_req ping; /* ping parameter */ - struct test_bulk_req bulk_v0; /* bulk parameter */ - struct test_bulk_req_v1 bulk_v1; /* bulk v1 parameter */ - } tsi_u; -}; - -/* - * XXX: trailing (PAGE_SIZE % sizeof(struct lnet_process_id)) bytes at the end - * of pages are not used - */ -#define SFW_MAX_CONCUR LST_MAX_CONCUR -#define SFW_ID_PER_PAGE (PAGE_SIZE / sizeof(struct lnet_process_id_packed)) -#define SFW_MAX_NDESTS (LNET_MAX_IOV * SFW_ID_PER_PAGE) -#define sfw_id_pages(n) (((n) + SFW_ID_PER_PAGE - 1) / SFW_ID_PER_PAGE) - -struct sfw_test_unit { - struct list_head tsu_list; /* chain on lst_test_instance */ - struct lnet_process_id tsu_dest; /* id of dest node */ - int tsu_loop; /* loop count of the test */ - struct sfw_test_instance *tsu_instance; /* pointer to test instance */ - void *tsu_private; /* private data */ - struct swi_workitem tsu_worker; /* workitem of the test unit */ -}; - -struct sfw_test_case { - struct list_head tsc_list; /* chain on fw_tests */ - struct srpc_service *tsc_srv_service; /* test service */ - struct sfw_test_client_ops *tsc_cli_ops; /* ops of test client */ -}; - -struct srpc_client_rpc * -sfw_create_rpc(struct lnet_process_id peer, int service, - unsigned int features, int nbulkiov, int bulklen, - void (*done)(struct srpc_client_rpc *), void *priv); -int sfw_create_test_rpc(struct sfw_test_unit *tsu, - struct lnet_process_id peer, unsigned int features, - int nblk, int blklen, struct srpc_client_rpc **rpc); -void sfw_abort_rpc(struct srpc_client_rpc *rpc); -void sfw_post_rpc(struct srpc_client_rpc *rpc); -void sfw_client_rpc_done(struct srpc_client_rpc *rpc); -void sfw_unpack_message(struct srpc_msg *msg); -void sfw_free_pages(struct srpc_server_rpc *rpc); -void sfw_add_bulk_page(struct srpc_bulk *bk, struct page *pg, int i); -int sfw_alloc_pages(struct srpc_server_rpc *rpc, int cpt, int npages, int len, - int sink); -int sfw_make_session(struct srpc_mksn_reqst *request, - struct srpc_mksn_reply *reply); - -struct srpc_client_rpc * -srpc_create_client_rpc(struct lnet_process_id peer, int service, - int nbulkiov, int bulklen, - void (*rpc_done)(struct srpc_client_rpc *), - void (*rpc_fini)(struct srpc_client_rpc *), void *priv); -void srpc_post_rpc(struct srpc_client_rpc *rpc); -void srpc_abort_rpc(struct srpc_client_rpc *rpc, int why); -void srpc_free_bulk(struct srpc_bulk *bk); -struct srpc_bulk *srpc_alloc_bulk(int cpt, unsigned int off, - unsigned int bulk_npg, unsigned int bulk_len, - int sink); -void srpc_send_rpc(struct swi_workitem *wi); -int srpc_send_reply(struct srpc_server_rpc *rpc); -int srpc_add_service(struct srpc_service *sv); -int srpc_remove_service(struct srpc_service *sv); -void srpc_shutdown_service(struct srpc_service *sv); -void srpc_abort_service(struct srpc_service *sv); -int srpc_finish_service(struct srpc_service *sv); -int srpc_service_add_buffers(struct srpc_service *sv, int nbuffer); -void srpc_service_remove_buffers(struct srpc_service *sv, int nbuffer); -void srpc_get_counters(struct srpc_counters *cnt); -void srpc_set_counters(const struct srpc_counters *cnt); - -extern struct workqueue_struct *lst_serial_wq; -extern struct workqueue_struct **lst_test_wq; - -static inline int -srpc_serv_is_framework(struct srpc_service *svc) -{ - return svc->sv_id < SRPC_FRAMEWORK_SERVICE_MAX_ID; -} - -static void -swi_wi_action(struct work_struct *wi) -{ - struct swi_workitem *swi; - - swi = container_of(wi, struct swi_workitem, swi_work); - - swi->swi_action(swi); -} - -static inline void -swi_init_workitem(struct swi_workitem *swi, - swi_action_t action, struct workqueue_struct *wq) -{ - swi->swi_wq = wq; - swi->swi_action = action; - swi->swi_state = SWI_STATE_NEWBORN; - INIT_WORK(&swi->swi_work, swi_wi_action); -} - -static inline void -swi_schedule_workitem(struct swi_workitem *wi) -{ - queue_work(wi->swi_wq, &wi->swi_work); -} - -static inline int -swi_cancel_workitem(struct swi_workitem *swi) -{ - return cancel_work_sync(&swi->swi_work); -} - -int sfw_startup(void); -int srpc_startup(void); -void sfw_shutdown(void); -void srpc_shutdown(void); - -static inline void -srpc_destroy_client_rpc(struct srpc_client_rpc *rpc) -{ - LASSERT(rpc); - LASSERT(!srpc_event_pending(rpc)); - LASSERT(!atomic_read(&rpc->crpc_refcount)); - - if (!rpc->crpc_fini) - kfree(rpc); - else - (*rpc->crpc_fini)(rpc); -} - -static inline void -srpc_init_client_rpc(struct srpc_client_rpc *rpc, struct lnet_process_id peer, - int service, int nbulkiov, int bulklen, - void (*rpc_done)(struct srpc_client_rpc *), - void (*rpc_fini)(struct srpc_client_rpc *), void *priv) -{ - LASSERT(nbulkiov <= LNET_MAX_IOV); - - memset(rpc, 0, offsetof(struct srpc_client_rpc, - crpc_bulk.bk_iovs[nbulkiov])); - - INIT_LIST_HEAD(&rpc->crpc_list); - swi_init_workitem(&rpc->crpc_wi, srpc_send_rpc, - lst_test_wq[lnet_cpt_of_nid(peer.nid)]); - spin_lock_init(&rpc->crpc_lock); - atomic_set(&rpc->crpc_refcount, 1); /* 1 ref for caller */ - - rpc->crpc_dest = peer; - rpc->crpc_priv = priv; - rpc->crpc_service = service; - rpc->crpc_bulk.bk_len = bulklen; - rpc->crpc_bulk.bk_niov = nbulkiov; - rpc->crpc_done = rpc_done; - rpc->crpc_fini = rpc_fini; - LNetInvalidateMDHandle(&rpc->crpc_reqstmdh); - LNetInvalidateMDHandle(&rpc->crpc_replymdh); - LNetInvalidateMDHandle(&rpc->crpc_bulk.bk_mdh); - - /* no event is expected at this point */ - rpc->crpc_bulkev.ev_fired = 1; - rpc->crpc_reqstev.ev_fired = 1; - rpc->crpc_replyev.ev_fired = 1; - - rpc->crpc_reqstmsg.msg_magic = SRPC_MSG_MAGIC; - rpc->crpc_reqstmsg.msg_version = SRPC_MSG_VERSION; - rpc->crpc_reqstmsg.msg_type = srpc_service2request(service); -} - -static inline const char * -swi_state2str(int state) -{ -#define STATE2STR(x) case x: return #x - switch (state) { - default: - LBUG(); - STATE2STR(SWI_STATE_NEWBORN); - STATE2STR(SWI_STATE_REPLY_SUBMITTED); - STATE2STR(SWI_STATE_REPLY_SENT); - STATE2STR(SWI_STATE_REQUEST_SUBMITTED); - STATE2STR(SWI_STATE_REQUEST_SENT); - STATE2STR(SWI_STATE_REPLY_RECEIVED); - STATE2STR(SWI_STATE_BULK_STARTED); - STATE2STR(SWI_STATE_DONE); - } -#undef STATE2STR -} - -#define selftest_wait_events() \ - do { \ - set_current_state(TASK_UNINTERRUPTIBLE); \ - schedule_timeout(HZ / 10); \ - } while (0) - -#define lst_wait_until(cond, lock, fmt, ...) \ -do { \ - int __I = 2; \ - while (!(cond)) { \ - CDEBUG(is_power_of_2(++__I) ? D_WARNING : D_NET, \ - fmt, ## __VA_ARGS__); \ - spin_unlock(&(lock)); \ - \ - selftest_wait_events(); \ - \ - spin_lock(&(lock)); \ - } \ -} while (0) - -static inline void -srpc_wait_service_shutdown(struct srpc_service *sv) -{ - int i = 2; - - LASSERT(sv->sv_shuttingdown); - - while (!srpc_finish_service(sv)) { - i++; - CDEBUG(((i & -i) == i) ? D_WARNING : D_NET, - "Waiting for %s service to shutdown...\n", - sv->sv_name); - selftest_wait_events(); - } -} - -extern struct sfw_test_client_ops brw_test_client; -void brw_init_test_client(void); - -extern struct srpc_service brw_test_service; -void brw_init_test_service(void); - -extern struct sfw_test_client_ops ping_test_client; -void ping_init_test_client(void); - -extern struct srpc_service ping_test_service; -void ping_init_test_service(void); - -#endif /* __SELFTEST_SELFTEST_H__ */ diff --git a/drivers/staging/lustre/lnet/selftest/timer.c b/drivers/staging/lustre/lnet/selftest/timer.c deleted file mode 100644 index 582f252b3e128123809b08027f59e5ca54d4978d..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/selftest/timer.c +++ /dev/null @@ -1,244 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/selftest/timer.c - * - * Author: Isaac Huang - */ - -#define DEBUG_SUBSYSTEM S_LNET - -#include "selftest.h" - -/* - * Timers are implemented as a sorted queue of expiry times. The queue - * is slotted, with each slot holding timers which expire in a - * 2**STTIMER_MINPOLL (8) second period. The timers in each slot are - * sorted by increasing expiry time. The number of slots is 2**7 (128), - * to cover a time period of 1024 seconds into the future before wrapping. - */ -#define STTIMER_MINPOLL 3 /* log2 min poll interval (8 s) */ -#define STTIMER_SLOTTIME BIT(STTIMER_MINPOLL) -#define STTIMER_SLOTTIMEMASK (~(STTIMER_SLOTTIME - 1)) -#define STTIMER_NSLOTS BIT(7) -#define STTIMER_SLOT(t) (&stt_data.stt_hash[(((t) >> STTIMER_MINPOLL) & \ - (STTIMER_NSLOTS - 1))]) - -static struct st_timer_data { - spinlock_t stt_lock; - unsigned long stt_prev_slot; /* start time of the slot processed - * previously - */ - struct list_head stt_hash[STTIMER_NSLOTS]; - int stt_shuttingdown; - wait_queue_head_t stt_waitq; - int stt_nthreads; -} stt_data; - -void -stt_add_timer(struct stt_timer *timer) -{ - struct list_head *pos; - - spin_lock(&stt_data.stt_lock); - - LASSERT(stt_data.stt_nthreads > 0); - LASSERT(!stt_data.stt_shuttingdown); - LASSERT(timer->stt_func); - LASSERT(list_empty(&timer->stt_list)); - LASSERT(timer->stt_expires > ktime_get_real_seconds()); - - /* a simple insertion sort */ - list_for_each_prev(pos, STTIMER_SLOT(timer->stt_expires)) { - struct stt_timer *old = list_entry(pos, struct stt_timer, - stt_list); - - if (timer->stt_expires >= old->stt_expires) - break; - } - list_add(&timer->stt_list, pos); - - spin_unlock(&stt_data.stt_lock); -} - -/* - * The function returns whether it has deactivated a pending timer or not. - * (ie. del_timer() of an inactive timer returns 0, del_timer() of an - * active timer returns 1.) - * - * CAVEAT EMPTOR: - * When 0 is returned, it is possible that timer->stt_func _is_ running on - * another CPU. - */ -int -stt_del_timer(struct stt_timer *timer) -{ - int ret = 0; - - spin_lock(&stt_data.stt_lock); - - LASSERT(stt_data.stt_nthreads > 0); - LASSERT(!stt_data.stt_shuttingdown); - - if (!list_empty(&timer->stt_list)) { - ret = 1; - list_del_init(&timer->stt_list); - } - - spin_unlock(&stt_data.stt_lock); - return ret; -} - -/* called with stt_data.stt_lock held */ -static int -stt_expire_list(struct list_head *slot, time64_t now) -{ - int expired = 0; - struct stt_timer *timer; - - while (!list_empty(slot)) { - timer = list_entry(slot->next, struct stt_timer, stt_list); - - if (timer->stt_expires > now) - break; - - list_del_init(&timer->stt_list); - spin_unlock(&stt_data.stt_lock); - - expired++; - (*timer->stt_func) (timer->stt_data); - - spin_lock(&stt_data.stt_lock); - } - - return expired; -} - -static int -stt_check_timers(unsigned long *last) -{ - int expired = 0; - time64_t now; - unsigned long this_slot; - - now = ktime_get_real_seconds(); - this_slot = now & STTIMER_SLOTTIMEMASK; - - spin_lock(&stt_data.stt_lock); - - while (time_after_eq(this_slot, *last)) { - expired += stt_expire_list(STTIMER_SLOT(this_slot), now); - this_slot = this_slot - STTIMER_SLOTTIME; - } - - *last = now & STTIMER_SLOTTIMEMASK; - spin_unlock(&stt_data.stt_lock); - return expired; -} - -static int -stt_timer_main(void *arg) -{ - int rc = 0; - - while (!stt_data.stt_shuttingdown) { - stt_check_timers(&stt_data.stt_prev_slot); - - rc = wait_event_timeout(stt_data.stt_waitq, - stt_data.stt_shuttingdown, - STTIMER_SLOTTIME * HZ); - } - - spin_lock(&stt_data.stt_lock); - stt_data.stt_nthreads--; - spin_unlock(&stt_data.stt_lock); - return rc; -} - -static int -stt_start_timer_thread(void) -{ - struct task_struct *task; - - LASSERT(!stt_data.stt_shuttingdown); - - task = kthread_run(stt_timer_main, NULL, "st_timer"); - if (IS_ERR(task)) - return PTR_ERR(task); - - spin_lock(&stt_data.stt_lock); - stt_data.stt_nthreads++; - spin_unlock(&stt_data.stt_lock); - return 0; -} - -int -stt_startup(void) -{ - int rc = 0; - int i; - - stt_data.stt_shuttingdown = 0; - stt_data.stt_prev_slot = ktime_get_real_seconds() & STTIMER_SLOTTIMEMASK; - - spin_lock_init(&stt_data.stt_lock); - for (i = 0; i < STTIMER_NSLOTS; i++) - INIT_LIST_HEAD(&stt_data.stt_hash[i]); - - stt_data.stt_nthreads = 0; - init_waitqueue_head(&stt_data.stt_waitq); - rc = stt_start_timer_thread(); - if (rc) - CERROR("Can't spawn timer thread: %d\n", rc); - - return rc; -} - -void -stt_shutdown(void) -{ - int i; - - spin_lock(&stt_data.stt_lock); - - for (i = 0; i < STTIMER_NSLOTS; i++) - LASSERT(list_empty(&stt_data.stt_hash[i])); - - stt_data.stt_shuttingdown = 1; - - wake_up(&stt_data.stt_waitq); - lst_wait_until(!stt_data.stt_nthreads, stt_data.stt_lock, - "waiting for %d threads to terminate\n", - stt_data.stt_nthreads); - - spin_unlock(&stt_data.stt_lock); -} diff --git a/drivers/staging/lustre/lnet/selftest/timer.h b/drivers/staging/lustre/lnet/selftest/timer.h deleted file mode 100644 index 7f0ef9bd0cdac69160b629b6bc96b9cd706232c6..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lnet/selftest/timer.h +++ /dev/null @@ -1,50 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lnet/selftest/timer.h - * - * Author: Isaac Huang - */ -#ifndef __SELFTEST_TIMER_H__ -#define __SELFTEST_TIMER_H__ - -struct stt_timer { - struct list_head stt_list; - time64_t stt_expires; - void (*stt_func)(void *); - void *stt_data; -}; - -void stt_add_timer(struct stt_timer *timer); -int stt_del_timer(struct stt_timer *timer); -int stt_startup(void); -void stt_shutdown(void); - -#endif /* __SELFTEST_TIMER_H__ */ diff --git a/drivers/staging/lustre/lustre/Kconfig b/drivers/staging/lustre/lustre/Kconfig deleted file mode 100644 index ccb78a94599545822b4cf3f8163365c6c199b03a..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/Kconfig +++ /dev/null @@ -1,45 +0,0 @@ -config LUSTRE_FS - tristate "Lustre file system client support" - depends on LNET - select CRYPTO - select CRYPTO_CRC32 - select CRYPTO_CRC32_PCLMUL if X86 - select CRYPTO_CRC32C - select CRYPTO_MD5 - select CRYPTO_SHA1 - select CRYPTO_SHA256 - select CRYPTO_SHA512 - depends on MULTIUSER - help - This option enables Lustre file system client support. Choose Y - here if you want to access a Lustre file system cluster. To compile - this file system support as a module, choose M here: the module will - be called lustre. - - To mount Lustre file systems, you also need to install the user space - mount.lustre and other user space commands which can be found in the - lustre-client package, available from - http://downloads.whamcloud.com/public/lustre/ - - Lustre file system is the most popular cluster file system in high - performance computing. Source code of both kernel space and user space - Lustre components can also be found at - http://git.whamcloud.com/?p=fs/lustre-release.git;a=summary - - If unsure, say N. - - See also http://wiki.lustre.org/ - -config LUSTRE_DEBUG_EXPENSIVE_CHECK - bool "Enable Lustre DEBUG checks" - depends on LUSTRE_FS - help - This option is mainly for debug purpose. It enables Lustre code to do - expensive checks that may have a performance impact. - - Use with caution. If unsure, say N. - -config LUSTRE_TRANSLATE_ERRNOS - bool - depends on LUSTRE_FS && !X86 - default y diff --git a/drivers/staging/lustre/lustre/Makefile b/drivers/staging/lustre/lustre/Makefile deleted file mode 100644 index 331e4fcdd5a256b14f7e252a8c70d82dd6d6d1d3..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/Makefile +++ /dev/null @@ -1,2 +0,0 @@ -obj-$(CONFIG_LUSTRE_FS) += obdclass/ ptlrpc/ fld/ osc/ mgc/ \ - fid/ lov/ mdc/ lmv/ llite/ obdecho/ diff --git a/drivers/staging/lustre/lustre/fid/Makefile b/drivers/staging/lustre/lustre/fid/Makefile deleted file mode 100644 index 77b65b92667d6e5182572266aadaee10ed0ff03f..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/fid/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/include -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/lustre/include/ - -obj-$(CONFIG_LUSTRE_FS) += fid.o -fid-y := fid_request.o fid_lib.o lproc_fid.o diff --git a/drivers/staging/lustre/lustre/fid/fid_internal.h b/drivers/staging/lustre/lustre/fid/fid_internal.h deleted file mode 100644 index 14569e969a314a930a780a24e040d2a7957e7b59..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/fid/fid_internal.h +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/fid/fid_internal.h - * - * Author: Yury Umanets - */ -#ifndef __FID_INTERNAL_H -#define __FID_INTERNAL_H - -#include - -/* Functions used internally in module. */ - -extern struct lprocfs_vars seq_client_debugfs_list[]; - -#endif /* __FID_INTERNAL_H */ diff --git a/drivers/staging/lustre/lustre/fid/fid_lib.c b/drivers/staging/lustre/lustre/fid/fid_lib.c deleted file mode 100644 index ac52b378c15593915e770c3ed2278f4a63aa59d4..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/fid/fid_lib.c +++ /dev/null @@ -1,87 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/fid/fid_lib.c - * - * Miscellaneous fid functions. - * - * Author: Nikita Danilov - * Author: Yury Umanets - */ - -#define DEBUG_SUBSYSTEM S_FID - -#include -#include - -/** - * A cluster-wide range from which fid-sequences are granted to servers and - * then clients. - * - * Fid namespace: - *
- * Normal FID:        seq:64 [2^33,2^64-1]      oid:32          ver:32
- * IGIF      :        0:32, ino:32              gen:32          0:32
- * IDIF      :        0:31, 1:1, ost-index:16,  objd:48         0:32
- * 
- * - * The first 0x400 sequences of normal FID are reserved for special purpose. - * FID_SEQ_START + 1 is for local file id generation. - * FID_SEQ_START + 2 is for .lustre directory and its objects - */ -const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE = { - .lsr_start = FID_SEQ_NORMAL, - .lsr_end = (__u64)~0ULL, -}; - -/* Zero range, used for init and other purposes. */ -const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE = { - .lsr_start = 0, -}; - -/* Lustre Big Fs Lock fid. */ -const struct lu_fid LUSTRE_BFL_FID = { .f_seq = FID_SEQ_SPECIAL, - .f_oid = FID_OID_SPECIAL_BFL, - .f_ver = 0x0000000000000000 }; -EXPORT_SYMBOL(LUSTRE_BFL_FID); - -/** Special fid for ".lustre" directory */ -const struct lu_fid LU_DOT_LUSTRE_FID = { .f_seq = FID_SEQ_DOT_LUSTRE, - .f_oid = FID_OID_DOT_LUSTRE, - .f_ver = 0x0000000000000000 }; -EXPORT_SYMBOL(LU_DOT_LUSTRE_FID); - -/** Special fid for "fid" special object in .lustre */ -const struct lu_fid LU_OBF_FID = { .f_seq = FID_SEQ_DOT_LUSTRE, - .f_oid = FID_OID_DOT_LUSTRE_OBF, - .f_ver = 0x0000000000000000 }; -EXPORT_SYMBOL(LU_OBF_FID); diff --git a/drivers/staging/lustre/lustre/fid/fid_request.c b/drivers/staging/lustre/lustre/fid/fid_request.c deleted file mode 100644 index a34fd90ca5e5dd01ac18379dd369c0a9e437ef25..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/fid/fid_request.c +++ /dev/null @@ -1,410 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/fid/fid_request.c - * - * Lustre Sequence Manager - * - * Author: Yury Umanets - */ - -#define DEBUG_SUBSYSTEM S_FID - -#include - -#include -#include -#include -#include -/* mdc RPC locks */ -#include -#include "fid_internal.h" - -static struct dentry *seq_debugfs_dir; - -static int seq_client_rpc(struct lu_client_seq *seq, - struct lu_seq_range *output, __u32 opc, - const char *opcname) -{ - struct obd_export *exp = seq->lcs_exp; - struct ptlrpc_request *req; - struct lu_seq_range *out, *in; - __u32 *op; - unsigned int debug_mask; - int rc; - - LASSERT(exp && !IS_ERR(exp)); - req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_SEQ_QUERY, - LUSTRE_MDS_VERSION, SEQ_QUERY); - if (!req) - return -ENOMEM; - - /* Init operation code */ - op = req_capsule_client_get(&req->rq_pill, &RMF_SEQ_OPC); - *op = opc; - - /* Zero out input range, this is not recovery yet. */ - in = req_capsule_client_get(&req->rq_pill, &RMF_SEQ_RANGE); - lu_seq_range_init(in); - - ptlrpc_request_set_replen(req); - - in->lsr_index = seq->lcs_space.lsr_index; - if (seq->lcs_type == LUSTRE_SEQ_METADATA) - fld_range_set_mdt(in); - else - fld_range_set_ost(in); - - if (opc == SEQ_ALLOC_SUPER) { - req->rq_request_portal = SEQ_CONTROLLER_PORTAL; - req->rq_reply_portal = MDC_REPLY_PORTAL; - /* During allocating super sequence for data object, - * the current thread might hold the export of MDT0(MDT0 - * precreating objects on this OST), and it will send the - * request to MDT0 here, so we can not keep resending the - * request here, otherwise if MDT0 is failed(umounted), - * it can not release the export of MDT0 - */ - if (seq->lcs_type == LUSTRE_SEQ_DATA) { - req->rq_no_delay = 1; - req->rq_no_resend = 1; - } - debug_mask = D_CONSOLE; - } else { - if (seq->lcs_type == LUSTRE_SEQ_METADATA) { - req->rq_reply_portal = MDC_REPLY_PORTAL; - req->rq_request_portal = SEQ_METADATA_PORTAL; - } else { - req->rq_reply_portal = OSC_REPLY_PORTAL; - req->rq_request_portal = SEQ_DATA_PORTAL; - } - debug_mask = D_INFO; - } - - ptlrpc_at_set_req_timeout(req); - - rc = ptlrpc_queue_wait(req); - if (rc) - goto out_req; - - out = req_capsule_server_get(&req->rq_pill, &RMF_SEQ_RANGE); - - if (!lu_seq_range_is_sane(out)) { - CERROR("%s: Invalid range received from server: " - DRANGE "\n", seq->lcs_name, PRANGE(out)); - rc = -EINVAL; - goto out_req; - } - - if (lu_seq_range_is_exhausted(out)) { - CERROR("%s: Range received from server is exhausted: " - DRANGE "]\n", seq->lcs_name, PRANGE(out)); - rc = -EINVAL; - goto out_req; - } - - *output = *out; - CDEBUG_LIMIT(debug_mask, "%s: Allocated %s-sequence " DRANGE "]\n", - seq->lcs_name, opcname, PRANGE(output)); - -out_req: - ptlrpc_req_finished(req); - return rc; -} - -/* Request sequence-controller node to allocate new meta-sequence. */ -static int seq_client_alloc_meta(const struct lu_env *env, - struct lu_client_seq *seq) -{ - int rc; - - do { - /* If meta server return -EINPROGRESS or EAGAIN, - * it means meta server might not be ready to - * allocate super sequence from sequence controller - * (MDT0)yet - */ - rc = seq_client_rpc(seq, &seq->lcs_space, - SEQ_ALLOC_META, "meta"); - } while (rc == -EINPROGRESS || rc == -EAGAIN); - - return rc; -} - -/* Allocate new sequence for client. */ -static int seq_client_alloc_seq(const struct lu_env *env, - struct lu_client_seq *seq, u64 *seqnr) -{ - int rc; - - LASSERT(lu_seq_range_is_sane(&seq->lcs_space)); - - if (lu_seq_range_is_exhausted(&seq->lcs_space)) { - rc = seq_client_alloc_meta(env, seq); - if (rc) { - CERROR("%s: Can't allocate new meta-sequence, rc %d\n", - seq->lcs_name, rc); - *seqnr = U64_MAX; - return rc; - } - CDEBUG(D_INFO, "%s: New range - " DRANGE "\n", - seq->lcs_name, PRANGE(&seq->lcs_space)); - } else { - rc = 0; - } - - LASSERT(!lu_seq_range_is_exhausted(&seq->lcs_space)); - *seqnr = seq->lcs_space.lsr_start; - seq->lcs_space.lsr_start += 1; - - CDEBUG(D_INFO, "%s: Allocated sequence [%#llx]\n", seq->lcs_name, - *seqnr); - - return rc; -} - -/* Allocate new fid on passed client @seq and save it to @fid. */ -int seq_client_alloc_fid(const struct lu_env *env, - struct lu_client_seq *seq, struct lu_fid *fid) -{ - int rc; - - LASSERT(seq); - LASSERT(fid); - - spin_lock(&seq->lcs_lock); - - if (OBD_FAIL_CHECK(OBD_FAIL_SEQ_EXHAUST)) - seq->lcs_fid.f_oid = seq->lcs_width; - - wait_event_cmd(seq->lcs_waitq, - (!fid_is_zero(&seq->lcs_fid) && - fid_oid(&seq->lcs_fid) < seq->lcs_width) || - !seq->lcs_update, - spin_unlock(&seq->lcs_lock), - spin_lock(&seq->lcs_lock)); - - if (!fid_is_zero(&seq->lcs_fid) && - fid_oid(&seq->lcs_fid) < seq->lcs_width) { - /* Just bump last allocated fid and return to caller. */ - seq->lcs_fid.f_oid += 1; - rc = 0; - } else { - u64 seqnr; - - LASSERT(seq->lcs_update == 0); - seq->lcs_update = 1; - spin_unlock(&seq->lcs_lock); - - rc = seq_client_alloc_seq(env, seq, &seqnr); - - spin_lock(&seq->lcs_lock); - seq->lcs_update = 0; - wake_up(&seq->lcs_waitq); - - if (rc) { - CERROR("%s: Can't allocate new sequence, rc %d\n", - seq->lcs_name, rc); - spin_unlock(&seq->lcs_lock); - return rc; - } - - CDEBUG(D_INFO, "%s: Switch to sequence [0x%16.16llx]\n", - seq->lcs_name, seqnr); - - seq->lcs_fid.f_oid = LUSTRE_FID_INIT_OID; - seq->lcs_fid.f_seq = seqnr; - seq->lcs_fid.f_ver = 0; - - /* - * Inform caller that sequence switch is performed to allow it - * to setup FLD for it. - */ - rc = 1; - } - - *fid = seq->lcs_fid; - spin_unlock(&seq->lcs_lock); - - CDEBUG(D_INFO, - "%s: Allocated FID " DFID "\n", seq->lcs_name, PFID(fid)); - return rc; -} -EXPORT_SYMBOL(seq_client_alloc_fid); - -/* - * Finish the current sequence due to disconnect. - * See mdc_import_event() - */ -void seq_client_flush(struct lu_client_seq *seq) -{ - - LASSERT(seq); - spin_lock(&seq->lcs_lock); - - wait_event_cmd(seq->lcs_waitq, - !seq->lcs_update, - spin_unlock(&seq->lcs_lock), - spin_lock(&seq->lcs_lock)); - - fid_zero(&seq->lcs_fid); - /** - * this id shld not be used for seq range allocation. - * set to -1 for dgb check. - */ - - seq->lcs_space.lsr_index = -1; - - lu_seq_range_init(&seq->lcs_space); - spin_unlock(&seq->lcs_lock); -} -EXPORT_SYMBOL(seq_client_flush); - -static void seq_client_debugfs_fini(struct lu_client_seq *seq) -{ - debugfs_remove_recursive(seq->lcs_debugfs_entry); -} - -static void seq_client_debugfs_init(struct lu_client_seq *seq) -{ - seq->lcs_debugfs_entry = debugfs_create_dir(seq->lcs_name, - seq_debugfs_dir); - - ldebugfs_add_vars(seq->lcs_debugfs_entry, seq_client_debugfs_list, seq); -} - -static void seq_client_fini(struct lu_client_seq *seq) -{ - seq_client_debugfs_fini(seq); - - if (seq->lcs_exp) { - class_export_put(seq->lcs_exp); - seq->lcs_exp = NULL; - } -} - -static void seq_client_init(struct lu_client_seq *seq, struct obd_export *exp, - enum lu_cli_type type, const char *prefix) -{ - LASSERT(seq); - LASSERT(prefix); - - seq->lcs_type = type; - - spin_lock_init(&seq->lcs_lock); - if (type == LUSTRE_SEQ_METADATA) - seq->lcs_width = LUSTRE_METADATA_SEQ_MAX_WIDTH; - else - seq->lcs_width = LUSTRE_DATA_SEQ_MAX_WIDTH; - - init_waitqueue_head(&seq->lcs_waitq); - /* Make sure that things are clear before work is started. */ - seq_client_flush(seq); - - seq->lcs_exp = class_export_get(exp); - - snprintf(seq->lcs_name, sizeof(seq->lcs_name), - "cli-%s", prefix); - - seq_client_debugfs_init(seq); -} - -int client_fid_init(struct obd_device *obd, - struct obd_export *exp, enum lu_cli_type type) -{ - struct client_obd *cli = &obd->u.cli; - char *prefix; - int rc; - - cli->cl_seq = kzalloc(sizeof(*cli->cl_seq), GFP_NOFS); - if (!cli->cl_seq) - return -ENOMEM; - - prefix = kzalloc(MAX_OBD_NAME + 5, GFP_NOFS); - if (!prefix) { - rc = -ENOMEM; - goto out_free_seq; - } - - snprintf(prefix, MAX_OBD_NAME + 5, "cli-%s", obd->obd_name); - - /* Init client side sequence-manager */ - seq_client_init(cli->cl_seq, exp, type, prefix); - kfree(prefix); - - return 0; -out_free_seq: - kfree(cli->cl_seq); - cli->cl_seq = NULL; - return rc; -} -EXPORT_SYMBOL(client_fid_init); - -int client_fid_fini(struct obd_device *obd) -{ - struct client_obd *cli = &obd->u.cli; - - if (cli->cl_seq) { - seq_client_fini(cli->cl_seq); - kfree(cli->cl_seq); - cli->cl_seq = NULL; - } - - return 0; -} -EXPORT_SYMBOL(client_fid_fini); - -static int __init fid_init(void) -{ - int rc; - - rc = libcfs_setup(); - if (rc) - return rc; - - seq_debugfs_dir = debugfs_create_dir(LUSTRE_SEQ_NAME, - debugfs_lustre_root); - return 0; -} - -static void __exit fid_exit(void) -{ - debugfs_remove_recursive(seq_debugfs_dir); -} - -MODULE_AUTHOR("OpenSFS, Inc. "); -MODULE_DESCRIPTION("Lustre File IDentifier"); -MODULE_VERSION(LUSTRE_VERSION_STRING); -MODULE_LICENSE("GPL"); - -module_init(fid_init); -module_exit(fid_exit); diff --git a/drivers/staging/lustre/lustre/fid/lproc_fid.c b/drivers/staging/lustre/lustre/fid/lproc_fid.c deleted file mode 100644 index 0aabf473c9bd226f73a0ca6c5480ce1deecd0866..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/fid/lproc_fid.c +++ /dev/null @@ -1,225 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/fid/lproc_fid.c - * - * Lustre Sequence Manager - * - * Author: Yury Umanets - */ - -#define DEBUG_SUBSYSTEM S_FID - -#include - -#include -#include -#include -#include -#include -#include "fid_internal.h" - -/* Format: [0x64BIT_INT - 0x64BIT_INT] + 32 bytes just in case */ -#define MAX_FID_RANGE_STRLEN (32 + 2 * 2 * sizeof(__u64)) -/* - * Note: this function is only used for testing, it is no safe for production - * use. - */ -static int -ldebugfs_fid_write_common(const char __user *buffer, size_t count, - struct lu_seq_range *range) -{ - struct lu_seq_range tmp; - int rc; - char kernbuf[MAX_FID_RANGE_STRLEN]; - - LASSERT(range); - - if (count >= sizeof(kernbuf)) - return -EINVAL; - - if (copy_from_user(kernbuf, buffer, count)) - return -EFAULT; - - kernbuf[count] = 0; - - if (count == 5 && strcmp(kernbuf, "clear") == 0) { - memset(range, 0, sizeof(*range)); - return count; - } - - /* of the form "[0x0000000240000400 - 0x000000028000400]" */ - rc = sscanf(kernbuf, "[%llx - %llx]\n", - (unsigned long long *)&tmp.lsr_start, - (unsigned long long *)&tmp.lsr_end); - if (rc != 2) - return -EINVAL; - if (!lu_seq_range_is_sane(&tmp) || lu_seq_range_is_zero(&tmp) || - tmp.lsr_start < range->lsr_start || tmp.lsr_end > range->lsr_end) - return -EINVAL; - *range = tmp; - return count; -} - -/* Client side debugfs stuff */ -static ssize_t -ldebugfs_fid_space_seq_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *off) -{ - struct lu_client_seq *seq; - struct lu_seq_range range; - int rc; - - seq = ((struct seq_file *)file->private_data)->private; - - rc = ldebugfs_fid_write_common(buffer, count, &range); - - spin_lock(&seq->lcs_lock); - if (seq->lcs_update) - /* An RPC call is active to update lcs_space */ - rc = -EBUSY; - if (rc > 0) - seq->lcs_space = range; - spin_unlock(&seq->lcs_lock); - - if (rc > 0) { - CDEBUG(D_INFO, "%s: Space: " DRANGE "\n", - seq->lcs_name, PRANGE(&range)); - } - - return rc; -} - -static int -ldebugfs_fid_space_seq_show(struct seq_file *m, void *unused) -{ - struct lu_client_seq *seq = (struct lu_client_seq *)m->private; - int rc = 0; - - spin_lock(&seq->lcs_lock); - if (seq->lcs_update) - rc = -EBUSY; - else - seq_printf(m, "[%#llx - %#llx]:%x:%s\n", PRANGE(&seq->lcs_space)); - spin_unlock(&seq->lcs_lock); - - return rc; -} - -static ssize_t -ldebugfs_fid_width_seq_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *off) -{ - struct lu_client_seq *seq; - __u64 max; - int rc, val; - - seq = ((struct seq_file *)file->private_data)->private; - - rc = lprocfs_write_helper(buffer, count, &val); - if (rc) - return rc; - - spin_lock(&seq->lcs_lock); - if (seq->lcs_type == LUSTRE_SEQ_DATA) - max = LUSTRE_DATA_SEQ_MAX_WIDTH; - else - max = LUSTRE_METADATA_SEQ_MAX_WIDTH; - - if (val <= max && val > 0) { - seq->lcs_width = val; - - CDEBUG(D_INFO, "%s: Sequence size: %llu\n", seq->lcs_name, - seq->lcs_width); - } - - spin_unlock(&seq->lcs_lock); - - return count; -} - -static int -ldebugfs_fid_width_seq_show(struct seq_file *m, void *unused) -{ - struct lu_client_seq *seq = (struct lu_client_seq *)m->private; - - spin_lock(&seq->lcs_lock); - seq_printf(m, "%llu\n", seq->lcs_width); - spin_unlock(&seq->lcs_lock); - - return 0; -} - -static int -ldebugfs_fid_fid_seq_show(struct seq_file *m, void *unused) -{ - struct lu_client_seq *seq = (struct lu_client_seq *)m->private; - - spin_lock(&seq->lcs_lock); - seq_printf(m, DFID "\n", PFID(&seq->lcs_fid)); - spin_unlock(&seq->lcs_lock); - - return 0; -} - -static int -ldebugfs_fid_server_seq_show(struct seq_file *m, void *unused) -{ - struct lu_client_seq *seq = (struct lu_client_seq *)m->private; - struct client_obd *cli; - - if (seq->lcs_exp) { - cli = &seq->lcs_exp->exp_obd->u.cli; - seq_printf(m, "%s\n", cli->cl_target_uuid.uuid); - } - - return 0; -} - -LPROC_SEQ_FOPS(ldebugfs_fid_space); -LPROC_SEQ_FOPS(ldebugfs_fid_width); -LPROC_SEQ_FOPS_RO(ldebugfs_fid_server); -LPROC_SEQ_FOPS_RO(ldebugfs_fid_fid); - -struct lprocfs_vars seq_client_debugfs_list[] = { - { .name = "space", - .fops = &ldebugfs_fid_space_fops }, - { .name = "width", - .fops = &ldebugfs_fid_width_fops }, - { .name = "server", - .fops = &ldebugfs_fid_server_fops }, - { .name = "fid", - .fops = &ldebugfs_fid_fid_fops }, - { NULL } -}; diff --git a/drivers/staging/lustre/lustre/fld/Makefile b/drivers/staging/lustre/lustre/fld/Makefile deleted file mode 100644 index 426deba8b8154766a16d4b5dd998318b1307ed63..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/fld/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/include -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/lustre/include/ - -obj-$(CONFIG_LUSTRE_FS) += fld.o -fld-y := fld_request.o fld_cache.o lproc_fld.o diff --git a/drivers/staging/lustre/lustre/fld/fld_cache.c b/drivers/staging/lustre/lustre/fld/fld_cache.c deleted file mode 100644 index a7415c9a1c28bbed92be5a033e2cb0006f7d4c5d..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/fld/fld_cache.c +++ /dev/null @@ -1,516 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2013, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/fld/fld_cache.c - * - * FLD (Fids Location Database) - * - * Author: Pravin Shelar - * Author: Yury Umanets - */ - -#define DEBUG_SUBSYSTEM S_FLD - -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include "fld_internal.h" - -/** - * create fld cache. - */ -struct fld_cache *fld_cache_init(const char *name, - int cache_size, int cache_threshold) -{ - struct fld_cache *cache; - - LASSERT(name); - LASSERT(cache_threshold < cache_size); - - cache = kzalloc(sizeof(*cache), GFP_NOFS); - if (!cache) - return ERR_PTR(-ENOMEM); - - INIT_LIST_HEAD(&cache->fci_entries_head); - INIT_LIST_HEAD(&cache->fci_lru); - - cache->fci_cache_count = 0; - rwlock_init(&cache->fci_lock); - - strlcpy(cache->fci_name, name, - sizeof(cache->fci_name)); - - cache->fci_cache_size = cache_size; - cache->fci_threshold = cache_threshold; - - /* Init fld cache info. */ - memset(&cache->fci_stat, 0, sizeof(cache->fci_stat)); - - CDEBUG(D_INFO, "%s: FLD cache - Size: %d, Threshold: %d\n", - cache->fci_name, cache_size, cache_threshold); - - return cache; -} - -/** - * destroy fld cache. - */ -void fld_cache_fini(struct fld_cache *cache) -{ - __u64 pct; - - LASSERT(cache); - fld_cache_flush(cache); - - if (cache->fci_stat.fst_count > 0) { - pct = cache->fci_stat.fst_cache * 100; - do_div(pct, cache->fci_stat.fst_count); - } else { - pct = 0; - } - - CDEBUG(D_INFO, "FLD cache statistics (%s):\n", cache->fci_name); - CDEBUG(D_INFO, " Total reqs: %llu\n", cache->fci_stat.fst_count); - CDEBUG(D_INFO, " Cache reqs: %llu\n", cache->fci_stat.fst_cache); - CDEBUG(D_INFO, " Cache hits: %llu%%\n", pct); - - kfree(cache); -} - -/** - * delete given node from list. - */ -static void fld_cache_entry_delete(struct fld_cache *cache, - struct fld_cache_entry *node) -{ - list_del(&node->fce_list); - list_del(&node->fce_lru); - cache->fci_cache_count--; - kfree(node); -} - -/** - * fix list by checking new entry with NEXT entry in order. - */ -static void fld_fix_new_list(struct fld_cache *cache) -{ - struct fld_cache_entry *f_curr; - struct fld_cache_entry *f_next; - struct lu_seq_range *c_range; - struct lu_seq_range *n_range; - struct list_head *head = &cache->fci_entries_head; - -restart_fixup: - - list_for_each_entry_safe(f_curr, f_next, head, fce_list) { - c_range = &f_curr->fce_range; - n_range = &f_next->fce_range; - - LASSERT(lu_seq_range_is_sane(c_range)); - if (&f_next->fce_list == head) - break; - - if (c_range->lsr_flags != n_range->lsr_flags) - continue; - - LASSERTF(c_range->lsr_start <= n_range->lsr_start, - "cur lsr_start " DRANGE " next lsr_start " DRANGE "\n", - PRANGE(c_range), PRANGE(n_range)); - - /* check merge possibility with next range */ - if (c_range->lsr_end == n_range->lsr_start) { - if (c_range->lsr_index != n_range->lsr_index) - continue; - n_range->lsr_start = c_range->lsr_start; - fld_cache_entry_delete(cache, f_curr); - continue; - } - - /* check if current range overlaps with next range. */ - if (n_range->lsr_start < c_range->lsr_end) { - if (c_range->lsr_index == n_range->lsr_index) { - n_range->lsr_start = c_range->lsr_start; - n_range->lsr_end = max(c_range->lsr_end, - n_range->lsr_end); - fld_cache_entry_delete(cache, f_curr); - } else { - if (n_range->lsr_end <= c_range->lsr_end) { - *n_range = *c_range; - fld_cache_entry_delete(cache, f_curr); - } else { - n_range->lsr_start = c_range->lsr_end; - } - } - - /* we could have overlap over next - * range too. better restart. - */ - goto restart_fixup; - } - - /* kill duplicates */ - if (c_range->lsr_start == n_range->lsr_start && - c_range->lsr_end == n_range->lsr_end) - fld_cache_entry_delete(cache, f_curr); - } -} - -/** - * add node to fld cache - */ -static inline void fld_cache_entry_add(struct fld_cache *cache, - struct fld_cache_entry *f_new, - struct list_head *pos) -{ - list_add(&f_new->fce_list, pos); - list_add(&f_new->fce_lru, &cache->fci_lru); - - cache->fci_cache_count++; - fld_fix_new_list(cache); -} - -/** - * Check if cache needs to be shrunk. If so - do it. - * Remove one entry in list and so on until cache is shrunk enough. - */ -static int fld_cache_shrink(struct fld_cache *cache) -{ - int num = 0; - - if (cache->fci_cache_count < cache->fci_cache_size) - return 0; - - while (cache->fci_cache_count + cache->fci_threshold > - cache->fci_cache_size && - !list_empty(&cache->fci_lru)) { - struct fld_cache_entry *flde = - list_last_entry(&cache->fci_lru, - struct fld_cache_entry, fce_lru); - - fld_cache_entry_delete(cache, flde); - num++; - } - - CDEBUG(D_INFO, "%s: FLD cache - Shrunk by %d entries\n", - cache->fci_name, num); - - return 0; -} - -/** - * kill all fld cache entries. - */ -void fld_cache_flush(struct fld_cache *cache) -{ - write_lock(&cache->fci_lock); - cache->fci_cache_size = 0; - fld_cache_shrink(cache); - write_unlock(&cache->fci_lock); -} - -/** - * punch hole in existing range. divide this range and add new - * entry accordingly. - */ - -static void fld_cache_punch_hole(struct fld_cache *cache, - struct fld_cache_entry *f_curr, - struct fld_cache_entry *f_new) -{ - const struct lu_seq_range *range = &f_new->fce_range; - const u64 new_start = range->lsr_start; - const u64 new_end = range->lsr_end; - struct fld_cache_entry *fldt; - - fldt = kzalloc(sizeof(*fldt), GFP_ATOMIC); - if (!fldt) { - kfree(f_new); - /* overlap is not allowed, so don't mess up list. */ - return; - } - /* break f_curr RANGE into three RANGES: - * f_curr, f_new , fldt - */ - - /* f_new = *range */ - - /* fldt */ - fldt->fce_range.lsr_start = new_end; - fldt->fce_range.lsr_end = f_curr->fce_range.lsr_end; - fldt->fce_range.lsr_index = f_curr->fce_range.lsr_index; - - /* f_curr */ - f_curr->fce_range.lsr_end = new_start; - - /* add these two entries to list */ - fld_cache_entry_add(cache, f_new, &f_curr->fce_list); - fld_cache_entry_add(cache, fldt, &f_new->fce_list); - - /* no need to fixup */ -} - -/** - * handle range overlap in fld cache. - */ -static void fld_cache_overlap_handle(struct fld_cache *cache, - struct fld_cache_entry *f_curr, - struct fld_cache_entry *f_new) -{ - const struct lu_seq_range *range = &f_new->fce_range; - const u64 new_start = range->lsr_start; - const u64 new_end = range->lsr_end; - const u32 mdt = range->lsr_index; - - /* this is overlap case, these case are checking overlapping with - * prev range only. fixup will handle overlapping with next range. - */ - - if (f_curr->fce_range.lsr_index == mdt) { - f_curr->fce_range.lsr_start = min(f_curr->fce_range.lsr_start, - new_start); - - f_curr->fce_range.lsr_end = max(f_curr->fce_range.lsr_end, - new_end); - - kfree(f_new); - fld_fix_new_list(cache); - - } else if (new_start <= f_curr->fce_range.lsr_start && - f_curr->fce_range.lsr_end <= new_end) { - /* case 1: new range completely overshadowed existing range. - * e.g. whole range migrated. update fld cache entry - */ - - f_curr->fce_range = *range; - kfree(f_new); - fld_fix_new_list(cache); - - } else if (f_curr->fce_range.lsr_start < new_start && - new_end < f_curr->fce_range.lsr_end) { - /* case 2: new range fit within existing range. */ - - fld_cache_punch_hole(cache, f_curr, f_new); - - } else if (new_end <= f_curr->fce_range.lsr_end) { - /* case 3: overlap: - * [new_start [c_start new_end) c_end) - */ - - LASSERT(new_start <= f_curr->fce_range.lsr_start); - - f_curr->fce_range.lsr_start = new_end; - fld_cache_entry_add(cache, f_new, f_curr->fce_list.prev); - - } else if (f_curr->fce_range.lsr_start <= new_start) { - /* case 4: overlap: - * [c_start [new_start c_end) new_end) - */ - - LASSERT(f_curr->fce_range.lsr_end <= new_end); - - f_curr->fce_range.lsr_end = new_start; - fld_cache_entry_add(cache, f_new, &f_curr->fce_list); - } else { - CERROR("NEW range =" DRANGE " curr = " DRANGE "\n", - PRANGE(range), PRANGE(&f_curr->fce_range)); - } -} - -struct fld_cache_entry -*fld_cache_entry_create(const struct lu_seq_range *range) -{ - struct fld_cache_entry *f_new; - - LASSERT(lu_seq_range_is_sane(range)); - - f_new = kzalloc(sizeof(*f_new), GFP_NOFS); - if (!f_new) - return ERR_PTR(-ENOMEM); - - f_new->fce_range = *range; - return f_new; -} - -/** - * Insert FLD entry in FLD cache. - * - * This function handles all cases of merging and breaking up of - * ranges. - */ -static int fld_cache_insert_nolock(struct fld_cache *cache, - struct fld_cache_entry *f_new) -{ - struct fld_cache_entry *f_curr; - struct fld_cache_entry *n; - struct list_head *head; - struct list_head *prev = NULL; - const u64 new_start = f_new->fce_range.lsr_start; - const u64 new_end = f_new->fce_range.lsr_end; - __u32 new_flags = f_new->fce_range.lsr_flags; - - /* - * Duplicate entries are eliminated in insert op. - * So we don't need to search new entry before starting - * insertion loop. - */ - - if (!cache->fci_no_shrink) - fld_cache_shrink(cache); - - head = &cache->fci_entries_head; - - list_for_each_entry_safe(f_curr, n, head, fce_list) { - /* add list if next is end of list */ - if (new_end < f_curr->fce_range.lsr_start || - (new_end == f_curr->fce_range.lsr_start && - new_flags != f_curr->fce_range.lsr_flags)) - break; - - prev = &f_curr->fce_list; - /* check if this range is to left of new range. */ - if (new_start < f_curr->fce_range.lsr_end && - new_flags == f_curr->fce_range.lsr_flags) { - fld_cache_overlap_handle(cache, f_curr, f_new); - goto out; - } - } - - if (!prev) - prev = head; - - CDEBUG(D_INFO, "insert range " DRANGE "\n", PRANGE(&f_new->fce_range)); - /* Add new entry to cache and lru list. */ - fld_cache_entry_add(cache, f_new, prev); -out: - return 0; -} - -int fld_cache_insert(struct fld_cache *cache, - const struct lu_seq_range *range) -{ - struct fld_cache_entry *flde; - int rc; - - flde = fld_cache_entry_create(range); - if (IS_ERR(flde)) - return PTR_ERR(flde); - - write_lock(&cache->fci_lock); - rc = fld_cache_insert_nolock(cache, flde); - write_unlock(&cache->fci_lock); - if (rc) - kfree(flde); - - return rc; -} - -/** - * Delete FLD entry in FLD cache. - * - */ - -struct fld_cache_entry -*fld_cache_entry_lookup_nolock(struct fld_cache *cache, - struct lu_seq_range *range) -{ - struct fld_cache_entry *flde; - struct fld_cache_entry *got = NULL; - struct list_head *head; - - head = &cache->fci_entries_head; - list_for_each_entry(flde, head, fce_list) { - if (range->lsr_start == flde->fce_range.lsr_start || - (range->lsr_end == flde->fce_range.lsr_end && - range->lsr_flags == flde->fce_range.lsr_flags)) { - got = flde; - break; - } - } - - return got; -} - -/** - * lookup \a seq sequence for range in fld cache. - */ -struct fld_cache_entry -*fld_cache_entry_lookup(struct fld_cache *cache, struct lu_seq_range *range) -{ - struct fld_cache_entry *got = NULL; - - read_lock(&cache->fci_lock); - got = fld_cache_entry_lookup_nolock(cache, range); - read_unlock(&cache->fci_lock); - return got; -} - -/** - * lookup \a seq sequence for range in fld cache. - */ -int fld_cache_lookup(struct fld_cache *cache, - const u64 seq, struct lu_seq_range *range) -{ - struct fld_cache_entry *flde; - struct fld_cache_entry *prev = NULL; - struct list_head *head; - - read_lock(&cache->fci_lock); - head = &cache->fci_entries_head; - - cache->fci_stat.fst_count++; - list_for_each_entry(flde, head, fce_list) { - if (flde->fce_range.lsr_start > seq) { - if (prev) - *range = prev->fce_range; - break; - } - - prev = flde; - if (lu_seq_range_within(&flde->fce_range, seq)) { - *range = flde->fce_range; - - cache->fci_stat.fst_cache++; - read_unlock(&cache->fci_lock); - return 0; - } - } - read_unlock(&cache->fci_lock); - return -ENOENT; -} diff --git a/drivers/staging/lustre/lustre/fld/fld_internal.h b/drivers/staging/lustre/lustre/fld/fld_internal.h deleted file mode 100644 index e1d6aaa5c2b4cf695bad3ce7490a0d64fba68847..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/fld/fld_internal.h +++ /dev/null @@ -1,170 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/fld/fld_internal.h - * - * Subsystem Description: - * FLD is FID Location Database, which stores where (IE, on which MDT) - * FIDs are located. - * The database is basically a record file, each record consists of a FID - * sequence range, MDT/OST index, and flags. The FLD for the whole FS - * is only stored on the sequence controller(MDT0) right now, but each target - * also has its local FLD, which only stores the local sequence. - * - * The FLD subsystem usually has two tasks: - * 1. maintain the database, i.e. when the sequence controller allocates - * new sequence ranges to some nodes, it will call the FLD API to insert the - * location information in FLDB. - * - * 2. Handle requests from other nodes, i.e. if client needs to know where - * the FID is located, if it can not find the information in the local cache, - * it will send a FLD lookup RPC to the FLD service, and the FLD service will - * look up the FLDB entry and return the location information to client. - * - * - * Author: Yury Umanets - * Author: Tom WangDi - */ -#ifndef __FLD_INTERNAL_H -#define __FLD_INTERNAL_H - -#include - -#include -#include - -struct fld_stats { - __u64 fst_count; - __u64 fst_cache; - __u64 fst_inflight; -}; - -struct lu_fld_hash { - const char *fh_name; - int (*fh_hash_func)(struct lu_client_fld *, __u64); - struct lu_fld_target *(*fh_scan_func)(struct lu_client_fld *, __u64); -}; - -struct fld_cache_entry { - struct list_head fce_lru; - struct list_head fce_list; - /** fld cache entries are sorted on range->lsr_start field. */ - struct lu_seq_range fce_range; -}; - -struct fld_cache { - /** - * Cache guard, protects fci_hash mostly because others immutable after - * init is finished. - */ - rwlock_t fci_lock; - - /** Cache shrink threshold */ - int fci_threshold; - - /** Preferred number of cached entries */ - int fci_cache_size; - - /** Current number of cached entries. Protected by \a fci_lock */ - int fci_cache_count; - - /** LRU list fld entries. */ - struct list_head fci_lru; - - /** sorted fld entries. */ - struct list_head fci_entries_head; - - /** Cache statistics. */ - struct fld_stats fci_stat; - - /** Cache name used for debug and messages. */ - char fci_name[LUSTRE_MDT_MAXNAMELEN]; - unsigned int fci_no_shrink:1; -}; - -enum { - /* 4M of FLD cache will not hurt client a lot. */ - FLD_SERVER_CACHE_SIZE = (4 * 0x100000), - - /* 1M of FLD cache will not hurt client a lot. */ - FLD_CLIENT_CACHE_SIZE = (1 * 0x100000) -}; - -enum { - /* Cache threshold is 10 percent of size. */ - FLD_SERVER_CACHE_THRESHOLD = 10, - - /* Cache threshold is 10 percent of size. */ - FLD_CLIENT_CACHE_THRESHOLD = 10 -}; - -extern struct lu_fld_hash fld_hash[]; - -int fld_client_rpc(struct obd_export *exp, - struct lu_seq_range *range, __u32 fld_op, - struct ptlrpc_request **reqp); - -extern struct lprocfs_vars fld_client_debugfs_list[]; - -struct fld_cache *fld_cache_init(const char *name, - int cache_size, int cache_threshold); - -void fld_cache_fini(struct fld_cache *cache); - -void fld_cache_flush(struct fld_cache *cache); - -int fld_cache_insert(struct fld_cache *cache, - const struct lu_seq_range *range); - -struct fld_cache_entry -*fld_cache_entry_create(const struct lu_seq_range *range); - -int fld_cache_lookup(struct fld_cache *cache, - const u64 seq, struct lu_seq_range *range); - -struct fld_cache_entry* -fld_cache_entry_lookup(struct fld_cache *cache, struct lu_seq_range *range); - -struct fld_cache_entry -*fld_cache_entry_lookup_nolock(struct fld_cache *cache, - struct lu_seq_range *range); - -static inline const char * -fld_target_name(struct lu_fld_target *tar) -{ - if (tar->ft_srv) - return tar->ft_srv->lsf_name; - - return (const char *)tar->ft_exp->exp_obd->obd_name; -} - -#endif /* __FLD_INTERNAL_H */ diff --git a/drivers/staging/lustre/lustre/fld/fld_request.c b/drivers/staging/lustre/lustre/fld/fld_request.c deleted file mode 100644 index 97f7ea6323463e888f7b853bac4e3dc6ac7175f2..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/fld/fld_request.c +++ /dev/null @@ -1,446 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/fld/fld_request.c - * - * FLD (Fids Location Database) - * - * Author: Yury Umanets - */ - -#define DEBUG_SUBSYSTEM S_FLD - -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include "fld_internal.h" - -static int fld_rrb_hash(struct lu_client_fld *fld, u64 seq) -{ - LASSERT(fld->lcf_count > 0); - return do_div(seq, fld->lcf_count); -} - -static struct lu_fld_target * -fld_rrb_scan(struct lu_client_fld *fld, u64 seq) -{ - struct lu_fld_target *target; - int hash; - - /* Because almost all of special sequence located in MDT0, - * it should go to index 0 directly, instead of calculating - * hash again, and also if other MDTs is not being connected, - * the fld lookup requests(for seq on MDT0) should not be - * blocked because of other MDTs - */ - if (fid_seq_is_norm(seq)) - hash = fld_rrb_hash(fld, seq); - else - hash = 0; - -again: - list_for_each_entry(target, &fld->lcf_targets, ft_chain) { - if (target->ft_idx == hash) - return target; - } - - if (hash != 0) { - /* It is possible the remote target(MDT) are not connected to - * with client yet, so we will refer this to MDT0, which should - * be connected during mount - */ - hash = 0; - goto again; - } - - CERROR("%s: Can't find target by hash %d (seq %#llx). Targets (%d):\n", - fld->lcf_name, hash, seq, fld->lcf_count); - - list_for_each_entry(target, &fld->lcf_targets, ft_chain) { - const char *srv_name = target->ft_srv ? - target->ft_srv->lsf_name : ""; - const char *exp_name = target->ft_exp ? - (char *)target->ft_exp->exp_obd->obd_uuid.uuid : - ""; - - CERROR(" exp: 0x%p (%s), srv: 0x%p (%s), idx: %llu\n", - target->ft_exp, exp_name, target->ft_srv, - srv_name, target->ft_idx); - } - - /* - * If target is not found, there is logical error anyway, so here is - * LBUG() to catch this situation. - */ - LBUG(); - return NULL; -} - -struct lu_fld_hash fld_hash[] = { - { - .fh_name = "RRB", - .fh_hash_func = fld_rrb_hash, - .fh_scan_func = fld_rrb_scan - }, - { - NULL, - } -}; - -static struct lu_fld_target * -fld_client_get_target(struct lu_client_fld *fld, u64 seq) -{ - struct lu_fld_target *target; - - LASSERT(fld->lcf_hash); - - spin_lock(&fld->lcf_lock); - target = fld->lcf_hash->fh_scan_func(fld, seq); - spin_unlock(&fld->lcf_lock); - - if (target) { - CDEBUG(D_INFO, "%s: Found target (idx %llu) by seq %#llx\n", - fld->lcf_name, target->ft_idx, seq); - } - - return target; -} - -/* - * Add export to FLD. This is usually done by CMM and LMV as they are main users - * of FLD module. - */ -int fld_client_add_target(struct lu_client_fld *fld, - struct lu_fld_target *tar) -{ - const char *name; - struct lu_fld_target *target, *tmp; - - LASSERT(tar); - name = fld_target_name(tar); - LASSERT(name); - LASSERT(tar->ft_srv || tar->ft_exp); - - CDEBUG(D_INFO, "%s: Adding target %s (idx %llu)\n", - fld->lcf_name, name, tar->ft_idx); - - target = kzalloc(sizeof(*target), GFP_NOFS); - if (!target) - return -ENOMEM; - - spin_lock(&fld->lcf_lock); - list_for_each_entry(tmp, &fld->lcf_targets, ft_chain) { - if (tmp->ft_idx == tar->ft_idx) { - spin_unlock(&fld->lcf_lock); - kfree(target); - CERROR("Target %s exists in FLD and known as %s:#%llu\n", - name, fld_target_name(tmp), tmp->ft_idx); - return -EEXIST; - } - } - - target->ft_exp = tar->ft_exp; - if (target->ft_exp) - class_export_get(target->ft_exp); - target->ft_srv = tar->ft_srv; - target->ft_idx = tar->ft_idx; - - list_add_tail(&target->ft_chain, &fld->lcf_targets); - - fld->lcf_count++; - spin_unlock(&fld->lcf_lock); - - return 0; -} -EXPORT_SYMBOL(fld_client_add_target); - -/* Remove export from FLD */ -int fld_client_del_target(struct lu_client_fld *fld, __u64 idx) -{ - struct lu_fld_target *target, *tmp; - - spin_lock(&fld->lcf_lock); - list_for_each_entry_safe(target, tmp, &fld->lcf_targets, ft_chain) { - if (target->ft_idx == idx) { - fld->lcf_count--; - list_del(&target->ft_chain); - spin_unlock(&fld->lcf_lock); - - if (target->ft_exp) - class_export_put(target->ft_exp); - - kfree(target); - return 0; - } - } - spin_unlock(&fld->lcf_lock); - return -ENOENT; -} - -static struct dentry *fld_debugfs_dir; - -static void fld_client_debugfs_init(struct lu_client_fld *fld) -{ - fld->lcf_debugfs_entry = debugfs_create_dir(fld->lcf_name, - fld_debugfs_dir); - - ldebugfs_add_vars(fld->lcf_debugfs_entry, fld_client_debugfs_list, fld); -} - -void fld_client_debugfs_fini(struct lu_client_fld *fld) -{ - debugfs_remove_recursive(fld->lcf_debugfs_entry); -} -EXPORT_SYMBOL(fld_client_debugfs_fini); - -static inline int hash_is_sane(int hash) -{ - return (hash >= 0 && hash < ARRAY_SIZE(fld_hash)); -} - -int fld_client_init(struct lu_client_fld *fld, - const char *prefix, int hash) -{ - int cache_size, cache_threshold; - int rc = 0; - - snprintf(fld->lcf_name, sizeof(fld->lcf_name), - "cli-%s", prefix); - - if (!hash_is_sane(hash)) { - CERROR("%s: Wrong hash function %#x\n", - fld->lcf_name, hash); - return -EINVAL; - } - - fld->lcf_count = 0; - spin_lock_init(&fld->lcf_lock); - fld->lcf_hash = &fld_hash[hash]; - INIT_LIST_HEAD(&fld->lcf_targets); - - cache_size = FLD_CLIENT_CACHE_SIZE / - sizeof(struct fld_cache_entry); - - cache_threshold = cache_size * - FLD_CLIENT_CACHE_THRESHOLD / 100; - - fld->lcf_cache = fld_cache_init(fld->lcf_name, - cache_size, cache_threshold); - if (IS_ERR(fld->lcf_cache)) { - rc = PTR_ERR(fld->lcf_cache); - fld->lcf_cache = NULL; - goto out; - } - - fld_client_debugfs_init(fld); -out: - CDEBUG(D_INFO, "%s: Using \"%s\" hash\n", - fld->lcf_name, fld->lcf_hash->fh_name); - return rc; -} -EXPORT_SYMBOL(fld_client_init); - -void fld_client_fini(struct lu_client_fld *fld) -{ - struct lu_fld_target *target, *tmp; - - spin_lock(&fld->lcf_lock); - list_for_each_entry_safe(target, tmp, &fld->lcf_targets, ft_chain) { - fld->lcf_count--; - list_del(&target->ft_chain); - if (target->ft_exp) - class_export_put(target->ft_exp); - kfree(target); - } - spin_unlock(&fld->lcf_lock); - - if (fld->lcf_cache) { - if (!IS_ERR(fld->lcf_cache)) - fld_cache_fini(fld->lcf_cache); - fld->lcf_cache = NULL; - } -} -EXPORT_SYMBOL(fld_client_fini); - -int fld_client_rpc(struct obd_export *exp, - struct lu_seq_range *range, __u32 fld_op, - struct ptlrpc_request **reqp) -{ - struct ptlrpc_request *req = NULL; - struct lu_seq_range *prange; - __u32 *op; - int rc = 0; - struct obd_import *imp; - - LASSERT(exp); - - imp = class_exp2cliimp(exp); - switch (fld_op) { - case FLD_QUERY: - req = ptlrpc_request_alloc_pack(imp, &RQF_FLD_QUERY, - LUSTRE_MDS_VERSION, FLD_QUERY); - if (!req) - return -ENOMEM; - - /* - * XXX: only needed when talking to old server(< 2.6), it should - * be removed when < 2.6 server is not supported - */ - op = req_capsule_client_get(&req->rq_pill, &RMF_FLD_OPC); - *op = FLD_LOOKUP; - - if (imp->imp_connect_flags_orig & OBD_CONNECT_MDS_MDS) - req->rq_allow_replay = 1; - break; - case FLD_READ: - req = ptlrpc_request_alloc_pack(imp, &RQF_FLD_READ, - LUSTRE_MDS_VERSION, FLD_READ); - if (!req) - return -ENOMEM; - - req_capsule_set_size(&req->rq_pill, &RMF_GENERIC_DATA, - RCL_SERVER, PAGE_SIZE); - break; - default: - rc = -EINVAL; - break; - } - if (rc) - return rc; - - prange = req_capsule_client_get(&req->rq_pill, &RMF_FLD_MDFLD); - *prange = *range; - ptlrpc_request_set_replen(req); - req->rq_request_portal = FLD_REQUEST_PORTAL; - req->rq_reply_portal = MDC_REPLY_PORTAL; - ptlrpc_at_set_req_timeout(req); - - obd_get_request_slot(&exp->exp_obd->u.cli); - rc = ptlrpc_queue_wait(req); - obd_put_request_slot(&exp->exp_obd->u.cli); - if (rc) - goto out_req; - - if (fld_op == FLD_QUERY) { - prange = req_capsule_server_get(&req->rq_pill, &RMF_FLD_MDFLD); - if (!prange) { - rc = -EFAULT; - goto out_req; - } - *range = *prange; - } - -out_req: - if (rc || !reqp) { - ptlrpc_req_finished(req); - req = NULL; - } - - if (reqp) - *reqp = req; - - return rc; -} - -int fld_client_lookup(struct lu_client_fld *fld, u64 seq, u32 *mds, - __u32 flags, const struct lu_env *env) -{ - struct lu_seq_range res = { 0 }; - struct lu_fld_target *target; - int rc; - - rc = fld_cache_lookup(fld->lcf_cache, seq, &res); - if (rc == 0) { - *mds = res.lsr_index; - return 0; - } - - /* Can not find it in the cache */ - target = fld_client_get_target(fld, seq); - LASSERT(target); - - CDEBUG(D_INFO, - "%s: Lookup fld entry (seq: %#llx) on target %s (idx %llu)\n", - fld->lcf_name, seq, fld_target_name(target), target->ft_idx); - - res.lsr_start = seq; - fld_range_set_type(&res, flags); - rc = fld_client_rpc(target->ft_exp, &res, FLD_QUERY, NULL); - - if (rc == 0) { - *mds = res.lsr_index; - - fld_cache_insert(fld->lcf_cache, &res); - } - return rc; -} -EXPORT_SYMBOL(fld_client_lookup); - -void fld_client_flush(struct lu_client_fld *fld) -{ - fld_cache_flush(fld->lcf_cache); -} - -static int __init fld_init(void) -{ - int rc; - - rc = libcfs_setup(); - if (rc) - return rc; - - fld_debugfs_dir = debugfs_create_dir(LUSTRE_FLD_NAME, - debugfs_lustre_root); - return 0; -} - -static void __exit fld_exit(void) -{ - debugfs_remove_recursive(fld_debugfs_dir); -} - -MODULE_AUTHOR("OpenSFS, Inc. "); -MODULE_DESCRIPTION("Lustre FID Location Database"); -MODULE_VERSION(LUSTRE_VERSION_STRING); -MODULE_LICENSE("GPL"); - -module_init(fld_init) -module_exit(fld_exit) diff --git a/drivers/staging/lustre/lustre/fld/lproc_fld.c b/drivers/staging/lustre/lustre/fld/lproc_fld.c deleted file mode 100644 index 0bcfb26ef8aaa7a11043b62df206654bc673f3ad..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/fld/lproc_fld.c +++ /dev/null @@ -1,154 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/fld/lproc_fld.c - * - * FLD (FIDs Location Database) - * - * Author: Yury Umanets - * Di Wang - */ - -#define DEBUG_SUBSYSTEM S_FLD - -#include - -#include -#include -#include -#include -#include -#include -#include "fld_internal.h" - -static int -fld_debugfs_targets_seq_show(struct seq_file *m, void *unused) -{ - struct lu_client_fld *fld = (struct lu_client_fld *)m->private; - struct lu_fld_target *target; - - spin_lock(&fld->lcf_lock); - list_for_each_entry(target, &fld->lcf_targets, ft_chain) - seq_printf(m, "%s\n", fld_target_name(target)); - spin_unlock(&fld->lcf_lock); - - return 0; -} - -static int -fld_debugfs_hash_seq_show(struct seq_file *m, void *unused) -{ - struct lu_client_fld *fld = (struct lu_client_fld *)m->private; - - spin_lock(&fld->lcf_lock); - seq_printf(m, "%s\n", fld->lcf_hash->fh_name); - spin_unlock(&fld->lcf_lock); - - return 0; -} - -static ssize_t -fld_debugfs_hash_seq_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *off) -{ - struct lu_client_fld *fld; - struct lu_fld_hash *hash = NULL; - char fh_name[8]; - int i; - - if (count > sizeof(fh_name)) - return -ENAMETOOLONG; - - if (copy_from_user(fh_name, buffer, count) != 0) - return -EFAULT; - - fld = ((struct seq_file *)file->private_data)->private; - - for (i = 0; fld_hash[i].fh_name; i++) { - if (count != strlen(fld_hash[i].fh_name)) - continue; - - if (!strncmp(fld_hash[i].fh_name, fh_name, count)) { - hash = &fld_hash[i]; - break; - } - } - - if (hash) { - spin_lock(&fld->lcf_lock); - fld->lcf_hash = hash; - spin_unlock(&fld->lcf_lock); - - CDEBUG(D_INFO, "%s: Changed hash to \"%s\"\n", - fld->lcf_name, hash->fh_name); - } - - return count; -} - -static ssize_t -fld_debugfs_cache_flush_write(struct file *file, const char __user *buffer, - size_t count, loff_t *pos) -{ - struct lu_client_fld *fld = file->private_data; - - fld_cache_flush(fld->lcf_cache); - - CDEBUG(D_INFO, "%s: Lookup cache is flushed\n", fld->lcf_name); - - return count; -} - -static int -fld_debugfs_cache_flush_release(struct inode *inode, struct file *file) -{ - file->private_data = NULL; - return 0; -} - -static const struct file_operations fld_debugfs_cache_flush_fops = { - .owner = THIS_MODULE, - .open = simple_open, - .write = fld_debugfs_cache_flush_write, - .release = fld_debugfs_cache_flush_release, -}; - -LPROC_SEQ_FOPS_RO(fld_debugfs_targets); -LPROC_SEQ_FOPS(fld_debugfs_hash); - -struct lprocfs_vars fld_client_debugfs_list[] = { - { "targets", &fld_debugfs_targets_fops }, - { "hash", &fld_debugfs_hash_fops }, - { "cache_flush", &fld_debugfs_cache_flush_fops }, - { NULL } -}; diff --git a/drivers/staging/lustre/lustre/include/cl_object.h b/drivers/staging/lustre/lustre/include/cl_object.h deleted file mode 100644 index 6f7b991be8098afaebfa541e7a47246dbe9e6795..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/cl_object.h +++ /dev/null @@ -1,2463 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ -#ifndef _LUSTRE_CL_OBJECT_H -#define _LUSTRE_CL_OBJECT_H - -/** \defgroup clio clio - * - * Client objects implement io operations and cache pages. - * - * Examples: lov and osc are implementations of cl interface. - * - * Big Theory Statement. - * - * Layered objects. - * - * Client implementation is based on the following data-types: - * - * - cl_object - * - * - cl_page - * - * - cl_lock represents an extent lock on an object. - * - * - cl_io represents high-level i/o activity such as whole read/write - * system call, or write-out of pages from under the lock being - * canceled. cl_io has sub-ios that can be stopped and resumed - * independently, thus achieving high degree of transfer - * parallelism. Single cl_io can be advanced forward by - * the multiple threads (although in the most usual case of - * read/write system call it is associated with the single user - * thread, that issued the system call). - * - * Terminology - * - * - to avoid confusion high-level I/O operation like read or write system - * call is referred to as "an io", whereas low-level I/O operation, like - * RPC, is referred to as "a transfer" - * - * - "generic code" means generic (not file system specific) code in the - * hosting environment. "cl-code" means code (mostly in cl_*.c files) that - * is not layer specific. - * - * Locking. - * - * - i_mutex - * - PG_locked - * - cl_object_header::coh_page_guard - * - lu_site::ls_guard - * - * See the top comment in cl_object.c for the description of overall locking and - * reference-counting design. - * - * See comments below for the description of i/o, page, and dlm-locking - * design. - * - * @{ - */ - -/* - * super-class definitions. - */ -#include -#include -#include -#include -#include -#include -#include - -struct inode; - -struct cl_device; - -struct cl_object; - -struct cl_page; -struct cl_page_slice; -struct cl_lock; -struct cl_lock_slice; - -struct cl_lock_operations; -struct cl_page_operations; - -struct cl_io; -struct cl_io_slice; - -struct cl_req_attr; - -/** - * Device in the client stack. - * - * \see vvp_device, lov_device, lovsub_device, osc_device - */ -struct cl_device { - /** Super-class. */ - struct lu_device cd_lu_dev; -}; - -/** \addtogroup cl_object cl_object - * @{ - */ -/** - * "Data attributes" of cl_object. Data attributes can be updated - * independently for a sub-object, and top-object's attributes are calculated - * from sub-objects' ones. - */ -struct cl_attr { - /** Object size, in bytes */ - loff_t cat_size; - /** - * Known minimal size, in bytes. - * - * This is only valid when at least one DLM lock is held. - */ - loff_t cat_kms; - /** Modification time. Measured in seconds since epoch. */ - time64_t cat_mtime; - /** Access time. Measured in seconds since epoch. */ - time64_t cat_atime; - /** Change time. Measured in seconds since epoch. */ - time64_t cat_ctime; - /** - * Blocks allocated to this cl_object on the server file system. - * - * \todo XXX An interface for block size is needed. - */ - __u64 cat_blocks; - /** - * User identifier for quota purposes. - */ - uid_t cat_uid; - /** - * Group identifier for quota purposes. - */ - gid_t cat_gid; - - /* nlink of the directory */ - __u64 cat_nlink; -}; - -/** - * Fields in cl_attr that are being set. - */ -enum cl_attr_valid { - CAT_SIZE = 1 << 0, - CAT_KMS = 1 << 1, - CAT_MTIME = 1 << 3, - CAT_ATIME = 1 << 4, - CAT_CTIME = 1 << 5, - CAT_BLOCKS = 1 << 6, - CAT_UID = 1 << 7, - CAT_GID = 1 << 8 -}; - -/** - * Sub-class of lu_object with methods common for objects on the client - * stacks. - * - * cl_object: represents a regular file system object, both a file and a - * stripe. cl_object is based on lu_object: it is identified by a fid, - * layered, cached, hashed, and lrued. Important distinction with the server - * side, where md_object and dt_object are used, is that cl_object "fans out" - * at the lov/sns level: depending on the file layout, single file is - * represented as a set of "sub-objects" (stripes). At the implementation - * level, struct lov_object contains an array of cl_objects. Each sub-object - * is a full-fledged cl_object, having its fid, living in the lru and hash - * table. - * - * This leads to the next important difference with the server side: on the - * client, it's quite usual to have objects with the different sequence of - * layers. For example, typical top-object is composed of the following - * layers: - * - * - vvp - * - lov - * - * whereas its sub-objects are composed of - * - * - lovsub - * - osc - * - * layers. Here "lovsub" is a mostly dummy layer, whose purpose is to keep - * track of the object-subobject relationship. - * - * Sub-objects are not cached independently: when top-object is about to - * be discarded from the memory, all its sub-objects are torn-down and - * destroyed too. - * - * \see vvp_object, lov_object, lovsub_object, osc_object - */ -struct cl_object { - /** super class */ - struct lu_object co_lu; - /** per-object-layer operations */ - const struct cl_object_operations *co_ops; - /** offset of page slice in cl_page buffer */ - int co_slice_off; -}; - -/** - * Description of the client object configuration. This is used for the - * creation of a new client object that is identified by a more state than - * fid. - */ -struct cl_object_conf { - /** Super-class. */ - struct lu_object_conf coc_lu; - union { - /** - * Object layout. This is consumed by lov. - */ - struct lu_buf coc_layout; - /** - * Description of particular stripe location in the - * cluster. This is consumed by osc. - */ - struct lov_oinfo *coc_oinfo; - } u; - /** - * VFS inode. This is consumed by vvp. - */ - struct inode *coc_inode; - /** - * Layout lock handle. - */ - struct ldlm_lock *coc_lock; - /** - * Operation to handle layout, OBJECT_CONF_XYZ. - */ - int coc_opc; -}; - -enum { - /** configure layout, set up a new stripe, must be called while - * holding layout lock. - */ - OBJECT_CONF_SET = 0, - /** invalidate the current stripe configuration due to losing - * layout lock. - */ - OBJECT_CONF_INVALIDATE = 1, - /** wait for old layout to go away so that new layout can be set up. */ - OBJECT_CONF_WAIT = 2 -}; - -enum { - CL_LAYOUT_GEN_NONE = (u32)-2, /* layout lock was cancelled */ - CL_LAYOUT_GEN_EMPTY = (u32)-1, /* for empty layout */ -}; - -struct cl_layout { - /** the buffer to return the layout in lov_mds_md format. */ - struct lu_buf cl_buf; - /** size of layout in lov_mds_md format. */ - size_t cl_size; - /** Layout generation. */ - u32 cl_layout_gen; -}; - -/** - * Operations implemented for each cl object layer. - * - * \see vvp_ops, lov_ops, lovsub_ops, osc_ops - */ -struct cl_object_operations { - /** - * Initialize page slice for this layer. Called top-to-bottom through - * every object layer when a new cl_page is instantiated. Layer - * keeping private per-page data, or requiring its own page operations - * vector should allocate these data here, and attach then to the page - * by calling cl_page_slice_add(). \a vmpage is locked (in the VM - * sense). Optional. - * - * \retval NULL success. - * - * \retval ERR_PTR(errno) failure code. - * - * \retval valid-pointer pointer to already existing referenced page - * to be used instead of newly created. - */ - int (*coo_page_init)(const struct lu_env *env, struct cl_object *obj, - struct cl_page *page, pgoff_t index); - /** - * Initialize lock slice for this layer. Called top-to-bottom through - * every object layer when a new cl_lock is instantiated. Layer - * keeping private per-lock data, or requiring its own lock operations - * vector should allocate these data here, and attach then to the lock - * by calling cl_lock_slice_add(). Mandatory. - */ - int (*coo_lock_init)(const struct lu_env *env, - struct cl_object *obj, struct cl_lock *lock, - const struct cl_io *io); - /** - * Initialize io state for a given layer. - * - * called top-to-bottom once per io existence to initialize io - * state. If layer wants to keep some state for this type of io, it - * has to embed struct cl_io_slice in lu_env::le_ses, and register - * slice with cl_io_slice_add(). It is guaranteed that all threads - * participating in this io share the same session. - */ - int (*coo_io_init)(const struct lu_env *env, - struct cl_object *obj, struct cl_io *io); - /** - * Fill portion of \a attr that this layer controls. This method is - * called top-to-bottom through all object layers. - * - * \pre cl_object_header::coh_attr_guard of the top-object is locked. - * - * \return 0: to continue - * \return +ve: to stop iterating through layers (but 0 is returned - * from enclosing cl_object_attr_get()) - * \return -ve: to signal error - */ - int (*coo_attr_get)(const struct lu_env *env, struct cl_object *obj, - struct cl_attr *attr); - /** - * Update attributes. - * - * \a valid is a bitmask composed from enum #cl_attr_valid, and - * indicating what attributes are to be set. - * - * \pre cl_object_header::coh_attr_guard of the top-object is locked. - * - * \return the same convention as for - * cl_object_operations::coo_attr_get() is used. - */ - int (*coo_attr_update)(const struct lu_env *env, struct cl_object *obj, - const struct cl_attr *attr, unsigned int valid); - /** - * Update object configuration. Called top-to-bottom to modify object - * configuration. - * - * XXX error conditions and handling. - */ - int (*coo_conf_set)(const struct lu_env *env, struct cl_object *obj, - const struct cl_object_conf *conf); - /** - * Glimpse ast. Executed when glimpse ast arrives for a lock on this - * object. Layers are supposed to fill parts of \a lvb that will be - * shipped to the glimpse originator as a glimpse result. - * - * \see vvp_object_glimpse(), lovsub_object_glimpse(), - * \see osc_object_glimpse() - */ - int (*coo_glimpse)(const struct lu_env *env, - const struct cl_object *obj, struct ost_lvb *lvb); - /** - * Object prune method. Called when the layout is going to change on - * this object, therefore each layer has to clean up their cache, - * mainly pages and locks. - */ - int (*coo_prune)(const struct lu_env *env, struct cl_object *obj); - /** - * Object getstripe method. - */ - int (*coo_getstripe)(const struct lu_env *env, struct cl_object *obj, - struct lov_user_md __user *lum); - /** - * Get FIEMAP mapping from the object. - */ - int (*coo_fiemap)(const struct lu_env *env, struct cl_object *obj, - struct ll_fiemap_info_key *fmkey, - struct fiemap *fiemap, size_t *buflen); - /** - * Get layout and generation of the object. - */ - int (*coo_layout_get)(const struct lu_env *env, struct cl_object *obj, - struct cl_layout *layout); - /** - * Get maximum size of the object. - */ - loff_t (*coo_maxbytes)(struct cl_object *obj); - /** - * Set request attributes. - */ - void (*coo_req_attr_set)(const struct lu_env *env, - struct cl_object *obj, - struct cl_req_attr *attr); -}; - -/** - * Extended header for client object. - */ -struct cl_object_header { - /** Standard lu_object_header. cl_object::co_lu::lo_header points - * here. - */ - struct lu_object_header coh_lu; - - /** - * Parent object. It is assumed that an object has a well-defined - * parent, but not a well-defined child (there may be multiple - * sub-objects, for the same top-object). cl_object_header::coh_parent - * field allows certain code to be written generically, without - * limiting possible cl_object layouts unduly. - */ - struct cl_object_header *coh_parent; - /** - * Protects consistency between cl_attr of parent object and - * attributes of sub-objects, that the former is calculated ("merged") - * from. - * - * \todo XXX this can be read/write lock if needed. - */ - spinlock_t coh_attr_guard; - /** - * Size of cl_page + page slices - */ - unsigned short coh_page_bufsize; - /** - * Number of objects above this one: 0 for a top-object, 1 for its - * sub-object, etc. - */ - unsigned char coh_nesting; -}; - -/** - * Helper macro: iterate over all layers of the object \a obj, assigning every - * layer top-to-bottom to \a slice. - */ -#define cl_object_for_each(slice, obj) \ - list_for_each_entry((slice), \ - &(obj)->co_lu.lo_header->loh_layers, \ - co_lu.lo_linkage) -/** - * Helper macro: iterate over all layers of the object \a obj, assigning every - * layer bottom-to-top to \a slice. - */ -#define cl_object_for_each_reverse(slice, obj) \ - list_for_each_entry_reverse((slice), \ - &(obj)->co_lu.lo_header->loh_layers, \ - co_lu.lo_linkage) -/** @} cl_object */ - -#define CL_PAGE_EOF ((pgoff_t)~0ull) - -/** \addtogroup cl_page cl_page - * @{ - */ - -/** \struct cl_page - * Layered client page. - * - * cl_page: represents a portion of a file, cached in the memory. All pages - * of the given file are of the same size, and are kept in the radix tree - * hanging off the cl_object. cl_page doesn't fan out, but as sub-objects - * of the top-level file object are first class cl_objects, they have their - * own radix trees of pages and hence page is implemented as a sequence of - * struct cl_pages's, linked into double-linked list through - * cl_page::cp_parent and cl_page::cp_child pointers, each residing in the - * corresponding radix tree at the corresponding logical offset. - * - * cl_page is associated with VM page of the hosting environment (struct - * page in Linux kernel, for example), struct page. It is assumed, that this - * association is implemented by one of cl_page layers (top layer in the - * current design) that - * - * - intercepts per-VM-page call-backs made by the environment (e.g., - * memory pressure), - * - * - translates state (page flag bits) and locking between lustre and - * environment. - * - * The association between cl_page and struct page is immutable and - * established when cl_page is created. - * - * cl_page can be "owned" by a particular cl_io (see below), guaranteeing - * this io an exclusive access to this page w.r.t. other io attempts and - * various events changing page state (such as transfer completion, or - * eviction of the page from the memory). Note, that in general cl_io - * cannot be identified with a particular thread, and page ownership is not - * exactly equal to the current thread holding a lock on the page. Layer - * implementing association between cl_page and struct page has to implement - * ownership on top of available synchronization mechanisms. - * - * While lustre client maintains the notion of an page ownership by io, - * hosting MM/VM usually has its own page concurrency control - * mechanisms. For example, in Linux, page access is synchronized by the - * per-page PG_locked bit-lock, and generic kernel code (generic_file_*()) - * takes care to acquire and release such locks as necessary around the - * calls to the file system methods (->readpage(), ->prepare_write(), - * ->commit_write(), etc.). This leads to the situation when there are two - * different ways to own a page in the client: - * - * - client code explicitly and voluntary owns the page (cl_page_own()); - * - * - VM locks a page and then calls the client, that has "to assume" - * the ownership from the VM (cl_page_assume()). - * - * Dual methods to release ownership are cl_page_disown() and - * cl_page_unassume(). - * - * cl_page is reference counted (cl_page::cp_ref). When reference counter - * drops to 0, the page is returned to the cache, unless it is in - * cl_page_state::CPS_FREEING state, in which case it is immediately - * destroyed. - * - * The general logic guaranteeing the absence of "existential races" for - * pages is the following: - * - * - there are fixed known ways for a thread to obtain a new reference - * to a page: - * - * - by doing a lookup in the cl_object radix tree, protected by the - * spin-lock; - * - * - by starting from VM-locked struct page and following some - * hosting environment method (e.g., following ->private pointer in - * the case of Linux kernel), see cl_vmpage_page(); - * - * - when the page enters cl_page_state::CPS_FREEING state, all these - * ways are severed with the proper synchronization - * (cl_page_delete()); - * - * - entry into cl_page_state::CPS_FREEING is serialized by the VM page - * lock; - * - * - no new references to the page in cl_page_state::CPS_FREEING state - * are allowed (checked in cl_page_get()). - * - * Together this guarantees that when last reference to a - * cl_page_state::CPS_FREEING page is released, it is safe to destroy the - * page, as neither references to it can be acquired at that point, nor - * ones exist. - * - * cl_page is a state machine. States are enumerated in enum - * cl_page_state. Possible state transitions are enumerated in - * cl_page_state_set(). State transition process (i.e., actual changing of - * cl_page::cp_state field) is protected by the lock on the underlying VM - * page. - * - * Linux Kernel implementation. - * - * Binding between cl_page and struct page (which is a typedef for - * struct page) is implemented in the vvp layer. cl_page is attached to the - * ->private pointer of the struct page, together with the setting of - * PG_private bit in page->flags, and acquiring additional reference on the - * struct page (much like struct buffer_head, or any similar file system - * private data structures). - * - * PG_locked lock is used to implement both ownership and transfer - * synchronization, that is, page is VM-locked in CPS_{OWNED,PAGE{IN,OUT}} - * states. No additional references are acquired for the duration of the - * transfer. - * - * \warning *THIS IS NOT* the behavior expected by the Linux kernel, where - * write-out is "protected" by the special PG_writeback bit. - */ - -/** - * States of cl_page. cl_page.c assumes particular order here. - * - * The page state machine is rather crude, as it doesn't recognize finer page - * states like "dirty" or "up to date". This is because such states are not - * always well defined for the whole stack (see, for example, the - * implementation of the read-ahead, that hides page up-to-dateness to track - * cache hits accurately). Such sub-states are maintained by the layers that - * are interested in them. - */ -enum cl_page_state { - /** - * Page is in the cache, un-owned. Page leaves cached state in the - * following cases: - * - * - [cl_page_state::CPS_OWNED] io comes across the page and - * owns it; - * - * - [cl_page_state::CPS_PAGEOUT] page is dirty, the - * req-formation engine decides that it wants to include this page - * into an RPC being constructed, and yanks it from the cache; - * - * - [cl_page_state::CPS_FREEING] VM callback is executed to - * evict the page form the memory; - * - * \invariant cl_page::cp_owner == NULL && cl_page::cp_req == NULL - */ - CPS_CACHED, - /** - * Page is exclusively owned by some cl_io. Page may end up in this - * state as a result of - * - * - io creating new page and immediately owning it; - * - * - [cl_page_state::CPS_CACHED] io finding existing cached page - * and owning it; - * - * - [cl_page_state::CPS_OWNED] io finding existing owned page - * and waiting for owner to release the page; - * - * Page leaves owned state in the following cases: - * - * - [cl_page_state::CPS_CACHED] io decides to leave the page in - * the cache, doing nothing; - * - * - [cl_page_state::CPS_PAGEIN] io starts read transfer for - * this page; - * - * - [cl_page_state::CPS_PAGEOUT] io starts immediate write - * transfer for this page; - * - * - [cl_page_state::CPS_FREEING] io decides to destroy this - * page (e.g., as part of truncate or extent lock cancellation). - * - * \invariant cl_page::cp_owner != NULL && cl_page::cp_req == NULL - */ - CPS_OWNED, - /** - * Page is being written out, as a part of a transfer. This state is - * entered when req-formation logic decided that it wants this page to - * be sent through the wire _now_. Specifically, it means that once - * this state is achieved, transfer completion handler (with either - * success or failure indication) is guaranteed to be executed against - * this page independently of any locks and any scheduling decisions - * made by the hosting environment (that effectively means that the - * page is never put into cl_page_state::CPS_PAGEOUT state "in - * advance". This property is mentioned, because it is important when - * reasoning about possible dead-locks in the system). The page can - * enter this state as a result of - * - * - [cl_page_state::CPS_OWNED] an io requesting an immediate - * write-out of this page, or - * - * - [cl_page_state::CPS_CACHED] req-forming engine deciding - * that it has enough dirty pages cached to issue a "good" - * transfer. - * - * The page leaves cl_page_state::CPS_PAGEOUT state when the transfer - * is completed---it is moved into cl_page_state::CPS_CACHED state. - * - * Underlying VM page is locked for the duration of transfer. - * - * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL - */ - CPS_PAGEOUT, - /** - * Page is being read in, as a part of a transfer. This is quite - * similar to the cl_page_state::CPS_PAGEOUT state, except that - * read-in is always "immediate"---there is no such thing a sudden - * construction of read request from cached, presumably not up to date, - * pages. - * - * Underlying VM page is locked for the duration of transfer. - * - * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL - */ - CPS_PAGEIN, - /** - * Page is being destroyed. This state is entered when client decides - * that page has to be deleted from its host object, as, e.g., a part - * of truncate. - * - * Once this state is reached, there is no way to escape it. - * - * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req == NULL - */ - CPS_FREEING, - CPS_NR -}; - -enum cl_page_type { - /** Host page, the page is from the host inode which the cl_page - * belongs to. - */ - CPT_CACHEABLE = 1, - - /** Transient page, the transient cl_page is used to bind a cl_page - * to vmpage which is not belonging to the same object of cl_page. - * it is used in DirectIO and lockless IO. - */ - CPT_TRANSIENT, -}; - -/** - * Fields are protected by the lock on struct page, except for atomics and - * immutables. - * - * \invariant Data type invariants are in cl_page_invariant(). Basically: - * cl_page::cp_parent and cl_page::cp_child are a well-formed double-linked - * list, consistent with the parent/child pointers in the cl_page::cp_obj and - * cl_page::cp_owner (when set). - */ -struct cl_page { - /** Reference counter. */ - atomic_t cp_ref; - /** An object this page is a part of. Immutable after creation. */ - struct cl_object *cp_obj; - /** vmpage */ - struct page *cp_vmpage; - /** Linkage of pages within group. Pages must be owned */ - struct list_head cp_batch; - /** List of slices. Immutable after creation. */ - struct list_head cp_layers; - /** - * Page state. This field is const to avoid accidental update, it is - * modified only internally within cl_page.c. Protected by a VM lock. - */ - const enum cl_page_state cp_state; - /** - * Page type. Only CPT_TRANSIENT is used so far. Immutable after - * creation. - */ - enum cl_page_type cp_type; - - /** - * Owning IO in cl_page_state::CPS_OWNED state. Sub-page can be owned - * by sub-io. Protected by a VM lock. - */ - struct cl_io *cp_owner; - /** List of references to this page, for debugging. */ - struct lu_ref cp_reference; - /** Link to an object, for debugging. */ - struct lu_ref_link cp_obj_ref; - /** Link to a queue, for debugging. */ - struct lu_ref_link cp_queue_ref; - /** Assigned if doing a sync_io */ - struct cl_sync_io *cp_sync_io; -}; - -/** - * Per-layer part of cl_page. - * - * \see vvp_page, lov_page, osc_page - */ -struct cl_page_slice { - struct cl_page *cpl_page; - pgoff_t cpl_index; - /** - * Object slice corresponding to this page slice. Immutable after - * creation. - */ - struct cl_object *cpl_obj; - const struct cl_page_operations *cpl_ops; - /** Linkage into cl_page::cp_layers. Immutable after creation. */ - struct list_head cpl_linkage; -}; - -/** - * Lock mode. For the client extent locks. - * - * \ingroup cl_lock - */ -enum cl_lock_mode { - CLM_READ, - CLM_WRITE, - CLM_GROUP -}; - -/** - * Requested transfer type. - */ -enum cl_req_type { - CRT_READ, - CRT_WRITE, - CRT_NR -}; - -/** - * Per-layer page operations. - * - * Methods taking an \a io argument are for the activity happening in the - * context of given \a io. Page is assumed to be owned by that io, except for - * the obvious cases (like cl_page_operations::cpo_own()). - * - * \see vvp_page_ops, lov_page_ops, osc_page_ops - */ -struct cl_page_operations { - /** - * cl_page<->struct page methods. Only one layer in the stack has to - * implement these. Current code assumes that this functionality is - * provided by the topmost layer, see cl_page_disown0() as an example. - */ - - /** - * Called when \a io acquires this page into the exclusive - * ownership. When this method returns, it is guaranteed that the is - * not owned by other io, and no transfer is going on against - * it. Optional. - * - * \see cl_page_own() - * \see vvp_page_own(), lov_page_own() - */ - int (*cpo_own)(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *io, int nonblock); - /** Called when ownership it yielded. Optional. - * - * \see cl_page_disown() - * \see vvp_page_disown() - */ - void (*cpo_disown)(const struct lu_env *env, - const struct cl_page_slice *slice, struct cl_io *io); - /** - * Called for a page that is already "owned" by \a io from VM point of - * view. Optional. - * - * \see cl_page_assume() - * \see vvp_page_assume(), lov_page_assume() - */ - void (*cpo_assume)(const struct lu_env *env, - const struct cl_page_slice *slice, struct cl_io *io); - /** Dual to cl_page_operations::cpo_assume(). Optional. Called - * bottom-to-top when IO releases a page without actually unlocking - * it. - * - * \see cl_page_unassume() - * \see vvp_page_unassume() - */ - void (*cpo_unassume)(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *io); - /** - * Announces whether the page contains valid data or not by \a uptodate. - * - * \see cl_page_export() - * \see vvp_page_export() - */ - void (*cpo_export)(const struct lu_env *env, - const struct cl_page_slice *slice, int uptodate); - /** - * Checks whether underlying VM page is locked (in the suitable - * sense). Used for assertions. - * - * \retval -EBUSY: page is protected by a lock of a given mode; - * \retval -ENODATA: page is not protected by a lock; - * \retval 0: this layer cannot decide. (Should never happen.) - */ - int (*cpo_is_vmlocked)(const struct lu_env *env, - const struct cl_page_slice *slice); - /** - * Page destruction. - */ - - /** - * Called when page is truncated from the object. Optional. - * - * \see cl_page_discard() - * \see vvp_page_discard(), osc_page_discard() - */ - void (*cpo_discard)(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *io); - /** - * Called when page is removed from the cache, and is about to being - * destroyed. Optional. - * - * \see cl_page_delete() - * \see vvp_page_delete(), osc_page_delete() - */ - void (*cpo_delete)(const struct lu_env *env, - const struct cl_page_slice *slice); - /** Destructor. Frees resources and slice itself. */ - void (*cpo_fini)(const struct lu_env *env, - struct cl_page_slice *slice); - /** - * Optional debugging helper. Prints given page slice. - * - * \see cl_page_print() - */ - int (*cpo_print)(const struct lu_env *env, - const struct cl_page_slice *slice, - void *cookie, lu_printer_t p); - /** - * \name transfer - * - * Transfer methods. - * - * @{ - */ - /** - * Request type dependent vector of operations. - * - * Transfer operations depend on transfer mode (cl_req_type). To avoid - * passing transfer mode to each and every of these methods, and to - * avoid branching on request type inside of the methods, separate - * methods for cl_req_type:CRT_READ and cl_req_type:CRT_WRITE are - * provided. That is, method invocation usually looks like - * - * slice->cp_ops.io[req->crq_type].cpo_method(env, slice, ...); - */ - struct { - /** - * Called when a page is submitted for a transfer as a part of - * cl_page_list. - * - * \return 0 : page is eligible for submission; - * \return -EALREADY : skip this page; - * \return -ve : error. - * - * \see cl_page_prep() - */ - int (*cpo_prep)(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *io); - /** - * Completion handler. This is guaranteed to be eventually - * fired after cl_page_operations::cpo_prep() or - * cl_page_operations::cpo_make_ready() call. - * - * This method can be called in a non-blocking context. It is - * guaranteed however, that the page involved and its object - * are pinned in memory (and, hence, calling cl_page_put() is - * safe). - * - * \see cl_page_completion() - */ - void (*cpo_completion)(const struct lu_env *env, - const struct cl_page_slice *slice, - int ioret); - /** - * Called when cached page is about to be added to the - * ptlrpc request as a part of req formation. - * - * \return 0 : proceed with this page; - * \return -EAGAIN : skip this page; - * \return -ve : error. - * - * \see cl_page_make_ready() - */ - int (*cpo_make_ready)(const struct lu_env *env, - const struct cl_page_slice *slice); - } io[CRT_NR]; - /** - * Tell transfer engine that only [to, from] part of a page should be - * transmitted. - * - * This is used for immediate transfers. - * - * \todo XXX this is not very good interface. It would be much better - * if all transfer parameters were supplied as arguments to - * cl_io_operations::cio_submit() call, but it is not clear how to do - * this for page queues. - * - * \see cl_page_clip() - */ - void (*cpo_clip)(const struct lu_env *env, - const struct cl_page_slice *slice, - int from, int to); - /** - * \pre the page was queued for transferring. - * \post page is removed from client's pending list, or -EBUSY - * is returned if it has already been in transferring. - * - * This is one of seldom page operation which is: - * 0. called from top level; - * 1. don't have vmpage locked; - * 2. every layer should synchronize execution of its ->cpo_cancel() - * with completion handlers. Osc uses client obd lock for this - * purpose. Based on there is no vvp_page_cancel and - * lov_page_cancel(), cpo_cancel is defacto protected by client lock. - * - * \see osc_page_cancel(). - */ - int (*cpo_cancel)(const struct lu_env *env, - const struct cl_page_slice *slice); - /** - * Write out a page by kernel. This is only called by ll_writepage - * right now. - * - * \see cl_page_flush() - */ - int (*cpo_flush)(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *io); - /** @} transfer */ -}; - -/** - * Helper macro, dumping detailed information about \a page into a log. - */ -#define CL_PAGE_DEBUG(mask, env, page, format, ...) \ -do { \ - if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ - LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ - cl_page_print(env, &msgdata, lu_cdebug_printer, page); \ - CDEBUG(mask, format, ## __VA_ARGS__); \ - } \ -} while (0) - -/** - * Helper macro, dumping shorter information about \a page into a log. - */ -#define CL_PAGE_HEADER(mask, env, page, format, ...) \ -do { \ - if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ - LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ - cl_page_header_print(env, &msgdata, lu_cdebug_printer, page); \ - CDEBUG(mask, format, ## __VA_ARGS__); \ - } \ -} while (0) - -static inline struct page *cl_page_vmpage(struct cl_page *page) -{ - LASSERT(page->cp_vmpage); - return page->cp_vmpage; -} - -/** - * Check if a cl_page is in use. - * - * Client cache holds a refcount, this refcount will be dropped when - * the page is taken out of cache, see vvp_page_delete(). - */ -static inline bool __page_in_use(const struct cl_page *page, int refc) -{ - return (atomic_read(&page->cp_ref) > refc + 1); -} - -/** - * Caller itself holds a refcount of cl_page. - */ -#define cl_page_in_use(pg) __page_in_use(pg, 1) -/** - * Caller doesn't hold a refcount. - */ -#define cl_page_in_use_noref(pg) __page_in_use(pg, 0) - -/** @} cl_page */ - -/** \addtogroup cl_lock cl_lock - * @{ - */ -/** \struct cl_lock - * - * Extent locking on the client. - * - * LAYERING - * - * The locking model of the new client code is built around - * - * struct cl_lock - * - * data-type representing an extent lock on a regular file. cl_lock is a - * layered object (much like cl_object and cl_page), it consists of a header - * (struct cl_lock) and a list of layers (struct cl_lock_slice), linked to - * cl_lock::cll_layers list through cl_lock_slice::cls_linkage. - * - * Typical cl_lock consists of the two layers: - * - * - vvp_lock (vvp specific data), and - * - lov_lock (lov specific data). - * - * lov_lock contains an array of sub-locks. Each of these sub-locks is a - * normal cl_lock: it has a header (struct cl_lock) and a list of layers: - * - * - lovsub_lock, and - * - osc_lock - * - * Each sub-lock is associated with a cl_object (representing stripe - * sub-object or the file to which top-level cl_lock is associated to), and is - * linked into that cl_object::coh_locks. In this respect cl_lock is similar to - * cl_object (that at lov layer also fans out into multiple sub-objects), and - * is different from cl_page, that doesn't fan out (there is usually exactly - * one osc_page for every vvp_page). We shall call vvp-lov portion of the lock - * a "top-lock" and its lovsub-osc portion a "sub-lock". - * - * LIFE CYCLE - * - * cl_lock is a cacheless data container for the requirements of locks to - * complete the IO. cl_lock is created before I/O starts and destroyed when the - * I/O is complete. - * - * cl_lock depends on LDLM lock to fulfill lock semantics. LDLM lock is attached - * to cl_lock at OSC layer. LDLM lock is still cacheable. - * - * INTERFACE AND USAGE - * - * Two major methods are supported for cl_lock: clo_enqueue and clo_cancel. A - * cl_lock is enqueued by cl_lock_request(), which will call clo_enqueue() - * methods for each layer to enqueue the lock. At the LOV layer, if a cl_lock - * consists of multiple sub cl_locks, each sub locks will be enqueued - * correspondingly. At OSC layer, the lock enqueue request will tend to reuse - * cached LDLM lock; otherwise a new LDLM lock will have to be requested from - * OST side. - * - * cl_lock_cancel() must be called to release a cl_lock after use. clo_cancel() - * method will be called for each layer to release the resource held by this - * lock. At OSC layer, the reference count of LDLM lock, which is held at - * clo_enqueue time, is released. - * - * LDLM lock can only be canceled if there is no cl_lock using it. - * - * Overall process of the locking during IO operation is as following: - * - * - once parameters for IO are setup in cl_io, cl_io_operations::cio_lock() - * is called on each layer. Responsibility of this method is to add locks, - * needed by a given layer into cl_io.ci_lockset. - * - * - once locks for all layers were collected, they are sorted to avoid - * dead-locks (cl_io_locks_sort()), and enqueued. - * - * - when all locks are acquired, IO is performed; - * - * - locks are released after IO is complete. - * - * Striping introduces major additional complexity into locking. The - * fundamental problem is that it is generally unsafe to actively use (hold) - * two locks on the different OST servers at the same time, as this introduces - * inter-server dependency and can lead to cascading evictions. - * - * Basic solution is to sub-divide large read/write IOs into smaller pieces so - * that no multi-stripe locks are taken (note that this design abandons POSIX - * read/write semantics). Such pieces ideally can be executed concurrently. At - * the same time, certain types of IO cannot be sub-divived, without - * sacrificing correctness. This includes: - * - * - O_APPEND write, where [0, EOF] lock has to be taken, to guarantee - * atomicity; - * - * - ftruncate(fd, offset), where [offset, EOF] lock has to be taken. - * - * Also, in the case of read(fd, buf, count) or write(fd, buf, count), where - * buf is a part of memory mapped Lustre file, a lock or locks protecting buf - * has to be held together with the usual lock on [offset, offset + count]. - * - * Interaction with DLM - * - * In the expected setup, cl_lock is ultimately backed up by a collection of - * DLM locks (struct ldlm_lock). Association between cl_lock and DLM lock is - * implemented in osc layer, that also matches DLM events (ASTs, cancellation, - * etc.) into cl_lock_operation calls. See struct osc_lock for a more detailed - * description of interaction with DLM. - */ - -/** - * Lock description. - */ -struct cl_lock_descr { - /** Object this lock is granted for. */ - struct cl_object *cld_obj; - /** Index of the first page protected by this lock. */ - pgoff_t cld_start; - /** Index of the last page (inclusive) protected by this lock. */ - pgoff_t cld_end; - /** Group ID, for group lock */ - __u64 cld_gid; - /** Lock mode. */ - enum cl_lock_mode cld_mode; - /** - * flags to enqueue lock. A combination of bit-flags from - * enum cl_enq_flags. - */ - __u32 cld_enq_flags; -}; - -#define DDESCR "%s(%d):[%lu, %lu]:%x" -#define PDESCR(descr) \ - cl_lock_mode_name((descr)->cld_mode), (descr)->cld_mode, \ - (descr)->cld_start, (descr)->cld_end, (descr)->cld_enq_flags - -const char *cl_lock_mode_name(const enum cl_lock_mode mode); - -/** - * Layered client lock. - */ -struct cl_lock { - /** List of slices. Immutable after creation. */ - struct list_head cll_layers; - /** lock attribute, extent, cl_object, etc. */ - struct cl_lock_descr cll_descr; -}; - -/** - * Per-layer part of cl_lock - * - * \see vvp_lock, lov_lock, lovsub_lock, osc_lock - */ -struct cl_lock_slice { - struct cl_lock *cls_lock; - /** Object slice corresponding to this lock slice. Immutable after - * creation. - */ - struct cl_object *cls_obj; - const struct cl_lock_operations *cls_ops; - /** Linkage into cl_lock::cll_layers. Immutable after creation. */ - struct list_head cls_linkage; -}; - -/** - * - * \see vvp_lock_ops, lov_lock_ops, lovsub_lock_ops, osc_lock_ops - */ -struct cl_lock_operations { - /** @{ */ - /** - * Attempts to enqueue the lock. Called top-to-bottom. - * - * \retval 0 this layer has enqueued the lock successfully - * \retval >0 this layer has enqueued the lock, but need to wait on - * @anchor for resources - * \retval -ve failure - * - * \see vvp_lock_enqueue(), lov_lock_enqueue(), lovsub_lock_enqueue(), - * \see osc_lock_enqueue() - */ - int (*clo_enqueue)(const struct lu_env *env, - const struct cl_lock_slice *slice, - struct cl_io *io, struct cl_sync_io *anchor); - /** - * Cancel a lock, release its DLM lock ref, while does not cancel the - * DLM lock - */ - void (*clo_cancel)(const struct lu_env *env, - const struct cl_lock_slice *slice); - /** @} */ - /** - * Destructor. Frees resources and the slice. - * - * \see vvp_lock_fini(), lov_lock_fini(), lovsub_lock_fini(), - * \see osc_lock_fini() - */ - void (*clo_fini)(const struct lu_env *env, struct cl_lock_slice *slice); - /** - * Optional debugging helper. Prints given lock slice. - */ - int (*clo_print)(const struct lu_env *env, - void *cookie, lu_printer_t p, - const struct cl_lock_slice *slice); -}; - -#define CL_LOCK_DEBUG(mask, env, lock, format, ...) \ -do { \ - LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ - \ - if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ - cl_lock_print(env, &msgdata, lu_cdebug_printer, lock); \ - CDEBUG(mask, format, ## __VA_ARGS__); \ - } \ -} while (0) - -#define CL_LOCK_ASSERT(expr, env, lock) do { \ - if (likely(expr)) \ - break; \ - \ - CL_LOCK_DEBUG(D_ERROR, env, lock, "failed at %s.\n", #expr); \ - LBUG(); \ -} while (0) - -/** @} cl_lock */ - -/** \addtogroup cl_page_list cl_page_list - * Page list used to perform collective operations on a group of pages. - * - * Pages are added to the list one by one. cl_page_list acquires a reference - * for every page in it. Page list is used to perform collective operations on - * pages: - * - * - submit pages for an immediate transfer, - * - * - own pages on behalf of certain io (waiting for each page in turn), - * - * - discard pages. - * - * When list is finalized, it releases references on all pages it still has. - * - * \todo XXX concurrency control. - * - * @{ - */ -struct cl_page_list { - unsigned int pl_nr; - struct list_head pl_pages; - struct task_struct *pl_owner; -}; - -/** - * A 2-queue of pages. A convenience data-type for common use case, 2-queue - * contains an incoming page list and an outgoing page list. - */ -struct cl_2queue { - struct cl_page_list c2_qin; - struct cl_page_list c2_qout; -}; - -/** @} cl_page_list */ - -/** \addtogroup cl_io cl_io - * @{ - */ -/** \struct cl_io - * I/O - * - * cl_io represents a high level I/O activity like - * read(2)/write(2)/truncate(2) system call, or cancellation of an extent - * lock. - * - * cl_io is a layered object, much like cl_{object,page,lock} but with one - * important distinction. We want to minimize number of calls to the allocator - * in the fast path, e.g., in the case of read(2) when everything is cached: - * client already owns the lock over region being read, and data are cached - * due to read-ahead. To avoid allocation of cl_io layers in such situations, - * per-layer io state is stored in the session, associated with the io, see - * struct {vvp,lov,osc}_io for example. Sessions allocation is amortized - * by using free-lists, see cl_env_get(). - * - * There is a small predefined number of possible io types, enumerated in enum - * cl_io_type. - * - * cl_io is a state machine, that can be advanced concurrently by the multiple - * threads. It is up to these threads to control the concurrency and, - * specifically, to detect when io is done, and its state can be safely - * released. - * - * For read/write io overall execution plan is as following: - * - * (0) initialize io state through all layers; - * - * (1) loop: prepare chunk of work to do - * - * (2) call all layers to collect locks they need to process current chunk - * - * (3) sort all locks to avoid dead-locks, and acquire them - * - * (4) process the chunk: call per-page methods - * cl_io_operations::cio_prepare_write(), - * cl_io_operations::cio_commit_write() for write) - * - * (5) release locks - * - * (6) repeat loop. - * - * To implement the "parallel IO mode", lov layer creates sub-io's (lazily to - * address allocation efficiency issues mentioned above), and returns with the - * special error condition from per-page method when current sub-io has to - * block. This causes io loop to be repeated, and lov switches to the next - * sub-io in its cl_io_operations::cio_iter_init() implementation. - */ - -/** IO types */ -enum cl_io_type { - /** read system call */ - CIT_READ = 1, - /** write system call */ - CIT_WRITE, - /** truncate, utime system calls */ - CIT_SETATTR, - /** get data version */ - CIT_DATA_VERSION, - /** - * page fault handling - */ - CIT_FAULT, - /** - * fsync system call handling - * To write out a range of file - */ - CIT_FSYNC, - /** - * Miscellaneous io. This is used for occasional io activity that - * doesn't fit into other types. Currently this is used for: - * - * - cancellation of an extent lock. This io exists as a context - * to write dirty pages from under the lock being canceled back - * to the server; - * - * - VM induced page write-out. An io context for writing page out - * for memory cleansing; - * - * - glimpse. An io context to acquire glimpse lock. - * - * - grouplock. An io context to acquire group lock. - * - * CIT_MISC io is used simply as a context in which locks and pages - * are manipulated. Such io has no internal "process", that is, - * cl_io_loop() is never called for it. - */ - CIT_MISC, - CIT_OP_NR -}; - -/** - * States of cl_io state machine - */ -enum cl_io_state { - /** Not initialized. */ - CIS_ZERO, - /** Initialized. */ - CIS_INIT, - /** IO iteration started. */ - CIS_IT_STARTED, - /** Locks taken. */ - CIS_LOCKED, - /** Actual IO is in progress. */ - CIS_IO_GOING, - /** IO for the current iteration finished. */ - CIS_IO_FINISHED, - /** Locks released. */ - CIS_UNLOCKED, - /** Iteration completed. */ - CIS_IT_ENDED, - /** cl_io finalized. */ - CIS_FINI -}; - -/** - * IO state private for a layer. - * - * This is usually embedded into layer session data, rather than allocated - * dynamically. - * - * \see vvp_io, lov_io, osc_io - */ -struct cl_io_slice { - struct cl_io *cis_io; - /** corresponding object slice. Immutable after creation. */ - struct cl_object *cis_obj; - /** io operations. Immutable after creation. */ - const struct cl_io_operations *cis_iop; - /** - * linkage into a list of all slices for a given cl_io, hanging off - * cl_io::ci_layers. Immutable after creation. - */ - struct list_head cis_linkage; -}; - -typedef void (*cl_commit_cbt)(const struct lu_env *, struct cl_io *, - struct cl_page *); - -struct cl_read_ahead { - /* - * Maximum page index the readahead window will end. - * This is determined DLM lock coverage, RPC and stripe boundary. - * cra_end is included. - */ - pgoff_t cra_end; - /* optimal RPC size for this read, by pages */ - unsigned long cra_rpc_size; - /* - * Release callback. If readahead holds resources underneath, this - * function should be called to release it. - */ - void (*cra_release)(const struct lu_env *env, void *cbdata); - /* Callback data for cra_release routine */ - void *cra_cbdata; -}; - -static inline void cl_read_ahead_release(const struct lu_env *env, - struct cl_read_ahead *ra) -{ - if (ra->cra_release) - ra->cra_release(env, ra->cra_cbdata); - memset(ra, 0, sizeof(*ra)); -} - -/** - * Per-layer io operations. - * \see vvp_io_ops, lov_io_ops, lovsub_io_ops, osc_io_ops - */ -struct cl_io_operations { - /** - * Vector of io state transition methods for every io type. - * - * \see cl_page_operations::io - */ - struct { - /** - * Prepare io iteration at a given layer. - * - * Called top-to-bottom at the beginning of each iteration of - * "io loop" (if it makes sense for this type of io). Here - * layer selects what work it will do during this iteration. - * - * \see cl_io_operations::cio_iter_fini() - */ - int (*cio_iter_init)(const struct lu_env *env, - const struct cl_io_slice *slice); - /** - * Finalize io iteration. - * - * Called bottom-to-top at the end of each iteration of "io - * loop". Here layers can decide whether IO has to be - * continued. - * - * \see cl_io_operations::cio_iter_init() - */ - void (*cio_iter_fini)(const struct lu_env *env, - const struct cl_io_slice *slice); - /** - * Collect locks for the current iteration of io. - * - * Called top-to-bottom to collect all locks necessary for - * this iteration. This methods shouldn't actually enqueue - * anything, instead it should post a lock through - * cl_io_lock_add(). Once all locks are collected, they are - * sorted and enqueued in the proper order. - */ - int (*cio_lock)(const struct lu_env *env, - const struct cl_io_slice *slice); - /** - * Finalize unlocking. - * - * Called bottom-to-top to finish layer specific unlocking - * functionality, after generic code released all locks - * acquired by cl_io_operations::cio_lock(). - */ - void (*cio_unlock)(const struct lu_env *env, - const struct cl_io_slice *slice); - /** - * Start io iteration. - * - * Once all locks are acquired, called top-to-bottom to - * commence actual IO. In the current implementation, - * top-level vvp_io_{read,write}_start() does all the work - * synchronously by calling generic_file_*(), so other layers - * are called when everything is done. - */ - int (*cio_start)(const struct lu_env *env, - const struct cl_io_slice *slice); - /** - * Called top-to-bottom at the end of io loop. Here layer - * might wait for an unfinished asynchronous io. - */ - void (*cio_end)(const struct lu_env *env, - const struct cl_io_slice *slice); - /** - * Called bottom-to-top to notify layers that read/write IO - * iteration finished, with \a nob bytes transferred. - */ - void (*cio_advance)(const struct lu_env *env, - const struct cl_io_slice *slice, - size_t nob); - /** - * Called once per io, bottom-to-top to release io resources. - */ - void (*cio_fini)(const struct lu_env *env, - const struct cl_io_slice *slice); - } op[CIT_OP_NR]; - - /** - * Submit pages from \a queue->c2_qin for IO, and move - * successfully submitted pages into \a queue->c2_qout. Return - * non-zero if failed to submit even the single page. If - * submission failed after some pages were moved into \a - * queue->c2_qout, completion callback with non-zero ioret is - * executed on them. - */ - int (*cio_submit)(const struct lu_env *env, - const struct cl_io_slice *slice, - enum cl_req_type crt, - struct cl_2queue *queue); - /** - * Queue async page for write. - * The difference between cio_submit and cio_queue is that - * cio_submit is for urgent request. - */ - int (*cio_commit_async)(const struct lu_env *env, - const struct cl_io_slice *slice, - struct cl_page_list *queue, int from, int to, - cl_commit_cbt cb); - /** - * Decide maximum read ahead extent - * - * \pre io->ci_type == CIT_READ - */ - int (*cio_read_ahead)(const struct lu_env *env, - const struct cl_io_slice *slice, - pgoff_t start, struct cl_read_ahead *ra); - /** - * Optional debugging helper. Print given io slice. - */ - int (*cio_print)(const struct lu_env *env, void *cookie, - lu_printer_t p, const struct cl_io_slice *slice); -}; - -/** - * Flags to lock enqueue procedure. - * \ingroup cl_lock - */ -enum cl_enq_flags { - /** - * instruct server to not block, if conflicting lock is found. Instead - * -EWOULDBLOCK is returned immediately. - */ - CEF_NONBLOCK = 0x00000001, - /** - * take lock asynchronously (out of order), as it cannot - * deadlock. This is for LDLM_FL_HAS_INTENT locks used for glimpsing. - */ - CEF_ASYNC = 0x00000002, - /** - * tell the server to instruct (though a flag in the blocking ast) an - * owner of the conflicting lock, that it can drop dirty pages - * protected by this lock, without sending them to the server. - */ - CEF_DISCARD_DATA = 0x00000004, - /** - * tell the sub layers that it must be a `real' lock. This is used for - * mmapped-buffer locks and glimpse locks that must be never converted - * into lockless mode. - * - * \see vvp_mmap_locks(), cl_glimpse_lock(). - */ - CEF_MUST = 0x00000008, - /** - * tell the sub layers that never request a `real' lock. This flag is - * not used currently. - * - * cl_io::ci_lockreq and CEF_{MUST,NEVER} flags specify lockless - * conversion policy: ci_lockreq describes generic information of lock - * requirement for this IO, especially for locks which belong to the - * object doing IO; however, lock itself may have precise requirements - * that are described by the enqueue flags. - */ - CEF_NEVER = 0x00000010, - /** - * for async glimpse lock. - */ - CEF_AGL = 0x00000020, - /** - * enqueue a lock to test DLM lock existence. - */ - CEF_PEEK = 0x00000040, - /** - * Lock match only. Used by group lock in I/O as group lock - * is known to exist. - */ - CEF_LOCK_MATCH = BIT(7), - /** - * mask of enq_flags. - */ - CEF_MASK = 0x000000ff, -}; - -/** - * Link between lock and io. Intermediate structure is needed, because the - * same lock can be part of multiple io's simultaneously. - */ -struct cl_io_lock_link { - /** linkage into one of cl_lockset lists. */ - struct list_head cill_linkage; - struct cl_lock cill_lock; - /** optional destructor */ - void (*cill_fini)(const struct lu_env *env, - struct cl_io_lock_link *link); -}; -#define cill_descr cill_lock.cll_descr - -/** - * Lock-set represents a collection of locks, that io needs at a - * time. Generally speaking, client tries to avoid holding multiple locks when - * possible, because - * - * - holding extent locks over multiple ost's introduces the danger of - * "cascading timeouts"; - * - * - holding multiple locks over the same ost is still dead-lock prone, - * see comment in osc_lock_enqueue(), - * - * but there are certain situations where this is unavoidable: - * - * - O_APPEND writes have to take [0, EOF] lock for correctness; - * - * - truncate has to take [new-size, EOF] lock for correctness; - * - * - SNS has to take locks across full stripe for correctness; - * - * - in the case when user level buffer, supplied to {read,write}(file0), - * is a part of a memory mapped lustre file, client has to take a dlm - * locks on file0, and all files that back up the buffer (or a part of - * the buffer, that is being processed in the current chunk, in any - * case, there are situations where at least 2 locks are necessary). - * - * In such cases we at least try to take locks in the same consistent - * order. To this end, all locks are first collected, then sorted, and then - * enqueued. - */ -struct cl_lockset { - /** locks to be acquired. */ - struct list_head cls_todo; - /** locks acquired. */ - struct list_head cls_done; -}; - -/** - * Lock requirements(demand) for IO. It should be cl_io_lock_req, - * but 'req' is always to be thought as 'request' :-) - */ -enum cl_io_lock_dmd { - /** Always lock data (e.g., O_APPEND). */ - CILR_MANDATORY = 0, - /** Layers are free to decide between local and global locking. */ - CILR_MAYBE, - /** Never lock: there is no cache (e.g., lockless IO). */ - CILR_NEVER -}; - -enum cl_fsync_mode { - /** start writeback, do not wait for them to finish */ - CL_FSYNC_NONE = 0, - /** start writeback and wait for them to finish */ - CL_FSYNC_LOCAL = 1, - /** discard all of dirty pages in a specific file range */ - CL_FSYNC_DISCARD = 2, - /** start writeback and make sure they have reached storage before - * return. OST_SYNC RPC must be issued and finished - */ - CL_FSYNC_ALL = 3 -}; - -struct cl_io_rw_common { - loff_t crw_pos; - size_t crw_count; - int crw_nonblock; -}; - -/** - * State for io. - * - * cl_io is shared by all threads participating in this IO (in current - * implementation only one thread advances IO, but parallel IO design and - * concurrent copy_*_user() require multiple threads acting on the same IO. It - * is up to these threads to serialize their activities, including updates to - * mutable cl_io fields. - */ -struct cl_io { - /** type of this IO. Immutable after creation. */ - enum cl_io_type ci_type; - /** current state of cl_io state machine. */ - enum cl_io_state ci_state; - /** main object this io is against. Immutable after creation. */ - struct cl_object *ci_obj; - /** - * Upper layer io, of which this io is a part of. Immutable after - * creation. - */ - struct cl_io *ci_parent; - /** List of slices. Immutable after creation. */ - struct list_head ci_layers; - /** list of locks (to be) acquired by this io. */ - struct cl_lockset ci_lockset; - /** lock requirements, this is just a help info for sublayers. */ - enum cl_io_lock_dmd ci_lockreq; - union { - struct cl_rd_io { - struct cl_io_rw_common rd; - } ci_rd; - struct cl_wr_io { - struct cl_io_rw_common wr; - int wr_append; - int wr_sync; - } ci_wr; - struct cl_io_rw_common ci_rw; - struct cl_setattr_io { - struct ost_lvb sa_attr; - unsigned int sa_attr_flags; - unsigned int sa_valid; - int sa_stripe_index; - const struct lu_fid *sa_parent_fid; - } ci_setattr; - struct cl_data_version_io { - u64 dv_data_version; - int dv_flags; - } ci_data_version; - struct cl_fault_io { - /** page index within file. */ - pgoff_t ft_index; - /** bytes valid byte on a faulted page. */ - size_t ft_nob; - /** writable page? for nopage() only */ - int ft_writable; - /** page of an executable? */ - int ft_executable; - /** page_mkwrite() */ - int ft_mkwrite; - /** resulting page */ - struct cl_page *ft_page; - } ci_fault; - struct cl_fsync_io { - loff_t fi_start; - loff_t fi_end; - /** file system level fid */ - struct lu_fid *fi_fid; - enum cl_fsync_mode fi_mode; - /* how many pages were written/discarded */ - unsigned int fi_nr_written; - } ci_fsync; - } u; - struct cl_2queue ci_queue; - size_t ci_nob; - int ci_result; - unsigned int ci_continue:1, - /** - * This io has held grouplock, to inform sublayers that - * don't do lockless i/o. - */ - ci_no_srvlock:1, - /** - * The whole IO need to be restarted because layout has been changed - */ - ci_need_restart:1, - /** - * to not refresh layout - the IO issuer knows that the layout won't - * change(page operations, layout change causes all page to be - * discarded), or it doesn't matter if it changes(sync). - */ - ci_ignore_layout:1, - /** - * Check if layout changed after the IO finishes. Mainly for HSM - * requirement. If IO occurs to openning files, it doesn't need to - * verify layout because HSM won't release openning files. - * Right now, only two operations need to verify layout: glimpse - * and setattr. - */ - ci_verify_layout:1, - /** - * file is released, restore has to be triggered by vvp layer - */ - ci_restore_needed:1, - /** - * O_NOATIME - */ - ci_noatime:1; - /** - * Number of pages owned by this IO. For invariant checking. - */ - unsigned int ci_owned_nr; -}; - -/** @} cl_io */ - -/** - * Per-transfer attributes. - */ -struct cl_req_attr { - enum cl_req_type cra_type; - u64 cra_flags; - struct cl_page *cra_page; - - /** Generic attributes for the server consumption. */ - struct obdo *cra_oa; - /** Jobid */ - char cra_jobid[LUSTRE_JOBID_SIZE]; -}; - -enum cache_stats_item { - /** how many cache lookups were performed */ - CS_lookup = 0, - /** how many times cache lookup resulted in a hit */ - CS_hit, - /** how many entities are in the cache right now */ - CS_total, - /** how many entities in the cache are actively used (and cannot be - * evicted) right now - */ - CS_busy, - /** how many entities were created at all */ - CS_create, - CS_NR -}; - -#define CS_NAMES { "lookup", "hit", "total", "busy", "create" } - -/** - * Stats for a generic cache (similar to inode, lu_object, etc. caches). - */ -struct cache_stats { - const char *cs_name; - atomic_t cs_stats[CS_NR]; -}; - -/** These are not exported so far */ -void cache_stats_init(struct cache_stats *cs, const char *name); - -/** - * Client-side site. This represents particular client stack. "Global" - * variables should (directly or indirectly) be added here to allow multiple - * clients to co-exist in the single address space. - */ -struct cl_site { - struct lu_site cs_lu; - /** - * Statistical counters. Atomics do not scale, something better like - * per-cpu counters is needed. - * - * These are exported as /sys/kernel/debug/lustre/llite/.../site - * - * When interpreting keep in mind that both sub-locks (and sub-pages) - * and top-locks (and top-pages) are accounted here. - */ - struct cache_stats cs_pages; - atomic_t cs_pages_state[CPS_NR]; -}; - -int cl_site_init(struct cl_site *s, struct cl_device *top); -void cl_site_fini(struct cl_site *s); -void cl_stack_fini(const struct lu_env *env, struct cl_device *cl); - -/** - * Output client site statistical counters into a buffer. Suitable for - * ll_rd_*()-style functions. - */ -int cl_site_stats_print(const struct cl_site *site, struct seq_file *m); - -/** - * \name helpers - * - * Type conversion and accessory functions. - */ -/** @{ */ - -static inline struct cl_site *lu2cl_site(const struct lu_site *site) -{ - return container_of(site, struct cl_site, cs_lu); -} - -static inline int lu_device_is_cl(const struct lu_device *d) -{ - return d->ld_type->ldt_tags & LU_DEVICE_CL; -} - -static inline struct cl_device *lu2cl_dev(const struct lu_device *d) -{ - LASSERT(!d || IS_ERR(d) || lu_device_is_cl(d)); - return container_of_safe(d, struct cl_device, cd_lu_dev); -} - -static inline struct lu_device *cl2lu_dev(struct cl_device *d) -{ - return &d->cd_lu_dev; -} - -static inline struct cl_object *lu2cl(const struct lu_object *o) -{ - LASSERT(!o || IS_ERR(o) || lu_device_is_cl(o->lo_dev)); - return container_of_safe(o, struct cl_object, co_lu); -} - -static inline const struct cl_object_conf * -lu2cl_conf(const struct lu_object_conf *conf) -{ - return container_of_safe(conf, struct cl_object_conf, coc_lu); -} - -static inline struct cl_object *cl_object_next(const struct cl_object *obj) -{ - return obj ? lu2cl(lu_object_next(&obj->co_lu)) : NULL; -} - -static inline struct cl_device *cl_object_device(const struct cl_object *o) -{ - LASSERT(!o || IS_ERR(o) || lu_device_is_cl(o->co_lu.lo_dev)); - return container_of_safe(o->co_lu.lo_dev, struct cl_device, cd_lu_dev); -} - -static inline struct cl_object_header *luh2coh(const struct lu_object_header *h) -{ - return container_of_safe(h, struct cl_object_header, coh_lu); -} - -static inline struct cl_site *cl_object_site(const struct cl_object *obj) -{ - return lu2cl_site(obj->co_lu.lo_dev->ld_site); -} - -static inline -struct cl_object_header *cl_object_header(const struct cl_object *obj) -{ - return luh2coh(obj->co_lu.lo_header); -} - -static inline int cl_device_init(struct cl_device *d, struct lu_device_type *t) -{ - return lu_device_init(&d->cd_lu_dev, t); -} - -static inline void cl_device_fini(struct cl_device *d) -{ - lu_device_fini(&d->cd_lu_dev); -} - -void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice, - struct cl_object *obj, pgoff_t index, - const struct cl_page_operations *ops); -void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice, - struct cl_object *obj, - const struct cl_lock_operations *ops); -void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice, - struct cl_object *obj, const struct cl_io_operations *ops); -/** @} helpers */ - -/** \defgroup cl_object cl_object - * @{ - */ -struct cl_object *cl_object_top(struct cl_object *o); -struct cl_object *cl_object_find(const struct lu_env *env, struct cl_device *cd, - const struct lu_fid *fid, - const struct cl_object_conf *c); - -int cl_object_header_init(struct cl_object_header *h); -void cl_object_put(const struct lu_env *env, struct cl_object *o); -void cl_object_get(struct cl_object *o); -void cl_object_attr_lock(struct cl_object *o); -void cl_object_attr_unlock(struct cl_object *o); -int cl_object_attr_get(const struct lu_env *env, struct cl_object *obj, - struct cl_attr *attr); -int cl_object_attr_update(const struct lu_env *env, struct cl_object *obj, - const struct cl_attr *attr, unsigned int valid); -int cl_object_glimpse(const struct lu_env *env, struct cl_object *obj, - struct ost_lvb *lvb); -int cl_conf_set(const struct lu_env *env, struct cl_object *obj, - const struct cl_object_conf *conf); -int cl_object_prune(const struct lu_env *env, struct cl_object *obj); -void cl_object_kill(const struct lu_env *env, struct cl_object *obj); -int cl_object_getstripe(const struct lu_env *env, struct cl_object *obj, - struct lov_user_md __user *lum); -int cl_object_fiemap(const struct lu_env *env, struct cl_object *obj, - struct ll_fiemap_info_key *fmkey, struct fiemap *fiemap, - size_t *buflen); -int cl_object_layout_get(const struct lu_env *env, struct cl_object *obj, - struct cl_layout *cl); -loff_t cl_object_maxbytes(struct cl_object *obj); - -/** - * Returns true, iff \a o0 and \a o1 are slices of the same object. - */ -static inline int cl_object_same(struct cl_object *o0, struct cl_object *o1) -{ - return cl_object_header(o0) == cl_object_header(o1); -} - -static inline void cl_object_page_init(struct cl_object *clob, int size) -{ - clob->co_slice_off = cl_object_header(clob)->coh_page_bufsize; - cl_object_header(clob)->coh_page_bufsize += cfs_size_round(size); - WARN_ON(cl_object_header(clob)->coh_page_bufsize > 512); -} - -static inline void *cl_object_page_slice(struct cl_object *clob, - struct cl_page *page) -{ - return (void *)((char *)page + clob->co_slice_off); -} - -/** - * Return refcount of cl_object. - */ -static inline int cl_object_refc(struct cl_object *clob) -{ - struct lu_object_header *header = clob->co_lu.lo_header; - - return atomic_read(&header->loh_ref); -} - -/** @} cl_object */ - -/** \defgroup cl_page cl_page - * @{ - */ -enum { - CLP_GANG_OKAY = 0, - CLP_GANG_RESCHED, - CLP_GANG_AGAIN, - CLP_GANG_ABORT -}; - -/* callback of cl_page_gang_lookup() */ -struct cl_page *cl_page_find(const struct lu_env *env, struct cl_object *obj, - pgoff_t idx, struct page *vmpage, - enum cl_page_type type); -struct cl_page *cl_page_alloc(const struct lu_env *env, - struct cl_object *o, pgoff_t ind, - struct page *vmpage, - enum cl_page_type type); -void cl_page_get(struct cl_page *page); -void cl_page_put(const struct lu_env *env, struct cl_page *page); -void cl_page_print(const struct lu_env *env, void *cookie, lu_printer_t printer, - const struct cl_page *pg); -void cl_page_header_print(const struct lu_env *env, void *cookie, - lu_printer_t printer, const struct cl_page *pg); -struct cl_page *cl_vmpage_page(struct page *vmpage, struct cl_object *obj); - -const struct cl_page_slice *cl_page_at(const struct cl_page *page, - const struct lu_device_type *dtype); - -/** - * \name ownership - * - * Functions dealing with the ownership of page by io. - */ -/** @{ */ - -int cl_page_own(const struct lu_env *env, - struct cl_io *io, struct cl_page *page); -int cl_page_own_try(const struct lu_env *env, - struct cl_io *io, struct cl_page *page); -void cl_page_assume(const struct lu_env *env, - struct cl_io *io, struct cl_page *page); -void cl_page_unassume(const struct lu_env *env, - struct cl_io *io, struct cl_page *pg); -void cl_page_disown(const struct lu_env *env, - struct cl_io *io, struct cl_page *page); -void cl_page_disown0(const struct lu_env *env, - struct cl_io *io, struct cl_page *pg); -int cl_page_is_owned(const struct cl_page *pg, const struct cl_io *io); - -/** @} ownership */ - -/** - * \name transfer - * - * Functions dealing with the preparation of a page for a transfer, and - * tracking transfer state. - */ -/** @{ */ -int cl_page_prep(const struct lu_env *env, struct cl_io *io, - struct cl_page *pg, enum cl_req_type crt); -void cl_page_completion(const struct lu_env *env, - struct cl_page *pg, enum cl_req_type crt, int ioret); -int cl_page_make_ready(const struct lu_env *env, struct cl_page *pg, - enum cl_req_type crt); -int cl_page_cache_add(const struct lu_env *env, struct cl_io *io, - struct cl_page *pg, enum cl_req_type crt); -void cl_page_clip(const struct lu_env *env, struct cl_page *pg, - int from, int to); -int cl_page_cancel(const struct lu_env *env, struct cl_page *page); -int cl_page_flush(const struct lu_env *env, struct cl_io *io, - struct cl_page *pg); - -/** @} transfer */ - -/** - * \name helper routines - * Functions to discard, delete and export a cl_page. - */ -/** @{ */ -void cl_page_discard(const struct lu_env *env, struct cl_io *io, - struct cl_page *pg); -void cl_page_delete(const struct lu_env *env, struct cl_page *pg); -int cl_page_is_vmlocked(const struct lu_env *env, const struct cl_page *pg); -void cl_page_export(const struct lu_env *env, struct cl_page *pg, int uptodate); -loff_t cl_offset(const struct cl_object *obj, pgoff_t idx); -pgoff_t cl_index(const struct cl_object *obj, loff_t offset); -size_t cl_page_size(const struct cl_object *obj); -int cl_pages_prune(const struct lu_env *env, struct cl_object *obj); - -void cl_lock_print(const struct lu_env *env, void *cookie, - lu_printer_t printer, const struct cl_lock *lock); -void cl_lock_descr_print(const struct lu_env *env, void *cookie, - lu_printer_t printer, - const struct cl_lock_descr *descr); -/* @} helper */ - -/** - * Data structure managing a client's cached pages. A count of - * "unstable" pages is maintained, and an LRU of clean pages is - * maintained. "unstable" pages are pages pinned by the ptlrpc - * layer for recovery purposes. - */ -struct cl_client_cache { - /** - * # of client cache refcount - * # of users (OSCs) + 2 (held by llite and lov) - */ - atomic_t ccc_users; - /** - * # of threads are doing shrinking - */ - unsigned int ccc_lru_shrinkers; - /** - * # of LRU entries available - */ - atomic_long_t ccc_lru_left; - /** - * List of entities(OSCs) for this LRU cache - */ - struct list_head ccc_lru; - /** - * Max # of LRU entries - */ - unsigned long ccc_lru_max; - /** - * Lock to protect ccc_lru list - */ - spinlock_t ccc_lru_lock; - /** - * Set if unstable check is enabled - */ - unsigned int ccc_unstable_check:1; - /** - * # of unstable pages for this mount point - */ - atomic_long_t ccc_unstable_nr; - /** - * Waitq for awaiting unstable pages to reach zero. - * Used at umounting time and signaled on BRW commit - */ - wait_queue_head_t ccc_unstable_waitq; - -}; - -/** - * cl_cache functions - */ -struct cl_client_cache *cl_cache_init(unsigned long lru_page_max); -void cl_cache_incref(struct cl_client_cache *cache); -void cl_cache_decref(struct cl_client_cache *cache); - -/** @} cl_page */ - -/** \defgroup cl_lock cl_lock - * @{ - */ - -int cl_lock_request(const struct lu_env *env, struct cl_io *io, - struct cl_lock *lock); -int cl_lock_init(const struct lu_env *env, struct cl_lock *lock, - const struct cl_io *io); -void cl_lock_fini(const struct lu_env *env, struct cl_lock *lock); -const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock, - const struct lu_device_type *dtype); -void cl_lock_release(const struct lu_env *env, struct cl_lock *lock); -int cl_lock_enqueue(const struct lu_env *env, struct cl_io *io, - struct cl_lock *lock, struct cl_sync_io *anchor); -void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock); - -/** @} cl_lock */ - -/** \defgroup cl_io cl_io - * @{ - */ - -int cl_io_init(const struct lu_env *env, struct cl_io *io, - enum cl_io_type iot, struct cl_object *obj); -int cl_io_sub_init(const struct lu_env *env, struct cl_io *io, - enum cl_io_type iot, struct cl_object *obj); -int cl_io_rw_init(const struct lu_env *env, struct cl_io *io, - enum cl_io_type iot, loff_t pos, size_t count); -int cl_io_loop(const struct lu_env *env, struct cl_io *io); - -void cl_io_fini(const struct lu_env *env, struct cl_io *io); -int cl_io_iter_init(const struct lu_env *env, struct cl_io *io); -void cl_io_iter_fini(const struct lu_env *env, struct cl_io *io); -int cl_io_lock(const struct lu_env *env, struct cl_io *io); -void cl_io_unlock(const struct lu_env *env, struct cl_io *io); -int cl_io_start(const struct lu_env *env, struct cl_io *io); -void cl_io_end(const struct lu_env *env, struct cl_io *io); -int cl_io_lock_add(const struct lu_env *env, struct cl_io *io, - struct cl_io_lock_link *link); -int cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io, - struct cl_lock_descr *descr); -int cl_io_submit_rw(const struct lu_env *env, struct cl_io *io, - enum cl_req_type iot, struct cl_2queue *queue); -int cl_io_submit_sync(const struct lu_env *env, struct cl_io *io, - enum cl_req_type iot, struct cl_2queue *queue, - long timeout); -int cl_io_commit_async(const struct lu_env *env, struct cl_io *io, - struct cl_page_list *queue, int from, int to, - cl_commit_cbt cb); -int cl_io_read_ahead(const struct lu_env *env, struct cl_io *io, - pgoff_t start, struct cl_read_ahead *ra); -int cl_io_is_going(const struct lu_env *env); - -/** - * True, iff \a io is an O_APPEND write(2). - */ -static inline int cl_io_is_append(const struct cl_io *io) -{ - return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_append; -} - -static inline int cl_io_is_sync_write(const struct cl_io *io) -{ - return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_sync; -} - -static inline int cl_io_is_mkwrite(const struct cl_io *io) -{ - return io->ci_type == CIT_FAULT && io->u.ci_fault.ft_mkwrite; -} - -/** - * True, iff \a io is a truncate(2). - */ -static inline int cl_io_is_trunc(const struct cl_io *io) -{ - return io->ci_type == CIT_SETATTR && - (io->u.ci_setattr.sa_valid & ATTR_SIZE); -} - -struct cl_io *cl_io_top(struct cl_io *io); - -#define CL_IO_SLICE_CLEAN(foo_io, base) \ -do { \ - typeof(foo_io) __foo_io = (foo_io); \ - \ - BUILD_BUG_ON(offsetof(typeof(*__foo_io), base) != 0); \ - memset(&__foo_io->base + 1, 0, \ - sizeof(*__foo_io) - sizeof(__foo_io->base)); \ -} while (0) - -/** @} cl_io */ - -/** \defgroup cl_page_list cl_page_list - * @{ - */ - -/** - * Last page in the page list. - */ -static inline struct cl_page *cl_page_list_last(struct cl_page_list *plist) -{ - LASSERT(plist->pl_nr > 0); - return list_entry(plist->pl_pages.prev, struct cl_page, cp_batch); -} - -static inline struct cl_page *cl_page_list_first(struct cl_page_list *plist) -{ - LASSERT(plist->pl_nr > 0); - return list_entry(plist->pl_pages.next, struct cl_page, cp_batch); -} - -/** - * Iterate over pages in a page list. - */ -#define cl_page_list_for_each(page, list) \ - list_for_each_entry((page), &(list)->pl_pages, cp_batch) - -/** - * Iterate over pages in a page list, taking possible removals into account. - */ -#define cl_page_list_for_each_safe(page, temp, list) \ - list_for_each_entry_safe((page), (temp), &(list)->pl_pages, cp_batch) - -void cl_page_list_init(struct cl_page_list *plist); -void cl_page_list_add(struct cl_page_list *plist, struct cl_page *page); -void cl_page_list_move(struct cl_page_list *dst, struct cl_page_list *src, - struct cl_page *page); -void cl_page_list_move_head(struct cl_page_list *dst, struct cl_page_list *src, - struct cl_page *page); -void cl_page_list_splice(struct cl_page_list *list, struct cl_page_list *head); -void cl_page_list_del(const struct lu_env *env, struct cl_page_list *plist, - struct cl_page *page); -void cl_page_list_disown(const struct lu_env *env, - struct cl_io *io, struct cl_page_list *plist); -void cl_page_list_fini(const struct lu_env *env, struct cl_page_list *plist); - -void cl_2queue_init(struct cl_2queue *queue); -void cl_2queue_disown(const struct lu_env *env, - struct cl_io *io, struct cl_2queue *queue); -void cl_2queue_discard(const struct lu_env *env, - struct cl_io *io, struct cl_2queue *queue); -void cl_2queue_fini(const struct lu_env *env, struct cl_2queue *queue); -void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page); - -/** @} cl_page_list */ - -void cl_req_attr_set(const struct lu_env *env, struct cl_object *obj, - struct cl_req_attr *attr); - -/** \defgroup cl_sync_io cl_sync_io - * @{ - */ - -/** - * Anchor for synchronous transfer. This is allocated on a stack by thread - * doing synchronous transfer, and a pointer to this structure is set up in - * every page submitted for transfer. Transfer completion routine updates - * anchor and wakes up waiting thread when transfer is complete. - */ -struct cl_sync_io { - /** number of pages yet to be transferred. */ - atomic_t csi_sync_nr; - /** error code. */ - int csi_sync_rc; - /** barrier of destroy this structure */ - atomic_t csi_barrier; - /** completion to be signaled when transfer is complete. */ - wait_queue_head_t csi_waitq; - /** callback to invoke when this IO is finished */ - void (*csi_end_io)(const struct lu_env *, - struct cl_sync_io *); -}; - -void cl_sync_io_init(struct cl_sync_io *anchor, int nr, - void (*end)(const struct lu_env *, struct cl_sync_io *)); -int cl_sync_io_wait(const struct lu_env *env, struct cl_sync_io *anchor, - long timeout); -void cl_sync_io_note(const struct lu_env *env, struct cl_sync_io *anchor, - int ioret); -void cl_sync_io_end(const struct lu_env *env, struct cl_sync_io *anchor); - -/** @} cl_sync_io */ - -/** \defgroup cl_env cl_env - * - * lu_env handling for a client. - * - * lu_env is an environment within which lustre code executes. Its major part - * is lu_context---a fast memory allocation mechanism that is used to conserve - * precious kernel stack space. Originally lu_env was designed for a server, - * where - * - * - there is a (mostly) fixed number of threads, and - * - * - call chains have no non-lustre portions inserted between lustre code. - * - * On a client both these assumption fails, because every user thread can - * potentially execute lustre code as part of a system call, and lustre calls - * into VFS or MM that call back into lustre. - * - * To deal with that, cl_env wrapper functions implement the following - * optimizations: - * - * - allocation and destruction of environment is amortized by caching no - * longer used environments instead of destroying them; - * - * \see lu_env, lu_context, lu_context_key - * @{ - */ - -struct lu_env *cl_env_get(u16 *refcheck); -struct lu_env *cl_env_alloc(u16 *refcheck, __u32 tags); -void cl_env_put(struct lu_env *env, u16 *refcheck); -unsigned int cl_env_cache_purge(unsigned int nr); -struct lu_env *cl_env_percpu_get(void); -void cl_env_percpu_put(struct lu_env *env); - -/** @} cl_env */ - -/* - * Misc - */ -void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb); - -struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site, - struct lu_device_type *ldt, - struct lu_device *next); -/** @} clio */ - -int cl_global_init(void); -void cl_global_fini(void); - -#endif /* _LINUX_CL_OBJECT_H */ diff --git a/drivers/staging/lustre/lustre/include/interval_tree.h b/drivers/staging/lustre/lustre/include/interval_tree.h deleted file mode 100644 index 7d119c1a046907f62c0d0a0a2a489ef3f1c411d3..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/interval_tree.h +++ /dev/null @@ -1,119 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/include/interval_tree.h - * - * Author: Huang Wei - * Author: Jay Xiong - */ - -#ifndef _INTERVAL_H__ -#define _INTERVAL_H__ - -#include -#include -#include - -struct interval_node { - struct interval_node *in_left; - struct interval_node *in_right; - struct interval_node *in_parent; - unsigned in_color:1, - in_intree:1, /** set if the node is in tree */ - in_res1:30; - __u8 in_res2[4]; /** tags, 8-bytes aligned */ - __u64 in_max_high; - struct interval_node_extent { - __u64 start; - __u64 end; - } in_extent; -}; - -enum interval_iter { - INTERVAL_ITER_CONT = 1, - INTERVAL_ITER_STOP = 2 -}; - -static inline int interval_is_intree(struct interval_node *node) -{ - return node->in_intree == 1; -} - -static inline __u64 interval_low(struct interval_node *node) -{ - return node->in_extent.start; -} - -static inline __u64 interval_high(struct interval_node *node) -{ - return node->in_extent.end; -} - -static inline int interval_set(struct interval_node *node, - __u64 start, __u64 end) -{ - if (start > end) - return -ERANGE; - node->in_extent.start = start; - node->in_extent.end = end; - node->in_max_high = end; - return 0; -} - -/* - * Rules to write an interval callback. - * - the callback returns INTERVAL_ITER_STOP when it thinks the iteration - * should be stopped. It will then cause the iteration function to return - * immediately with return value INTERVAL_ITER_STOP. - * - callbacks for interval_iterate and interval_iterate_reverse: Every - * nodes in the tree will be set to @node before the callback being called - * - callback for interval_search: Only overlapped node will be set to @node - * before the callback being called. - */ -typedef enum interval_iter (*interval_callback_t)(struct interval_node *node, - void *args); - -struct interval_node *interval_insert(struct interval_node *node, - struct interval_node **root); -void interval_erase(struct interval_node *node, struct interval_node **root); - -/* - * Search the extents in the tree and call @func for each overlapped - * extents. - */ -enum interval_iter interval_search(struct interval_node *root, - struct interval_node_extent *ex, - interval_callback_t func, void *data); - -enum interval_iter interval_iterate_reverse(struct interval_node *root, - interval_callback_t func, - void *data); - -#endif diff --git a/drivers/staging/lustre/lustre/include/llog_swab.h b/drivers/staging/lustre/lustre/include/llog_swab.h deleted file mode 100644 index 0433b79efdcb7dc3abd19245a1a42edbb5d082a5..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/llog_swab.h +++ /dev/null @@ -1,67 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2014, Intel Corporation. - * - * Copyright 2015 Cray Inc, all rights reserved. - * Author: Ben Evans. - * - * We assume all nodes are either little-endian or big-endian, and we - * always send messages in the sender's native format. The receiver - * detects the message format by checking the 'magic' field of the message - * (see lustre_msg_swabbed() below). - * - * Each type has corresponding 'lustre_swab_xxxtypexxx()' routines - * are implemented in ptlrpc/pack_generic.c. These 'swabbers' convert the - * type from "other" endian, in-place in the message buffer. - * - * A swabber takes a single pointer argument. The caller must already have - * verified that the length of the message buffer >= sizeof (type). - * - * For variable length types, a second 'lustre_swab_v_xxxtypexxx()' routine - * may be defined that swabs just the variable part, after the caller has - * verified that the message buffer is large enough. - */ - -#ifndef _LLOG_SWAB_H_ -#define _LLOG_SWAB_H_ - -#include - -struct lustre_cfg; - -void lustre_swab_lu_fid(struct lu_fid *fid); -void lustre_swab_ost_id(struct ost_id *oid); -void lustre_swab_llogd_body(struct llogd_body *d); -void lustre_swab_llog_hdr(struct llog_log_hdr *h); -void lustre_swab_llogd_conn_body(struct llogd_conn_body *d); -void lustre_swab_llog_rec(struct llog_rec_hdr *rec); -void lustre_swab_lu_seq_range(struct lu_seq_range *range); -void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg); -void lustre_swab_cfg_marker(struct cfg_marker *marker, - int swab, int size); - -#endif diff --git a/drivers/staging/lustre/lustre/include/lprocfs_status.h b/drivers/staging/lustre/lustre/include/lprocfs_status.h deleted file mode 100644 index 495e6f5f676bdb94d2f5a0328dd33f38d7b6f80f..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lprocfs_status.h +++ /dev/null @@ -1,646 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/include/lprocfs_status.h - * - * Top level header file for LProc SNMP - * - * Author: Hariharan Thantry thantry@users.sourceforge.net - */ -#ifndef _LPROCFS_SNMP_H -#define _LPROCFS_SNMP_H - -#include -#include -#include -#include -#include - -#include -#include - -struct lprocfs_vars { - const char *name; - const struct file_operations *fops; - void *data; - /** - * sysfs file mode. - */ - umode_t proc_mode; -}; - -struct lprocfs_static_vars { - struct lprocfs_vars *obd_vars; - const struct attribute_group *sysfs_vars; -}; - -/* if we find more consumers this could be generalized */ -#define OBD_HIST_MAX 32 -struct obd_histogram { - spinlock_t oh_lock; - unsigned long oh_buckets[OBD_HIST_MAX]; -}; - -enum { - BRW_R_PAGES = 0, - BRW_W_PAGES, - BRW_R_RPC_HIST, - BRW_W_RPC_HIST, - BRW_R_IO_TIME, - BRW_W_IO_TIME, - BRW_R_DISCONT_PAGES, - BRW_W_DISCONT_PAGES, - BRW_R_DISCONT_BLOCKS, - BRW_W_DISCONT_BLOCKS, - BRW_R_DISK_IOSIZE, - BRW_W_DISK_IOSIZE, - BRW_R_DIO_FRAGS, - BRW_W_DIO_FRAGS, - BRW_LAST, -}; - -struct brw_stats { - struct obd_histogram hist[BRW_LAST]; -}; - -enum { - RENAME_SAMEDIR_SIZE = 0, - RENAME_CROSSDIR_SRC_SIZE, - RENAME_CROSSDIR_TGT_SIZE, - RENAME_LAST, -}; - -struct rename_stats { - struct obd_histogram hist[RENAME_LAST]; -}; - -/* An lprocfs counter can be configured using the enum bit masks below. - * - * LPROCFS_CNTR_EXTERNALLOCK indicates that an external lock already - * protects this counter from concurrent updates. If not specified, - * lprocfs an internal per-counter lock variable. External locks are - * not used to protect counter increments, but are used to protect - * counter readout and resets. - * - * LPROCFS_CNTR_AVGMINMAX indicates a multi-valued counter samples, - * (i.e. counter can be incremented by more than "1"). When specified, - * the counter maintains min, max and sum in addition to a simple - * invocation count. This allows averages to be computed. - * If not specified, the counter is an increment-by-1 counter. - * min, max, sum, etc. are not maintained. - * - * LPROCFS_CNTR_STDDEV indicates that the counter should track sum of - * squares (for multi-valued counter samples only). This allows - * external computation of standard deviation, but involves a 64-bit - * multiply per counter increment. - */ - -enum { - LPROCFS_CNTR_EXTERNALLOCK = 0x0001, - LPROCFS_CNTR_AVGMINMAX = 0x0002, - LPROCFS_CNTR_STDDEV = 0x0004, - - /* counter data type */ - LPROCFS_TYPE_REGS = 0x0100, - LPROCFS_TYPE_BYTES = 0x0200, - LPROCFS_TYPE_PAGES = 0x0400, - LPROCFS_TYPE_CYCLE = 0x0800, -}; - -#define LC_MIN_INIT ((~(__u64)0) >> 1) - -struct lprocfs_counter_header { - unsigned int lc_config; - const char *lc_name; /* must be static */ - const char *lc_units; /* must be static */ -}; - -struct lprocfs_counter { - __s64 lc_count; - __s64 lc_min; - __s64 lc_max; - __s64 lc_sumsquare; - /* - * Every counter has lc_array_sum[0], while lc_array_sum[1] is only - * for irq context counter, i.e. stats with - * LPROCFS_STATS_FLAG_IRQ_SAFE flag, its counter need - * lc_array_sum[1] - */ - __s64 lc_array_sum[1]; -}; - -#define lc_sum lc_array_sum[0] -#define lc_sum_irq lc_array_sum[1] - -struct lprocfs_percpu { -#ifndef __GNUC__ - __s64 pad; -#endif - struct lprocfs_counter lp_cntr[0]; -}; - -enum lprocfs_stats_lock_ops { - LPROCFS_GET_NUM_CPU = 0x0001, /* number allocated per-CPU stats */ - LPROCFS_GET_SMP_ID = 0x0002, /* current stat to be updated */ -}; - -enum lprocfs_stats_flags { - LPROCFS_STATS_FLAG_NONE = 0x0000, /* per cpu counter */ - LPROCFS_STATS_FLAG_NOPERCPU = 0x0001, /* stats have no percpu - * area and need locking - */ - LPROCFS_STATS_FLAG_IRQ_SAFE = 0x0002, /* alloc need irq safe */ -}; - -enum lprocfs_fields_flags { - LPROCFS_FIELDS_FLAGS_CONFIG = 0x0001, - LPROCFS_FIELDS_FLAGS_SUM = 0x0002, - LPROCFS_FIELDS_FLAGS_MIN = 0x0003, - LPROCFS_FIELDS_FLAGS_MAX = 0x0004, - LPROCFS_FIELDS_FLAGS_AVG = 0x0005, - LPROCFS_FIELDS_FLAGS_SUMSQUARE = 0x0006, - LPROCFS_FIELDS_FLAGS_COUNT = 0x0007, -}; - -struct lprocfs_stats { - /* # of counters */ - unsigned short ls_num; - /* 1 + the biggest cpu # whose ls_percpu slot has been allocated */ - unsigned short ls_biggest_alloc_num; - enum lprocfs_stats_flags ls_flags; - /* Lock used when there are no percpu stats areas; For percpu stats, - * it is used to protect ls_biggest_alloc_num change - */ - spinlock_t ls_lock; - - /* has ls_num of counter headers */ - struct lprocfs_counter_header *ls_cnt_header; - struct lprocfs_percpu *ls_percpu[0]; -}; - -#define OPC_RANGE(seg) (seg ## _LAST_OPC - seg ## _FIRST_OPC) - -/* Pack all opcodes down into a single monotonically increasing index */ -static inline int opcode_offset(__u32 opc) -{ - if (opc < OST_LAST_OPC) { - /* OST opcode */ - return (opc - OST_FIRST_OPC); - } else if (opc < MDS_LAST_OPC) { - /* MDS opcode */ - return (opc - MDS_FIRST_OPC + - OPC_RANGE(OST)); - } else if (opc < LDLM_LAST_OPC) { - /* LDLM Opcode */ - return (opc - LDLM_FIRST_OPC + - OPC_RANGE(MDS) + - OPC_RANGE(OST)); - } else if (opc < MGS_LAST_OPC) { - /* MGS Opcode */ - return (opc - MGS_FIRST_OPC + - OPC_RANGE(LDLM) + - OPC_RANGE(MDS) + - OPC_RANGE(OST)); - } else if (opc < OBD_LAST_OPC) { - /* OBD Ping */ - return (opc - OBD_FIRST_OPC + - OPC_RANGE(MGS) + - OPC_RANGE(LDLM) + - OPC_RANGE(MDS) + - OPC_RANGE(OST)); - } else if (opc < LLOG_LAST_OPC) { - /* LLOG Opcode */ - return (opc - LLOG_FIRST_OPC + - OPC_RANGE(OBD) + - OPC_RANGE(MGS) + - OPC_RANGE(LDLM) + - OPC_RANGE(MDS) + - OPC_RANGE(OST)); - } else if (opc < QUOTA_LAST_OPC) { - /* LQUOTA Opcode */ - return (opc - QUOTA_FIRST_OPC + - OPC_RANGE(LLOG) + - OPC_RANGE(OBD) + - OPC_RANGE(MGS) + - OPC_RANGE(LDLM) + - OPC_RANGE(MDS) + - OPC_RANGE(OST)); - } else if (opc < SEQ_LAST_OPC) { - /* SEQ opcode */ - return (opc - SEQ_FIRST_OPC + - OPC_RANGE(QUOTA) + - OPC_RANGE(LLOG) + - OPC_RANGE(OBD) + - OPC_RANGE(MGS) + - OPC_RANGE(LDLM) + - OPC_RANGE(MDS) + - OPC_RANGE(OST)); - } else if (opc < SEC_LAST_OPC) { - /* SEC opcode */ - return (opc - SEC_FIRST_OPC + - OPC_RANGE(SEQ) + - OPC_RANGE(QUOTA) + - OPC_RANGE(LLOG) + - OPC_RANGE(OBD) + - OPC_RANGE(MGS) + - OPC_RANGE(LDLM) + - OPC_RANGE(MDS) + - OPC_RANGE(OST)); - } else if (opc < FLD_LAST_OPC) { - /* FLD opcode */ - return (opc - FLD_FIRST_OPC + - OPC_RANGE(SEC) + - OPC_RANGE(SEQ) + - OPC_RANGE(QUOTA) + - OPC_RANGE(LLOG) + - OPC_RANGE(OBD) + - OPC_RANGE(MGS) + - OPC_RANGE(LDLM) + - OPC_RANGE(MDS) + - OPC_RANGE(OST)); - } else { - /* Unknown Opcode */ - return -1; - } -} - -#define LUSTRE_MAX_OPCODES (OPC_RANGE(OST) + \ - OPC_RANGE(MDS) + \ - OPC_RANGE(LDLM) + \ - OPC_RANGE(MGS) + \ - OPC_RANGE(OBD) + \ - OPC_RANGE(LLOG) + \ - OPC_RANGE(SEC) + \ - OPC_RANGE(SEQ) + \ - OPC_RANGE(SEC) + \ - OPC_RANGE(FLD)) - -#define EXTRA_MAX_OPCODES ((PTLRPC_LAST_CNTR - PTLRPC_FIRST_CNTR) + \ - OPC_RANGE(EXTRA)) - -enum { - PTLRPC_REQWAIT_CNTR = 0, - PTLRPC_REQQDEPTH_CNTR, - PTLRPC_REQACTIVE_CNTR, - PTLRPC_TIMEOUT, - PTLRPC_REQBUF_AVAIL_CNTR, - PTLRPC_LAST_CNTR -}; - -#define PTLRPC_FIRST_CNTR PTLRPC_REQWAIT_CNTR - -enum { - LDLM_GLIMPSE_ENQUEUE = 0, - LDLM_PLAIN_ENQUEUE, - LDLM_EXTENT_ENQUEUE, - LDLM_FLOCK_ENQUEUE, - LDLM_IBITS_ENQUEUE, - MDS_REINT_SETATTR, - MDS_REINT_CREATE, - MDS_REINT_LINK, - MDS_REINT_UNLINK, - MDS_REINT_RENAME, - MDS_REINT_OPEN, - MDS_REINT_SETXATTR, - BRW_READ_BYTES, - BRW_WRITE_BYTES, - EXTRA_LAST_OPC -}; - -#define EXTRA_FIRST_OPC LDLM_GLIMPSE_ENQUEUE -/* class_obd.c */ -extern struct dentry *debugfs_lustre_root; -extern struct kobject *lustre_kobj; - -struct obd_device; -struct obd_histogram; - -/* Days / hours / mins / seconds format */ -struct dhms { - int d, h, m, s; -}; - -static inline void s2dhms(struct dhms *ts, time64_t secs64) -{ - unsigned int secs; - - ts->d = div_u64_rem(secs64, 86400, &secs); - ts->h = secs / 3600; - secs = secs % 3600; - ts->m = secs / 60; - ts->s = secs % 60; -} - -#define DHMS_FMT "%dd%dh%02dm%02ds" -#define DHMS_VARS(x) (x)->d, (x)->h, (x)->m, (x)->s - -#define JOBSTATS_JOBID_VAR_MAX_LEN 20 -#define JOBSTATS_DISABLE "disable" -#define JOBSTATS_PROCNAME_UID "procname_uid" -#define JOBSTATS_NODELOCAL "nodelocal" - -/* obd_config.c */ -void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg)); - -int lprocfs_write_frac_helper(const char __user *buffer, - unsigned long count, int *val, int mult); -int lprocfs_read_frac_helper(char *buffer, unsigned long count, - long val, int mult); - -int lprocfs_stats_alloc_one(struct lprocfs_stats *stats, - unsigned int cpuid); -int lprocfs_stats_lock(struct lprocfs_stats *stats, - enum lprocfs_stats_lock_ops opc, - unsigned long *flags); -void lprocfs_stats_unlock(struct lprocfs_stats *stats, - enum lprocfs_stats_lock_ops opc, - unsigned long *flags); - -static inline unsigned int -lprocfs_stats_counter_size(struct lprocfs_stats *stats) -{ - unsigned int percpusize; - - percpusize = offsetof(struct lprocfs_percpu, lp_cntr[stats->ls_num]); - - /* irq safe stats need lc_array_sum[1] */ - if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) - percpusize += stats->ls_num * sizeof(__s64); - - if ((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0) - percpusize = L1_CACHE_ALIGN(percpusize); - - return percpusize; -} - -static inline struct lprocfs_counter * -lprocfs_stats_counter_get(struct lprocfs_stats *stats, unsigned int cpuid, - int index) -{ - struct lprocfs_counter *cntr; - - cntr = &stats->ls_percpu[cpuid]->lp_cntr[index]; - - if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) - cntr = (void *)cntr + index * sizeof(__s64); - - return cntr; -} - -/* Two optimized LPROCFS counter increment functions are provided: - * lprocfs_counter_incr(cntr, value) - optimized for by-one counters - * lprocfs_counter_add(cntr) - use for multi-valued counters - * Counter data layout allows config flag, counter lock and the - * count itself to reside within a single cache line. - */ - -void lprocfs_counter_add(struct lprocfs_stats *stats, int idx, long amount); -void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx, long amount); - -#define lprocfs_counter_incr(stats, idx) \ - lprocfs_counter_add(stats, idx, 1) -#define lprocfs_counter_decr(stats, idx) \ - lprocfs_counter_sub(stats, idx, 1) - -__s64 lprocfs_read_helper(struct lprocfs_counter *lc, - struct lprocfs_counter_header *header, - enum lprocfs_stats_flags flags, - enum lprocfs_fields_flags field); -__u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx, - enum lprocfs_fields_flags field); - -extern struct lprocfs_stats * -lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags); -void lprocfs_clear_stats(struct lprocfs_stats *stats); -void lprocfs_free_stats(struct lprocfs_stats **stats); -void lprocfs_counter_init(struct lprocfs_stats *stats, int index, - unsigned int conf, const char *name, - const char *units); -struct obd_export; -int lprocfs_exp_cleanup(struct obd_export *exp); -extern const struct file_operations lprocfs_stats_seq_fops; - -/* lprocfs_status.c */ -void ldebugfs_add_vars(struct dentry *parent, struct lprocfs_vars *var, - void *data); - -int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list, - const struct attribute_group *attrs); -int lprocfs_obd_cleanup(struct obd_device *obd); - -/* Generic callbacks */ - -int lprocfs_rd_uint(struct seq_file *m, void *data); -int lprocfs_wr_uint(struct file *file, const char __user *buffer, - unsigned long count, void *data); -int lprocfs_rd_server_uuid(struct seq_file *m, void *data); -int lprocfs_rd_conn_uuid(struct seq_file *m, void *data); -int lprocfs_rd_import(struct seq_file *m, void *data); -int lprocfs_rd_state(struct seq_file *m, void *data); -int lprocfs_rd_connect_flags(struct seq_file *m, void *data); - -struct adaptive_timeout; -int lprocfs_at_hist_helper(struct seq_file *m, struct adaptive_timeout *at); -int lprocfs_rd_timeouts(struct seq_file *m, void *data); -int lprocfs_wr_ping(struct file *file, const char __user *buffer, - size_t count, loff_t *off); -int lprocfs_wr_import(struct file *file, const char __user *buffer, - size_t count, loff_t *off); -int lprocfs_rd_pinger_recov(struct seq_file *m, void *n); -int lprocfs_wr_pinger_recov(struct file *file, const char __user *buffer, - size_t count, loff_t *off); - -/* Statfs helpers */ - -int lprocfs_write_helper(const char __user *buffer, unsigned long count, - int *val); -int lprocfs_write_u64_helper(const char __user *buffer, - unsigned long count, __u64 *val); -int lprocfs_write_frac_u64_helper(const char __user *buffer, - unsigned long count, - __u64 *val, int mult); -char *lprocfs_find_named_value(const char *buffer, const char *name, - size_t *count); -void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value); -void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value); -void lprocfs_oh_clear(struct obd_histogram *oh); -unsigned long lprocfs_oh_sum(struct obd_histogram *oh); - -void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx, - struct lprocfs_counter *cnt); - -int lprocfs_single_release(struct inode *inode, struct file *file); -int lprocfs_seq_release(struct inode *inode, struct file *file); - -/* write the name##_seq_show function, call LPROC_SEQ_FOPS_RO for read-only - * proc entries; otherwise, you will define name##_seq_write function also for - * a read-write proc entry, and then call LPROC_SEQ_SEQ instead. Finally, - * call ldebugfs_obd_seq_create(obd, filename, 0444, &name#_fops, data); - */ -#define __LPROC_SEQ_FOPS(name, custom_seq_write) \ -static int name##_single_open(struct inode *inode, struct file *file) \ -{ \ - return single_open(file, name##_seq_show, inode->i_private); \ -} \ -static const struct file_operations name##_fops = { \ - .owner = THIS_MODULE, \ - .open = name##_single_open, \ - .read = seq_read, \ - .write = custom_seq_write, \ - .llseek = seq_lseek, \ - .release = lprocfs_single_release, \ -} - -#define LPROC_SEQ_FOPS_RO(name) __LPROC_SEQ_FOPS(name, NULL) -#define LPROC_SEQ_FOPS(name) __LPROC_SEQ_FOPS(name, name##_seq_write) - -#define LPROC_SEQ_FOPS_RO_TYPE(name, type) \ - static int name##_##type##_seq_show(struct seq_file *m, void *v)\ - { \ - return lprocfs_rd_##type(m, m->private); \ - } \ - LPROC_SEQ_FOPS_RO(name##_##type) - -#define LPROC_SEQ_FOPS_RW_TYPE(name, type) \ - static int name##_##type##_seq_show(struct seq_file *m, void *v)\ - { \ - return lprocfs_rd_##type(m, m->private); \ - } \ - static ssize_t name##_##type##_seq_write(struct file *file, \ - const char __user *buffer, size_t count, \ - loff_t *off) \ - { \ - struct seq_file *seq = file->private_data; \ - return lprocfs_wr_##type(file, buffer, \ - count, seq->private); \ - } \ - LPROC_SEQ_FOPS(name##_##type) - -#define LPROC_SEQ_FOPS_WR_ONLY(name, type) \ - static ssize_t name##_##type##_write(struct file *file, \ - const char __user *buffer, size_t count, \ - loff_t *off) \ - { \ - return lprocfs_wr_##type(file, buffer, count, off); \ - } \ - static int name##_##type##_open(struct inode *inode, struct file *file) \ - { \ - return single_open(file, NULL, inode->i_private); \ - } \ - static const struct file_operations name##_##type##_fops = { \ - .open = name##_##type##_open, \ - .write = name##_##type##_write, \ - .release = lprocfs_single_release, \ - } - -struct lustre_attr { - struct attribute attr; - ssize_t (*show)(struct kobject *kobj, struct attribute *attr, - char *buf); - ssize_t (*store)(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t len); -}; - -#define LUSTRE_ATTR(name, mode, show, store) \ -static struct lustre_attr lustre_attr_##name = __ATTR(name, mode, show, store) - -#define LUSTRE_RO_ATTR(name) LUSTRE_ATTR(name, 0444, name##_show, NULL) -#define LUSTRE_RW_ATTR(name) LUSTRE_ATTR(name, 0644, name##_show, name##_store) - -extern const struct sysfs_ops lustre_sysfs_ops; - -struct root_squash_info; -int lprocfs_wr_root_squash(const char __user *buffer, unsigned long count, - struct root_squash_info *squash, char *name); -int lprocfs_wr_nosquash_nids(const char __user *buffer, unsigned long count, - struct root_squash_info *squash, char *name); - -/* all quota proc functions */ -int lprocfs_quota_rd_bunit(char *page, char **start, - loff_t off, int count, - int *eof, void *data); -int lprocfs_quota_wr_bunit(struct file *file, const char *buffer, - unsigned long count, void *data); -int lprocfs_quota_rd_btune(char *page, char **start, - loff_t off, int count, - int *eof, void *data); -int lprocfs_quota_wr_btune(struct file *file, const char *buffer, - unsigned long count, void *data); -int lprocfs_quota_rd_iunit(char *page, char **start, - loff_t off, int count, - int *eof, void *data); -int lprocfs_quota_wr_iunit(struct file *file, const char *buffer, - unsigned long count, void *data); -int lprocfs_quota_rd_itune(char *page, char **start, - loff_t off, int count, - int *eof, void *data); -int lprocfs_quota_wr_itune(struct file *file, const char *buffer, - unsigned long count, void *data); -int lprocfs_quota_rd_type(char *page, char **start, loff_t off, int count, - int *eof, void *data); -int lprocfs_quota_wr_type(struct file *file, const char *buffer, - unsigned long count, void *data); -int lprocfs_quota_rd_switch_seconds(char *page, char **start, loff_t off, - int count, int *eof, void *data); -int lprocfs_quota_wr_switch_seconds(struct file *file, - const char *buffer, - unsigned long count, void *data); -int lprocfs_quota_rd_sync_blk(char *page, char **start, loff_t off, - int count, int *eof, void *data); -int lprocfs_quota_wr_sync_blk(struct file *file, const char *buffer, - unsigned long count, void *data); -int lprocfs_quota_rd_switch_qs(char *page, char **start, loff_t off, - int count, int *eof, void *data); -int lprocfs_quota_wr_switch_qs(struct file *file, - const char *buffer, unsigned long count, - void *data); -int lprocfs_quota_rd_boundary_factor(char *page, char **start, loff_t off, - int count, int *eof, void *data); -int lprocfs_quota_wr_boundary_factor(struct file *file, - const char *buffer, unsigned long count, - void *data); -int lprocfs_quota_rd_least_bunit(char *page, char **start, loff_t off, - int count, int *eof, void *data); -int lprocfs_quota_wr_least_bunit(struct file *file, - const char *buffer, unsigned long count, - void *data); -int lprocfs_quota_rd_least_iunit(char *page, char **start, loff_t off, - int count, int *eof, void *data); -int lprocfs_quota_wr_least_iunit(struct file *file, - const char *buffer, unsigned long count, - void *data); -int lprocfs_quota_rd_qs_factor(char *page, char **start, loff_t off, - int count, int *eof, void *data); -int lprocfs_quota_wr_qs_factor(struct file *file, - const char *buffer, unsigned long count, - void *data); -#endif /* LPROCFS_SNMP_H */ diff --git a/drivers/staging/lustre/lustre/include/lu_object.h b/drivers/staging/lustre/lustre/include/lu_object.h deleted file mode 100644 index f29bbca5af656d5a605d4504487548d9bb6eca1f..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lu_object.h +++ /dev/null @@ -1,1305 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#ifndef __LUSTRE_LU_OBJECT_H -#define __LUSTRE_LU_OBJECT_H - -#include -#include -#include -#include -#include - -struct seq_file; -struct lustre_cfg; -struct lprocfs_stats; - -/** \defgroup lu lu - * lu_* data-types represent server-side entities shared by data and meta-data - * stacks. - * - * Design goals: - * - * -# support for layering. - * - * Server side object is split into layers, one per device in the - * corresponding device stack. Individual layer is represented by struct - * lu_object. Compound layered object --- by struct lu_object_header. Most - * interface functions take lu_object as an argument and operate on the - * whole compound object. This decision was made due to the following - * reasons: - * - * - it's envisaged that lu_object will be used much more often than - * lu_object_header; - * - * - we want lower (non-top) layers to be able to initiate operations - * on the whole object. - * - * Generic code supports layering more complex than simple stacking, e.g., - * it is possible that at some layer object "spawns" multiple sub-objects - * on the lower layer. - * - * -# fid-based identification. - * - * Compound object is uniquely identified by its fid. Objects are indexed - * by their fids (hash table is used for index). - * - * -# caching and life-cycle management. - * - * Object's life-time is controlled by reference counting. When reference - * count drops to 0, object is returned to cache. Cached objects still - * retain their identity (i.e., fid), and can be recovered from cache. - * - * Objects are kept in the global LRU list, and lu_site_purge() function - * can be used to reclaim given number of unused objects from the tail of - * the LRU. - * - * -# avoiding recursion. - * - * Generic code tries to replace recursion through layers by iterations - * where possible. Additionally to the end of reducing stack consumption, - * data, when practically possible, are allocated through lu_context_key - * interface rather than on stack. - * @{ - */ - -struct lu_site; -struct lu_object; -struct lu_device; -struct lu_object_header; -struct lu_context; -struct lu_env; - -/** - * Operations common for data and meta-data devices. - */ -struct lu_device_operations { - /** - * Allocate object for the given device (without lower-layer - * parts). This is called by lu_object_operations::loo_object_init() - * from the parent layer, and should setup at least lu_object::lo_dev - * and lu_object::lo_ops fields of resulting lu_object. - * - * Object creation protocol. - * - * Due to design goal of avoiding recursion, object creation (see - * lu_object_alloc()) is somewhat involved: - * - * - first, lu_device_operations::ldo_object_alloc() method of the - * top-level device in the stack is called. It should allocate top - * level object (including lu_object_header), but without any - * lower-layer sub-object(s). - * - * - then lu_object_alloc() sets fid in the header of newly created - * object. - * - * - then lu_object_operations::loo_object_init() is called. It has - * to allocate lower-layer object(s). To do this, - * lu_object_operations::loo_object_init() calls ldo_object_alloc() - * of the lower-layer device(s). - * - * - for all new objects allocated by - * lu_object_operations::loo_object_init() (and inserted into object - * stack), lu_object_operations::loo_object_init() is called again - * repeatedly, until no new objects are created. - * - * \post ergo(!IS_ERR(result), result->lo_dev == d && - * result->lo_ops != NULL); - */ - struct lu_object *(*ldo_object_alloc)(const struct lu_env *env, - const struct lu_object_header *h, - struct lu_device *d); - /** - * process config specific for device. - */ - int (*ldo_process_config)(const struct lu_env *env, - struct lu_device *, struct lustre_cfg *); - int (*ldo_recovery_complete)(const struct lu_env *, - struct lu_device *); - - /** - * initialize local objects for device. this method called after layer - * has been initialized (after LCFG_SETUP stage) and before it starts - * serving user requests. - */ - - int (*ldo_prepare)(const struct lu_env *, - struct lu_device *parent, - struct lu_device *dev); - -}; - -/** - * For lu_object_conf flags - */ -enum loc_flags { - /* This is a new object to be allocated, or the file - * corresponding to the object does not exists. - */ - LOC_F_NEW = 0x00000001, -}; - -/** - * Object configuration, describing particulars of object being created. On - * server this is not used, as server objects are full identified by fid. On - * client configuration contains struct lustre_md. - */ -struct lu_object_conf { - /** - * Some hints for obj find and alloc. - */ - enum loc_flags loc_flags; -}; - -/** - * Type of "printer" function used by lu_object_operations::loo_object_print() - * method. - * - * Printer function is needed to provide some flexibility in (semi-)debugging - * output: possible implementations: printk, CDEBUG, sysfs/seq_file - */ -typedef int (*lu_printer_t)(const struct lu_env *env, - void *cookie, const char *format, ...) - __printf(3, 4); - -/** - * Operations specific for particular lu_object. - */ -struct lu_object_operations { - /** - * Allocate lower-layer parts of the object by calling - * lu_device_operations::ldo_object_alloc() of the corresponding - * underlying device. - * - * This method is called once for each object inserted into object - * stack. It's responsibility of this method to insert lower-layer - * object(s) it create into appropriate places of object stack. - */ - int (*loo_object_init)(const struct lu_env *env, - struct lu_object *o, - const struct lu_object_conf *conf); - /** - * Called (in top-to-bottom order) during object allocation after all - * layers were allocated and initialized. Can be used to perform - * initialization depending on lower layers. - */ - int (*loo_object_start)(const struct lu_env *env, - struct lu_object *o); - /** - * Called before lu_object_operations::loo_object_free() to signal - * that object is being destroyed. Dual to - * lu_object_operations::loo_object_init(). - */ - void (*loo_object_delete)(const struct lu_env *env, - struct lu_object *o); - /** - * Dual to lu_device_operations::ldo_object_alloc(). Called when - * object is removed from memory. - */ - void (*loo_object_free)(const struct lu_env *env, - struct lu_object *o); - /** - * Called when last active reference to the object is released (and - * object returns to the cache). This method is optional. - */ - void (*loo_object_release)(const struct lu_env *env, - struct lu_object *o); - /** - * Optional debugging helper. Print given object. - */ - int (*loo_object_print)(const struct lu_env *env, void *cookie, - lu_printer_t p, const struct lu_object *o); - /** - * Optional debugging method. Returns true iff method is internally - * consistent. - */ - int (*loo_object_invariant)(const struct lu_object *o); -}; - -/** - * Type of lu_device. - */ -struct lu_device_type; - -/** - * Device: a layer in the server side abstraction stacking. - */ -struct lu_device { - /** - * reference count. This is incremented, in particular, on each object - * created at this layer. - * - * \todo XXX which means that atomic_t is probably too small. - */ - atomic_t ld_ref; - /** - * Pointer to device type. Never modified once set. - */ - struct lu_device_type *ld_type; - /** - * Operation vector for this device. - */ - const struct lu_device_operations *ld_ops; - /** - * Stack this device belongs to. - */ - struct lu_site *ld_site; - - /** \todo XXX: temporary back pointer into obd. */ - struct obd_device *ld_obd; - /** - * A list of references to this object, for debugging. - */ - struct lu_ref ld_reference; - /** - * Link the device to the site. - **/ - struct list_head ld_linkage; -}; - -struct lu_device_type_operations; - -/** - * Tag bits for device type. They are used to distinguish certain groups of - * device types. - */ -enum lu_device_tag { - /** this is meta-data device */ - LU_DEVICE_MD = (1 << 0), - /** this is data device */ - LU_DEVICE_DT = (1 << 1), - /** data device in the client stack */ - LU_DEVICE_CL = (1 << 2) -}; - -/** - * Type of device. - */ -struct lu_device_type { - /** - * Tag bits. Taken from enum lu_device_tag. Never modified once set. - */ - __u32 ldt_tags; - /** - * Name of this class. Unique system-wide. Never modified once set. - */ - char *ldt_name; - /** - * Operations for this type. - */ - const struct lu_device_type_operations *ldt_ops; - /** - * \todo XXX: temporary pointer to associated obd_type. - */ - struct obd_type *ldt_obd_type; - /** - * \todo XXX: temporary: context tags used by obd_*() calls. - */ - __u32 ldt_ctx_tags; - /** - * Number of existing device type instances. - */ - atomic_t ldt_device_nr; - /** - * Linkage into a global list of all device types. - * - * \see lu_device_types. - */ - struct list_head ldt_linkage; -}; - -/** - * Operations on a device type. - */ -struct lu_device_type_operations { - /** - * Allocate new device. - */ - struct lu_device *(*ldto_device_alloc)(const struct lu_env *env, - struct lu_device_type *t, - struct lustre_cfg *lcfg); - /** - * Free device. Dual to - * lu_device_type_operations::ldto_device_alloc(). Returns pointer to - * the next device in the stack. - */ - struct lu_device *(*ldto_device_free)(const struct lu_env *, - struct lu_device *); - - /** - * Initialize the devices after allocation - */ - int (*ldto_device_init)(const struct lu_env *env, - struct lu_device *, const char *, - struct lu_device *); - /** - * Finalize device. Dual to - * lu_device_type_operations::ldto_device_init(). Returns pointer to - * the next device in the stack. - */ - struct lu_device *(*ldto_device_fini)(const struct lu_env *env, - struct lu_device *); - /** - * Initialize device type. This is called on module load. - */ - int (*ldto_init)(struct lu_device_type *t); - /** - * Finalize device type. Dual to - * lu_device_type_operations::ldto_init(). Called on module unload. - */ - void (*ldto_fini)(struct lu_device_type *t); - /** - * Called when the first device is created. - */ - void (*ldto_start)(struct lu_device_type *t); - /** - * Called when number of devices drops to 0. - */ - void (*ldto_stop)(struct lu_device_type *t); -}; - -static inline int lu_device_is_md(const struct lu_device *d) -{ - return ergo(d, d->ld_type->ldt_tags & LU_DEVICE_MD); -} - -/** - * Common object attributes. - */ -struct lu_attr { - /** size in bytes */ - __u64 la_size; - /** modification time in seconds since Epoch */ - s64 la_mtime; - /** access time in seconds since Epoch */ - s64 la_atime; - /** change time in seconds since Epoch */ - s64 la_ctime; - /** 512-byte blocks allocated to object */ - __u64 la_blocks; - /** permission bits and file type */ - __u32 la_mode; - /** owner id */ - __u32 la_uid; - /** group id */ - __u32 la_gid; - /** object flags */ - __u32 la_flags; - /** number of persistent references to this object */ - __u32 la_nlink; - /** blk bits of the object*/ - __u32 la_blkbits; - /** blk size of the object*/ - __u32 la_blksize; - /** real device */ - __u32 la_rdev; - /** - * valid bits - * - * \see enum la_valid - */ - __u64 la_valid; -}; - -/** Bit-mask of valid attributes */ -enum la_valid { - LA_ATIME = 1 << 0, - LA_MTIME = 1 << 1, - LA_CTIME = 1 << 2, - LA_SIZE = 1 << 3, - LA_MODE = 1 << 4, - LA_UID = 1 << 5, - LA_GID = 1 << 6, - LA_BLOCKS = 1 << 7, - LA_TYPE = 1 << 8, - LA_FLAGS = 1 << 9, - LA_NLINK = 1 << 10, - LA_RDEV = 1 << 11, - LA_BLKSIZE = 1 << 12, - LA_KILL_SUID = 1 << 13, - LA_KILL_SGID = 1 << 14, -}; - -/** - * Layer in the layered object. - */ -struct lu_object { - /** - * Header for this object. - */ - struct lu_object_header *lo_header; - /** - * Device for this layer. - */ - struct lu_device *lo_dev; - /** - * Operations for this object. - */ - const struct lu_object_operations *lo_ops; - /** - * Linkage into list of all layers. - */ - struct list_head lo_linkage; - /** - * Link to the device, for debugging. - */ - struct lu_ref_link lo_dev_ref; -}; - -enum lu_object_header_flags { - /** - * Don't keep this object in cache. Object will be destroyed as soon - * as last reference to it is released. This flag cannot be cleared - * once set. - */ - LU_OBJECT_HEARD_BANSHEE = 0, - /** - * Mark this object has already been taken out of cache. - */ - LU_OBJECT_UNHASHED = 1, -}; - -enum lu_object_header_attr { - LOHA_EXISTS = 1 << 0, - LOHA_REMOTE = 1 << 1, - /** - * UNIX file type is stored in S_IFMT bits. - */ - LOHA_FT_START = 001 << 12, /**< S_IFIFO */ - LOHA_FT_END = 017 << 12, /**< S_IFMT */ -}; - -/** - * "Compound" object, consisting of multiple layers. - * - * Compound object with given fid is unique with given lu_site. - * - * Note, that object does *not* necessary correspond to the real object in the - * persistent storage: object is an anchor for locking and method calling, so - * it is created for things like not-yet-existing child created by mkdir or - * create calls. lu_object_operations::loo_exists() can be used to check - * whether object is backed by persistent storage entity. - */ -struct lu_object_header { - /** - * Fid, uniquely identifying this object. - */ - struct lu_fid loh_fid; - /** - * Object flags from enum lu_object_header_flags. Set and checked - * atomically. - */ - unsigned long loh_flags; - /** - * Object reference count. Protected by lu_site::ls_guard. - */ - atomic_t loh_ref; - /** - * Common object attributes, cached for efficiency. From enum - * lu_object_header_attr. - */ - __u32 loh_attr; - /** - * Linkage into per-site hash table. Protected by lu_site::ls_guard. - */ - struct hlist_node loh_hash; - /** - * Linkage into per-site LRU list. Protected by lu_site::ls_guard. - */ - struct list_head loh_lru; - /** - * Linkage into list of layers. Never modified once set (except lately - * during object destruction). No locking is necessary. - */ - struct list_head loh_layers; - /** - * A list of references to this object, for debugging. - */ - struct lu_ref loh_reference; -}; - -struct fld; -struct lu_site_bkt_data; - -enum { - LU_SS_CREATED = 0, - LU_SS_CACHE_HIT, - LU_SS_CACHE_MISS, - LU_SS_CACHE_RACE, - LU_SS_CACHE_DEATH_RACE, - LU_SS_LRU_PURGED, - LU_SS_LAST_STAT -}; - -/** - * lu_site is a "compartment" within which objects are unique, and LRU - * discipline is maintained. - * - * lu_site exists so that multiple layered stacks can co-exist in the same - * address space. - * - * lu_site has the same relation to lu_device as lu_object_header to - * lu_object. - */ -struct lu_site { - /** - * objects hash table - */ - struct cfs_hash *ls_obj_hash; - /** - * index of bucket on hash table while purging - */ - unsigned int ls_purge_start; - /** - * Top-level device for this stack. - */ - struct lu_device *ls_top_dev; - /** - * Bottom-level device for this stack - */ - struct lu_device *ls_bottom_dev; - /** - * Linkage into global list of sites. - */ - struct list_head ls_linkage; - /** - * List for lu device for this site, protected - * by ls_ld_lock. - **/ - struct list_head ls_ld_linkage; - spinlock_t ls_ld_lock; - - /** - * Lock to serialize site purge. - */ - struct mutex ls_purge_mutex; - - /** - * lu_site stats - */ - struct lprocfs_stats *ls_stats; - /** - * XXX: a hack! fld has to find md_site via site, remove when possible - */ - struct seq_server_site *ld_seq_site; - /** - * Number of objects in lsb_lru_lists - used for shrinking - */ - struct percpu_counter ls_lru_len_counter; -}; - -wait_queue_head_t * -lu_site_wq_from_fid(struct lu_site *site, struct lu_fid *fid); - -static inline struct seq_server_site *lu_site2seq(const struct lu_site *s) -{ - return s->ld_seq_site; -} - -/** \name ctors - * Constructors/destructors. - * @{ - */ - -int lu_site_init(struct lu_site *s, struct lu_device *d); -void lu_site_fini(struct lu_site *s); -int lu_site_init_finish(struct lu_site *s); -void lu_stack_fini(const struct lu_env *env, struct lu_device *top); -void lu_device_get(struct lu_device *d); -void lu_device_put(struct lu_device *d); -int lu_device_init(struct lu_device *d, struct lu_device_type *t); -void lu_device_fini(struct lu_device *d); -int lu_object_header_init(struct lu_object_header *h); -void lu_object_header_fini(struct lu_object_header *h); -int lu_object_init(struct lu_object *o, - struct lu_object_header *h, struct lu_device *d); -void lu_object_fini(struct lu_object *o); -void lu_object_add_top(struct lu_object_header *h, struct lu_object *o); -void lu_object_add(struct lu_object *before, struct lu_object *o); - -/** - * Helpers to initialize and finalize device types. - */ - -int lu_device_type_init(struct lu_device_type *ldt); -void lu_device_type_fini(struct lu_device_type *ldt); - -/** @} ctors */ - -/** \name caching - * Caching and reference counting. - * @{ - */ - -/** - * Acquire additional reference to the given object. This function is used to - * attain additional reference. To acquire initial reference use - * lu_object_find(). - */ -static inline void lu_object_get(struct lu_object *o) -{ - LASSERT(atomic_read(&o->lo_header->loh_ref) > 0); - atomic_inc(&o->lo_header->loh_ref); -} - -/** - * Return true of object will not be cached after last reference to it is - * released. - */ -static inline int lu_object_is_dying(const struct lu_object_header *h) -{ - return test_bit(LU_OBJECT_HEARD_BANSHEE, &h->loh_flags); -} - -void lu_object_put(const struct lu_env *env, struct lu_object *o); -void lu_object_unhash(const struct lu_env *env, struct lu_object *o); -int lu_site_purge_objects(const struct lu_env *env, struct lu_site *s, int nr, - bool canblock); - -static inline int lu_site_purge(const struct lu_env *env, struct lu_site *s, - int nr) -{ - return lu_site_purge_objects(env, s, nr, true); -} - -void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie, - lu_printer_t printer); -struct lu_object *lu_object_find_at(const struct lu_env *env, - struct lu_device *dev, - const struct lu_fid *f, - const struct lu_object_conf *conf); -struct lu_object *lu_object_find_slice(const struct lu_env *env, - struct lu_device *dev, - const struct lu_fid *f, - const struct lu_object_conf *conf); -/** @} caching */ - -/** \name helpers - * Helpers. - * @{ - */ - -/** - * First (topmost) sub-object of given compound object - */ -static inline struct lu_object *lu_object_top(struct lu_object_header *h) -{ - LASSERT(!list_empty(&h->loh_layers)); - return list_first_entry(&h->loh_layers, struct lu_object, lo_linkage); -} - -/** - * Next sub-object in the layering - */ -static inline const struct lu_object *lu_object_next(const struct lu_object *o) -{ - return list_next_entry(o, lo_linkage); -} - -/** - * Pointer to the fid of this object. - */ -static inline const struct lu_fid *lu_object_fid(const struct lu_object *o) -{ - return &o->lo_header->loh_fid; -} - -/** - * return device operations vector for this object - */ -static inline const struct lu_device_operations * -lu_object_ops(const struct lu_object *o) -{ - return o->lo_dev->ld_ops; -} - -/** - * Given a compound object, find its slice, corresponding to the device type - * \a dtype. - */ -struct lu_object *lu_object_locate(struct lu_object_header *h, - const struct lu_device_type *dtype); - -/** - * Printer function emitting messages through libcfs_debug_msg(). - */ -int lu_cdebug_printer(const struct lu_env *env, - void *cookie, const char *format, ...); - -/** - * Print object description followed by a user-supplied message. - */ -#define LU_OBJECT_DEBUG(mask, env, object, format, ...) \ -do { \ - if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ - LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ - lu_object_print(env, &msgdata, lu_cdebug_printer, object);\ - CDEBUG(mask, format "\n", ## __VA_ARGS__); \ - } \ -} while (0) - -/** - * Print short object description followed by a user-supplied message. - */ -#define LU_OBJECT_HEADER(mask, env, object, format, ...) \ -do { \ - if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ - LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ - lu_object_header_print(env, &msgdata, lu_cdebug_printer,\ - (object)->lo_header); \ - lu_cdebug_printer(env, &msgdata, "\n"); \ - CDEBUG(mask, format, ## __VA_ARGS__); \ - } \ -} while (0) - -void lu_object_print (const struct lu_env *env, void *cookie, - lu_printer_t printer, const struct lu_object *o); -void lu_object_header_print(const struct lu_env *env, void *cookie, - lu_printer_t printer, - const struct lu_object_header *hdr); - -/** - * Check object consistency. - */ -int lu_object_invariant(const struct lu_object *o); - -/** - * Check whether object exists, no matter on local or remote storage. - * Note: LOHA_EXISTS will be set once some one created the object, - * and it does not needs to be committed to storage. - */ -#define lu_object_exists(o) ((o)->lo_header->loh_attr & LOHA_EXISTS) - -/** - * Check whether object on the remote storage. - */ -#define lu_object_remote(o) unlikely((o)->lo_header->loh_attr & LOHA_REMOTE) - -static inline int lu_object_assert_exists(const struct lu_object *o) -{ - return lu_object_exists(o); -} - -static inline int lu_object_assert_not_exists(const struct lu_object *o) -{ - return !lu_object_exists(o); -} - -/** - * Attr of this object. - */ -static inline __u32 lu_object_attr(const struct lu_object *o) -{ - LASSERT(lu_object_exists(o) != 0); - return o->lo_header->loh_attr; -} - -static inline void lu_object_ref_add(struct lu_object *o, - const char *scope, - const void *source) -{ - lu_ref_add(&o->lo_header->loh_reference, scope, source); -} - -static inline void lu_object_ref_add_at(struct lu_object *o, - struct lu_ref_link *link, - const char *scope, - const void *source) -{ - lu_ref_add_at(&o->lo_header->loh_reference, link, scope, source); -} - -static inline void lu_object_ref_del(struct lu_object *o, - const char *scope, const void *source) -{ - lu_ref_del(&o->lo_header->loh_reference, scope, source); -} - -static inline void lu_object_ref_del_at(struct lu_object *o, - struct lu_ref_link *link, - const char *scope, const void *source) -{ - lu_ref_del_at(&o->lo_header->loh_reference, link, scope, source); -} - -/** input params, should be filled out by mdt */ -struct lu_rdpg { - /** hash */ - __u64 rp_hash; - /** count in bytes */ - unsigned int rp_count; - /** number of pages */ - unsigned int rp_npages; - /** requested attr */ - __u32 rp_attrs; - /** pointers to pages */ - struct page **rp_pages; -}; - -enum lu_xattr_flags { - LU_XATTR_REPLACE = (1 << 0), - LU_XATTR_CREATE = (1 << 1) -}; - -/** @} helpers */ - -/** \name lu_context - * @{ - */ - -/** For lu_context health-checks */ -enum lu_context_state { - LCS_INITIALIZED = 1, - LCS_ENTERED, - LCS_LEFT, - LCS_FINALIZED -}; - -/** - * lu_context. Execution context for lu_object methods. Currently associated - * with thread. - * - * All lu_object methods, except device and device type methods (called during - * system initialization and shutdown) are executed "within" some - * lu_context. This means, that pointer to some "current" lu_context is passed - * as an argument to all methods. - * - * All service ptlrpc threads create lu_context as part of their - * initialization. It is possible to create "stand-alone" context for other - * execution environments (like system calls). - * - * lu_object methods mainly use lu_context through lu_context_key interface - * that allows each layer to associate arbitrary pieces of data with each - * context (see pthread_key_create(3) for similar interface). - * - * On a client, lu_context is bound to a thread, see cl_env_get(). - * - * \see lu_context_key - */ -struct lu_context { - /** - * lu_context is used on the client side too. Yet we don't want to - * allocate values of server-side keys for the client contexts and - * vice versa. - * - * To achieve this, set of tags in introduced. Contexts and keys are - * marked with tags. Key value are created only for context whose set - * of tags has non-empty intersection with one for key. Tags are taken - * from enum lu_context_tag. - */ - __u32 lc_tags; - enum lu_context_state lc_state; - /** - * Pointer to the home service thread. NULL for other execution - * contexts. - */ - struct ptlrpc_thread *lc_thread; - /** - * Pointer to an array with key values. Internal implementation - * detail. - */ - void **lc_value; - /** - * Linkage into a list of all remembered contexts. Only - * `non-transient' contexts, i.e., ones created for service threads - * are placed here. - */ - struct list_head lc_remember; - /** - * Version counter used to skip calls to lu_context_refill() when no - * keys were registered. - */ - unsigned int lc_version; - /** - * Debugging cookie. - */ - unsigned int lc_cookie; -}; - -/** - * lu_context_key interface. Similar to pthread_key. - */ - -enum lu_context_tag { - /** - * Thread on md server - */ - LCT_MD_THREAD = 1 << 0, - /** - * Thread on dt server - */ - LCT_DT_THREAD = 1 << 1, - /** - * Context for transaction handle - */ - LCT_TX_HANDLE = 1 << 2, - /** - * Thread on client - */ - LCT_CL_THREAD = 1 << 3, - /** - * A per-request session on a server, and a per-system-call session on - * a client. - */ - LCT_SESSION = 1 << 4, - /** - * A per-request data on OSP device - */ - LCT_OSP_THREAD = 1 << 5, - /** - * MGS device thread - */ - LCT_MG_THREAD = 1 << 6, - /** - * Context for local operations - */ - LCT_LOCAL = 1 << 7, - /** - * session for server thread - **/ - LCT_SERVER_SESSION = BIT(8), - /** - * Set when at least one of keys, having values in this context has - * non-NULL lu_context_key::lct_exit() method. This is used to - * optimize lu_context_exit() call. - */ - LCT_HAS_EXIT = 1 << 28, - /** - * Don't add references for modules creating key values in that context. - * This is only for contexts used internally by lu_object framework. - */ - LCT_NOREF = 1 << 29, - /** - * Key is being prepared for retiring, don't create new values for it. - */ - LCT_QUIESCENT = 1 << 30, - /** - * Context should be remembered. - */ - LCT_REMEMBER = 1 << 31, - /** - * Contexts usable in cache shrinker thread. - */ - LCT_SHRINKER = LCT_MD_THREAD | LCT_DT_THREAD | LCT_CL_THREAD | - LCT_NOREF -}; - -/** - * Key. Represents per-context value slot. - * - * Keys are usually registered when module owning the key is initialized, and - * de-registered when module is unloaded. Once key is registered, all new - * contexts with matching tags, will get key value. "Old" contexts, already - * initialized at the time of key registration, can be forced to get key value - * by calling lu_context_refill(). - * - * Every key value is counted in lu_context_key::lct_used and acquires a - * reference on an owning module. This means, that all key values have to be - * destroyed before module can be unloaded. This is usually achieved by - * stopping threads started by the module, that created contexts in their - * entry functions. Situation is complicated by the threads shared by multiple - * modules, like ptlrpcd daemon on a client. To work around this problem, - * contexts, created in such threads, are `remembered' (see - * LCT_REMEMBER)---i.e., added into a global list. When module is preparing - * for unloading it does the following: - * - * - marks its keys as `quiescent' (lu_context_tag::LCT_QUIESCENT) - * preventing new key values from being allocated in the new contexts, - * and - * - * - scans a list of remembered contexts, destroying values of module - * keys, thus releasing references to the module. - * - * This is done by lu_context_key_quiesce(). If module is re-activated - * before key has been de-registered, lu_context_key_revive() call clears - * `quiescent' marker. - * - * lu_context code doesn't provide any internal synchronization for these - * activities---it's assumed that startup (including threads start-up) and - * shutdown are serialized by some external means. - * - * \see lu_context - */ -struct lu_context_key { - /** - * Set of tags for which values of this key are to be instantiated. - */ - __u32 lct_tags; - /** - * Value constructor. This is called when new value is created for a - * context. Returns pointer to new value of error pointer. - */ - void *(*lct_init)(const struct lu_context *ctx, - struct lu_context_key *key); - /** - * Value destructor. Called when context with previously allocated - * value of this slot is destroyed. \a data is a value that was returned - * by a matching call to lu_context_key::lct_init(). - */ - void (*lct_fini)(const struct lu_context *ctx, - struct lu_context_key *key, void *data); - /** - * Optional method called on lu_context_exit() for all allocated - * keys. Can be used by debugging code checking that locks are - * released, etc. - */ - void (*lct_exit)(const struct lu_context *ctx, - struct lu_context_key *key, void *data); - /** - * Internal implementation detail: index within lu_context::lc_value[] - * reserved for this key. - */ - int lct_index; - /** - * Internal implementation detail: number of values created for this - * key. - */ - atomic_t lct_used; - /** - * Internal implementation detail: module for this key. - */ - struct module *lct_owner; - /** - * References to this key. For debugging. - */ - struct lu_ref lct_reference; -}; - -#define LU_KEY_INIT(mod, type) \ - static void *mod##_key_init(const struct lu_context *ctx, \ - struct lu_context_key *key) \ - { \ - type *value; \ - \ - BUILD_BUG_ON(sizeof(*value) > PAGE_SIZE); \ - \ - value = kzalloc(sizeof(*value), GFP_NOFS); \ - if (!value) \ - value = ERR_PTR(-ENOMEM); \ - \ - return value; \ - } \ - struct __##mod##__dummy_init {; } /* semicolon catcher */ - -#define LU_KEY_FINI(mod, type) \ - static void mod##_key_fini(const struct lu_context *ctx, \ - struct lu_context_key *key, void *data) \ - { \ - type *info = data; \ - \ - kfree(info); \ - } \ - struct __##mod##__dummy_fini {; } /* semicolon catcher */ - -#define LU_KEY_INIT_FINI(mod, type) \ - LU_KEY_INIT(mod, type); \ - LU_KEY_FINI(mod, type) - -#define LU_CONTEXT_KEY_DEFINE(mod, tags) \ - struct lu_context_key mod##_thread_key = { \ - .lct_tags = tags, \ - .lct_init = mod##_key_init, \ - .lct_fini = mod##_key_fini \ - } - -#define LU_CONTEXT_KEY_INIT(key) \ -do { \ - (key)->lct_owner = THIS_MODULE; \ -} while (0) - -int lu_context_key_register(struct lu_context_key *key); -void lu_context_key_degister(struct lu_context_key *key); -void *lu_context_key_get(const struct lu_context *ctx, - const struct lu_context_key *key); -void lu_context_key_quiesce(struct lu_context_key *key); -void lu_context_key_revive(struct lu_context_key *key); - -/* - * LU_KEY_INIT_GENERIC() has to be a macro to correctly determine an - * owning module. - */ - -#define LU_KEY_INIT_GENERIC(mod) \ - static void mod##_key_init_generic(struct lu_context_key *k, ...) \ - { \ - struct lu_context_key *key = k; \ - va_list args; \ - \ - va_start(args, k); \ - do { \ - LU_CONTEXT_KEY_INIT(key); \ - key = va_arg(args, struct lu_context_key *); \ - } while (key); \ - va_end(args); \ - } - -#define LU_TYPE_INIT(mod, ...) \ - LU_KEY_INIT_GENERIC(mod) \ - static int mod##_type_init(struct lu_device_type *t) \ - { \ - mod##_key_init_generic(__VA_ARGS__, NULL); \ - return lu_context_key_register_many(__VA_ARGS__, NULL); \ - } \ - struct __##mod##_dummy_type_init {; } - -#define LU_TYPE_FINI(mod, ...) \ - static void mod##_type_fini(struct lu_device_type *t) \ - { \ - lu_context_key_degister_many(__VA_ARGS__, NULL); \ - } \ - struct __##mod##_dummy_type_fini {; } - -#define LU_TYPE_START(mod, ...) \ - static void mod##_type_start(struct lu_device_type *t) \ - { \ - lu_context_key_revive_many(__VA_ARGS__, NULL); \ - } \ - struct __##mod##_dummy_type_start {; } - -#define LU_TYPE_STOP(mod, ...) \ - static void mod##_type_stop(struct lu_device_type *t) \ - { \ - lu_context_key_quiesce_many(__VA_ARGS__, NULL); \ - } \ - struct __##mod##_dummy_type_stop {; } - -#define LU_TYPE_INIT_FINI(mod, ...) \ - LU_TYPE_INIT(mod, __VA_ARGS__); \ - LU_TYPE_FINI(mod, __VA_ARGS__); \ - LU_TYPE_START(mod, __VA_ARGS__); \ - LU_TYPE_STOP(mod, __VA_ARGS__) - -int lu_context_init(struct lu_context *ctx, __u32 tags); -void lu_context_fini(struct lu_context *ctx); -void lu_context_enter(struct lu_context *ctx); -void lu_context_exit(struct lu_context *ctx); -int lu_context_refill(struct lu_context *ctx); - -/* - * Helper functions to operate on multiple keys. These are used by the default - * device type operations, defined by LU_TYPE_INIT_FINI(). - */ - -int lu_context_key_register_many(struct lu_context_key *k, ...); -void lu_context_key_degister_many(struct lu_context_key *k, ...); -void lu_context_key_revive_many(struct lu_context_key *k, ...); -void lu_context_key_quiesce_many(struct lu_context_key *k, ...); - -/** - * Environment. - */ -struct lu_env { - /** - * "Local" context, used to store data instead of stack. - */ - struct lu_context le_ctx; - /** - * "Session" context for per-request data. - */ - struct lu_context *le_ses; -}; - -int lu_env_init(struct lu_env *env, __u32 tags); -void lu_env_fini(struct lu_env *env); -int lu_env_refill(struct lu_env *env); - -/** @} lu_context */ - -/** - * Output site statistical counters into a buffer. Suitable for - * ll_rd_*()-style functions. - */ -int lu_site_stats_print(const struct lu_site *s, struct seq_file *m); - -/** - * Common name structure to be passed around for various name related methods. - */ -struct lu_name { - const char *ln_name; - int ln_namelen; -}; - -/** - * Validate names (path components) - * - * To be valid \a name must be non-empty, '\0' terminated of length \a - * name_len, and not contain '/'. The maximum length of a name (before - * say -ENAMETOOLONG will be returned) is really controlled by llite - * and the server. We only check for something insane coming from bad - * integer handling here. - */ -static inline bool lu_name_is_valid_2(const char *name, size_t name_len) -{ - return name && name_len > 0 && name_len < INT_MAX && - name[name_len] == '\0' && strlen(name) == name_len && - !memchr(name, '/', name_len); -} - -/** - * Common buffer structure to be passed around for various xattr_{s,g}et() - * methods. - */ -struct lu_buf { - void *lb_buf; - size_t lb_len; -}; - -/** - * One-time initializers, called at obdclass module initialization, not - * exported. - */ - -/** - * Initialization of global lu_* data. - */ -int lu_global_init(void); - -/** - * Dual to lu_global_init(). - */ -void lu_global_fini(void); - -struct lu_kmem_descr { - struct kmem_cache **ckd_cache; - const char *ckd_name; - const size_t ckd_size; -}; - -int lu_kmem_init(struct lu_kmem_descr *caches); -void lu_kmem_fini(struct lu_kmem_descr *caches); - -extern __u32 lu_context_tags_default; -extern __u32 lu_session_tags_default; - -/** @} lu */ -#endif /* __LUSTRE_LU_OBJECT_H */ diff --git a/drivers/staging/lustre/lustre/include/lu_ref.h b/drivers/staging/lustre/lustre/include/lu_ref.h deleted file mode 100644 index ad0c24d29ffa3db3657bb2f14a977b9435804d58..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lu_ref.h +++ /dev/null @@ -1,178 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - * - * Author: Nikita Danilov - * - * This file is part of Lustre, http://www.lustre.org. - * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -#ifndef __LUSTRE_LU_REF_H -#define __LUSTRE_LU_REF_H - -#include - -/** \defgroup lu_ref lu_ref - * - * An interface to track references between objects. Mostly for debugging. - * - * Suppose there is a reference counted data-structure struct foo. To track - * who acquired references to instance of struct foo, add lu_ref field to it: - * - * \code - * struct foo { - * atomic_t foo_refcount; - * struct lu_ref foo_reference; - * ... - * }; - * \endcode - * - * foo::foo_reference has to be initialized by calling - * lu_ref_init(). Typically there will be functions or macros to increment and - * decrement foo::foo_refcount, let's say they are foo_get(struct foo *foo) - * and foo_put(struct foo *foo), respectively. - * - * Whenever foo_get() is called to acquire a reference on a foo, lu_ref_add() - * has to be called to insert into foo::foo_reference a record, describing - * acquired reference. Dually, lu_ref_del() removes matching record. Typical - * usages are: - * - * \code - * struct bar *bar; - * - * // bar owns a reference to foo. - * bar->bar_foo = foo_get(foo); - * lu_ref_add(&foo->foo_reference, "bar", bar); - * - * ... - * - * // reference from bar to foo is released. - * lu_ref_del(&foo->foo_reference, "bar", bar); - * foo_put(bar->bar_foo); - * - * - * // current thread acquired a temporary reference to foo. - * foo_get(foo); - * lu_ref_add(&foo->reference, __func__, current); - * - * ... - * - * // temporary reference is released. - * lu_ref_del(&foo->reference, __func__, current); - * foo_put(foo); - * \endcode - * - * \e Et \e cetera. Often it makes sense to include lu_ref_add() and - * lu_ref_del() calls into foo_get() and foo_put(). When an instance of struct - * foo is destroyed, lu_ref_fini() has to be called that checks that no - * pending references remain. lu_ref_print() can be used to dump a list of - * pending references, while hunting down a leak. - * - * For objects to which a large number of references can be acquired, - * lu_ref_del() can become cpu consuming, as it has to scan the list of - * references. To work around this, remember result of lu_ref_add() (usually - * in the same place where pointer to struct foo is stored), and use - * lu_ref_del_at(): - * - * \code - * // There is a large number of bar's for a single foo. - * bar->bar_foo = foo_get(foo); - * bar->bar_foo_ref = lu_ref_add(&foo->foo_reference, "bar", bar); - * - * ... - * - * // reference from bar to foo is released. - * lu_ref_del_at(&foo->foo_reference, bar->bar_foo_ref, "bar", bar); - * foo_put(bar->bar_foo); - * \endcode - * - * lu_ref interface degrades gracefully in case of memory shortages. - * - * @{ - */ - -/* - * dummy data structures/functions to pass compile for now. - * We need to reimplement them with kref. - */ -struct lu_ref {}; -struct lu_ref_link {}; - -static inline void lu_ref_init(struct lu_ref *ref) -{ -} - -static inline void lu_ref_fini(struct lu_ref *ref) -{ -} - -static inline struct lu_ref_link *lu_ref_add(struct lu_ref *ref, - const char *scope, - const void *source) -{ - return NULL; -} - -static inline struct lu_ref_link *lu_ref_add_atomic(struct lu_ref *ref, - const char *scope, - const void *source) -{ - return NULL; -} - -static inline void lu_ref_add_at(struct lu_ref *ref, - struct lu_ref_link *link, - const char *scope, - const void *source) -{ -} - -static inline void lu_ref_del(struct lu_ref *ref, const char *scope, - const void *source) -{ -} - -static inline void lu_ref_set_at(struct lu_ref *ref, struct lu_ref_link *link, - const char *scope, const void *source0, - const void *source1) -{ -} - -static inline void lu_ref_del_at(struct lu_ref *ref, struct lu_ref_link *link, - const char *scope, const void *source) -{ -} - -static inline int lu_ref_global_init(void) -{ - return 0; -} - -static inline void lu_ref_global_fini(void) -{ -} - -static inline void lu_ref_print(const struct lu_ref *ref) -{ -} - -static inline void lu_ref_print_all(void) -{ -} - -/** @} lu */ - -#endif /* __LUSTRE_LU_REF_H */ diff --git a/drivers/staging/lustre/lustre/include/lustre_acl.h b/drivers/staging/lustre/lustre/include/lustre_acl.h deleted file mode 100644 index e7575a172b5fe8bef16a0c69ced935b7557037ef..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_acl.h +++ /dev/null @@ -1,51 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/include/lustre_acl.h - */ - -#ifndef _LUSTRE_ACL_H -#define _LUSTRE_ACL_H - -#include -#include -#ifdef CONFIG_FS_POSIX_ACL -#include - -#define LUSTRE_POSIX_ACL_MAX_ENTRIES 32 -#define LUSTRE_POSIX_ACL_MAX_SIZE_OLD \ - (sizeof(struct posix_acl_xattr_header) + \ - LUSTRE_POSIX_ACL_MAX_ENTRIES * sizeof(struct posix_acl_xattr_entry)) - -#else /* ! CONFIG_FS_POSIX_ACL */ -#define LUSTRE_POSIX_ACL_MAX_SIZE_OLD 0 -#endif /* CONFIG_FS_POSIX_ACL */ - -#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_compat.h b/drivers/staging/lustre/lustre/include/lustre_compat.h deleted file mode 100644 index 3c6db0d632dc3636de2e30b17334e09a447e2159..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_compat.h +++ /dev/null @@ -1,82 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#ifndef _LUSTRE_COMPAT_H -#define _LUSTRE_COMPAT_H - -#include -#include -#include -#include - -#include - -/* - * set ATTR_BLOCKS to a high value to avoid any risk of collision with other - * ATTR_* attributes (see bug 13828) - */ -#define ATTR_BLOCKS (1 << 27) - -#define current_ngroups current_cred()->group_info->ngroups -#define current_groups current_cred()->group_info->small_block - -/* - * OBD need working random driver, thus all our - * initialization routines must be called after device - * driver initialization - */ -#ifndef MODULE -#undef module_init -#define module_init(a) late_initcall(a) -#endif - -#define LTIME_S(time) (time.tv_sec) - -#ifndef QUOTA_OK -# define QUOTA_OK 0 -#endif -#ifndef NO_QUOTA -# define NO_QUOTA (-EDQUOT) -#endif - -#if !defined(_ASM_GENERIC_BITOPS_EXT2_NON_ATOMIC_H_) && !defined(ext2_set_bit) -# define ext2_set_bit __test_and_set_bit_le -# define ext2_clear_bit __test_and_clear_bit_le -# define ext2_test_bit test_bit_le -# define ext2_find_first_zero_bit find_first_zero_bit_le -# define ext2_find_next_zero_bit find_next_zero_bit_le -#endif - -#define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET) - -#endif /* _LUSTRE_COMPAT_H */ diff --git a/drivers/staging/lustre/lustre/include/lustre_debug.h b/drivers/staging/lustre/lustre/include/lustre_debug.h deleted file mode 100644 index 721a81f923e30c14d0b138a990239703d605933f..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_debug.h +++ /dev/null @@ -1,52 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#ifndef _LUSTRE_DEBUG_H -#define _LUSTRE_DEBUG_H - -/** \defgroup debug debug - * - * @{ - */ - -#include -#include - -/* lib/debug.c */ -int dump_req(struct ptlrpc_request *req); -int block_debug_setup(void *addr, int len, __u64 off, __u64 id); -int block_debug_check(char *who, void *addr, int len, __u64 off, __u64 id); - -/** @} debug */ - -#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_disk.h b/drivers/staging/lustre/lustre/include/lustre_disk.h deleted file mode 100644 index 886e817644d69feeb5345a0b9558c26880007bb6..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_disk.h +++ /dev/null @@ -1,152 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/include/lustre_disk.h - * - * Lustre disk format definitions. - * - * Author: Nathan Rutman - */ - -#ifndef _LUSTRE_DISK_H -#define _LUSTRE_DISK_H - -/** \defgroup disk disk - * - * @{ - */ - -#include -#include -#include - -/****************** persistent mount data *********************/ - -#define LDD_F_SV_TYPE_MDT 0x0001 -#define LDD_F_SV_TYPE_OST 0x0002 -#define LDD_F_SV_TYPE_MGS 0x0004 -#define LDD_F_SV_TYPE_MASK (LDD_F_SV_TYPE_MDT | \ - LDD_F_SV_TYPE_OST | \ - LDD_F_SV_TYPE_MGS) -#define LDD_F_SV_ALL 0x0008 - -/****************** mount command *********************/ - -/* The lmd is only used internally by Lustre; mount simply passes - * everything as string options - */ - -#define LMD_MAGIC 0xbdacbd03 -#define LMD_PARAMS_MAXLEN 4096 - -/* gleaned from the mount command - no persistent info here */ -struct lustre_mount_data { - __u32 lmd_magic; - __u32 lmd_flags; /* lustre mount flags */ - int lmd_mgs_failnodes; /* mgs failover node count */ - int lmd_exclude_count; - int lmd_recovery_time_soft; - int lmd_recovery_time_hard; - char *lmd_dev; /* device name */ - char *lmd_profile; /* client only */ - char *lmd_mgssec; /* sptlrpc flavor to mgs */ - char *lmd_opts; /* lustre mount options (as opposed to - * _device_ mount options) - */ - char *lmd_params; /* lustre params */ - __u32 *lmd_exclude; /* array of OSTs to ignore */ - char *lmd_mgs; /* MGS nid */ - char *lmd_osd_type; /* OSD type */ -}; - -#define LMD_FLG_SERVER 0x0001 /* Mounting a server */ -#define LMD_FLG_CLIENT 0x0002 /* Mounting a client */ -#define LMD_FLG_ABORT_RECOV 0x0008 /* Abort recovery */ -#define LMD_FLG_NOSVC 0x0010 /* Only start MGS/MGC for servers, - * no other services - */ -#define LMD_FLG_NOMGS 0x0020 /* Only start target for servers, - * reusing existing MGS services - */ -#define LMD_FLG_WRITECONF 0x0040 /* Rewrite config log */ -#define LMD_FLG_NOIR 0x0080 /* NO imperative recovery */ -#define LMD_FLG_NOSCRUB 0x0100 /* Do not trigger scrub automatically */ -#define LMD_FLG_MGS 0x0200 /* Also start MGS along with server */ -#define LMD_FLG_IAM 0x0400 /* IAM dir */ -#define LMD_FLG_NO_PRIMNODE 0x0800 /* all nodes are service nodes */ -#define LMD_FLG_VIRGIN 0x1000 /* the service registers first time */ -#define LMD_FLG_UPDATE 0x2000 /* update parameters */ -#define LMD_FLG_HSM 0x4000 /* Start coordinator */ - -#define lmd_is_client(x) ((x)->lmd_flags & LMD_FLG_CLIENT) - -/****************** superblock additional info *********************/ - -struct ll_sb_info; - -struct lustre_sb_info { - int lsi_flags; - struct obd_device *lsi_mgc; /* mgc obd */ - struct lustre_mount_data *lsi_lmd; /* mount command info */ - struct ll_sb_info *lsi_llsbi; /* add'l client sbi info */ - struct dt_device *lsi_dt_dev; /* dt device to access disk fs*/ - atomic_t lsi_mounts; /* references to the srv_mnt */ - char lsi_svname[MTI_NAME_MAXLEN]; - char lsi_osd_obdname[64]; - char lsi_osd_uuid[64]; - struct obd_export *lsi_osd_exp; - char lsi_osd_type[16]; - char lsi_fstype[16]; -}; - -#define LSI_UMOUNT_FAILOVER 0x00200000 - -#define s2lsi(sb) ((struct lustre_sb_info *)((sb)->s_fs_info)) -#define s2lsi_nocast(sb) ((sb)->s_fs_info) - -#define get_profile_name(sb) (s2lsi(sb)->lsi_lmd->lmd_profile) - -/****************** prototypes *********************/ - -/* obd_mount.c */ - -int lustre_start_mgc(struct super_block *sb); -void lustre_register_super_ops(struct module *mod, - int (*cfs)(struct super_block *sb), - void (*ksc)(struct super_block *sb)); -int lustre_common_put_super(struct super_block *sb); - -int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, int type); - -/** @} disk */ - -#endif /* _LUSTRE_DISK_H */ diff --git a/drivers/staging/lustre/lustre/include/lustre_dlm.h b/drivers/staging/lustre/lustre/include/lustre_dlm.h deleted file mode 100644 index 2c55241258ccde5b90e4877bd0da22394ae7c417..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_dlm.h +++ /dev/null @@ -1,1346 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2010, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -/** \defgroup LDLM Lustre Distributed Lock Manager - * - * Lustre DLM is based on VAX DLM. - * Its two main roles are: - * - To provide locking assuring consistency of data on all Lustre nodes. - * - To allow clients to cache state protected by a lock by holding the - * lock until a conflicting lock is requested or it is expired by the LRU. - * - * @{ - */ - -#ifndef _LUSTRE_DLM_H__ -#define _LUSTRE_DLM_H__ - -#include -#include -#include -#include -#include /* for interval_node{}, ldlm_extent */ -#include - -#include "lustre_dlm_flags.h" - -struct obd_ops; -struct obd_device; - -#define OBD_LDLM_DEVICENAME "ldlm" - -#define LDLM_DEFAULT_LRU_SIZE (100 * num_online_cpus()) -#define LDLM_DEFAULT_MAX_ALIVE (65 * 60 * HZ) /* 65 min */ -#define LDLM_DEFAULT_PARALLEL_AST_LIMIT 1024 - -/** - * LDLM non-error return states - */ -enum ldlm_error { - ELDLM_OK = 0, - ELDLM_LOCK_MATCHED = 1, - - ELDLM_LOCK_CHANGED = 300, - ELDLM_LOCK_ABORTED = 301, - ELDLM_LOCK_REPLACED = 302, - ELDLM_NO_LOCK_DATA = 303, - ELDLM_LOCK_WOULDBLOCK = 304, - - ELDLM_NAMESPACE_EXISTS = 400, - ELDLM_BAD_NAMESPACE = 401 -}; - -/** - * LDLM namespace type. - * The "client" type is actually an indication that this is a narrow local view - * into complete namespace on the server. Such namespaces cannot make any - * decisions about lack of conflicts or do any autonomous lock granting without - * first speaking to a server. - */ -enum ldlm_side { - LDLM_NAMESPACE_SERVER = 1 << 0, - LDLM_NAMESPACE_CLIENT = 1 << 1 -}; - -/** - * The blocking callback is overloaded to perform two functions. These flags - * indicate which operation should be performed. - */ -#define LDLM_CB_BLOCKING 1 -#define LDLM_CB_CANCELING 2 - -/** - * \name Lock Compatibility Matrix. - * - * A lock has both a type (extent, flock, inode bits, or plain) and a mode. - * Lock types are described in their respective implementation files: - * ldlm_{extent,flock,inodebits,plain}.c. - * - * There are six lock modes along with a compatibility matrix to indicate if - * two locks are compatible. - * - * - EX: Exclusive mode. Before a new file is created, MDS requests EX lock - * on the parent. - * - PW: Protective Write (normal write) mode. When a client requests a write - * lock from an OST, a lock with PW mode will be issued. - * - PR: Protective Read (normal read) mode. When a client requests a read from - * an OST, a lock with PR mode will be issued. Also, if the client opens a - * file for execution, it is granted a lock with PR mode. - * - CW: Concurrent Write mode. The type of lock that the MDS grants if a client - * requests a write lock during a file open operation. - * - CR Concurrent Read mode. When a client performs a path lookup, MDS grants - * an inodebit lock with the CR mode on the intermediate path component. - * - NL Null mode. - * - *
- *       NL  CR  CW  PR  PW  EX
- *  NL    1   1   1   1   1   1
- *  CR    1   1   1   1   1   0
- *  CW    1   1   1   0   0   0
- *  PR    1   1   0   1   0   0
- *  PW    1   1   0   0   0   0
- *  EX    1   0   0   0   0   0
- * 
- */ -/** @{ */ -#define LCK_COMPAT_EX LCK_NL -#define LCK_COMPAT_PW (LCK_COMPAT_EX | LCK_CR) -#define LCK_COMPAT_PR (LCK_COMPAT_PW | LCK_PR) -#define LCK_COMPAT_CW (LCK_COMPAT_PW | LCK_CW) -#define LCK_COMPAT_CR (LCK_COMPAT_CW | LCK_PR | LCK_PW) -#define LCK_COMPAT_NL (LCK_COMPAT_CR | LCK_EX | LCK_GROUP) -#define LCK_COMPAT_GROUP (LCK_GROUP | LCK_NL) -#define LCK_COMPAT_COS (LCK_COS) -/** @} Lock Compatibility Matrix */ - -extern enum ldlm_mode lck_compat_array[]; - -static inline void lockmode_verify(enum ldlm_mode mode) -{ - LASSERT(mode > LCK_MINMODE && mode < LCK_MAXMODE); -} - -static inline int lockmode_compat(enum ldlm_mode exist_mode, - enum ldlm_mode new_mode) -{ - return (lck_compat_array[exist_mode] & new_mode); -} - -/* - * - * cluster name spaces - * - */ - -#define DLM_OST_NAMESPACE 1 -#define DLM_MDS_NAMESPACE 2 - -/* XXX - - do we just separate this by security domains and use a prefix for - multiple namespaces in the same domain? - - -*/ - -/** - * Locking rules for LDLM: - * - * lr_lock - * - * lr_lock - * waiting_locks_spinlock - * - * lr_lock - * led_lock - * - * lr_lock - * ns_lock - * - * lr_lvb_mutex - * lr_lock - * - */ - -struct ldlm_pool; -struct ldlm_lock; -struct ldlm_resource; -struct ldlm_namespace; - -/** - * Operations on LDLM pools. - * LDLM pool is a pool of locks in the namespace without any implicitly - * specified limits. - * Locks in the pool are organized in LRU. - * Local memory pressure or server instructions (e.g. mempressure on server) - * can trigger freeing of locks from the pool - */ -struct ldlm_pool_ops { - /** Recalculate pool \a pl usage */ - int (*po_recalc)(struct ldlm_pool *pl); - /** Cancel at least \a nr locks from pool \a pl */ - int (*po_shrink)(struct ldlm_pool *pl, int nr, - gfp_t gfp_mask); -}; - -/** One second for pools thread check interval. Each pool has own period. */ -#define LDLM_POOLS_THREAD_PERIOD (1) - -/** ~6% margin for modest pools. See ldlm_pool.c for details. */ -#define LDLM_POOLS_MODEST_MARGIN_SHIFT (4) - -/** Default recalc period for server side pools in sec. */ -#define LDLM_POOL_SRV_DEF_RECALC_PERIOD (1) - -/** Default recalc period for client side pools in sec. */ -#define LDLM_POOL_CLI_DEF_RECALC_PERIOD (10) - -/** - * LDLM pool structure to track granted locks. - * For purposes of determining when to release locks on e.g. memory pressure. - * This feature is commonly referred to as lru_resize. - */ -struct ldlm_pool { - /** Pool debugfs directory. */ - struct dentry *pl_debugfs_entry; - /** Pool name, must be long enough to hold compound proc entry name. */ - char pl_name[100]; - /** Lock for protecting SLV/CLV updates. */ - spinlock_t pl_lock; - /** Number of allowed locks in in pool, both, client and server side. */ - atomic_t pl_limit; - /** Number of granted locks in */ - atomic_t pl_granted; - /** Grant rate per T. */ - atomic_t pl_grant_rate; - /** Cancel rate per T. */ - atomic_t pl_cancel_rate; - /** Server lock volume (SLV). Protected by pl_lock. */ - __u64 pl_server_lock_volume; - /** Current biggest client lock volume. Protected by pl_lock. */ - __u64 pl_client_lock_volume; - /** Lock volume factor. SLV on client is calculated as following: - * server_slv * lock_volume_factor. - */ - atomic_t pl_lock_volume_factor; - /** Time when last SLV from server was obtained. */ - time64_t pl_recalc_time; - /** Recalculation period for pool. */ - time64_t pl_recalc_period; - /** Recalculation and shrink operations. */ - const struct ldlm_pool_ops *pl_ops; - /** Number of planned locks for next period. */ - int pl_grant_plan; - /** Pool statistics. */ - struct lprocfs_stats *pl_stats; - - /* sysfs object */ - struct kobject pl_kobj; - struct completion pl_kobj_unregister; -}; - -typedef int (*ldlm_cancel_cbt)(struct ldlm_lock *lock); - -/** - * LVB operations. - * LVB is Lock Value Block. This is a special opaque (to LDLM) value that could - * be associated with an LDLM lock and transferred from client to server and - * back. - * - * Currently LVBs are used by: - * - OSC-OST code to maintain current object size/times - * - layout lock code to return the layout when the layout lock is granted - */ -struct ldlm_valblock_ops { - int (*lvbo_init)(struct ldlm_resource *res); - int (*lvbo_update)(struct ldlm_resource *res, - struct ptlrpc_request *r, - int increase); - int (*lvbo_free)(struct ldlm_resource *res); - /* Return size of lvb data appropriate RPC size can be reserved */ - int (*lvbo_size)(struct ldlm_lock *lock); - /* Called to fill in lvb data to RPC buffer @buf */ - int (*lvbo_fill)(struct ldlm_lock *lock, void *buf, int buflen); -}; - -/** - * LDLM pools related, type of lock pool in the namespace. - * Greedy means release cached locks aggressively - */ -enum ldlm_appetite { - LDLM_NAMESPACE_GREEDY = 1 << 0, - LDLM_NAMESPACE_MODEST = 1 << 1 -}; - -struct ldlm_ns_bucket { - /** back pointer to namespace */ - struct ldlm_namespace *nsb_namespace; - /** - * Estimated lock callback time. Used by adaptive timeout code to - * avoid spurious client evictions due to unresponsiveness when in - * fact the network or overall system load is at fault - */ - struct adaptive_timeout nsb_at_estimate; -}; - -enum { - /** LDLM namespace lock stats */ - LDLM_NSS_LOCKS = 0, - LDLM_NSS_LAST -}; - -enum ldlm_ns_type { - /** invalid type */ - LDLM_NS_TYPE_UNKNOWN = 0, - /** mdc namespace */ - LDLM_NS_TYPE_MDC, - /** mds namespace */ - LDLM_NS_TYPE_MDT, - /** osc namespace */ - LDLM_NS_TYPE_OSC, - /** ost namespace */ - LDLM_NS_TYPE_OST, - /** mgc namespace */ - LDLM_NS_TYPE_MGC, - /** mgs namespace */ - LDLM_NS_TYPE_MGT, -}; - -/** - * LDLM Namespace. - * - * Namespace serves to contain locks related to a particular service. - * There are two kinds of namespaces: - * - Server namespace has knowledge of all locks and is therefore authoritative - * to make decisions like what locks could be granted and what conflicts - * exist during new lock enqueue. - * - Client namespace only has limited knowledge about locks in the namespace, - * only seeing locks held by the client. - * - * Every Lustre service has one server namespace present on the server serving - * that service. Every client connected to the service has a client namespace - * for it. - * Every lock obtained by client in that namespace is actually represented by - * two in-memory locks. One on the server and one on the client. The locks are - * linked by a special cookie by which one node can tell to the other which lock - * it actually means during communications. Such locks are called remote locks. - * The locks held by server only without any reference to a client are called - * local locks. - */ -struct ldlm_namespace { - /** Backward link to OBD, required for LDLM pool to store new SLV. */ - struct obd_device *ns_obd; - - /** Flag indicating if namespace is on client instead of server */ - enum ldlm_side ns_client; - - /** name of this namespace */ - char *ns_name; - - /** Resource hash table for namespace. */ - struct cfs_hash *ns_rs_hash; - - /** serialize */ - spinlock_t ns_lock; - - /** big refcount (by bucket) */ - atomic_t ns_bref; - - /** - * Namespace connect flags supported by server (may be changed via - * sysfs, LRU resize may be disabled/enabled). - */ - __u64 ns_connect_flags; - - /** Client side original connect flags supported by server. */ - __u64 ns_orig_connect_flags; - - /* namespace debugfs dir entry */ - struct dentry *ns_debugfs_entry; - - /** - * Position in global namespace list linking all namespaces on - * the node. - */ - struct list_head ns_list_chain; - - /** - * List of unused locks for this namespace. This list is also called - * LRU lock list. - * Unused locks are locks with zero reader/writer reference counts. - * This list is only used on clients for lock caching purposes. - * When we want to release some locks voluntarily or if server wants - * us to release some locks due to e.g. memory pressure, we take locks - * to release from the head of this list. - * Locks are linked via l_lru field in \see struct ldlm_lock. - */ - struct list_head ns_unused_list; - /** Number of locks in the LRU list above */ - int ns_nr_unused; - - /** - * Maximum number of locks permitted in the LRU. If 0, means locks - * are managed by pools and there is no preset limit, rather it is all - * controlled by available memory on this client and on server. - */ - unsigned int ns_max_unused; - /** Maximum allowed age (last used time) for locks in the LRU */ - unsigned int ns_max_age; - - /** - * Used to rate-limit ldlm_namespace_dump calls. - * \see ldlm_namespace_dump. Increased by 10 seconds every time - * it is called. - */ - unsigned long ns_next_dump; - - /** - * LVB operations for this namespace. - * \see struct ldlm_valblock_ops - */ - struct ldlm_valblock_ops *ns_lvbo; - - /** - * Used by filter code to store pointer to OBD of the service. - * Should be dropped in favor of \a ns_obd - */ - void *ns_lvbp; - - /** - * Wait queue used by __ldlm_namespace_free. Gets woken up every time - * a resource is removed. - */ - wait_queue_head_t ns_waitq; - /** LDLM pool structure for this namespace */ - struct ldlm_pool ns_pool; - /** Definition of how eagerly unused locks will be released from LRU */ - enum ldlm_appetite ns_appetite; - - /** Limit of parallel AST RPC count. */ - unsigned ns_max_parallel_ast; - - /** - * Callback to check if a lock is good to be canceled by ELC or - * during recovery. - */ - ldlm_cancel_cbt ns_cancel; - - /** LDLM lock stats */ - struct lprocfs_stats *ns_stats; - - /** - * Flag to indicate namespace is being freed. Used to determine if - * recalculation of LDLM pool statistics should be skipped. - */ - unsigned ns_stopping:1; - - struct kobject ns_kobj; /* sysfs object */ - struct completion ns_kobj_unregister; -}; - -/** - * Returns 1 if namespace \a ns supports early lock cancel (ELC). - */ -static inline int ns_connect_cancelset(struct ldlm_namespace *ns) -{ - return !!(ns->ns_connect_flags & OBD_CONNECT_CANCELSET); -} - -/** - * Returns 1 if this namespace supports lru_resize. - */ -static inline int ns_connect_lru_resize(struct ldlm_namespace *ns) -{ - return !!(ns->ns_connect_flags & OBD_CONNECT_LRU_RESIZE); -} - -static inline void ns_register_cancel(struct ldlm_namespace *ns, - ldlm_cancel_cbt arg) -{ - ns->ns_cancel = arg; -} - -struct ldlm_lock; - -/** Type for blocking callback function of a lock. */ -typedef int (*ldlm_blocking_callback)(struct ldlm_lock *lock, - struct ldlm_lock_desc *new, void *data, - int flag); -/** Type for completion callback function of a lock. */ -typedef int (*ldlm_completion_callback)(struct ldlm_lock *lock, __u64 flags, - void *data); -/** Type for glimpse callback function of a lock. */ -typedef int (*ldlm_glimpse_callback)(struct ldlm_lock *lock, void *data); - -/** Work list for sending GL ASTs to multiple locks. */ -struct ldlm_glimpse_work { - struct ldlm_lock *gl_lock; /* lock to glimpse */ - struct list_head gl_list; /* linkage to other gl work structs */ - __u32 gl_flags;/* see LDLM_GL_WORK_* below */ - union ldlm_gl_desc *gl_desc; /* glimpse descriptor to be packed in - * glimpse callback request - */ -}; - -/** The ldlm_glimpse_work is allocated on the stack and should not be freed. */ -#define LDLM_GL_WORK_NOFREE 0x1 - -/** Interval node data for each LDLM_EXTENT lock. */ -struct ldlm_interval { - struct interval_node li_node; /* node for tree management */ - struct list_head li_group; /* the locks which have the same - * policy - group of the policy - */ -}; - -#define to_ldlm_interval(n) container_of(n, struct ldlm_interval, li_node) - -/** - * Interval tree for extent locks. - * The interval tree must be accessed under the resource lock. - * Interval trees are used for granted extent locks to speed up conflicts - * lookup. See ldlm/interval_tree.c for more details. - */ -struct ldlm_interval_tree { - /** Tree size. */ - int lit_size; - enum ldlm_mode lit_mode; /* lock mode */ - struct interval_node *lit_root; /* actual ldlm_interval */ -}; - -/** Whether to track references to exports by LDLM locks. */ -#define LUSTRE_TRACKS_LOCK_EXP_REFS (0) - -/** Cancel flags. */ -enum ldlm_cancel_flags { - LCF_ASYNC = 0x1, /* Cancel locks asynchronously. */ - LCF_LOCAL = 0x2, /* Cancel locks locally, not notifing server */ - LCF_BL_AST = 0x4, /* Cancel locks marked as LDLM_FL_BL_AST - * in the same RPC - */ -}; - -struct ldlm_flock { - __u64 start; - __u64 end; - __u64 owner; - __u64 blocking_owner; - struct obd_export *blocking_export; - __u32 pid; -}; - -union ldlm_policy_data { - struct ldlm_extent l_extent; - struct ldlm_flock l_flock; - struct ldlm_inodebits l_inodebits; -}; - -void ldlm_convert_policy_to_local(struct obd_export *exp, enum ldlm_type type, - const union ldlm_wire_policy_data *wpolicy, - union ldlm_policy_data *lpolicy); - -enum lvb_type { - LVB_T_NONE = 0, - LVB_T_OST = 1, - LVB_T_LQUOTA = 2, - LVB_T_LAYOUT = 3, -}; - -/** - * LDLM_GID_ANY is used to match any group id in ldlm_lock_match(). - */ -#define LDLM_GID_ANY ((__u64)-1) - -/** - * LDLM lock structure - * - * Represents a single LDLM lock and its state in memory. Each lock is - * associated with a single ldlm_resource, the object which is being - * locked. There may be multiple ldlm_locks on a single resource, - * depending on the lock type and whether the locks are conflicting or - * not. - */ -struct ldlm_lock { - /** - * Local lock handle. - * When remote side wants to tell us about a lock, they address - * it by this opaque handle. The handle does not hold a - * reference on the ldlm_lock, so it can be safely passed to - * other threads or nodes. When the lock needs to be accessed - * from the handle, it is looked up again in the lock table, and - * may no longer exist. - * - * Must be first in the structure. - */ - struct portals_handle l_handle; - /** - * Lock reference count. - * This is how many users have pointers to actual structure, so that - * we do not accidentally free lock structure that is in use. - */ - atomic_t l_refc; - /** - * Internal spinlock protects l_resource. We should hold this lock - * first before taking res_lock. - */ - spinlock_t l_lock; - /** - * Pointer to actual resource this lock is in. - * ldlm_lock_change_resource() can change this. - */ - struct ldlm_resource *l_resource; - /** - * List item for client side LRU list. - * Protected by ns_lock in struct ldlm_namespace. - */ - struct list_head l_lru; - /** - * Linkage to resource's lock queues according to current lock state. - * (could be granted, waiting or converting) - * Protected by lr_lock in struct ldlm_resource. - */ - struct list_head l_res_link; - /** - * Tree node for ldlm_extent. - */ - struct ldlm_interval *l_tree_node; - /** - * Requested mode. - * Protected by lr_lock. - */ - enum ldlm_mode l_req_mode; - /** - * Granted mode, also protected by lr_lock. - */ - enum ldlm_mode l_granted_mode; - /** Lock completion handler pointer. Called when lock is granted. */ - ldlm_completion_callback l_completion_ast; - /** - * Lock blocking AST handler pointer. - * It plays two roles: - * - as a notification of an attempt to queue a conflicting lock (once) - * - as a notification when the lock is being cancelled. - * - * As such it's typically called twice: once for the initial conflict - * and then once more when the last user went away and the lock is - * cancelled (could happen recursively). - */ - ldlm_blocking_callback l_blocking_ast; - /** - * Lock glimpse handler. - * Glimpse handler is used to obtain LVB updates from a client by - * server - */ - ldlm_glimpse_callback l_glimpse_ast; - - /** - * Lock export. - * This is a pointer to actual client export for locks that were granted - * to clients. Used server-side. - */ - struct obd_export *l_export; - /** - * Lock connection export. - * Pointer to server export on a client. - */ - struct obd_export *l_conn_export; - - /** - * Remote lock handle. - * If the lock is remote, this is the handle of the other side lock - * (l_handle) - */ - struct lustre_handle l_remote_handle; - - /** - * Representation of private data specific for a lock type. - * Examples are: extent range for extent lock or bitmask for ibits locks - */ - union ldlm_policy_data l_policy_data; - - /** - * Lock state flags. Protected by lr_lock. - * \see lustre_dlm_flags.h where the bits are defined. - */ - __u64 l_flags; - - /** - * Lock r/w usage counters. - * Protected by lr_lock. - */ - __u32 l_readers; - __u32 l_writers; - /** - * If the lock is granted, a process sleeps on this waitq to learn when - * it's no longer in use. If the lock is not granted, a process sleeps - * on this waitq to learn when it becomes granted. - */ - wait_queue_head_t l_waitq; - - /** - * Seconds. It will be updated if there is any activity related to - * the lock, e.g. enqueue the lock or send blocking AST. - */ - time64_t l_last_activity; - - /** - * Time last used by e.g. being matched by lock match. - * Jiffies. Should be converted to time if needed. - */ - unsigned long l_last_used; - - /** Originally requested extent for the extent lock. */ - struct ldlm_extent l_req_extent; - - /* - * Client-side-only members. - */ - - enum lvb_type l_lvb_type; - - /** - * Temporary storage for a LVB received during an enqueue operation. - */ - __u32 l_lvb_len; - void *l_lvb_data; - - /** Private storage for lock user. Opaque to LDLM. */ - void *l_ast_data; - - /* - * Server-side-only members. - */ - - /** - * Connection cookie for the client originating the operation. - * Used by Commit on Share (COS) code. Currently only used for - * inodebits locks on MDS. - */ - __u64 l_client_cookie; - - /** - * List item for locks waiting for cancellation from clients. - * The lists this could be linked into are: - * waiting_locks_list (protected by waiting_locks_spinlock), - * then if the lock timed out, it is moved to - * expired_lock_thread.elt_expired_locks for further processing. - * Protected by elt_lock. - */ - struct list_head l_pending_chain; - - /** - * Set when lock is sent a blocking AST. Time in seconds when timeout - * is reached and client holding this lock could be evicted. - * This timeout could be further extended by e.g. certain IO activity - * under this lock. - * \see ost_rw_prolong_locks - */ - unsigned long l_callback_timeout; - - /** Local PID of process which created this lock. */ - __u32 l_pid; - - /** - * Number of times blocking AST was sent for this lock. - * This is for debugging. Valid values are 0 and 1, if there is an - * attempt to send blocking AST more than once, an assertion would be - * hit. \see ldlm_work_bl_ast_lock - */ - int l_bl_ast_run; - /** List item ldlm_add_ast_work_item() for case of blocking ASTs. */ - struct list_head l_bl_ast; - /** List item ldlm_add_ast_work_item() for case of completion ASTs. */ - struct list_head l_cp_ast; - /** For ldlm_add_ast_work_item() for "revoke" AST used in COS. */ - struct list_head l_rk_ast; - - /** - * Pointer to a conflicting lock that caused blocking AST to be sent - * for this lock - */ - struct ldlm_lock *l_blocking_lock; - - /** - * Protected by lr_lock, linkages to "skip lists". - * For more explanations of skip lists see ldlm/ldlm_inodebits.c - */ - struct list_head l_sl_mode; - struct list_head l_sl_policy; - - /** Reference tracking structure to debug leaked locks. */ - struct lu_ref l_reference; -#if LUSTRE_TRACKS_LOCK_EXP_REFS - /* Debugging stuff for bug 20498, for tracking export references. */ - /** number of export references taken */ - int l_exp_refs_nr; - /** link all locks referencing one export */ - struct list_head l_exp_refs_link; - /** referenced export object */ - struct obd_export *l_exp_refs_target; -#endif -}; - -/** - * LDLM resource description. - * Basically, resource is a representation for a single object. - * Object has a name which is currently 4 64-bit integers. LDLM user is - * responsible for creation of a mapping between objects it wants to be - * protected and resource names. - * - * A resource can only hold locks of a single lock type, though there may be - * multiple ldlm_locks on a single resource, depending on the lock type and - * whether the locks are conflicting or not. - */ -struct ldlm_resource { - struct ldlm_ns_bucket *lr_ns_bucket; - - /** - * List item for list in namespace hash. - * protected by ns_lock - */ - struct hlist_node lr_hash; - - /** Spinlock to protect locks under this resource. */ - spinlock_t lr_lock; - - /** - * protected by lr_lock - * @{ - */ - /** List of locks in granted state */ - struct list_head lr_granted; - /** - * List of locks that could not be granted due to conflicts and - * that are waiting for conflicts to go away - */ - struct list_head lr_waiting; - /** @} */ - - /** Type of locks this resource can hold. Only one type per resource. */ - enum ldlm_type lr_type; /* LDLM_{PLAIN,EXTENT,FLOCK,IBITS} */ - - /** Resource name */ - struct ldlm_res_id lr_name; - /** Reference count for this resource */ - atomic_t lr_refcount; - - /** - * Interval trees (only for extent locks) for all modes of this resource - */ - struct ldlm_interval_tree lr_itree[LCK_MODE_NUM]; - - /** - * Server-side-only lock value block elements. - * To serialize lvbo_init. - */ - struct mutex lr_lvb_mutex; - int lr_lvb_len; - - /** When the resource was considered as contended. */ - unsigned long lr_contention_time; - /** List of references to this resource. For debugging. */ - struct lu_ref lr_reference; - - struct inode *lr_lvb_inode; -}; - -static inline bool ldlm_has_layout(struct ldlm_lock *lock) -{ - return lock->l_resource->lr_type == LDLM_IBITS && - lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_LAYOUT; -} - -static inline char * -ldlm_ns_name(struct ldlm_namespace *ns) -{ - return ns->ns_name; -} - -static inline struct ldlm_namespace * -ldlm_res_to_ns(struct ldlm_resource *res) -{ - return res->lr_ns_bucket->nsb_namespace; -} - -static inline struct ldlm_namespace * -ldlm_lock_to_ns(struct ldlm_lock *lock) -{ - return ldlm_res_to_ns(lock->l_resource); -} - -static inline char * -ldlm_lock_to_ns_name(struct ldlm_lock *lock) -{ - return ldlm_ns_name(ldlm_lock_to_ns(lock)); -} - -static inline struct adaptive_timeout * -ldlm_lock_to_ns_at(struct ldlm_lock *lock) -{ - return &lock->l_resource->lr_ns_bucket->nsb_at_estimate; -} - -static inline int ldlm_lvbo_init(struct ldlm_resource *res) -{ - struct ldlm_namespace *ns = ldlm_res_to_ns(res); - - if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) - return ns->ns_lvbo->lvbo_init(res); - - return 0; -} - -static inline int ldlm_lvbo_size(struct ldlm_lock *lock) -{ - struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); - - if (ns->ns_lvbo && ns->ns_lvbo->lvbo_size) - return ns->ns_lvbo->lvbo_size(lock); - - return 0; -} - -static inline int ldlm_lvbo_fill(struct ldlm_lock *lock, void *buf, int len) -{ - struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); - - if (ns->ns_lvbo) - return ns->ns_lvbo->lvbo_fill(lock, buf, len); - - return 0; -} - -struct ldlm_ast_work { - struct ldlm_lock *w_lock; - int w_blocking; - struct ldlm_lock_desc w_desc; - struct list_head w_list; - int w_flags; - void *w_data; - int w_datalen; -}; - -/** - * Common ldlm_enqueue parameters - */ -struct ldlm_enqueue_info { - enum ldlm_type ei_type; /** Type of the lock being enqueued. */ - enum ldlm_mode ei_mode; /** Mode of the lock being enqueued. */ - void *ei_cb_bl; /** blocking lock callback */ - void *ei_cb_cp; /** lock completion callback */ - void *ei_cb_gl; /** lock glimpse callback */ - void *ei_cbdata; /** Data to be passed into callbacks. */ - unsigned int ei_enq_slave:1; /* whether enqueue slave stripes */ -}; - -extern struct obd_ops ldlm_obd_ops; - -extern char *ldlm_lockname[]; -const char *ldlm_it2str(enum ldlm_intent_flags it); - -/** - * Just a fancy CDEBUG call with log level preset to LDLM_DEBUG. - * For the cases where we do not have actual lock to print along - * with a debugging message that is ldlm-related - */ -#define LDLM_DEBUG_NOLOCK(format, a...) \ - CDEBUG(D_DLMTRACE, "### " format "\n", ##a) - -/** - * Support function for lock information printing into debug logs. - * \see LDLM_DEBUG - */ -#define ldlm_lock_debug(msgdata, mask, cdls, lock, fmt, a...) do { \ - CFS_CHECK_STACK(msgdata, mask, cdls); \ - \ - if (((mask) & D_CANTMASK) != 0 || \ - ((libcfs_debug & (mask)) != 0 && \ - (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) \ - _ldlm_lock_debug(lock, msgdata, fmt, ##a); \ -} while (0) - -void _ldlm_lock_debug(struct ldlm_lock *lock, - struct libcfs_debug_msg_data *data, - const char *fmt, ...) - __printf(3, 4); - -/** - * Rate-limited version of lock printing function. - */ -#define LDLM_DEBUG_LIMIT(mask, lock, fmt, a...) do { \ - static struct cfs_debug_limit_state _ldlm_cdls; \ - LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, &_ldlm_cdls); \ - ldlm_lock_debug(&msgdata, mask, &_ldlm_cdls, lock, "### " fmt, ##a);\ -} while (0) - -#define LDLM_ERROR(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_ERROR, lock, fmt, ## a) -#define LDLM_WARN(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_WARNING, lock, fmt, ## a) - -/** Non-rate-limited lock printing function for debugging purposes. */ -#define LDLM_DEBUG(lock, fmt, a...) do { \ - if (likely(lock)) { \ - LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_DLMTRACE, NULL); \ - ldlm_lock_debug(&msgdata, D_DLMTRACE, NULL, lock, \ - "### " fmt, ##a); \ - } else { \ - LDLM_DEBUG_NOLOCK("no dlm lock: " fmt, ##a); \ - } \ -} while (0) - -typedef int (*ldlm_processing_policy)(struct ldlm_lock *lock, __u64 *flags, - int first_enq, enum ldlm_error *err, - struct list_head *work_list); - -/** - * Return values for lock iterators. - * Also used during deciding of lock grants and cancellations. - */ -#define LDLM_ITER_CONTINUE 1 /* keep iterating */ -#define LDLM_ITER_STOP 2 /* stop iterating */ - -typedef int (*ldlm_iterator_t)(struct ldlm_lock *, void *); -typedef int (*ldlm_res_iterator_t)(struct ldlm_resource *, void *); - -/** \defgroup ldlm_iterator Lock iterators - * - * LDLM provides for a way to iterate through every lock on a resource or - * namespace or every resource in a namespace. - * @{ - */ -int ldlm_resource_iterate(struct ldlm_namespace *, const struct ldlm_res_id *, - ldlm_iterator_t iter, void *data); -/** @} ldlm_iterator */ - -int ldlm_replay_locks(struct obd_import *imp); - -/* ldlm_flock.c */ -int ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data); - -/* ldlm_extent.c */ -__u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms); - -struct ldlm_callback_suite { - ldlm_completion_callback lcs_completion; - ldlm_blocking_callback lcs_blocking; - ldlm_glimpse_callback lcs_glimpse; -}; - -/* ldlm_lockd.c */ -int ldlm_get_ref(void); -void ldlm_put_ref(void); -struct ldlm_lock *ldlm_request_lock(struct ptlrpc_request *req); - -/* ldlm_lock.c */ -void ldlm_lock2handle(const struct ldlm_lock *lock, - struct lustre_handle *lockh); -struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *, __u64 flags); -void ldlm_cancel_callback(struct ldlm_lock *); -int ldlm_lock_remove_from_lru(struct ldlm_lock *); -int ldlm_lock_set_data(const struct lustre_handle *lockh, void *data); - -/** - * Obtain a lock reference by its handle. - */ -static inline struct ldlm_lock *ldlm_handle2lock(const struct lustre_handle *h) -{ - return __ldlm_handle2lock(h, 0); -} - -#define LDLM_LOCK_REF_DEL(lock) \ - lu_ref_del(&lock->l_reference, "handle", current) - -static inline struct ldlm_lock * -ldlm_handle2lock_long(const struct lustre_handle *h, __u64 flags) -{ - struct ldlm_lock *lock; - - lock = __ldlm_handle2lock(h, flags); - if (lock) - LDLM_LOCK_REF_DEL(lock); - return lock; -} - -/** - * Update Lock Value Block Operations (LVBO) on a resource taking into account - * data from request \a r - */ -static inline int ldlm_res_lvbo_update(struct ldlm_resource *res, - struct ptlrpc_request *r, int increase) -{ - if (ldlm_res_to_ns(res)->ns_lvbo && - ldlm_res_to_ns(res)->ns_lvbo->lvbo_update) { - return ldlm_res_to_ns(res)->ns_lvbo->lvbo_update(res, r, - increase); - } - return 0; -} - -int ldlm_error2errno(enum ldlm_error error); - -#if LUSTRE_TRACKS_LOCK_EXP_REFS -void ldlm_dump_export_locks(struct obd_export *exp); -#endif - -/** - * Release a temporary lock reference obtained by ldlm_handle2lock() or - * __ldlm_handle2lock(). - */ -#define LDLM_LOCK_PUT(lock) \ -do { \ - LDLM_LOCK_REF_DEL(lock); \ - /*LDLM_DEBUG((lock), "put");*/ \ - ldlm_lock_put(lock); \ -} while (0) - -/** - * Release a lock reference obtained by some other means (see - * LDLM_LOCK_PUT()). - */ -#define LDLM_LOCK_RELEASE(lock) \ -do { \ - /*LDLM_DEBUG((lock), "put");*/ \ - ldlm_lock_put(lock); \ -} while (0) - -#define LDLM_LOCK_GET(lock) \ -({ \ - ldlm_lock_get(lock); \ - /*LDLM_DEBUG((lock), "get");*/ \ - lock; \ -}) - -#define ldlm_lock_list_put(head, member, count) \ -({ \ - struct ldlm_lock *_lock, *_next; \ - int c = count; \ - list_for_each_entry_safe(_lock, _next, head, member) { \ - if (c-- == 0) \ - break; \ - list_del_init(&_lock->member); \ - LDLM_LOCK_RELEASE(_lock); \ - } \ - LASSERT(c <= 0); \ -}) - -struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock); -void ldlm_lock_put(struct ldlm_lock *lock); -void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc); -void ldlm_lock_addref(const struct lustre_handle *lockh, enum ldlm_mode mode); -int ldlm_lock_addref_try(const struct lustre_handle *lockh, - enum ldlm_mode mode); -void ldlm_lock_decref(const struct lustre_handle *lockh, enum ldlm_mode mode); -void ldlm_lock_decref_and_cancel(const struct lustre_handle *lockh, - enum ldlm_mode mode); -void ldlm_lock_fail_match_locked(struct ldlm_lock *lock); -void ldlm_lock_allow_match(struct ldlm_lock *lock); -void ldlm_lock_allow_match_locked(struct ldlm_lock *lock); -enum ldlm_mode ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags, - const struct ldlm_res_id *, - enum ldlm_type type, union ldlm_policy_data *, - enum ldlm_mode mode, struct lustre_handle *, - int unref); -enum ldlm_mode ldlm_revalidate_lock_handle(const struct lustre_handle *lockh, - __u64 *bits); -void ldlm_lock_cancel(struct ldlm_lock *lock); -void ldlm_lock_dump_handle(int level, const struct lustre_handle *); -void ldlm_unlink_lock_skiplist(struct ldlm_lock *req); - -/* resource.c */ -struct ldlm_namespace * -ldlm_namespace_new(struct obd_device *obd, char *name, - enum ldlm_side client, enum ldlm_appetite apt, - enum ldlm_ns_type ns_type); -int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags); -void ldlm_namespace_free_prior(struct ldlm_namespace *ns, - struct obd_import *imp, - int force); -void ldlm_namespace_free_post(struct ldlm_namespace *ns); -void ldlm_namespace_get(struct ldlm_namespace *ns); -void ldlm_namespace_put(struct ldlm_namespace *ns); -void ldlm_debugfs_setup(void); -void ldlm_debugfs_cleanup(void); - -/* resource.c - internal */ -struct ldlm_resource *ldlm_resource_get(struct ldlm_namespace *ns, - struct ldlm_resource *parent, - const struct ldlm_res_id *, - enum ldlm_type type, int create); -void ldlm_resource_putref(struct ldlm_resource *res); -void ldlm_resource_add_lock(struct ldlm_resource *res, - struct list_head *head, - struct ldlm_lock *lock); -void ldlm_resource_unlink_lock(struct ldlm_lock *lock); -void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc); -void ldlm_dump_all_namespaces(enum ldlm_side client, int level); -void ldlm_namespace_dump(int level, struct ldlm_namespace *); -void ldlm_resource_dump(int level, struct ldlm_resource *); -int ldlm_lock_change_resource(struct ldlm_namespace *, struct ldlm_lock *, - const struct ldlm_res_id *); - -#define LDLM_RESOURCE_ADDREF(res) do { \ - lu_ref_add_atomic(&(res)->lr_reference, __func__, current); \ -} while (0) - -#define LDLM_RESOURCE_DELREF(res) do { \ - lu_ref_del(&(res)->lr_reference, __func__, current); \ -} while (0) - -/* ldlm_request.c */ -/** \defgroup ldlm_local_ast Default AST handlers for local locks - * These AST handlers are typically used for server-side local locks and are - * also used by client-side lock handlers to perform minimum level base - * processing. - * @{ - */ -int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data); -/** @} ldlm_local_ast */ - -/** \defgroup ldlm_cli_api API to operate on locks from actual LDLM users. - * These are typically used by client and server (*_local versions) - * to obtain and release locks. - * @{ - */ -int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp, - struct ldlm_enqueue_info *einfo, - const struct ldlm_res_id *res_id, - union ldlm_policy_data const *policy, __u64 *flags, - void *lvb, __u32 lvb_len, enum lvb_type lvb_type, - struct lustre_handle *lockh, int async); -int ldlm_prep_enqueue_req(struct obd_export *exp, - struct ptlrpc_request *req, - struct list_head *cancels, - int count); -int ldlm_prep_elc_req(struct obd_export *exp, - struct ptlrpc_request *req, - int version, int opc, int canceloff, - struct list_head *cancels, int count); - -int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req, - enum ldlm_type type, __u8 with_policy, - enum ldlm_mode mode, - __u64 *flags, void *lvb, __u32 lvb_len, - const struct lustre_handle *lockh, int rc); -int ldlm_cli_update_pool(struct ptlrpc_request *req); -int ldlm_cli_cancel(const struct lustre_handle *lockh, - enum ldlm_cancel_flags cancel_flags); -int ldlm_cli_cancel_unused(struct ldlm_namespace *, const struct ldlm_res_id *, - enum ldlm_cancel_flags flags, void *opaque); -int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns, - const struct ldlm_res_id *res_id, - union ldlm_policy_data *policy, - enum ldlm_mode mode, - enum ldlm_cancel_flags flags, - void *opaque); -int ldlm_cancel_resource_local(struct ldlm_resource *res, - struct list_head *cancels, - union ldlm_policy_data *policy, - enum ldlm_mode mode, __u64 lock_flags, - enum ldlm_cancel_flags cancel_flags, - void *opaque); -int ldlm_cli_cancel_list_local(struct list_head *cancels, int count, - enum ldlm_cancel_flags flags); -int ldlm_cli_cancel_list(struct list_head *head, int count, - struct ptlrpc_request *req, - enum ldlm_cancel_flags flags); -/** @} ldlm_cli_api */ - -/* mds/handler.c */ -/* This has to be here because recursive inclusion sucks. */ -int intent_disposition(struct ldlm_reply *rep, int flag); -void intent_set_disposition(struct ldlm_reply *rep, int flag); - -/** - * "Modes" of acquiring lock_res, necessary to tell lockdep that taking more - * than one lock_res is dead-lock safe. - */ -enum lock_res_type { - LRT_NORMAL, - LRT_NEW -}; - -/** Lock resource. */ -static inline void lock_res(struct ldlm_resource *res) -{ - spin_lock(&res->lr_lock); -} - -/** Lock resource with a way to instruct lockdep code about nestedness-safe. */ -static inline void lock_res_nested(struct ldlm_resource *res, - enum lock_res_type mode) -{ - spin_lock_nested(&res->lr_lock, mode); -} - -/** Unlock resource. */ -static inline void unlock_res(struct ldlm_resource *res) -{ - spin_unlock(&res->lr_lock); -} - -/** Check if resource is already locked, assert if not. */ -static inline void check_res_locked(struct ldlm_resource *res) -{ - assert_spin_locked(&res->lr_lock); -} - -struct ldlm_resource *lock_res_and_lock(struct ldlm_lock *lock); -void unlock_res_and_lock(struct ldlm_lock *lock); - -/* ldlm_pool.c */ -/** \defgroup ldlm_pools Various LDLM pool related functions - * There are not used outside of ldlm. - * @{ - */ -int ldlm_pools_init(void); -void ldlm_pools_fini(void); - -int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns, - int idx, enum ldlm_side client); -void ldlm_pool_fini(struct ldlm_pool *pl); -void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock); -void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock); -/** @} */ - -static inline int ldlm_extent_overlap(const struct ldlm_extent *ex1, - const struct ldlm_extent *ex2) -{ - return ex1->start <= ex2->end && ex2->start <= ex1->end; -} - -/* check if @ex1 contains @ex2 */ -static inline int ldlm_extent_contain(const struct ldlm_extent *ex1, - const struct ldlm_extent *ex2) -{ - return ex1->start <= ex2->start && ex1->end >= ex2->end; -} - -#endif -/** @} LDLM */ diff --git a/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h b/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h deleted file mode 100644 index 53db031c4c8c0ed2f77b55a569cb3a6d44a0a6d7..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_dlm_flags.h +++ /dev/null @@ -1,402 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* -*- buffer-read-only: t -*- vi: set ro: - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * Lustre is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * See the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program. If not, see . - */ -/** - * \file lustre_dlm_flags.h - * The flags and collections of flags (masks) for \see struct ldlm_lock. - * - * \addtogroup LDLM Lustre Distributed Lock Manager - * @{ - * - * \name flags - * The flags and collections of flags (masks) for \see struct ldlm_lock. - * @{ - */ -#ifndef LDLM_ALL_FLAGS_MASK - -/** l_flags bits marked as "all_flags" bits */ -#define LDLM_FL_ALL_FLAGS_MASK 0x00FFFFFFC08F932FULL - -/** extent, mode, or resource changed */ -#define LDLM_FL_LOCK_CHANGED 0x0000000000000001ULL /* bit 0 */ -#define ldlm_is_lock_changed(_l) LDLM_TEST_FLAG((_l), 1ULL << 0) -#define ldlm_set_lock_changed(_l) LDLM_SET_FLAG((_l), 1ULL << 0) -#define ldlm_clear_lock_changed(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 0) - -/** - * Server placed lock on granted list, or a recovering client wants the - * lock added to the granted list, no questions asked. - */ -#define LDLM_FL_BLOCK_GRANTED 0x0000000000000002ULL /* bit 1 */ -#define ldlm_is_block_granted(_l) LDLM_TEST_FLAG((_l), 1ULL << 1) -#define ldlm_set_block_granted(_l) LDLM_SET_FLAG((_l), 1ULL << 1) -#define ldlm_clear_block_granted(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 1) - -/** - * Server placed lock on conv list, or a recovering client wants the lock - * added to the conv list, no questions asked. - */ -#define LDLM_FL_BLOCK_CONV 0x0000000000000004ULL /* bit 2 */ -#define ldlm_is_block_conv(_l) LDLM_TEST_FLAG((_l), 1ULL << 2) -#define ldlm_set_block_conv(_l) LDLM_SET_FLAG((_l), 1ULL << 2) -#define ldlm_clear_block_conv(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 2) - -/** - * Server placed lock on wait list, or a recovering client wants the lock - * added to the wait list, no questions asked. - */ -#define LDLM_FL_BLOCK_WAIT 0x0000000000000008ULL /* bit 3 */ -#define ldlm_is_block_wait(_l) LDLM_TEST_FLAG((_l), 1ULL << 3) -#define ldlm_set_block_wait(_l) LDLM_SET_FLAG((_l), 1ULL << 3) -#define ldlm_clear_block_wait(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 3) - -/** blocking or cancel packet was queued for sending. */ -#define LDLM_FL_AST_SENT 0x0000000000000020ULL /* bit 5 */ -#define ldlm_is_ast_sent(_l) LDLM_TEST_FLAG((_l), 1ULL << 5) -#define ldlm_set_ast_sent(_l) LDLM_SET_FLAG((_l), 1ULL << 5) -#define ldlm_clear_ast_sent(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 5) - -/** - * Lock is being replayed. This could probably be implied by the fact that - * one of BLOCK_{GRANTED,CONV,WAIT} is set, but that is pretty dangerous. - */ -#define LDLM_FL_REPLAY 0x0000000000000100ULL /* bit 8 */ -#define ldlm_is_replay(_l) LDLM_TEST_FLAG((_l), 1ULL << 8) -#define ldlm_set_replay(_l) LDLM_SET_FLAG((_l), 1ULL << 8) -#define ldlm_clear_replay(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 8) - -/** Don't grant lock, just do intent. */ -#define LDLM_FL_INTENT_ONLY 0x0000000000000200ULL /* bit 9 */ -#define ldlm_is_intent_only(_l) LDLM_TEST_FLAG((_l), 1ULL << 9) -#define ldlm_set_intent_only(_l) LDLM_SET_FLAG((_l), 1ULL << 9) -#define ldlm_clear_intent_only(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 9) - -/** lock request has intent */ -#define LDLM_FL_HAS_INTENT 0x0000000000001000ULL /* bit 12 */ -#define ldlm_is_has_intent(_l) LDLM_TEST_FLAG((_l), 1ULL << 12) -#define ldlm_set_has_intent(_l) LDLM_SET_FLAG((_l), 1ULL << 12) -#define ldlm_clear_has_intent(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 12) - -/** flock deadlock detected */ -#define LDLM_FL_FLOCK_DEADLOCK 0x0000000000008000ULL /* bit 15 */ -#define ldlm_is_flock_deadlock(_l) LDLM_TEST_FLAG((_l), 1ULL << 15) -#define ldlm_set_flock_deadlock(_l) LDLM_SET_FLAG((_l), 1ULL << 15) -#define ldlm_clear_flock_deadlock(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 15) - -/** discard (no writeback) on cancel */ -#define LDLM_FL_DISCARD_DATA 0x0000000000010000ULL /* bit 16 */ -#define ldlm_is_discard_data(_l) LDLM_TEST_FLAG((_l), 1ULL << 16) -#define ldlm_set_discard_data(_l) LDLM_SET_FLAG((_l), 1ULL << 16) -#define ldlm_clear_discard_data(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 16) - -/** Blocked by group lock - wait indefinitely */ -#define LDLM_FL_NO_TIMEOUT 0x0000000000020000ULL /* bit 17 */ -#define ldlm_is_no_timeout(_l) LDLM_TEST_FLAG((_l), 1ULL << 17) -#define ldlm_set_no_timeout(_l) LDLM_SET_FLAG((_l), 1ULL << 17) -#define ldlm_clear_no_timeout(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 17) - -/** - * Server told not to wait if blocked. For AGL, OST will not send glimpse - * callback. - */ -#define LDLM_FL_BLOCK_NOWAIT 0x0000000000040000ULL /* bit 18 */ -#define ldlm_is_block_nowait(_l) LDLM_TEST_FLAG((_l), 1ULL << 18) -#define ldlm_set_block_nowait(_l) LDLM_SET_FLAG((_l), 1ULL << 18) -#define ldlm_clear_block_nowait(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 18) - -/** return blocking lock */ -#define LDLM_FL_TEST_LOCK 0x0000000000080000ULL /* bit 19 */ -#define ldlm_is_test_lock(_l) LDLM_TEST_FLAG((_l), 1ULL << 19) -#define ldlm_set_test_lock(_l) LDLM_SET_FLAG((_l), 1ULL << 19) -#define ldlm_clear_test_lock(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 19) - -/** match lock only */ -#define LDLM_FL_MATCH_LOCK 0x0000000000100000ULL /* bit 20 */ - -/** - * Immediately cancel such locks when they block some other locks. Send - * cancel notification to original lock holder, but expect no reply. This - * is for clients (like liblustre) that cannot be expected to reliably - * response to blocking AST. - */ -#define LDLM_FL_CANCEL_ON_BLOCK 0x0000000000800000ULL /* bit 23 */ -#define ldlm_is_cancel_on_block(_l) LDLM_TEST_FLAG((_l), 1ULL << 23) -#define ldlm_set_cancel_on_block(_l) LDLM_SET_FLAG((_l), 1ULL << 23) -#define ldlm_clear_cancel_on_block(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 23) - -/** - * measure lock contention and return -EUSERS if locking contention is high - */ -#define LDLM_FL_DENY_ON_CONTENTION 0x0000000040000000ULL /* bit 30 */ -#define ldlm_is_deny_on_contention(_l) LDLM_TEST_FLAG((_l), 1ULL << 30) -#define ldlm_set_deny_on_contention(_l) LDLM_SET_FLAG((_l), 1ULL << 30) -#define ldlm_clear_deny_on_contention(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 30) - -/** - * These are flags that are mapped into the flags and ASTs of blocking - * locks Add FL_DISCARD to blocking ASTs - */ -#define LDLM_FL_AST_DISCARD_DATA 0x0000000080000000ULL /* bit 31 */ -#define ldlm_is_ast_discard_data(_l) LDLM_TEST_FLAG((_l), 1ULL << 31) -#define ldlm_set_ast_discard_data(_l) LDLM_SET_FLAG((_l), 1ULL << 31) -#define ldlm_clear_ast_discard_data(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 31) - -/** - * Used for marking lock as a target for -EINTR while cp_ast sleep emulation - * + race with upcoming bl_ast. - */ -#define LDLM_FL_FAIL_LOC 0x0000000100000000ULL /* bit 32 */ -#define ldlm_is_fail_loc(_l) LDLM_TEST_FLAG((_l), 1ULL << 32) -#define ldlm_set_fail_loc(_l) LDLM_SET_FLAG((_l), 1ULL << 32) -#define ldlm_clear_fail_loc(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 32) - -/** - * Used while processing the unused list to know that we have already - * handled this lock and decided to skip it. - */ -#define LDLM_FL_SKIPPED 0x0000000200000000ULL /* bit 33 */ -#define ldlm_is_skipped(_l) LDLM_TEST_FLAG((_l), 1ULL << 33) -#define ldlm_set_skipped(_l) LDLM_SET_FLAG((_l), 1ULL << 33) -#define ldlm_clear_skipped(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 33) - -/** this lock is being destroyed */ -#define LDLM_FL_CBPENDING 0x0000000400000000ULL /* bit 34 */ -#define ldlm_is_cbpending(_l) LDLM_TEST_FLAG((_l), 1ULL << 34) -#define ldlm_set_cbpending(_l) LDLM_SET_FLAG((_l), 1ULL << 34) -#define ldlm_clear_cbpending(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 34) - -/** not a real flag, not saved in lock */ -#define LDLM_FL_WAIT_NOREPROC 0x0000000800000000ULL /* bit 35 */ -#define ldlm_is_wait_noreproc(_l) LDLM_TEST_FLAG((_l), 1ULL << 35) -#define ldlm_set_wait_noreproc(_l) LDLM_SET_FLAG((_l), 1ULL << 35) -#define ldlm_clear_wait_noreproc(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 35) - -/** cancellation callback already run */ -#define LDLM_FL_CANCEL 0x0000001000000000ULL /* bit 36 */ -#define ldlm_is_cancel(_l) LDLM_TEST_FLAG((_l), 1ULL << 36) -#define ldlm_set_cancel(_l) LDLM_SET_FLAG((_l), 1ULL << 36) -#define ldlm_clear_cancel(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 36) - -/** whatever it might mean -- never transmitted? */ -#define LDLM_FL_LOCAL_ONLY 0x0000002000000000ULL /* bit 37 */ -#define ldlm_is_local_only(_l) LDLM_TEST_FLAG((_l), 1ULL << 37) -#define ldlm_set_local_only(_l) LDLM_SET_FLAG((_l), 1ULL << 37) -#define ldlm_clear_local_only(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 37) - -/** don't run the cancel callback under ldlm_cli_cancel_unused */ -#define LDLM_FL_FAILED 0x0000004000000000ULL /* bit 38 */ -#define ldlm_is_failed(_l) LDLM_TEST_FLAG((_l), 1ULL << 38) -#define ldlm_set_failed(_l) LDLM_SET_FLAG((_l), 1ULL << 38) -#define ldlm_clear_failed(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 38) - -/** lock cancel has already been sent */ -#define LDLM_FL_CANCELING 0x0000008000000000ULL /* bit 39 */ -#define ldlm_is_canceling(_l) LDLM_TEST_FLAG((_l), 1ULL << 39) -#define ldlm_set_canceling(_l) LDLM_SET_FLAG((_l), 1ULL << 39) -#define ldlm_clear_canceling(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 39) - -/** local lock (ie, no srv/cli split) */ -#define LDLM_FL_LOCAL 0x0000010000000000ULL /* bit 40 */ -#define ldlm_is_local(_l) LDLM_TEST_FLAG((_l), 1ULL << 40) -#define ldlm_set_local(_l) LDLM_SET_FLAG((_l), 1ULL << 40) -#define ldlm_clear_local(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 40) - -/** - * XXX FIXME: This is being added to b_size as a low-risk fix to the - * fact that the LVB filling happens _after_ the lock has been granted, - * so another thread can match it before the LVB has been updated. As a - * dirty hack, we set LDLM_FL_LVB_READY only after we've done the LVB poop. - * this is only needed on LOV/OSC now, where LVB is actually used and - * callers must set it in input flags. - * - * The proper fix is to do the granting inside of the completion AST, - * which can be replaced with a LVB-aware wrapping function for OSC locks. - * That change is pretty high-risk, though, and would need a lot more - * testing. - */ -#define LDLM_FL_LVB_READY 0x0000020000000000ULL /* bit 41 */ -#define ldlm_is_lvb_ready(_l) LDLM_TEST_FLAG((_l), 1ULL << 41) -#define ldlm_set_lvb_ready(_l) LDLM_SET_FLAG((_l), 1ULL << 41) -#define ldlm_clear_lvb_ready(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 41) - -/** - * A lock contributes to the known minimum size (KMS) calculation until it - * has finished the part of its cancellation that performs write back on its - * dirty pages. It can remain on the granted list during this whole time. - * Threads racing to update the KMS after performing their writeback need - * to know to exclude each other's locks from the calculation as they walk - * the granted list. - */ -#define LDLM_FL_KMS_IGNORE 0x0000040000000000ULL /* bit 42 */ -#define ldlm_is_kms_ignore(_l) LDLM_TEST_FLAG((_l), 1ULL << 42) -#define ldlm_set_kms_ignore(_l) LDLM_SET_FLAG((_l), 1ULL << 42) -#define ldlm_clear_kms_ignore(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 42) - -/** completion AST to be executed */ -#define LDLM_FL_CP_REQD 0x0000080000000000ULL /* bit 43 */ -#define ldlm_is_cp_reqd(_l) LDLM_TEST_FLAG((_l), 1ULL << 43) -#define ldlm_set_cp_reqd(_l) LDLM_SET_FLAG((_l), 1ULL << 43) -#define ldlm_clear_cp_reqd(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 43) - -/** cleanup_resource has already handled the lock */ -#define LDLM_FL_CLEANED 0x0000100000000000ULL /* bit 44 */ -#define ldlm_is_cleaned(_l) LDLM_TEST_FLAG((_l), 1ULL << 44) -#define ldlm_set_cleaned(_l) LDLM_SET_FLAG((_l), 1ULL << 44) -#define ldlm_clear_cleaned(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 44) - -/** - * optimization hint: LDLM can run blocking callback from current context - * w/o involving separate thread. in order to decrease cs rate - */ -#define LDLM_FL_ATOMIC_CB 0x0000200000000000ULL /* bit 45 */ -#define ldlm_is_atomic_cb(_l) LDLM_TEST_FLAG((_l), 1ULL << 45) -#define ldlm_set_atomic_cb(_l) LDLM_SET_FLAG((_l), 1ULL << 45) -#define ldlm_clear_atomic_cb(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 45) - -/** - * It may happen that a client initiates two operations, e.g. unlink and - * mkdir, such that the server sends a blocking AST for conflicting locks - * to this client for the first operation, whereas the second operation - * has canceled this lock and is waiting for rpc_lock which is taken by - * the first operation. LDLM_FL_BL_AST is set by ldlm_callback_handler() in - * the lock to prevent the Early Lock Cancel (ELC) code from cancelling it. - */ -#define LDLM_FL_BL_AST 0x0000400000000000ULL /* bit 46 */ -#define ldlm_is_bl_ast(_l) LDLM_TEST_FLAG((_l), 1ULL << 46) -#define ldlm_set_bl_ast(_l) LDLM_SET_FLAG((_l), 1ULL << 46) -#define ldlm_clear_bl_ast(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 46) - -/** - * Set by ldlm_cancel_callback() when lock cache is dropped to let - * ldlm_callback_handler() return EINVAL to the server. It is used when - * ELC RPC is already prepared and is waiting for rpc_lock, too late to - * send a separate CANCEL RPC. - */ -#define LDLM_FL_BL_DONE 0x0000800000000000ULL /* bit 47 */ -#define ldlm_is_bl_done(_l) LDLM_TEST_FLAG((_l), 1ULL << 47) -#define ldlm_set_bl_done(_l) LDLM_SET_FLAG((_l), 1ULL << 47) -#define ldlm_clear_bl_done(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 47) - -/** - * Don't put lock into the LRU list, so that it is not canceled due - * to aging. Used by MGC locks, they are cancelled only at unmount or - * by callback. - */ -#define LDLM_FL_NO_LRU 0x0001000000000000ULL /* bit 48 */ -#define ldlm_is_no_lru(_l) LDLM_TEST_FLAG((_l), 1ULL << 48) -#define ldlm_set_no_lru(_l) LDLM_SET_FLAG((_l), 1ULL << 48) -#define ldlm_clear_no_lru(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 48) - -/** - * Set for locks that failed and where the server has been notified. - * - * Protected by lock and resource locks. - */ -#define LDLM_FL_FAIL_NOTIFIED 0x0002000000000000ULL /* bit 49 */ -#define ldlm_is_fail_notified(_l) LDLM_TEST_FLAG((_l), 1ULL << 49) -#define ldlm_set_fail_notified(_l) LDLM_SET_FLAG((_l), 1ULL << 49) -#define ldlm_clear_fail_notified(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 49) - -/** - * Set for locks that were removed from class hash table and will - * be destroyed when last reference to them is released. Set by - * ldlm_lock_destroy_internal(). - * - * Protected by lock and resource locks. - */ -#define LDLM_FL_DESTROYED 0x0004000000000000ULL /* bit 50 */ -#define ldlm_is_destroyed(_l) LDLM_TEST_FLAG((_l), 1ULL << 50) -#define ldlm_set_destroyed(_l) LDLM_SET_FLAG((_l), 1ULL << 50) -#define ldlm_clear_destroyed(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 50) - -/** flag whether this is a server namespace lock */ -#define LDLM_FL_SERVER_LOCK 0x0008000000000000ULL /* bit 51 */ -#define ldlm_is_server_lock(_l) LDLM_TEST_FLAG((_l), 1ULL << 51) -#define ldlm_set_server_lock(_l) LDLM_SET_FLAG((_l), 1ULL << 51) -#define ldlm_clear_server_lock(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 51) - -/** - * It's set in lock_res_and_lock() and unset in unlock_res_and_lock(). - * - * NB: compared with check_res_locked(), checking this bit is cheaper. - * Also, spin_is_locked() is deprecated for kernel code; one reason is - * because it works only for SMP so user needs to add extra macros like - * LASSERT_SPIN_LOCKED for uniprocessor kernels. - */ -#define LDLM_FL_RES_LOCKED 0x0010000000000000ULL /* bit 52 */ -#define ldlm_is_res_locked(_l) LDLM_TEST_FLAG((_l), 1ULL << 52) -#define ldlm_set_res_locked(_l) LDLM_SET_FLAG((_l), 1ULL << 52) -#define ldlm_clear_res_locked(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 52) - -/** - * It's set once we call ldlm_add_waiting_lock_res_locked() to start the - * lock-timeout timer and it will never be reset. - * - * Protected by lock and resource locks. - */ -#define LDLM_FL_WAITED 0x0020000000000000ULL /* bit 53 */ -#define ldlm_is_waited(_l) LDLM_TEST_FLAG((_l), 1ULL << 53) -#define ldlm_set_waited(_l) LDLM_SET_FLAG((_l), 1ULL << 53) -#define ldlm_clear_waited(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 53) - -/** Flag whether this is a server namespace lock. */ -#define LDLM_FL_NS_SRV 0x0040000000000000ULL /* bit 54 */ -#define ldlm_is_ns_srv(_l) LDLM_TEST_FLAG((_l), 1ULL << 54) -#define ldlm_set_ns_srv(_l) LDLM_SET_FLAG((_l), 1ULL << 54) -#define ldlm_clear_ns_srv(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 54) - -/** Flag whether this lock can be reused. Used by exclusive open. */ -#define LDLM_FL_EXCL 0x0080000000000000ULL /* bit 55 */ -#define ldlm_is_excl(_l) LDLM_TEST_FLAG((_l), 1ULL << 55) -#define ldlm_set_excl(_l) LDLM_SET_FLAG((_l), 1ULL << 55) -#define ldlm_clear_excl(_l) LDLM_CLEAR_FLAG((_l), 1ULL << 55) - -/** l_flags bits marked as "ast" bits */ -#define LDLM_FL_AST_MASK (LDLM_FL_FLOCK_DEADLOCK |\ - LDLM_FL_AST_DISCARD_DATA) - -/** l_flags bits marked as "blocked" bits */ -#define LDLM_FL_BLOCKED_MASK (LDLM_FL_BLOCK_GRANTED |\ - LDLM_FL_BLOCK_CONV |\ - LDLM_FL_BLOCK_WAIT) - -/** l_flags bits marked as "gone" bits */ -#define LDLM_FL_GONE_MASK (LDLM_FL_DESTROYED |\ - LDLM_FL_FAILED) - -/** l_flags bits marked as "inherit" bits */ -/* Flags inherited from wire on enqueue/reply between client/server. */ -/* NO_TIMEOUT flag to force ldlm_lock_match() to wait with no timeout. */ -/* TEST_LOCK flag to not let TEST lock to be granted. */ -#define LDLM_FL_INHERIT_MASK (LDLM_FL_CANCEL_ON_BLOCK |\ - LDLM_FL_NO_TIMEOUT |\ - LDLM_FL_TEST_LOCK) - -/** test for ldlm_lock flag bit set */ -#define LDLM_TEST_FLAG(_l, _b) (((_l)->l_flags & (_b)) != 0) - -/** multi-bit test: are any of mask bits set? */ -#define LDLM_HAVE_MASK(_l, _m) ((_l)->l_flags & LDLM_FL_##_m##_MASK) - -/** set a ldlm_lock flag bit */ -#define LDLM_SET_FLAG(_l, _b) ((_l)->l_flags |= (_b)) - -/** clear a ldlm_lock flag bit */ -#define LDLM_CLEAR_FLAG(_l, _b) ((_l)->l_flags &= ~(_b)) - -/** @} subgroup */ -/** @} group */ - -#endif /* LDLM_ALL_FLAGS_MASK */ diff --git a/drivers/staging/lustre/lustre/include/lustre_errno.h b/drivers/staging/lustre/lustre/include/lustre_errno.h deleted file mode 100644 index 59fbb9f47ff15f9e7afed1bcfc4a141f777baec7..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_errno.h +++ /dev/null @@ -1,198 +0,0 @@ -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.txt - * - * GPL HEADER END - */ -/* - * Copyright (C) 2011 FUJITSU LIMITED. All rights reserved. - * - * Copyright (c) 2013, Intel Corporation. - */ - -#ifndef LUSTRE_ERRNO_H -#define LUSTRE_ERRNO_H - -/* - * Only "network" errnos, which are defined below, are allowed on wire (or on - * disk). Generic routines exist to help translate between these and a subset - * of the "host" errnos. Some host errnos (e.g., EDEADLOCK) are intentionally - * left out. See also the comment on lustre_errno_hton_mapping[]. - * - * To maintain compatibility with existing x86 clients and servers, each of - * these network errnos has the same numerical value as its corresponding host - * errno on x86. - */ -#define LUSTRE_EPERM 1 /* Operation not permitted */ -#define LUSTRE_ENOENT 2 /* No such file or directory */ -#define LUSTRE_ESRCH 3 /* No such process */ -#define LUSTRE_EINTR 4 /* Interrupted system call */ -#define LUSTRE_EIO 5 /* I/O error */ -#define LUSTRE_ENXIO 6 /* No such device or address */ -#define LUSTRE_E2BIG 7 /* Argument list too long */ -#define LUSTRE_ENOEXEC 8 /* Exec format error */ -#define LUSTRE_EBADF 9 /* Bad file number */ -#define LUSTRE_ECHILD 10 /* No child processes */ -#define LUSTRE_EAGAIN 11 /* Try again */ -#define LUSTRE_ENOMEM 12 /* Out of memory */ -#define LUSTRE_EACCES 13 /* Permission denied */ -#define LUSTRE_EFAULT 14 /* Bad address */ -#define LUSTRE_ENOTBLK 15 /* Block device required */ -#define LUSTRE_EBUSY 16 /* Device or resource busy */ -#define LUSTRE_EEXIST 17 /* File exists */ -#define LUSTRE_EXDEV 18 /* Cross-device link */ -#define LUSTRE_ENODEV 19 /* No such device */ -#define LUSTRE_ENOTDIR 20 /* Not a directory */ -#define LUSTRE_EISDIR 21 /* Is a directory */ -#define LUSTRE_EINVAL 22 /* Invalid argument */ -#define LUSTRE_ENFILE 23 /* File table overflow */ -#define LUSTRE_EMFILE 24 /* Too many open files */ -#define LUSTRE_ENOTTY 25 /* Not a typewriter */ -#define LUSTRE_ETXTBSY 26 /* Text file busy */ -#define LUSTRE_EFBIG 27 /* File too large */ -#define LUSTRE_ENOSPC 28 /* No space left on device */ -#define LUSTRE_ESPIPE 29 /* Illegal seek */ -#define LUSTRE_EROFS 30 /* Read-only file system */ -#define LUSTRE_EMLINK 31 /* Too many links */ -#define LUSTRE_EPIPE 32 /* Broken pipe */ -#define LUSTRE_EDOM 33 /* Math argument out of func domain */ -#define LUSTRE_ERANGE 34 /* Math result not representable */ -#define LUSTRE_EDEADLK 35 /* Resource deadlock would occur */ -#define LUSTRE_ENAMETOOLONG 36 /* File name too long */ -#define LUSTRE_ENOLCK 37 /* No record locks available */ -#define LUSTRE_ENOSYS 38 /* Function not implemented */ -#define LUSTRE_ENOTEMPTY 39 /* Directory not empty */ -#define LUSTRE_ELOOP 40 /* Too many symbolic links found */ -#define LUSTRE_ENOMSG 42 /* No message of desired type */ -#define LUSTRE_EIDRM 43 /* Identifier removed */ -#define LUSTRE_ECHRNG 44 /* Channel number out of range */ -#define LUSTRE_EL2NSYNC 45 /* Level 2 not synchronized */ -#define LUSTRE_EL3HLT 46 /* Level 3 halted */ -#define LUSTRE_EL3RST 47 /* Level 3 reset */ -#define LUSTRE_ELNRNG 48 /* Link number out of range */ -#define LUSTRE_EUNATCH 49 /* Protocol driver not attached */ -#define LUSTRE_ENOCSI 50 /* No CSI structure available */ -#define LUSTRE_EL2HLT 51 /* Level 2 halted */ -#define LUSTRE_EBADE 52 /* Invalid exchange */ -#define LUSTRE_EBADR 53 /* Invalid request descriptor */ -#define LUSTRE_EXFULL 54 /* Exchange full */ -#define LUSTRE_ENOANO 55 /* No anode */ -#define LUSTRE_EBADRQC 56 /* Invalid request code */ -#define LUSTRE_EBADSLT 57 /* Invalid slot */ -#define LUSTRE_EBFONT 59 /* Bad font file format */ -#define LUSTRE_ENOSTR 60 /* Device not a stream */ -#define LUSTRE_ENODATA 61 /* No data available */ -#define LUSTRE_ETIME 62 /* Timer expired */ -#define LUSTRE_ENOSR 63 /* Out of streams resources */ -#define LUSTRE_ENONET 64 /* Machine is not on the network */ -#define LUSTRE_ENOPKG 65 /* Package not installed */ -#define LUSTRE_EREMOTE 66 /* Object is remote */ -#define LUSTRE_ENOLINK 67 /* Link has been severed */ -#define LUSTRE_EADV 68 /* Advertise error */ -#define LUSTRE_ESRMNT 69 /* Srmount error */ -#define LUSTRE_ECOMM 70 /* Communication error on send */ -#define LUSTRE_EPROTO 71 /* Protocol error */ -#define LUSTRE_EMULTIHOP 72 /* Multihop attempted */ -#define LUSTRE_EDOTDOT 73 /* RFS specific error */ -#define LUSTRE_EBADMSG 74 /* Not a data message */ -#define LUSTRE_EOVERFLOW 75 /* Value too large for data type */ -#define LUSTRE_ENOTUNIQ 76 /* Name not unique on network */ -#define LUSTRE_EBADFD 77 /* File descriptor in bad state */ -#define LUSTRE_EREMCHG 78 /* Remote address changed */ -#define LUSTRE_ELIBACC 79 /* Can't access needed shared library */ -#define LUSTRE_ELIBBAD 80 /* Access corrupted shared library */ -#define LUSTRE_ELIBSCN 81 /* .lib section in a.out corrupted */ -#define LUSTRE_ELIBMAX 82 /* Trying to link too many libraries */ -#define LUSTRE_ELIBEXEC 83 /* Cannot exec a shared lib directly */ -#define LUSTRE_EILSEQ 84 /* Illegal byte sequence */ -#define LUSTRE_ERESTART 85 /* Restart interrupted system call */ -#define LUSTRE_ESTRPIPE 86 /* Streams pipe error */ -#define LUSTRE_EUSERS 87 /* Too many users */ -#define LUSTRE_ENOTSOCK 88 /* Socket operation on non-socket */ -#define LUSTRE_EDESTADDRREQ 89 /* Destination address required */ -#define LUSTRE_EMSGSIZE 90 /* Message too long */ -#define LUSTRE_EPROTOTYPE 91 /* Protocol wrong type for socket */ -#define LUSTRE_ENOPROTOOPT 92 /* Protocol not available */ -#define LUSTRE_EPROTONOSUPPORT 93 /* Protocol not supported */ -#define LUSTRE_ESOCKTNOSUPPORT 94 /* Socket type not supported */ -#define LUSTRE_EOPNOTSUPP 95 /* Operation not supported */ -#define LUSTRE_EPFNOSUPPORT 96 /* Protocol family not supported */ -#define LUSTRE_EAFNOSUPPORT 97 /* Address family not supported */ -#define LUSTRE_EADDRINUSE 98 /* Address already in use */ -#define LUSTRE_EADDRNOTAVAIL 99 /* Cannot assign requested address */ -#define LUSTRE_ENETDOWN 100 /* Network is down */ -#define LUSTRE_ENETUNREACH 101 /* Network is unreachable */ -#define LUSTRE_ENETRESET 102 /* Network connection drop for reset */ -#define LUSTRE_ECONNABORTED 103 /* Software caused connection abort */ -#define LUSTRE_ECONNRESET 104 /* Connection reset by peer */ -#define LUSTRE_ENOBUFS 105 /* No buffer space available */ -#define LUSTRE_EISCONN 106 /* Transport endpoint is connected */ -#define LUSTRE_ENOTCONN 107 /* Transport endpoint not connected */ -#define LUSTRE_ESHUTDOWN 108 /* Cannot send after shutdown */ -#define LUSTRE_ETOOMANYREFS 109 /* Too many references: cannot splice */ -#define LUSTRE_ETIMEDOUT 110 /* Connection timed out */ -#define LUSTRE_ECONNREFUSED 111 /* Connection refused */ -#define LUSTRE_EHOSTDOWN 112 /* Host is down */ -#define LUSTRE_EHOSTUNREACH 113 /* No route to host */ -#define LUSTRE_EALREADY 114 /* Operation already in progress */ -#define LUSTRE_EINPROGRESS 115 /* Operation now in progress */ -#define LUSTRE_ESTALE 116 /* Stale file handle */ -#define LUSTRE_EUCLEAN 117 /* Structure needs cleaning */ -#define LUSTRE_ENOTNAM 118 /* Not a XENIX named type file */ -#define LUSTRE_ENAVAIL 119 /* No XENIX semaphores available */ -#define LUSTRE_EISNAM 120 /* Is a named type file */ -#define LUSTRE_EREMOTEIO 121 /* Remote I/O error */ -#define LUSTRE_EDQUOT 122 /* Quota exceeded */ -#define LUSTRE_ENOMEDIUM 123 /* No medium found */ -#define LUSTRE_EMEDIUMTYPE 124 /* Wrong medium type */ -#define LUSTRE_ECANCELED 125 /* Operation Canceled */ -#define LUSTRE_ENOKEY 126 /* Required key not available */ -#define LUSTRE_EKEYEXPIRED 127 /* Key has expired */ -#define LUSTRE_EKEYREVOKED 128 /* Key has been revoked */ -#define LUSTRE_EKEYREJECTED 129 /* Key was rejected by service */ -#define LUSTRE_EOWNERDEAD 130 /* Owner died */ -#define LUSTRE_ENOTRECOVERABLE 131 /* State not recoverable */ -#define LUSTRE_ERESTARTSYS 512 -#define LUSTRE_ERESTARTNOINTR 513 -#define LUSTRE_ERESTARTNOHAND 514 /* restart if no handler.. */ -#define LUSTRE_ENOIOCTLCMD 515 /* No ioctl command */ -#define LUSTRE_ERESTART_RESTARTBLOCK 516 /* restart via sys_restart_syscall */ -#define LUSTRE_EBADHANDLE 521 /* Illegal NFS file handle */ -#define LUSTRE_ENOTSYNC 522 /* Update synchronization mismatch */ -#define LUSTRE_EBADCOOKIE 523 /* Cookie is stale */ -#define LUSTRE_ENOTSUPP 524 /* Operation is not supported */ -#define LUSTRE_ETOOSMALL 525 /* Buffer or request is too small */ -#define LUSTRE_ESERVERFAULT 526 /* An untranslatable error occurred */ -#define LUSTRE_EBADTYPE 527 /* Type not supported by server */ -#define LUSTRE_EJUKEBOX 528 /* Request won't finish until timeout */ -#define LUSTRE_EIOCBQUEUED 529 /* iocb queued await completion event */ -#define LUSTRE_EIOCBRETRY 530 /* iocb queued, will trigger a retry */ - -/* - * Translations are optimized away on x86. Host errnos that shouldn't be put - * on wire could leak through as a result. Do not count on this side effect. - */ -#ifdef CONFIG_LUSTRE_TRANSLATE_ERRNOS -unsigned int lustre_errno_hton(unsigned int h); -unsigned int lustre_errno_ntoh(unsigned int n); -#else -#define lustre_errno_hton(h) (h) -#define lustre_errno_ntoh(n) (n) -#endif - -#endif /* LUSTRE_ERRNO_H */ diff --git a/drivers/staging/lustre/lustre/include/lustre_export.h b/drivers/staging/lustre/lustre/include/lustre_export.h deleted file mode 100644 index 79ad5aae86b993bd6e74a8744e2cc69b32fab658..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_export.h +++ /dev/null @@ -1,250 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ -/** \defgroup obd_export PortalRPC export definitions - * - * @{ - */ - -#ifndef __EXPORT_H -#define __EXPORT_H - -/** \defgroup export export - * - * @{ - */ - -#include -#include -#include - -enum obd_option { - OBD_OPT_FORCE = 0x0001, - OBD_OPT_FAILOVER = 0x0002, - OBD_OPT_ABORT_RECOV = 0x0004, -}; - -/** - * Export structure. Represents target-side of connection in portals. - * Also used in Lustre to connect between layers on the same node when - * there is no network-connection in-between. - * For every connected client there is an export structure on the server - * attached to the same obd device. - */ -struct obd_export { - /** - * Export handle, it's id is provided to client on connect - * Subsequent client RPCs contain this handle id to identify - * what export they are talking to. - */ - struct portals_handle exp_handle; - atomic_t exp_refcount; - /** - * Set of counters below is to track where export references are - * kept. The exp_rpc_count is used for reconnect handling also, - * the cb_count and locks_count are for debug purposes only for now. - * The sum of them should be less than exp_refcount by 3 - */ - atomic_t exp_rpc_count; /* RPC references */ - atomic_t exp_cb_count; /* Commit callback references */ - /** Number of queued replay requests to be processes */ - atomic_t exp_replay_count; - atomic_t exp_locks_count; /** Lock references */ -#if LUSTRE_TRACKS_LOCK_EXP_REFS - struct list_head exp_locks_list; - spinlock_t exp_locks_list_guard; -#endif - /** UUID of client connected to this export */ - struct obd_uuid exp_client_uuid; - /** To link all exports on an obd device */ - struct list_head exp_obd_chain; - /** work_struct for destruction of export */ - struct work_struct exp_zombie_work; - struct rhash_head exp_uuid_hash; /** uuid-export hash*/ - /** Obd device of this export */ - struct obd_device *exp_obd; - /** - * "reverse" import to send requests (e.g. from ldlm) back to client - * exp_lock protect its change - */ - struct obd_import *exp_imp_reverse; - struct lprocfs_stats *exp_md_stats; - /** Active connection */ - struct ptlrpc_connection *exp_connection; - /** Connection count value from last successful reconnect rpc */ - __u32 exp_conn_cnt; - struct list_head exp_outstanding_replies; - struct list_head exp_uncommitted_replies; - spinlock_t exp_uncommitted_replies_lock; - /** Last committed transno for this export */ - __u64 exp_last_committed; - /** On replay all requests waiting for replay are linked here */ - struct list_head exp_req_replay_queue; - /** - * protects exp_flags, exp_outstanding_replies and the change - * of exp_imp_reverse - */ - spinlock_t exp_lock; - /** Compatibility flags for this export are embedded into - * exp_connect_data - */ - struct obd_connect_data exp_connect_data; - enum obd_option exp_flags; - unsigned long exp_failed:1, - exp_disconnected:1, - exp_connecting:1, - exp_flvr_changed:1, - exp_flvr_adapt:1; - /* also protected by exp_lock */ - enum lustre_sec_part exp_sp_peer; - struct sptlrpc_flavor exp_flvr; /* current */ - struct sptlrpc_flavor exp_flvr_old[2]; /* about-to-expire */ - time64_t exp_flvr_expire[2]; /* seconds */ - - /** protects exp_hp_rpcs */ - spinlock_t exp_rpc_lock; - struct list_head exp_hp_rpcs; /* (potential) HP RPCs */ - - /** blocking dlm lock list, protected by exp_bl_list_lock */ - struct list_head exp_bl_list; - spinlock_t exp_bl_list_lock; -}; - -static inline __u64 *exp_connect_flags_ptr(struct obd_export *exp) -{ - return &exp->exp_connect_data.ocd_connect_flags; -} - -static inline __u64 exp_connect_flags(struct obd_export *exp) -{ - return *exp_connect_flags_ptr(exp); -} - -static inline int exp_max_brw_size(struct obd_export *exp) -{ - if (exp_connect_flags(exp) & OBD_CONNECT_BRW_SIZE) - return exp->exp_connect_data.ocd_brw_size; - - return ONE_MB_BRW_SIZE; -} - -static inline int exp_connect_multibulk(struct obd_export *exp) -{ - return exp_max_brw_size(exp) > ONE_MB_BRW_SIZE; -} - -static inline int exp_connect_cancelset(struct obd_export *exp) -{ - return !!(exp_connect_flags(exp) & OBD_CONNECT_CANCELSET); -} - -static inline int exp_connect_lru_resize(struct obd_export *exp) -{ - return !!(exp_connect_flags(exp) & OBD_CONNECT_LRU_RESIZE); -} - -static inline int exp_connect_vbr(struct obd_export *exp) -{ - return !!(exp_connect_flags(exp) & OBD_CONNECT_VBR); -} - -static inline int exp_connect_som(struct obd_export *exp) -{ - return !!(exp_connect_flags(exp) & OBD_CONNECT_SOM); -} - -static inline int exp_connect_umask(struct obd_export *exp) -{ - return !!(exp_connect_flags(exp) & OBD_CONNECT_UMASK); -} - -static inline int imp_connect_lru_resize(struct obd_import *imp) -{ - struct obd_connect_data *ocd; - - ocd = &imp->imp_connect_data; - return !!(ocd->ocd_connect_flags & OBD_CONNECT_LRU_RESIZE); -} - -static inline int exp_connect_layout(struct obd_export *exp) -{ - return !!(exp_connect_flags(exp) & OBD_CONNECT_LAYOUTLOCK); -} - -static inline bool exp_connect_lvb_type(struct obd_export *exp) -{ - if (exp_connect_flags(exp) & OBD_CONNECT_LVB_TYPE) - return true; - else - return false; -} - -static inline bool imp_connect_lvb_type(struct obd_import *imp) -{ - struct obd_connect_data *ocd; - - ocd = &imp->imp_connect_data; - if (ocd->ocd_connect_flags & OBD_CONNECT_LVB_TYPE) - return true; - else - return false; -} - -static inline __u64 exp_connect_ibits(struct obd_export *exp) -{ - struct obd_connect_data *ocd; - - ocd = &exp->exp_connect_data; - return ocd->ocd_ibits_known; -} - -static inline bool imp_connect_disp_stripe(struct obd_import *imp) -{ - struct obd_connect_data *ocd; - - ocd = &imp->imp_connect_data; - return ocd->ocd_connect_flags & OBD_CONNECT_DISP_STRIPE; -} - -struct obd_export *class_conn2export(struct lustre_handle *conn); - -#define KKUC_CT_DATA_MAGIC 0x092013cea -struct kkuc_ct_data { - __u32 kcd_magic; - struct obd_uuid kcd_uuid; - __u32 kcd_archive; -}; - -/** @} export */ - -#endif /* __EXPORT_H */ -/** @} obd_export */ diff --git a/drivers/staging/lustre/lustre/include/lustre_fid.h b/drivers/staging/lustre/lustre/include/lustre_fid.h deleted file mode 100644 index 094ad282de2c9ef5ff871c69c9459bfd38d6ca12..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_fid.h +++ /dev/null @@ -1,676 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/include/lustre_fid.h - * - * Author: Yury Umanets - */ - -#ifndef __LUSTRE_FID_H -#define __LUSTRE_FID_H - -/** \defgroup fid fid - * - * @{ - * - * http://wiki.lustre.org/index.php/Architecture_-_Interoperability_fids_zfs - * describes the FID namespace and interoperability requirements for FIDs. - * The important parts of that document are included here for reference. - * - * FID - * File IDentifier generated by client from range allocated by the SEQuence - * service and stored in struct lu_fid. The FID is composed of three parts: - * SEQuence, ObjectID, and VERsion. The SEQ component is a filesystem - * unique 64-bit integer, and only one client is ever assigned any SEQ value. - * The first 0x400 FID_SEQ_NORMAL [2^33, 2^33 + 0x400] values are reserved - * for system use. The OID component is a 32-bit value generated by the - * client on a per-SEQ basis to allow creating many unique FIDs without - * communication with the server. The VER component is a 32-bit value that - * distinguishes between different FID instantiations, such as snapshots or - * separate subtrees within the filesystem. FIDs with the same VER field - * are considered part of the same namespace. - * - * OLD filesystems are those upgraded from Lustre 1.x that predate FIDs, and - * MDTs use 32-bit ldiskfs internal inode/generation numbers (IGIFs), while - * OSTs use 64-bit Lustre object IDs and generation numbers. - * - * NEW filesystems are those formatted since the introduction of FIDs. - * - * IGIF - * Inode and Generation In FID, a surrogate FID used to globally identify - * an existing object on OLD formatted MDT file system. This would only be - * used on MDT0 in a DNE filesystem, because there cannot be more than one - * MDT in an OLD formatted filesystem. Belongs to sequence in [12, 2^32 - 1] - * range, where inode number is stored in SEQ, and inode generation is in OID. - * NOTE: This assumes no more than 2^32-1 inodes exist in the MDT filesystem, - * which is the maximum possible for an ldiskfs backend. It also assumes - * that the reserved ext3/ext4/ldiskfs inode numbers [0-11] are never visible - * to clients, which has always been true. - * - * IDIF - * object ID In FID, a surrogate FID used to globally identify an existing - * OST object on OLD formatted OST file system. Belongs to a sequence in - * [2^32, 2^33 - 1]. Sequence number is calculated as: - * - * 1 << 32 | (ost_index << 16) | ((objid >> 32) & 0xffff) - * - * that is, SEQ consists of 16-bit OST index, and higher 16 bits of object - * ID. The generation of unique SEQ values per OST allows the IDIF FIDs to - * be identified in the FLD correctly. The OID field is calculated as: - * - * objid & 0xffffffff - * - * that is, it consists of lower 32 bits of object ID. For objects within - * the IDIF range, object ID extraction will be: - * - * o_id = (fid->f_seq & 0x7fff) << 16 | fid->f_oid; - * o_seq = 0; // formerly group number - * - * NOTE: This assumes that no more than 2^48-1 objects have ever been created - * on any OST, and that no more than 65535 OSTs are in use. Both are very - * reasonable assumptions, i.e. an IDIF can uniquely map all objects assuming - * a maximum creation rate of 1M objects per second for a maximum of 9 years, - * or combinations thereof. - * - * OST_MDT0 - * Surrogate FID used to identify an existing object on OLD formatted OST - * filesystem. Belongs to the reserved SEQuence 0, and is used prior to - * the introduction of FID-on-OST, at which point IDIF will be used to - * identify objects as residing on a specific OST. - * - * LLOG - * For Lustre Log objects the object sequence 1 is used. This is compatible - * with both OLD and NEW namespaces, as this SEQ number is in the - * ext3/ldiskfs reserved inode range and does not conflict with IGIF - * sequence numbers. - * - * ECHO - * For testing OST IO performance the object sequence 2 is used. This is - * compatible with both OLD and NEW namespaces, as this SEQ number is in - * the ext3/ldiskfs reserved inode range and does not conflict with IGIF - * sequence numbers. - * - * OST_MDT1 .. OST_MAX - * For testing with multiple MDTs the object sequence 3 through 9 is used, - * allowing direct mapping of MDTs 1 through 7 respectively, for a total - * of 8 MDTs including OST_MDT0. This matches the legacy CMD project "group" - * mappings. However, this SEQ range is only for testing prior to any - * production DNE release, as the objects in this range conflict across all - * OSTs, as the OST index is not part of the FID. For production DNE usage, - * OST objects created by MDT1+ will use FID_SEQ_NORMAL FIDs. - * - * DLM OST objid to IDIF mapping - * For compatibility with existing OLD OST network protocol structures, the - * FID must map onto the o_id and o_seq in a manner that ensures existing - * objects are identified consistently for IO, as well as onto the LDLM - * namespace to ensure IDIFs there is only a single resource name for any - * object in the DLM. The OLD OST object DLM resource mapping is: - * - * resource[] = {o_id, o_seq, 0, 0}; // o_seq == 0 for production releases - * - * The NEW OST object DLM resource mapping is the same for both MDT and OST: - * - * resource[] = {SEQ, OID, VER, HASH}; - * - * NOTE: for mapping IDIF values to DLM resource names the o_id may be - * larger than the 2^33 reserved sequence numbers for IDIF, so it is possible - * for the o_id numbers to overlap FID SEQ numbers in the resource. However, - * in all production releases the OLD o_seq field is always zero, and all - * valid FID OID values are non-zero, so the lock resources will not collide. - * Even so, the MDT and OST resources are also in different LDLM namespaces. - */ - -#include -#include -#include -#include - -struct lu_env; -struct lu_site; -struct lu_context; -struct obd_device; -struct obd_export; - -/* Whole sequences space range and zero range definitions */ -extern const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE; -extern const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE; -extern const struct lu_fid LUSTRE_BFL_FID; -extern const struct lu_fid LU_OBF_FID; -extern const struct lu_fid LU_DOT_LUSTRE_FID; - -enum { - /* - * This is how may metadata FIDs may be allocated in one sequence(128k) - */ - LUSTRE_METADATA_SEQ_MAX_WIDTH = 0x0000000000020000ULL, - - /* - * This is how many data FIDs could be allocated in one sequence(4B - 1) - */ - LUSTRE_DATA_SEQ_MAX_WIDTH = 0x00000000FFFFFFFFULL, - - /* - * How many sequences to allocate to a client at once. - */ - LUSTRE_SEQ_META_WIDTH = 0x0000000000000001ULL, - - /* - * seq allocation pool size. - */ - LUSTRE_SEQ_BATCH_WIDTH = LUSTRE_SEQ_META_WIDTH * 1000, - - /* - * This is how many sequences may be in one super-sequence allocated to - * MDTs. - */ - LUSTRE_SEQ_SUPER_WIDTH = ((1ULL << 30ULL) * LUSTRE_SEQ_META_WIDTH) -}; - -enum { - /** 2^6 FIDs for OI containers */ - OSD_OI_FID_OID_BITS = 6, - /** reserve enough FIDs in case we want more in the future */ - OSD_OI_FID_OID_BITS_MAX = 10, -}; - -/** special OID for local objects */ -enum local_oid { - /** \see fld_mod_init */ - FLD_INDEX_OID = 3UL, - /** \see fid_mod_init */ - FID_SEQ_CTL_OID = 4UL, - FID_SEQ_SRV_OID = 5UL, - /** \see mdd_mod_init */ - MDD_ROOT_INDEX_OID = 6UL, /* deprecated in 2.4 */ - MDD_ORPHAN_OID = 7UL, /* deprecated in 2.4 */ - MDD_LOV_OBJ_OID = 8UL, - MDD_CAPA_KEYS_OID = 9UL, - /** \see mdt_mod_init */ - LAST_RECV_OID = 11UL, - OSD_FS_ROOT_OID = 13UL, - ACCT_USER_OID = 15UL, - ACCT_GROUP_OID = 16UL, - LFSCK_BOOKMARK_OID = 17UL, - OTABLE_IT_OID = 18UL, - /* These two definitions are obsolete - * OFD_GROUP0_LAST_OID = 20UL, - * OFD_GROUP4K_LAST_OID = 20UL+4096, - */ - OFD_LAST_GROUP_OID = 4117UL, - LLOG_CATALOGS_OID = 4118UL, - MGS_CONFIGS_OID = 4119UL, - OFD_HEALTH_CHECK_OID = 4120UL, - MDD_LOV_OBJ_OSEQ = 4121UL, - LFSCK_NAMESPACE_OID = 4122UL, - REMOTE_PARENT_DIR_OID = 4123UL, - SLAVE_LLOG_CATALOGS_OID = 4124UL, -}; - -static inline void lu_local_obj_fid(struct lu_fid *fid, __u32 oid) -{ - fid->f_seq = FID_SEQ_LOCAL_FILE; - fid->f_oid = oid; - fid->f_ver = 0; -} - -static inline void lu_local_name_obj_fid(struct lu_fid *fid, __u32 oid) -{ - fid->f_seq = FID_SEQ_LOCAL_NAME; - fid->f_oid = oid; - fid->f_ver = 0; -} - -/* For new FS (>= 2.4), the root FID will be changed to - * [FID_SEQ_ROOT:1:0], for existing FS, (upgraded to 2.4), - * the root FID will still be IGIF - */ -static inline int fid_is_root(const struct lu_fid *fid) -{ - return unlikely((fid_seq(fid) == FID_SEQ_ROOT && - fid_oid(fid) == 1)); -} - -static inline int fid_is_dot_lustre(const struct lu_fid *fid) -{ - return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE && - fid_oid(fid) == FID_OID_DOT_LUSTRE); -} - -static inline int fid_is_obf(const struct lu_fid *fid) -{ - return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE && - fid_oid(fid) == FID_OID_DOT_LUSTRE_OBF); -} - -static inline int fid_is_otable_it(const struct lu_fid *fid) -{ - return unlikely(fid_seq(fid) == FID_SEQ_LOCAL_FILE && - fid_oid(fid) == OTABLE_IT_OID); -} - -static inline int fid_is_acct(const struct lu_fid *fid) -{ - return fid_seq(fid) == FID_SEQ_LOCAL_FILE && - (fid_oid(fid) == ACCT_USER_OID || - fid_oid(fid) == ACCT_GROUP_OID); -} - -static inline int fid_is_quota(const struct lu_fid *fid) -{ - return fid_seq(fid) == FID_SEQ_QUOTA || - fid_seq(fid) == FID_SEQ_QUOTA_GLB; -} - -static inline int fid_seq_in_fldb(__u64 seq) -{ - return fid_seq_is_igif(seq) || fid_seq_is_norm(seq) || - fid_seq_is_root(seq) || fid_seq_is_dot(seq); -} - -static inline void lu_last_id_fid(struct lu_fid *fid, __u64 seq, __u32 ost_idx) -{ - if (fid_seq_is_mdt0(seq)) { - fid->f_seq = fid_idif_seq(0, ost_idx); - } else { - LASSERTF(fid_seq_is_norm(seq) || fid_seq_is_echo(seq) || - fid_seq_is_idif(seq), "%#llx\n", seq); - fid->f_seq = seq; - } - fid->f_oid = 0; - fid->f_ver = 0; -} - -/* seq client type */ -enum lu_cli_type { - LUSTRE_SEQ_METADATA = 1, - LUSTRE_SEQ_DATA -}; - -enum lu_mgr_type { - LUSTRE_SEQ_SERVER, - LUSTRE_SEQ_CONTROLLER -}; - -/* Client sequence manager interface. */ -struct lu_client_seq { - /* Sequence-controller export. */ - struct obd_export *lcs_exp; - spinlock_t lcs_lock; - - /* - * Range of allowed for allocation sequences. When using lu_client_seq on - * clients, this contains meta-sequence range. And for servers this - * contains super-sequence range. - */ - struct lu_seq_range lcs_space; - - /* Seq related proc */ - struct dentry *lcs_debugfs_entry; - - /* This holds last allocated fid in last obtained seq */ - struct lu_fid lcs_fid; - - /* LUSTRE_SEQ_METADATA or LUSTRE_SEQ_DATA */ - enum lu_cli_type lcs_type; - - /* - * Service uuid, passed from MDT + seq name to form unique seq name to - * use it with procfs. - */ - char lcs_name[LUSTRE_MDT_MAXNAMELEN]; - - /* - * Sequence width, that is how many objects may be allocated in one - * sequence. Default value for it is LUSTRE_SEQ_MAX_WIDTH. - */ - __u64 lcs_width; - - /* wait queue for fid allocation and update indicator */ - wait_queue_head_t lcs_waitq; - int lcs_update; -}; - -/* Client methods */ -void seq_client_flush(struct lu_client_seq *seq); - -int seq_client_alloc_fid(const struct lu_env *env, struct lu_client_seq *seq, - struct lu_fid *fid); -/* Fids common stuff */ -int fid_is_local(const struct lu_env *env, - struct lu_site *site, const struct lu_fid *fid); - -enum lu_cli_type; -int client_fid_init(struct obd_device *obd, struct obd_export *exp, - enum lu_cli_type type); -int client_fid_fini(struct obd_device *obd); - -/* fid locking */ - -struct ldlm_namespace; - -/* - * Build (DLM) resource name from FID. - * - * NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2], - * but was moved into name[1] along with the OID to avoid consuming the - * renaming name[2,3] fields that need to be used for the quota identifier. - */ -static inline void -fid_build_reg_res_name(const struct lu_fid *fid, struct ldlm_res_id *res) -{ - memset(res, 0, sizeof(*res)); - res->name[LUSTRE_RES_ID_SEQ_OFF] = fid_seq(fid); - res->name[LUSTRE_RES_ID_VER_OID_OFF] = fid_ver_oid(fid); -} - -/* - * Return true if resource is for object identified by FID. - */ -static inline bool fid_res_name_eq(const struct lu_fid *fid, - const struct ldlm_res_id *res) -{ - return res->name[LUSTRE_RES_ID_SEQ_OFF] == fid_seq(fid) && - res->name[LUSTRE_RES_ID_VER_OID_OFF] == fid_ver_oid(fid); -} - -/* - * Extract FID from LDLM resource. Reverse of fid_build_reg_res_name(). - */ -static inline void -fid_extract_from_res_name(struct lu_fid *fid, const struct ldlm_res_id *res) -{ - fid->f_seq = res->name[LUSTRE_RES_ID_SEQ_OFF]; - fid->f_oid = (__u32)(res->name[LUSTRE_RES_ID_VER_OID_OFF]); - fid->f_ver = (__u32)(res->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32); - LASSERT(fid_res_name_eq(fid, res)); -} - -/* - * Build (DLM) resource identifier from global quota FID and quota ID. - */ -static inline void -fid_build_quota_res_name(const struct lu_fid *glb_fid, union lquota_id *qid, - struct ldlm_res_id *res) -{ - fid_build_reg_res_name(glb_fid, res); - res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF] = fid_seq(&qid->qid_fid); - res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] = fid_ver_oid(&qid->qid_fid); -} - -/* - * Extract global FID and quota ID from resource name - */ -static inline void fid_extract_from_quota_res(struct lu_fid *glb_fid, - union lquota_id *qid, - const struct ldlm_res_id *res) -{ - fid_extract_from_res_name(glb_fid, res); - qid->qid_fid.f_seq = res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF]; - qid->qid_fid.f_oid = (__u32)res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF]; - qid->qid_fid.f_ver = - (__u32)(res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] >> 32); -} - -static inline void -fid_build_pdo_res_name(const struct lu_fid *fid, unsigned int hash, - struct ldlm_res_id *res) -{ - fid_build_reg_res_name(fid, res); - res->name[LUSTRE_RES_ID_HSH_OFF] = hash; -} - -/** - * Build DLM resource name from object id & seq, which will be removed - * finally, when we replace ost_id with FID in data stack. - * - * Currently, resid from the old client, whose res[0] = object_id, - * res[1] = object_seq, is just opposite with Metatdata - * resid, where, res[0] = fid->f_seq, res[1] = fid->f_oid. - * To unify the resid identification, we will reverse the data - * resid to keep it same with Metadata resid, i.e. - * - * For resid from the old client, - * res[0] = objid, res[1] = 0, still keep the original order, - * for compatibility. - * - * For new resid - * res will be built from normal FID directly, i.e. res[0] = f_seq, - * res[1] = f_oid + f_ver. - */ -static inline void ostid_build_res_name(const struct ost_id *oi, - struct ldlm_res_id *name) -{ - memset(name, 0, sizeof(*name)); - if (fid_seq_is_mdt0(ostid_seq(oi))) { - name->name[LUSTRE_RES_ID_SEQ_OFF] = ostid_id(oi); - name->name[LUSTRE_RES_ID_VER_OID_OFF] = ostid_seq(oi); - } else { - fid_build_reg_res_name(&oi->oi_fid, name); - } -} - -/** - * Return true if the resource is for the object identified by this id & group. - */ -static inline int ostid_res_name_eq(const struct ost_id *oi, - const struct ldlm_res_id *name) -{ - /* Note: it is just a trick here to save some effort, probably the - * correct way would be turn them into the FID and compare - */ - if (fid_seq_is_mdt0(ostid_seq(oi))) { - return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_id(oi) && - name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_seq(oi); - } else { - return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_seq(oi) && - name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_id(oi); - } -} - -/** - * Note: we need check oi_seq to decide where to set oi_id, - * so oi_seq should always be set ahead of oi_id. - */ -static inline int ostid_set_id(struct ost_id *oi, __u64 oid) -{ - if (fid_seq_is_mdt0(oi->oi.oi_seq)) { - if (oid >= IDIF_MAX_OID) - return -E2BIG; - oi->oi.oi_id = oid; - } else if (fid_is_idif(&oi->oi_fid)) { - if (oid >= IDIF_MAX_OID) - return -E2BIG; - oi->oi_fid.f_seq = fid_idif_seq(oid, - fid_idif_ost_idx(&oi->oi_fid)); - oi->oi_fid.f_oid = oid; - oi->oi_fid.f_ver = oid >> 48; - } else { - if (oid >= OBIF_MAX_OID) - return -E2BIG; - oi->oi_fid.f_oid = oid; - } - return 0; -} - -/* pack any OST FID into an ostid (id/seq) for the wire/disk */ -static inline int fid_to_ostid(const struct lu_fid *fid, struct ost_id *ostid) -{ - int rc = 0; - - if (fid_seq_is_igif(fid->f_seq)) - return -EBADF; - - if (fid_is_idif(fid)) { - u64 objid = fid_idif_id(fid_seq(fid), fid_oid(fid), - fid_ver(fid)); - - ostid_set_seq_mdt0(ostid); - rc = ostid_set_id(ostid, objid); - } else { - ostid->oi_fid = *fid; - } - - return rc; -} - -/* The same as osc_build_res_name() */ -static inline void ost_fid_build_resid(const struct lu_fid *fid, - struct ldlm_res_id *resname) -{ - if (fid_is_mdt0(fid) || fid_is_idif(fid)) { - struct ost_id oi; - - oi.oi.oi_id = 0; /* gcc 4.7.2 complains otherwise */ - if (fid_to_ostid(fid, &oi) != 0) - return; - ostid_build_res_name(&oi, resname); - } else { - fid_build_reg_res_name(fid, resname); - } -} - -/** - * Flatten 128-bit FID values into a 64-bit value for use as an inode number. - * For non-IGIF FIDs this starts just over 2^32, and continues without - * conflict until 2^64, at which point we wrap the high 24 bits of the SEQ - * into the range where there may not be many OID values in use, to minimize - * the risk of conflict. - * - * Suppose LUSTRE_SEQ_MAX_WIDTH less than (1 << 24) which is currently true, - * the time between re-used inode numbers is very long - 2^40 SEQ numbers, - * or about 2^40 client mounts, if clients create less than 2^24 files/mount. - */ -static inline __u64 fid_flatten(const struct lu_fid *fid) -{ - __u64 ino; - __u64 seq; - - if (fid_is_igif(fid)) { - ino = lu_igif_ino(fid); - return ino; - } - - seq = fid_seq(fid); - - ino = (seq << 24) + ((seq >> 24) & 0xffffff0000ULL) + fid_oid(fid); - - return ino ? ino : fid_oid(fid); -} - -static inline __u32 fid_hash(const struct lu_fid *f, int bits) -{ - /* all objects with same id and different versions will belong to same - * collisions list. - */ - return hash_long(fid_flatten(f), bits); -} - -/** - * map fid to 32 bit value for ino on 32bit systems. - */ -static inline __u32 fid_flatten32(const struct lu_fid *fid) -{ - __u32 ino; - __u64 seq; - - if (fid_is_igif(fid)) { - ino = lu_igif_ino(fid); - return ino; - } - - seq = fid_seq(fid) - FID_SEQ_START; - - /* Map the high bits of the OID into higher bits of the inode number so - * that inodes generated at about the same time have a reduced chance - * of collisions. This will give a period of 2^12 = 1024 unique clients - * (from SEQ) and up to min(LUSTRE_SEQ_MAX_WIDTH, 2^20) = 128k objects - * (from OID), or up to 128M inodes without collisions for new files. - */ - ino = ((seq & 0x000fffffULL) << 12) + ((seq >> 8) & 0xfffff000) + - (seq >> (64 - (40 - 8)) & 0xffffff00) + - (fid_oid(fid) & 0xff000fff) + ((fid_oid(fid) & 0x00fff000) << 8); - - return ino ? ino : fid_oid(fid); -} - -static inline int lu_fid_diff(const struct lu_fid *fid1, - const struct lu_fid *fid2) -{ - LASSERTF(fid_seq(fid1) == fid_seq(fid2), "fid1:" DFID ", fid2:" DFID "\n", - PFID(fid1), PFID(fid2)); - - if (fid_is_idif(fid1) && fid_is_idif(fid2)) - return fid_idif_id(fid1->f_seq, fid1->f_oid, fid1->f_ver) - - fid_idif_id(fid2->f_seq, fid2->f_oid, fid2->f_ver); - - return fid_oid(fid1) - fid_oid(fid2); -} - -#define LUSTRE_SEQ_SRV_NAME "seq_srv" -#define LUSTRE_SEQ_CTL_NAME "seq_ctl" - -/* Range common stuff */ -static inline void range_cpu_to_le(struct lu_seq_range *dst, const struct lu_seq_range *src) -{ - dst->lsr_start = cpu_to_le64(src->lsr_start); - dst->lsr_end = cpu_to_le64(src->lsr_end); - dst->lsr_index = cpu_to_le32(src->lsr_index); - dst->lsr_flags = cpu_to_le32(src->lsr_flags); -} - -static inline void range_le_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src) -{ - dst->lsr_start = le64_to_cpu(src->lsr_start); - dst->lsr_end = le64_to_cpu(src->lsr_end); - dst->lsr_index = le32_to_cpu(src->lsr_index); - dst->lsr_flags = le32_to_cpu(src->lsr_flags); -} - -static inline void range_cpu_to_be(struct lu_seq_range *dst, const struct lu_seq_range *src) -{ - dst->lsr_start = cpu_to_be64(src->lsr_start); - dst->lsr_end = cpu_to_be64(src->lsr_end); - dst->lsr_index = cpu_to_be32(src->lsr_index); - dst->lsr_flags = cpu_to_be32(src->lsr_flags); -} - -static inline void range_be_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src) -{ - dst->lsr_start = be64_to_cpu(src->lsr_start); - dst->lsr_end = be64_to_cpu(src->lsr_end); - dst->lsr_index = be32_to_cpu(src->lsr_index); - dst->lsr_flags = be32_to_cpu(src->lsr_flags); -} - -/** @} fid */ - -#endif /* __LUSTRE_FID_H */ diff --git a/drivers/staging/lustre/lustre/include/lustre_fld.h b/drivers/staging/lustre/lustre/include/lustre_fld.h deleted file mode 100644 index f42122a4dfaa7c1411be16d4bea925bd9c668f36..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_fld.h +++ /dev/null @@ -1,137 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#ifndef __LINUX_FLD_H -#define __LINUX_FLD_H - -/** \defgroup fld fld - * - * @{ - */ - -#include -#include - -struct lu_client_fld; -struct lu_server_fld; -struct lu_fld_hash; -struct fld_cache; - -extern const struct dt_index_features fld_index_features; -extern const char fld_index_name[]; - -/* - * FLD (Fid Location Database) interface. - */ -enum { - LUSTRE_CLI_FLD_HASH_DHT = 0, - LUSTRE_CLI_FLD_HASH_RRB -}; - -struct lu_fld_target { - struct list_head ft_chain; - struct obd_export *ft_exp; - struct lu_server_fld *ft_srv; - __u64 ft_idx; -}; - -struct lu_server_fld { - /** - * super sequence controller export, needed to forward fld - * lookup request. - */ - struct obd_export *lsf_control_exp; - - /** Client FLD cache. */ - struct fld_cache *lsf_cache; - - /** Protect index modifications */ - struct mutex lsf_lock; - - /** Fld service name in form "fld-srv-lustre-MDTXXX" */ - char lsf_name[LUSTRE_MDT_MAXNAMELEN]; - -}; - -struct lu_client_fld { - /** Client side debugfs entry. */ - struct dentry *lcf_debugfs_entry; - - /** List of exports client FLD knows about. */ - struct list_head lcf_targets; - - /** Current hash to be used to chose an export. */ - struct lu_fld_hash *lcf_hash; - - /** Exports count. */ - int lcf_count; - - /** Lock protecting exports list and fld_hash. */ - spinlock_t lcf_lock; - - /** Client FLD cache. */ - struct fld_cache *lcf_cache; - - /** Client fld debugfs entry name. */ - char lcf_name[LUSTRE_MDT_MAXNAMELEN]; -}; - -/* Client methods */ -int fld_client_init(struct lu_client_fld *fld, - const char *prefix, int hash); - -void fld_client_fini(struct lu_client_fld *fld); - -void fld_client_flush(struct lu_client_fld *fld); - -int fld_client_lookup(struct lu_client_fld *fld, u64 seq, u32 *mds, - __u32 flags, const struct lu_env *env); - -int fld_client_create(struct lu_client_fld *fld, - struct lu_seq_range *range, - const struct lu_env *env); - -int fld_client_delete(struct lu_client_fld *fld, u64 seq, - const struct lu_env *env); - -int fld_client_add_target(struct lu_client_fld *fld, - struct lu_fld_target *tar); - -int fld_client_del_target(struct lu_client_fld *fld, - __u64 idx); - -void fld_client_debugfs_fini(struct lu_client_fld *fld); - -/** @} fld */ - -#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_ha.h b/drivers/staging/lustre/lustre/include/lustre_ha.h deleted file mode 100644 index cbd68985ada92ba1472e0022637664b4ae92a1d3..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_ha.h +++ /dev/null @@ -1,61 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#ifndef _LUSTRE_HA_H -#define _LUSTRE_HA_H - -/** \defgroup ha ha - * - * @{ - */ - -struct obd_import; -struct obd_export; -struct obd_device; -struct ptlrpc_request; - -int ptlrpc_replay(struct obd_import *imp); -int ptlrpc_resend(struct obd_import *imp); -void ptlrpc_free_committed(struct obd_import *imp); -void ptlrpc_wake_delayed(struct obd_import *imp); -int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async); -int ptlrpc_set_import_active(struct obd_import *imp, int active); -void ptlrpc_activate_import(struct obd_import *imp); -void ptlrpc_deactivate_import(struct obd_import *imp); -void ptlrpc_invalidate_import(struct obd_import *imp); -void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt); -void ptlrpc_pinger_force(struct obd_import *imp); - -/** @} ha */ - -#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_handles.h b/drivers/staging/lustre/lustre/include/lustre_handles.h deleted file mode 100644 index 3556ce8d94e8ffe5e8c52e9c85b49448b3ccb711..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_handles.h +++ /dev/null @@ -1,91 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#ifndef __LUSTRE_HANDLES_H_ -#define __LUSTRE_HANDLES_H_ - -/** \defgroup handles handles - * - * @{ - */ - -#include -#include -#include -#include -#include - -struct portals_handle_ops { - void (*hop_addref)(void *object); - void (*hop_free)(void *object, int size); -}; - -/* These handles are most easily used by having them appear at the very top of - * whatever object that you want to make handles for. ie: - * - * struct ldlm_lock { - * struct portals_handle handle; - * ... - * }; - * - * Now you're able to assign the results of cookie2handle directly to an - * ldlm_lock. If it's not at the top, you'll want to use container_of() - * to compute the start of the structure based on the handle field. - */ -struct portals_handle { - struct list_head h_link; - __u64 h_cookie; - const void *h_owner; - struct portals_handle_ops *h_ops; - - /* newly added fields to handle the RCU issue. -jxiong */ - struct rcu_head h_rcu; - spinlock_t h_lock; - unsigned int h_size:31; - unsigned int h_in:1; -}; - -/* handles.c */ - -/* Add a handle to the hash table */ -void class_handle_hash(struct portals_handle *, - struct portals_handle_ops *ops); -void class_handle_unhash(struct portals_handle *); -void *class_handle2object(__u64 cookie, const void *owner); -void class_handle_free_cb(struct rcu_head *rcu); -int class_handle_init(void); -void class_handle_cleanup(void); - -/** @} handles */ - -#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_import.h b/drivers/staging/lustre/lustre/include/lustre_import.h deleted file mode 100644 index ac3805ead620cf52064167d475ba3131958d0183..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_import.h +++ /dev/null @@ -1,369 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ -/** \defgroup obd_import PtlRPC import definitions - * Imports are client-side representation of remote obd target. - * - * @{ - */ - -#ifndef __IMPORT_H -#define __IMPORT_H - -/** \defgroup export export - * - * @{ - */ - -#include -#include -#include - -/** - * Adaptive Timeout stuff - * - * @{ - */ -#define D_ADAPTTO D_OTHER -#define AT_BINS 4 /* "bin" means "N seconds of history" */ -#define AT_FLG_NOHIST 0x1 /* use last reported value only */ - -struct adaptive_timeout { - time64_t at_binstart; /* bin start time */ - unsigned int at_hist[AT_BINS]; /* timeout history bins */ - unsigned int at_flags; - unsigned int at_current; /* current timeout value */ - unsigned int at_worst_ever; /* worst-ever timeout value */ - time64_t at_worst_time; /* worst-ever timeout timestamp */ - spinlock_t at_lock; -}; - -struct ptlrpc_at_array { - struct list_head *paa_reqs_array; /** array to hold requests */ - __u32 paa_size; /** the size of array */ - __u32 paa_count; /** the total count of reqs */ - time64_t paa_deadline; /** the earliest deadline of reqs */ - __u32 *paa_reqs_count; /** the count of reqs in each entry */ -}; - -#define IMP_AT_MAX_PORTALS 8 -struct imp_at { - int iat_portal[IMP_AT_MAX_PORTALS]; - struct adaptive_timeout iat_net_latency; - struct adaptive_timeout iat_service_estimate[IMP_AT_MAX_PORTALS]; -}; - -/** @} */ - -/** Possible import states */ -enum lustre_imp_state { - LUSTRE_IMP_CLOSED = 1, - LUSTRE_IMP_NEW = 2, - LUSTRE_IMP_DISCON = 3, - LUSTRE_IMP_CONNECTING = 4, - LUSTRE_IMP_REPLAY = 5, - LUSTRE_IMP_REPLAY_LOCKS = 6, - LUSTRE_IMP_REPLAY_WAIT = 7, - LUSTRE_IMP_RECOVER = 8, - LUSTRE_IMP_FULL = 9, - LUSTRE_IMP_EVICTED = 10, -}; - -/** Returns test string representation of numeric import state \a state */ -static inline char *ptlrpc_import_state_name(enum lustre_imp_state state) -{ - static char *import_state_names[] = { - "", "CLOSED", "NEW", "DISCONN", - "CONNECTING", "REPLAY", "REPLAY_LOCKS", "REPLAY_WAIT", - "RECOVER", "FULL", "EVICTED", - }; - - LASSERT(state <= LUSTRE_IMP_EVICTED); - return import_state_names[state]; -} - -/** - * List of import event types - */ -enum obd_import_event { - IMP_EVENT_DISCON = 0x808001, - IMP_EVENT_INACTIVE = 0x808002, - IMP_EVENT_INVALIDATE = 0x808003, - IMP_EVENT_ACTIVE = 0x808004, - IMP_EVENT_OCD = 0x808005, - IMP_EVENT_DEACTIVATE = 0x808006, - IMP_EVENT_ACTIVATE = 0x808007, -}; - -/** - * Definition of import connection structure - */ -struct obd_import_conn { - /** Item for linking connections together */ - struct list_head oic_item; - /** Pointer to actual PortalRPC connection */ - struct ptlrpc_connection *oic_conn; - /** uuid of remote side */ - struct obd_uuid oic_uuid; - /** - * Time (64 bit jiffies) of last connection attempt on this connection - */ - __u64 oic_last_attempt; -}; - -/* state history */ -#define IMP_STATE_HIST_LEN 16 -struct import_state_hist { - enum lustre_imp_state ish_state; - time64_t ish_time; -}; - -/** - * Definition of PortalRPC import structure. - * Imports are representing client-side view to remote target. - */ -struct obd_import { - /** Local handle (== id) for this import. */ - struct portals_handle imp_handle; - /** Reference counter */ - atomic_t imp_refcount; - struct lustre_handle imp_dlm_handle; /* client's ldlm export */ - /** Currently active connection */ - struct ptlrpc_connection *imp_connection; - /** PortalRPC client structure for this import */ - struct ptlrpc_client *imp_client; - /** List element for linking into pinger chain */ - struct list_head imp_pinger_chain; - /** work struct for destruction of import */ - struct work_struct imp_zombie_work; - - /** - * Lists of requests that are retained for replay, waiting for a reply, - * or waiting for recovery to complete, respectively. - * @{ - */ - struct list_head imp_replay_list; - struct list_head imp_sending_list; - struct list_head imp_delayed_list; - /** @} */ - - /** - * List of requests that are retained for committed open replay. Once - * open is committed, open replay request will be moved from the - * imp_replay_list into the imp_committed_list. - * The imp_replay_cursor is for accelerating searching during replay. - * @{ - */ - struct list_head imp_committed_list; - struct list_head *imp_replay_cursor; - /** @} */ - - /** List of not replied requests */ - struct list_head imp_unreplied_list; - /** Known maximal replied XID */ - __u64 imp_known_replied_xid; - - /** obd device for this import */ - struct obd_device *imp_obd; - - /** - * some seciruty-related fields - * @{ - */ - struct ptlrpc_sec *imp_sec; - struct mutex imp_sec_mutex; - time64_t imp_sec_expire; - /** @} */ - - /** Wait queue for those who need to wait for recovery completion */ - wait_queue_head_t imp_recovery_waitq; - - /** Number of requests currently in-flight */ - atomic_t imp_inflight; - /** Number of requests currently unregistering */ - atomic_t imp_unregistering; - /** Number of replay requests inflight */ - atomic_t imp_replay_inflight; - /** Number of currently happening import invalidations */ - atomic_t imp_inval_count; - /** Numbner of request timeouts */ - atomic_t imp_timeouts; - /** Current import state */ - enum lustre_imp_state imp_state; - /** Last replay state */ - enum lustre_imp_state imp_replay_state; - /** History of import states */ - struct import_state_hist imp_state_hist[IMP_STATE_HIST_LEN]; - int imp_state_hist_idx; - /** Current import generation. Incremented on every reconnect */ - int imp_generation; - /** Incremented every time we send reconnection request */ - __u32 imp_conn_cnt; - /** - * \see ptlrpc_free_committed remembers imp_generation value here - * after a check to save on unnecessary replay list iterations - */ - int imp_last_generation_checked; - /** Last transno we replayed */ - __u64 imp_last_replay_transno; - /** Last transno committed on remote side */ - __u64 imp_peer_committed_transno; - /** - * \see ptlrpc_free_committed remembers last_transno since its last - * check here and if last_transno did not change since last run of - * ptlrpc_free_committed and import generation is the same, we can - * skip looking for requests to remove from replay list as optimisation - */ - __u64 imp_last_transno_checked; - /** - * Remote export handle. This is how remote side knows what export - * we are talking to. Filled from response to connect request - */ - struct lustre_handle imp_remote_handle; - /** When to perform next ping. time in jiffies. */ - unsigned long imp_next_ping; - /** When we last successfully connected. time in 64bit jiffies */ - __u64 imp_last_success_conn; - - /** List of all possible connection for import. */ - struct list_head imp_conn_list; - /** - * Current connection. \a imp_connection is imp_conn_current->oic_conn - */ - struct obd_import_conn *imp_conn_current; - - /** Protects flags, level, generation, conn_cnt, *_list */ - spinlock_t imp_lock; - - /* flags */ - unsigned long imp_no_timeout:1, /* timeouts are disabled */ - imp_invalid:1, /* evicted */ - /* administratively disabled */ - imp_deactive:1, - /* try to recover the import */ - imp_replayable:1, - /* don't run recovery (timeout instead) */ - imp_dlm_fake:1, - /* use 1/2 timeout on MDS' OSCs */ - imp_server_timeout:1, - /* VBR: imp in delayed recovery */ - imp_delayed_recovery:1, - /* VBR: if gap was found then no lock replays - */ - imp_no_lock_replay:1, - /* recovery by versions was failed */ - imp_vbr_failed:1, - /* force an immediate ping */ - imp_force_verify:1, - /* force a scheduled ping */ - imp_force_next_verify:1, - /* pingable */ - imp_pingable:1, - /* resend for replay */ - imp_resend_replay:1, - /* disable normal recovery, for test only. */ - imp_no_pinger_recover:1, -#if OBD_OCD_VERSION(3, 0, 53, 0) > LUSTRE_VERSION_CODE - /* need IR MNE swab */ - imp_need_mne_swab:1, -#endif - /* import must be reconnected instead of - * chosing new connection - */ - imp_force_reconnect:1, - /* import has tried to connect with server */ - imp_connect_tried:1, - /* connected but not FULL yet */ - imp_connected:1; - __u32 imp_connect_op; - struct obd_connect_data imp_connect_data; - __u64 imp_connect_flags_orig; - int imp_connect_error; - - __u32 imp_msg_magic; - __u32 imp_msghdr_flags; /* adjusted based on server capability */ - - struct imp_at imp_at; /* adaptive timeout data */ - time64_t imp_last_reply_time; /* for health check */ -}; - -/* import.c */ -static inline unsigned int at_est2timeout(unsigned int val) -{ - /* add an arbitrary minimum: 125% +5 sec */ - return (val + (val >> 2) + 5); -} - -static inline unsigned int at_timeout2est(unsigned int val) -{ - /* restore estimate value from timeout: e=4/5(t-5) */ - LASSERT(val); - return (max((val << 2) / 5, 5U) - 4); -} - -static inline void at_reset(struct adaptive_timeout *at, int val) -{ - spin_lock(&at->at_lock); - at->at_current = val; - at->at_worst_ever = val; - at->at_worst_time = ktime_get_real_seconds(); - spin_unlock(&at->at_lock); -} - -static inline void at_init(struct adaptive_timeout *at, int val, int flags) -{ - memset(at, 0, sizeof(*at)); - spin_lock_init(&at->at_lock); - at->at_flags = flags; - at_reset(at, val); -} - -extern unsigned int at_min; -static inline int at_get(struct adaptive_timeout *at) -{ - return (at->at_current > at_min) ? at->at_current : at_min; -} - -int at_measured(struct adaptive_timeout *at, unsigned int val); -int import_at_get_index(struct obd_import *imp, int portal); -extern unsigned int at_max; -#define AT_OFF (at_max == 0) - -/* genops.c */ -struct obd_export; -struct obd_import *class_exp2cliimp(struct obd_export *); - -/** @} import */ - -#endif /* __IMPORT_H */ - -/** @} obd_import */ diff --git a/drivers/staging/lustre/lustre/include/lustre_intent.h b/drivers/staging/lustre/lustre/include/lustre_intent.h deleted file mode 100644 index 51e5c0e03872008e7008328c62023790524a4af2..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_intent.h +++ /dev/null @@ -1,71 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#ifndef LUSTRE_INTENT_H -#define LUSTRE_INTENT_H - -#include - -/* intent IT_XXX are defined in lustre/include/obd.h */ - -struct lookup_intent { - int it_op; - int it_create_mode; - __u64 it_flags; - int it_disposition; - int it_status; - __u64 it_lock_handle; - __u64 it_lock_bits; - int it_lock_mode; - int it_remote_lock_mode; - __u64 it_remote_lock_handle; - struct ptlrpc_request *it_request; - unsigned int it_lock_set:1; -}; - -static inline int it_disposition(struct lookup_intent *it, int flag) -{ - return it->it_disposition & flag; -} - -static inline void it_set_disposition(struct lookup_intent *it, int flag) -{ - it->it_disposition |= flag; -} - -static inline void it_clear_disposition(struct lookup_intent *it, int flag) -{ - it->it_disposition &= ~flag; -} - -#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_kernelcomm.h b/drivers/staging/lustre/lustre/include/lustre_kernelcomm.h deleted file mode 100644 index 2b3fa84301857e00e26bd85b09289c13cd641835..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_kernelcomm.h +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2013 Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * - * Author: Nathan Rutman - * - * Kernel <-> userspace communication routines. - * The definitions below are used in the kernel and userspace. - */ - -#ifndef __LUSTRE_KERNELCOMM_H__ -#define __LUSTRE_KERNELCOMM_H__ - -/* For declarations shared with userspace */ -#include - -/* prototype for callback function on kuc groups */ -typedef int (*libcfs_kkuc_cb_t)(void *data, void *cb_arg); - -/* Kernel methods */ -int libcfs_kkuc_msg_put(struct file *fp, void *payload); -int libcfs_kkuc_group_put(unsigned int group, void *payload); -int libcfs_kkuc_group_add(struct file *fp, int uid, unsigned int group, - void *data, size_t data_len); -int libcfs_kkuc_group_rem(int uid, unsigned int group); -int libcfs_kkuc_group_foreach(unsigned int group, libcfs_kkuc_cb_t cb_func, - void *cb_arg); - -#endif /* __LUSTRE_KERNELCOMM_H__ */ diff --git a/drivers/staging/lustre/lustre/include/lustre_lib.h b/drivers/staging/lustre/lustre/include/lustre_lib.h deleted file mode 100644 index 87748e9902a7f011b1386c8b295a7054754cdb8c..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_lib.h +++ /dev/null @@ -1,126 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/include/lustre_lib.h - * - * Basic Lustre library routines. - */ - -#ifndef _LUSTRE_LIB_H -#define _LUSTRE_LIB_H - -/** \defgroup lib lib - * - * @{ - */ - -#include -#include -#include -#include -#include -#include - -/* target.c */ -struct ptlrpc_request; -struct obd_export; -struct lu_target; -struct l_wait_info; -#include -#include - -#define LI_POISON 0x5a5a5a5a -#if BITS_PER_LONG > 32 -# define LL_POISON 0x5a5a5a5a5a5a5a5aL -#else -# define LL_POISON 0x5a5a5a5aL -#endif -#define LP_POISON ((void *)LL_POISON) - -int target_pack_pool_reply(struct ptlrpc_request *req); -int do_set_info_async(struct obd_import *imp, - int opcode, int version, - u32 keylen, void *key, - u32 vallen, void *val, - struct ptlrpc_request_set *set); - -void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id); - -#define LUSTRE_FATAL_SIGS (sigmask(SIGKILL) | sigmask(SIGINT) | \ - sigmask(SIGTERM) | sigmask(SIGQUIT) | \ - sigmask(SIGALRM)) -static inline int l_fatal_signal_pending(struct task_struct *p) -{ - return signal_pending(p) && sigtestsetmask(&p->pending.signal, LUSTRE_FATAL_SIGS); -} - -/** @} lib */ - - - -/* l_wait_event_abortable() is a bit like wait_event_killable() - * except there is a fixed set of signals which will abort: - * LUSTRE_FATAL_SIGS - */ -#define l_wait_event_abortable(wq, condition) \ -({ \ - sigset_t __new_blocked, __old_blocked; \ - int __ret = 0; \ - siginitset(&__new_blocked, LUSTRE_FATAL_SIGS); \ - sigprocmask(SIG_BLOCK, &__new_blocked, &__old_blocked); \ - __ret = wait_event_interruptible(wq, condition); \ - sigprocmask(SIG_SETMASK, &__old_blocked, NULL); \ - __ret; \ -}) - -#define l_wait_event_abortable_timeout(wq, condition, timeout) \ -({ \ - sigset_t __new_blocked, __old_blocked; \ - int __ret = 0; \ - siginitset(&__new_blocked, LUSTRE_FATAL_SIGS); \ - sigprocmask(SIG_BLOCK, &__new_blocked, &__old_blocked); \ - __ret = wait_event_interruptible_timeout(wq, condition, timeout);\ - sigprocmask(SIG_SETMASK, &__old_blocked, NULL); \ - __ret; \ -}) - -#define l_wait_event_abortable_exclusive(wq, condition) \ -({ \ - sigset_t __new_blocked, __old_blocked; \ - int __ret = 0; \ - siginitset(&__new_blocked, LUSTRE_FATAL_SIGS); \ - sigprocmask(SIG_BLOCK, &__new_blocked, &__old_blocked); \ - __ret = wait_event_interruptible_exclusive(wq, condition); \ - sigprocmask(SIG_SETMASK, &__old_blocked, NULL); \ - __ret; \ -}) -#endif /* _LUSTRE_LIB_H */ diff --git a/drivers/staging/lustre/lustre/include/lustre_linkea.h b/drivers/staging/lustre/lustre/include/lustre_linkea.h deleted file mode 100644 index 03db1511bfd33bf0bab17996b75b9e7c2ff7a3fb..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_linkea.h +++ /dev/null @@ -1,93 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2013, 2014, Intel Corporation. - * Use is subject to license terms. - * - * Author: di wang - */ - -/* There are several reasons to restrict the linkEA size: - * - * 1. Under DNE mode, if we do not restrict the linkEA size, and if there - * are too many cross-MDTs hard links to the same object, then it will - * casue the llog overflow. - * - * 2. Some backend has limited size for EA. For example, if without large - * EA enabled, the ldiskfs will make all EAs to share one (4K) EA block. - * - * 3. Too many entries in linkEA will seriously affect linkEA performance - * because we only support to locate linkEA entry consecutively. - */ -#define MAX_LINKEA_SIZE 4096 - -struct linkea_data { - /** - * Buffer to keep link EA body. - */ - struct lu_buf *ld_buf; - /** - * The matched header, entry and its length in the EA - */ - struct link_ea_header *ld_leh; - struct link_ea_entry *ld_lee; - int ld_reclen; -}; - -int linkea_data_new(struct linkea_data *ldata, struct lu_buf *buf); -int linkea_init(struct linkea_data *ldata); -int linkea_init_with_rec(struct linkea_data *ldata); -void linkea_entry_unpack(const struct link_ea_entry *lee, int *reclen, - struct lu_name *lname, struct lu_fid *pfid); -int linkea_entry_pack(struct link_ea_entry *lee, const struct lu_name *lname, - const struct lu_fid *pfid); -int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname, - const struct lu_fid *pfid); -void linkea_del_buf(struct linkea_data *ldata, const struct lu_name *lname); -int linkea_links_find(struct linkea_data *ldata, const struct lu_name *lname, - const struct lu_fid *pfid); - -static inline void linkea_first_entry(struct linkea_data *ldata) -{ - LASSERT(ldata); - LASSERT(ldata->ld_leh); - - if (ldata->ld_leh->leh_reccount == 0) - ldata->ld_lee = NULL; - else - ldata->ld_lee = (struct link_ea_entry *)(ldata->ld_leh + 1); -} - -static inline void linkea_next_entry(struct linkea_data *ldata) -{ - LASSERT(ldata); - LASSERT(ldata->ld_leh); - - if (ldata->ld_lee) { - ldata->ld_lee = (struct link_ea_entry *)((char *)ldata->ld_lee + - ldata->ld_reclen); - if ((char *)ldata->ld_lee >= ((char *)ldata->ld_leh + - ldata->ld_leh->leh_len)) - ldata->ld_lee = NULL; - } -} diff --git a/drivers/staging/lustre/lustre/include/lustre_lmv.h b/drivers/staging/lustre/lustre/include/lustre_lmv.h deleted file mode 100644 index 080ec1f8e19f88ad84b400cd2b707928e05f86db..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_lmv.h +++ /dev/null @@ -1,174 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License version 2 for more details. A copy is - * included in the COPYING file that accompanied this code. - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * GPL HEADER END - */ -/* - * Copyright (c) 2013, Intel Corporation. - */ -/* - * lustre/include/lustre_lmv.h - * - * Lustre LMV structures and functions. - * - * Author: Di Wang - */ - -#ifndef _LUSTRE_LMV_H -#define _LUSTRE_LMV_H -#include - -struct lmv_oinfo { - struct lu_fid lmo_fid; - u32 lmo_mds; - struct inode *lmo_root; -}; - -struct lmv_stripe_md { - __u32 lsm_md_magic; - __u32 lsm_md_stripe_count; - __u32 lsm_md_master_mdt_index; - __u32 lsm_md_hash_type; - __u32 lsm_md_layout_version; - __u32 lsm_md_default_count; - __u32 lsm_md_default_index; - char lsm_md_pool_name[LOV_MAXPOOLNAME + 1]; - struct lmv_oinfo lsm_md_oinfo[0]; -}; - -static inline bool -lsm_md_eq(const struct lmv_stripe_md *lsm1, const struct lmv_stripe_md *lsm2) -{ - __u32 idx; - - if (lsm1->lsm_md_magic != lsm2->lsm_md_magic || - lsm1->lsm_md_stripe_count != lsm2->lsm_md_stripe_count || - lsm1->lsm_md_master_mdt_index != lsm2->lsm_md_master_mdt_index || - lsm1->lsm_md_hash_type != lsm2->lsm_md_hash_type || - lsm1->lsm_md_layout_version != lsm2->lsm_md_layout_version || - strcmp(lsm1->lsm_md_pool_name, lsm2->lsm_md_pool_name) != 0) - return false; - - for (idx = 0; idx < lsm1->lsm_md_stripe_count; idx++) { - if (!lu_fid_eq(&lsm1->lsm_md_oinfo[idx].lmo_fid, - &lsm2->lsm_md_oinfo[idx].lmo_fid)) - return false; - } - - return true; -} - -union lmv_mds_md; - -void lmv_free_memmd(struct lmv_stripe_md *lsm); - -static inline void lmv1_le_to_cpu(struct lmv_mds_md_v1 *lmv_dst, - const struct lmv_mds_md_v1 *lmv_src) -{ - __u32 i; - - lmv_dst->lmv_magic = le32_to_cpu(lmv_src->lmv_magic); - lmv_dst->lmv_stripe_count = le32_to_cpu(lmv_src->lmv_stripe_count); - lmv_dst->lmv_master_mdt_index = - le32_to_cpu(lmv_src->lmv_master_mdt_index); - lmv_dst->lmv_hash_type = le32_to_cpu(lmv_src->lmv_hash_type); - lmv_dst->lmv_layout_version = le32_to_cpu(lmv_src->lmv_layout_version); - - for (i = 0; i < lmv_src->lmv_stripe_count; i++) - fid_le_to_cpu(&lmv_dst->lmv_stripe_fids[i], - &lmv_src->lmv_stripe_fids[i]); -} - -static inline void lmv_le_to_cpu(union lmv_mds_md *lmv_dst, - const union lmv_mds_md *lmv_src) -{ - switch (le32_to_cpu(lmv_src->lmv_magic)) { - case LMV_MAGIC_V1: - lmv1_le_to_cpu(&lmv_dst->lmv_md_v1, &lmv_src->lmv_md_v1); - break; - default: - break; - } -} - -/* This hash is only for testing purpose */ -static inline unsigned int -lmv_hash_all_chars(unsigned int count, const char *name, int namelen) -{ - const unsigned char *p = (const unsigned char *)name; - unsigned int c = 0; - - while (--namelen >= 0) - c += p[namelen]; - - c = c % count; - - return c; -} - -static inline unsigned int -lmv_hash_fnv1a(unsigned int count, const char *name, int namelen) -{ - __u64 hash; - - hash = lustre_hash_fnv_1a_64(name, namelen); - - return do_div(hash, count); -} - -static inline int lmv_name_to_stripe_index(__u32 lmv_hash_type, - unsigned int stripe_count, - const char *name, int namelen) -{ - __u32 hash_type = lmv_hash_type & LMV_HASH_TYPE_MASK; - int idx; - - LASSERT(namelen > 0); - if (stripe_count <= 1) - return 0; - - /* for migrating object, always start from 0 stripe */ - if (lmv_hash_type & LMV_HASH_FLAG_MIGRATION) - return 0; - - switch (hash_type) { - case LMV_HASH_TYPE_ALL_CHARS: - idx = lmv_hash_all_chars(stripe_count, name, namelen); - break; - case LMV_HASH_TYPE_FNV_1A_64: - idx = lmv_hash_fnv1a(stripe_count, name, namelen); - break; - default: - idx = -EBADFD; - break; - } - CDEBUG(D_INFO, "name %.*s hash_type %d idx %d\n", namelen, name, - hash_type, idx); - - return idx; -} - -static inline bool lmv_is_known_hash_type(__u32 type) -{ - return (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_FNV_1A_64 || - (type & LMV_HASH_TYPE_MASK) == LMV_HASH_TYPE_ALL_CHARS; -} - -#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_log.h b/drivers/staging/lustre/lustre/include/lustre_log.h deleted file mode 100644 index 07f4e600386bb23c26ae5e5ccdec96aab67c2197..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_log.h +++ /dev/null @@ -1,382 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/include/lustre_log.h - * - * Generic infrastructure for managing a collection of logs. - * These logs are used for: - * - * - orphan recovery: OST adds record on create - * - mtime/size consistency: the OST adds a record on first write - * - open/unlinked objects: OST adds a record on destroy - * - * - mds unlink log: the MDS adds an entry upon delete - * - * - raid1 replication log between OST's - * - MDS replication logs - */ - -#ifndef _LUSTRE_LOG_H -#define _LUSTRE_LOG_H - -/** \defgroup log log - * - * @{ - */ - -#include -#include - -#define LOG_NAME_LIMIT(logname, name) \ - snprintf(logname, sizeof(logname), "LOGS/%s", name) -#define LLOG_EEMPTY 4711 - -enum llog_open_param { - LLOG_OPEN_EXISTS = 0x0000, - LLOG_OPEN_NEW = 0x0001, -}; - -struct plain_handle_data { - struct list_head phd_entry; - struct llog_handle *phd_cat_handle; - struct llog_cookie phd_cookie; /* cookie of this log in its cat */ -}; - -struct cat_handle_data { - struct list_head chd_head; - struct llog_handle *chd_current_log; /* currently open log */ - struct llog_handle *chd_next_log; /* llog to be used next */ -}; - -struct llog_handle; - -/* llog.c - general API */ -int llog_init_handle(const struct lu_env *env, struct llog_handle *handle, - int flags, struct obd_uuid *uuid); -int llog_process(const struct lu_env *env, struct llog_handle *loghandle, - llog_cb_t cb, void *data, void *catdata); -int llog_process_or_fork(const struct lu_env *env, - struct llog_handle *loghandle, - llog_cb_t cb, void *data, void *catdata, bool fork); -int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt, - struct llog_handle **lgh, struct llog_logid *logid, - char *name, enum llog_open_param open_param); -int llog_close(const struct lu_env *env, struct llog_handle *cathandle); - -/* llog_process flags */ -#define LLOG_FLAG_NODEAMON 0x0001 - -/* llog_cat.c - catalog api */ -struct llog_process_data { - /** - * Any useful data needed while processing catalog. This is - * passed later to process callback. - */ - void *lpd_data; - /** - * Catalog process callback function, called for each record - * in catalog. - */ - llog_cb_t lpd_cb; - /** - * Start processing the catalog from startcat/startidx - */ - int lpd_startcat; - int lpd_startidx; -}; - -struct llog_process_cat_data { - /** - * Temporary stored first_idx while scanning log. - */ - int lpcd_first_idx; - /** - * Temporary stored last_idx while scanning log. - */ - int lpcd_last_idx; -}; - -struct thandle; - -int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle); -int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh, - llog_cb_t cb, void *data, int startcat, int startidx); - -/* llog_obd.c */ -int llog_setup(const struct lu_env *env, struct obd_device *obd, - struct obd_llog_group *olg, int index, - struct obd_device *disk_obd, struct llog_operations *op); -int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt); -int llog_cleanup(const struct lu_env *env, struct llog_ctxt *); - -/* llog_net.c */ -int llog_initiator_connect(struct llog_ctxt *ctxt); - -struct llog_operations { - int (*lop_next_block)(const struct lu_env *env, struct llog_handle *h, - int *curr_idx, int next_idx, __u64 *offset, - void *buf, int len); - int (*lop_prev_block)(const struct lu_env *env, struct llog_handle *h, - int prev_idx, void *buf, int len); - int (*lop_read_header)(const struct lu_env *env, - struct llog_handle *handle); - int (*lop_setup)(const struct lu_env *env, struct obd_device *obd, - struct obd_llog_group *olg, int ctxt_idx, - struct obd_device *disk_obd); - int (*lop_sync)(struct llog_ctxt *ctxt, struct obd_export *exp, - int flags); - int (*lop_cleanup)(const struct lu_env *env, struct llog_ctxt *ctxt); - int (*lop_cancel)(const struct lu_env *env, struct llog_ctxt *ctxt, - struct llog_cookie *cookies, int flags); - int (*lop_connect)(struct llog_ctxt *ctxt, struct llog_logid *logid, - struct llog_gen *gen, struct obd_uuid *uuid); - /** - * Any llog file must be opened first using llog_open(). Llog can be - * opened by name, logid or without both, in last case the new logid - * will be generated. - */ - int (*lop_open)(const struct lu_env *env, struct llog_handle *lgh, - struct llog_logid *logid, char *name, - enum llog_open_param); - /** - * Opened llog may not exist and this must be checked where needed using - * the llog_exist() call. - */ - int (*lop_exist)(struct llog_handle *lgh); - /** - * Close llog file and calls llog_free_handle() implicitly. - * Any opened llog must be closed by llog_close() call. - */ - int (*lop_close)(const struct lu_env *env, struct llog_handle *handle); - /** - * Create new llog file. The llog must be opened. - * Must be used only for local llog operations. - */ - int (*lop_declare_create)(const struct lu_env *env, - struct llog_handle *handle, - struct thandle *th); - /** - * write new record in llog. It appends records usually but can edit - * existing records too. - */ - int (*lop_declare_write_rec)(const struct lu_env *env, - struct llog_handle *lgh, - struct llog_rec_hdr *rec, - int idx, struct thandle *th); - int (*lop_write_rec)(const struct lu_env *env, - struct llog_handle *loghandle, - struct llog_rec_hdr *rec, - struct llog_cookie *cookie, int cookiecount, - void *buf, int idx, struct thandle *th); - /** - * Add new record in llog catalog. Does the same as llog_write_rec() - * but using llog catalog. - */ - int (*lop_declare_add)(const struct lu_env *env, - struct llog_handle *lgh, - struct llog_rec_hdr *rec, struct thandle *th); - int (*lop_add)(const struct lu_env *env, struct llog_handle *lgh, - struct llog_rec_hdr *rec, struct llog_cookie *cookie, - void *buf, struct thandle *th); -}; - -/* In-memory descriptor for a log object or log catalog */ -struct llog_handle { - struct rw_semaphore lgh_lock; - spinlock_t lgh_hdr_lock; /* protect lgh_hdr data */ - struct llog_logid lgh_id; /* id of this log */ - struct llog_log_hdr *lgh_hdr; - size_t lgh_hdr_size; - int lgh_last_idx; - int lgh_cur_idx; /* used during llog_process */ - __u64 lgh_cur_offset; /* used during llog_process */ - struct llog_ctxt *lgh_ctxt; - union { - struct plain_handle_data phd; - struct cat_handle_data chd; - } u; - char *lgh_name; - void *private_data; - struct llog_operations *lgh_logops; - atomic_t lgh_refcount; -}; - -#define LLOG_CTXT_FLAG_UNINITIALIZED 0x00000001 -#define LLOG_CTXT_FLAG_STOP 0x00000002 - -struct llog_ctxt { - int loc_idx; /* my index the obd array of ctxt's */ - struct obd_device *loc_obd; /* points back to the containing obd*/ - struct obd_llog_group *loc_olg; /* group containing that ctxt */ - struct obd_export *loc_exp; /* parent "disk" export (e.g. MDS) */ - struct obd_import *loc_imp; /* to use in RPC's: can be backward - * pointing import - */ - struct llog_operations *loc_logops; - struct llog_handle *loc_handle; - struct mutex loc_mutex; /* protect loc_imp */ - atomic_t loc_refcount; - long loc_flags; /* flags, see above defines */ - /* - * llog chunk size, and llog record size can not be bigger than - * loc_chunk_size - */ - __u32 loc_chunk_size; -}; - -#define LLOG_PROC_BREAK 0x0001 -#define LLOG_DEL_RECORD 0x0002 - -static inline int llog_handle2ops(struct llog_handle *loghandle, - struct llog_operations **lop) -{ - if (!loghandle || !loghandle->lgh_logops) - return -EINVAL; - - *lop = loghandle->lgh_logops; - return 0; -} - -static inline struct llog_ctxt *llog_ctxt_get(struct llog_ctxt *ctxt) -{ - atomic_inc(&ctxt->loc_refcount); - CDEBUG(D_INFO, "GETting ctxt %p : new refcount %d\n", ctxt, - atomic_read(&ctxt->loc_refcount)); - return ctxt; -} - -static inline void llog_ctxt_put(struct llog_ctxt *ctxt) -{ - if (!ctxt) - return; - LASSERT_ATOMIC_GT_LT(&ctxt->loc_refcount, 0, LI_POISON); - CDEBUG(D_INFO, "PUTting ctxt %p : new refcount %d\n", ctxt, - atomic_read(&ctxt->loc_refcount) - 1); - __llog_ctxt_put(NULL, ctxt); -} - -static inline void llog_group_init(struct obd_llog_group *olg) -{ - init_waitqueue_head(&olg->olg_waitq); - spin_lock_init(&olg->olg_lock); - mutex_init(&olg->olg_cat_processing); -} - -static inline int llog_group_set_ctxt(struct obd_llog_group *olg, - struct llog_ctxt *ctxt, int index) -{ - LASSERT(index >= 0 && index < LLOG_MAX_CTXTS); - - spin_lock(&olg->olg_lock); - if (olg->olg_ctxts[index]) { - spin_unlock(&olg->olg_lock); - return -EEXIST; - } - olg->olg_ctxts[index] = ctxt; - spin_unlock(&olg->olg_lock); - return 0; -} - -static inline struct llog_ctxt *llog_group_get_ctxt(struct obd_llog_group *olg, - int index) -{ - struct llog_ctxt *ctxt; - - LASSERT(index >= 0 && index < LLOG_MAX_CTXTS); - - spin_lock(&olg->olg_lock); - if (!olg->olg_ctxts[index]) - ctxt = NULL; - else - ctxt = llog_ctxt_get(olg->olg_ctxts[index]); - spin_unlock(&olg->olg_lock); - return ctxt; -} - -static inline void llog_group_clear_ctxt(struct obd_llog_group *olg, int index) -{ - LASSERT(index >= 0 && index < LLOG_MAX_CTXTS); - spin_lock(&olg->olg_lock); - olg->olg_ctxts[index] = NULL; - spin_unlock(&olg->olg_lock); -} - -static inline struct llog_ctxt *llog_get_context(struct obd_device *obd, - int index) -{ - return llog_group_get_ctxt(&obd->obd_olg, index); -} - -static inline int llog_group_ctxt_null(struct obd_llog_group *olg, int index) -{ - return (!olg->olg_ctxts[index]); -} - -static inline int llog_ctxt_null(struct obd_device *obd, int index) -{ - return llog_group_ctxt_null(&obd->obd_olg, index); -} - -static inline int llog_next_block(const struct lu_env *env, - struct llog_handle *loghandle, int *cur_idx, - int next_idx, __u64 *cur_offset, void *buf, - int len) -{ - struct llog_operations *lop; - int rc; - - rc = llog_handle2ops(loghandle, &lop); - if (rc) - return rc; - if (!lop->lop_next_block) - return -EOPNOTSUPP; - - rc = lop->lop_next_block(env, loghandle, cur_idx, next_idx, - cur_offset, buf, len); - return rc; -} - -/* llog.c */ -int llog_declare_write_rec(const struct lu_env *env, - struct llog_handle *handle, - struct llog_rec_hdr *rec, int idx, - struct thandle *th); -int llog_write_rec(const struct lu_env *env, struct llog_handle *handle, - struct llog_rec_hdr *rec, struct llog_cookie *logcookies, - int numcookies, void *buf, int idx, struct thandle *th); -int lustre_process_log(struct super_block *sb, char *logname, - struct config_llog_instance *cfg); -int lustre_end_log(struct super_block *sb, char *logname, - struct config_llog_instance *cfg); -/** @} log */ - -#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_mdc.h b/drivers/staging/lustre/lustre/include/lustre_mdc.h deleted file mode 100644 index a9c9992a2502f3ef2d7f553fb9511f9ff8f4de2b..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_mdc.h +++ /dev/null @@ -1,229 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/include/lustre_mdc.h - * - * MDS data structures. - * See also lustre_idl.h for wire formats of requests. - */ - -#ifndef _LUSTRE_MDC_H -#define _LUSTRE_MDC_H - -/** \defgroup mdc mdc - * - * @{ - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct ptlrpc_client; -struct obd_export; -struct ptlrpc_request; -struct obd_device; - -/** - * Serializes in-flight MDT-modifying RPC requests to preserve idempotency. - * - * This mutex is used to implement execute-once semantics on the MDT. - * The MDT stores the last transaction ID and result for every client in - * its last_rcvd file. If the client doesn't get a reply, it can safely - * resend the request and the MDT will reconstruct the reply being aware - * that the request has already been executed. Without this lock, - * execution status of concurrent in-flight requests would be - * overwritten. - * - * This design limits the extent to which we can keep a full pipeline of - * in-flight requests from a single client. This limitation could be - * overcome by allowing multiple slots per client in the last_rcvd file. - */ -struct mdc_rpc_lock { - /** Lock protecting in-flight RPC concurrency. */ - struct mutex rpcl_mutex; - /** Intent associated with currently executing request. */ - struct lookup_intent *rpcl_it; - /** Used for MDS/RPC load testing purposes. */ - int rpcl_fakes; -}; - -#define MDC_FAKE_RPCL_IT ((void *)0x2c0012bfUL) - -static inline void mdc_init_rpc_lock(struct mdc_rpc_lock *lck) -{ - mutex_init(&lck->rpcl_mutex); - lck->rpcl_it = NULL; -} - -static inline void mdc_get_rpc_lock(struct mdc_rpc_lock *lck, - struct lookup_intent *it) -{ - if (it && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP || - it->it_op == IT_LAYOUT || it->it_op == IT_READDIR)) - return; - - /* This would normally block until the existing request finishes. - * If fail_loc is set it will block until the regular request is - * done, then set rpcl_it to MDC_FAKE_RPCL_IT. Once that is set - * it will only be cleared when all fake requests are finished. - * Only when all fake requests are finished can normal requests - * be sent, to ensure they are recoverable again. - */ - again: - mutex_lock(&lck->rpcl_mutex); - - if (CFS_FAIL_CHECK_QUIET(OBD_FAIL_MDC_RPCS_SEM)) { - lck->rpcl_it = MDC_FAKE_RPCL_IT; - lck->rpcl_fakes++; - mutex_unlock(&lck->rpcl_mutex); - return; - } - - /* This will only happen when the CFS_FAIL_CHECK() was - * just turned off but there are still requests in progress. - * Wait until they finish. It doesn't need to be efficient - * in this extremely rare case, just have low overhead in - * the common case when it isn't true. - */ - while (unlikely(lck->rpcl_it == MDC_FAKE_RPCL_IT)) { - mutex_unlock(&lck->rpcl_mutex); - schedule_timeout(HZ / 4); - goto again; - } - - LASSERT(!lck->rpcl_it); - lck->rpcl_it = it; -} - -static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck, - struct lookup_intent *it) -{ - if (it && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP || - it->it_op == IT_LAYOUT || it->it_op == IT_READDIR)) - return; - - if (lck->rpcl_it == MDC_FAKE_RPCL_IT) { /* OBD_FAIL_MDC_RPCS_SEM */ - mutex_lock(&lck->rpcl_mutex); - - LASSERTF(lck->rpcl_fakes > 0, "%d\n", lck->rpcl_fakes); - lck->rpcl_fakes--; - - if (lck->rpcl_fakes == 0) - lck->rpcl_it = NULL; - - } else { - LASSERTF(it == lck->rpcl_it, "%p != %p\n", it, lck->rpcl_it); - lck->rpcl_it = NULL; - } - - mutex_unlock(&lck->rpcl_mutex); -} - -static inline void mdc_get_mod_rpc_slot(struct ptlrpc_request *req, - struct lookup_intent *it) -{ - struct client_obd *cli = &req->rq_import->imp_obd->u.cli; - u32 opc; - u16 tag; - - opc = lustre_msg_get_opc(req->rq_reqmsg); - tag = obd_get_mod_rpc_slot(cli, opc, it); - lustre_msg_set_tag(req->rq_reqmsg, tag); -} - -static inline void mdc_put_mod_rpc_slot(struct ptlrpc_request *req, - struct lookup_intent *it) -{ - struct client_obd *cli = &req->rq_import->imp_obd->u.cli; - u32 opc; - u16 tag; - - opc = lustre_msg_get_opc(req->rq_reqmsg); - tag = lustre_msg_get_tag(req->rq_reqmsg); - obd_put_mod_rpc_slot(cli, opc, it, tag); -} - -/** - * Update the maximum possible easize. - * - * This value is learned from ptlrpc replies sent by the MDT. The - * default easize is initialized to the minimum value but allowed - * to grow up to a single page in size if required to handle the - * common case. - * - * \see client_obd::cl_default_mds_easize - * - * \param[in] exp export for MDC device - * \param[in] body body of ptlrpc reply from MDT - * - */ -static inline void mdc_update_max_ea_from_body(struct obd_export *exp, - struct mdt_body *body) -{ - if (body->mbo_valid & OBD_MD_FLMODEASIZE) { - struct client_obd *cli = &exp->exp_obd->u.cli; - u32 def_easize; - - if (cli->cl_max_mds_easize < body->mbo_max_mdsize) - cli->cl_max_mds_easize = body->mbo_max_mdsize; - - def_easize = min_t(__u32, body->mbo_max_mdsize, - OBD_MAX_DEFAULT_EA_SIZE); - cli->cl_default_mds_easize = def_easize; - } -} - -/* mdc/mdc_locks.c */ -int it_open_error(int phase, struct lookup_intent *it); - -static inline bool cl_is_lov_delay_create(unsigned int flags) -{ - return (flags & O_LOV_DELAY_CREATE) == O_LOV_DELAY_CREATE; -} - -static inline void cl_lov_delay_create_clear(unsigned int *flags) -{ - if ((*flags & O_LOV_DELAY_CREATE) == O_LOV_DELAY_CREATE) - *flags &= ~O_LOV_DELAY_CREATE; -} - -/** @} mdc */ - -#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_mds.h b/drivers/staging/lustre/lustre/include/lustre_mds.h deleted file mode 100644 index f665556556ec461a263e0761e321aabf9fd6323a..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_mds.h +++ /dev/null @@ -1,62 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/include/lustre_mds.h - * - * MDS data structures. - * See also lustre_idl.h for wire formats of requests. - */ - -#ifndef _LUSTRE_MDS_H -#define _LUSTRE_MDS_H - -/** \defgroup mds mds - * - * @{ - */ - -#include -#include -#include -#include - -struct mds_group_info { - struct obd_uuid *uuid; - int group; -}; - -#define MDD_OBD_NAME "mdd_obd" -#define MDD_OBD_UUID "mdd_obd_uuid" - -/** @} mds */ - -#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_net.h b/drivers/staging/lustre/lustre/include/lustre_net.h deleted file mode 100644 index 35b43a77eb1804351dabe7290db6989c0a615491..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_net.h +++ /dev/null @@ -1,2360 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2010, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ -/** \defgroup PtlRPC Portal RPC and networking module. - * - * PortalRPC is the layer used by rest of lustre code to achieve network - * communications: establish connections with corresponding export and import - * states, listen for a service, send and receive RPCs. - * PortalRPC also includes base recovery framework: packet resending and - * replaying, reconnections, pinger. - * - * PortalRPC utilizes LNet as its transport layer. - * - * @{ - */ - -#ifndef _LUSTRE_NET_H -#define _LUSTRE_NET_H - -/** \defgroup net net - * - * @{ - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -/* MD flags we _always_ use */ -#define PTLRPC_MD_OPTIONS 0 - -/** - * log2 max # of bulk operations in one request: 2=4MB/RPC, 5=32MB/RPC, ... - * In order for the client and server to properly negotiate the maximum - * possible transfer size, PTLRPC_BULK_OPS_COUNT must be a power-of-two - * value. The client is free to limit the actual RPC size for any bulk - * transfer via cl_max_pages_per_rpc to some non-power-of-two value. - * NOTE: This is limited to 16 (=64GB RPCs) by IOOBJ_MAX_BRW_BITS. - */ -#define PTLRPC_BULK_OPS_BITS 4 -#if PTLRPC_BULK_OPS_BITS > 16 -#error "More than 65536 BRW RPCs not allowed by IOOBJ_MAX_BRW_BITS." -#endif -#define PTLRPC_BULK_OPS_COUNT (1U << PTLRPC_BULK_OPS_BITS) -/** - * PTLRPC_BULK_OPS_MASK is for the convenience of the client only, and - * should not be used on the server at all. Otherwise, it imposes a - * protocol limitation on the maximum RPC size that can be used by any - * RPC sent to that server in the future. Instead, the server should - * use the negotiated per-client ocd_brw_size to determine the bulk - * RPC count. - */ -#define PTLRPC_BULK_OPS_MASK (~((__u64)PTLRPC_BULK_OPS_COUNT - 1)) - -/** - * Define maxima for bulk I/O. - * - * A single PTLRPC BRW request is sent via up to PTLRPC_BULK_OPS_COUNT - * of LNET_MTU sized RDMA transfers. Clients and servers negotiate the - * currently supported maximum between peers at connect via ocd_brw_size. - */ -#define PTLRPC_MAX_BRW_BITS (LNET_MTU_BITS + PTLRPC_BULK_OPS_BITS) -#define PTLRPC_MAX_BRW_SIZE (1 << PTLRPC_MAX_BRW_BITS) -#define PTLRPC_MAX_BRW_PAGES (PTLRPC_MAX_BRW_SIZE >> PAGE_SHIFT) - -#define ONE_MB_BRW_SIZE (1 << LNET_MTU_BITS) -#define MD_MAX_BRW_SIZE (1 << LNET_MTU_BITS) -#define MD_MAX_BRW_PAGES (MD_MAX_BRW_SIZE >> PAGE_SHIFT) -#define DT_MAX_BRW_SIZE PTLRPC_MAX_BRW_SIZE -#define DT_MAX_BRW_PAGES (DT_MAX_BRW_SIZE >> PAGE_SHIFT) -#define OFD_MAX_BRW_SIZE (1 << LNET_MTU_BITS) - -/* When PAGE_SIZE is a constant, we can check our arithmetic here with cpp! */ -# if ((PTLRPC_MAX_BRW_PAGES & (PTLRPC_MAX_BRW_PAGES - 1)) != 0) -# error "PTLRPC_MAX_BRW_PAGES isn't a power of two" -# endif -# if (PTLRPC_MAX_BRW_SIZE != (PTLRPC_MAX_BRW_PAGES * PAGE_SIZE)) -# error "PTLRPC_MAX_BRW_SIZE isn't PTLRPC_MAX_BRW_PAGES * PAGE_SIZE" -# endif -# if (PTLRPC_MAX_BRW_SIZE > LNET_MTU * PTLRPC_BULK_OPS_COUNT) -# error "PTLRPC_MAX_BRW_SIZE too big" -# endif -# if (PTLRPC_MAX_BRW_PAGES > LNET_MAX_IOV * PTLRPC_BULK_OPS_COUNT) -# error "PTLRPC_MAX_BRW_PAGES too big" -# endif - -#define PTLRPC_NTHRS_INIT 2 - -/** - * Buffer Constants - * - * Constants determine how memory is used to buffer incoming service requests. - * - * ?_NBUFS # buffers to allocate when growing the pool - * ?_BUFSIZE # bytes in a single request buffer - * ?_MAXREQSIZE # maximum request service will receive - * - * When fewer than ?_NBUFS/2 buffers are posted for receive, another chunk - * of ?_NBUFS is added to the pool. - * - * Messages larger than ?_MAXREQSIZE are dropped. Request buffers are - * considered full when less than ?_MAXREQSIZE is left in them. - */ -/** - * Thread Constants - * - * Constants determine how threads are created for ptlrpc service. - * - * ?_NTHRS_INIT # threads to create for each service partition on - * initializing. If it's non-affinity service and - * there is only one partition, it's the overall # - * threads for the service while initializing. - * ?_NTHRS_BASE # threads should be created at least for each - * ptlrpc partition to keep the service healthy. - * It's the low-water mark of threads upper-limit - * for each partition. - * ?_THR_FACTOR # threads can be added on threads upper-limit for - * each CPU core. This factor is only for reference, - * we might decrease value of factor if number of cores - * per CPT is above a limit. - * ?_NTHRS_MAX # overall threads can be created for a service, - * it's a soft limit because if service is running - * on machine with hundreds of cores and tens of - * CPU partitions, we need to guarantee each partition - * has ?_NTHRS_BASE threads, which means total threads - * will be ?_NTHRS_BASE * number_of_cpts which can - * exceed ?_NTHRS_MAX. - * - * Examples - * - * #define MDS_NTHRS_INIT 2 - * #define MDS_NTHRS_BASE 64 - * #define MDS_NTHRS_FACTOR 8 - * #define MDS_NTHRS_MAX 1024 - * - * Example 1): - * --------------------------------------------------------------------- - * Server(A) has 16 cores, user configured it to 4 partitions so each - * partition has 4 cores, then actual number of service threads on each - * partition is: - * MDS_NTHRS_BASE(64) + cores(4) * MDS_NTHRS_FACTOR(8) = 96 - * - * Total number of threads for the service is: - * 96 * partitions(4) = 384 - * - * Example 2): - * --------------------------------------------------------------------- - * Server(B) has 32 cores, user configured it to 4 partitions so each - * partition has 8 cores, then actual number of service threads on each - * partition is: - * MDS_NTHRS_BASE(64) + cores(8) * MDS_NTHRS_FACTOR(8) = 128 - * - * Total number of threads for the service is: - * 128 * partitions(4) = 512 - * - * Example 3): - * --------------------------------------------------------------------- - * Server(B) has 96 cores, user configured it to 8 partitions so each - * partition has 12 cores, then actual number of service threads on each - * partition is: - * MDS_NTHRS_BASE(64) + cores(12) * MDS_NTHRS_FACTOR(8) = 160 - * - * Total number of threads for the service is: - * 160 * partitions(8) = 1280 - * - * However, it's above the soft limit MDS_NTHRS_MAX, so we choose this number - * as upper limit of threads number for each partition: - * MDS_NTHRS_MAX(1024) / partitions(8) = 128 - * - * Example 4): - * --------------------------------------------------------------------- - * Server(C) have a thousand of cores and user configured it to 32 partitions - * MDS_NTHRS_BASE(64) * 32 = 2048 - * - * which is already above soft limit MDS_NTHRS_MAX(1024), but we still need - * to guarantee that each partition has at least MDS_NTHRS_BASE(64) threads - * to keep service healthy, so total number of threads will just be 2048. - * - * NB: we don't suggest to choose server with that many cores because backend - * filesystem itself, buffer cache, or underlying network stack might - * have some SMP scalability issues at that large scale. - * - * If user already has a fat machine with hundreds or thousands of cores, - * there are two choices for configuration: - * a) create CPU table from subset of all CPUs and run Lustre on - * top of this subset - * b) bind service threads on a few partitions, see modparameters of - * MDS and OSS for details -* - * NB: these calculations (and examples below) are simplified to help - * understanding, the real implementation is a little more complex, - * please see ptlrpc_server_nthreads_check() for details. - * - */ - - /* - * LDLM threads constants: - * - * Given 8 as factor and 24 as base threads number - * - * example 1) - * On 4-core machine we will have 24 + 8 * 4 = 56 threads. - * - * example 2) - * On 8-core machine with 2 partitions we will have 24 + 4 * 8 = 56 - * threads for each partition and total threads number will be 112. - * - * example 3) - * On 64-core machine with 8 partitions we will need LDLM_NTHRS_BASE(24) - * threads for each partition to keep service healthy, so total threads - * number should be 24 * 8 = 192. - * - * So with these constants, threads number will be at the similar level - * of old versions, unless target machine has over a hundred cores - */ -#define LDLM_THR_FACTOR 8 -#define LDLM_NTHRS_INIT PTLRPC_NTHRS_INIT -#define LDLM_NTHRS_BASE 24 -#define LDLM_NTHRS_MAX (num_online_cpus() == 1 ? 64 : 128) - -#define LDLM_BL_THREADS LDLM_NTHRS_AUTO_INIT -#define LDLM_CLIENT_NBUFS 1 -#define LDLM_SERVER_NBUFS 64 -#define LDLM_BUFSIZE (8 * 1024) -#define LDLM_MAXREQSIZE (5 * 1024) -#define LDLM_MAXREPSIZE (1024) - -#define MDS_MAXREQSIZE (5 * 1024) /* >= 4736 */ - -/** - * FIEMAP request can be 4K+ for now - */ -#define OST_MAXREQSIZE (16 * 1024) - -/* Macro to hide a typecast. */ -#define ptlrpc_req_async_args(req) ((void *)&req->rq_async_args) - -struct ptlrpc_replay_async_args { - int praa_old_state; - int praa_old_status; -}; - -/** - * Structure to single define portal connection. - */ -struct ptlrpc_connection { - /** linkage for connections hash table */ - struct rhash_head c_hash; - /** Our own lnet nid for this connection */ - lnet_nid_t c_self; - /** Remote side nid for this connection */ - struct lnet_process_id c_peer; - /** UUID of the other side */ - struct obd_uuid c_remote_uuid; - /** reference counter for this connection */ - atomic_t c_refcount; -}; - -/** Client definition for PortalRPC */ -struct ptlrpc_client { - /** What lnet portal does this client send messages to by default */ - __u32 cli_request_portal; - /** What portal do we expect replies on */ - __u32 cli_reply_portal; - /** Name of the client */ - char *cli_name; -}; - -/** state flags of requests */ -/* XXX only ones left are those used by the bulk descs as well! */ -#define PTL_RPC_FL_INTR (1 << 0) /* reply wait was interrupted by user */ -#define PTL_RPC_FL_TIMEOUT (1 << 7) /* request timed out waiting for reply */ - -#define REQ_MAX_ACK_LOCKS 8 - -union ptlrpc_async_args { - /** - * Scratchpad for passing args to completion interpreter. Users - * cast to the struct of their choosing, and BUILD_BUG_ON oversized - * arguments. For _tons_ of context, kmalloc a struct and store - * a pointer to it here. The pointer_arg ensures this struct is at - * least big enough for that. - */ - void *pointer_arg[11]; - __u64 space[7]; -}; - -struct ptlrpc_request_set; -typedef int (*set_interpreter_func)(struct ptlrpc_request_set *, void *, int); -typedef int (*set_producer_func)(struct ptlrpc_request_set *, void *); - -/** - * Definition of request set structure. - * Request set is a list of requests (not necessary to the same target) that - * once populated with RPCs could be sent in parallel. - * There are two kinds of request sets. General purpose and with dedicated - * serving thread. Example of the latter is ptlrpcd set. - * For general purpose sets once request set started sending it is impossible - * to add new requests to such set. - * Provides a way to call "completion callbacks" when all requests in the set - * returned. - */ -struct ptlrpc_request_set { - atomic_t set_refcount; - /** number of in queue requests */ - atomic_t set_new_count; - /** number of uncompleted requests */ - atomic_t set_remaining; - /** wait queue to wait on for request events */ - wait_queue_head_t set_waitq; - wait_queue_head_t *set_wakeup_ptr; - /** List of requests in the set */ - struct list_head set_requests; - /** - * List of completion callbacks to be called when the set is completed - * This is only used if \a set_interpret is NULL. - * Links struct ptlrpc_set_cbdata. - */ - struct list_head set_cblist; - /** Completion callback, if only one. */ - set_interpreter_func set_interpret; - /** opaq argument passed to completion \a set_interpret callback. */ - void *set_arg; - /** - * Lock for \a set_new_requests manipulations - * locked so that any old caller can communicate requests to - * the set holder who can then fold them into the lock-free set - */ - spinlock_t set_new_req_lock; - /** List of new yet unsent requests. Only used with ptlrpcd now. */ - struct list_head set_new_requests; - - /** rq_status of requests that have been freed already */ - int set_rc; - /** Additional fields used by the flow control extension */ - /** Maximum number of RPCs in flight */ - int set_max_inflight; - /** Callback function used to generate RPCs */ - set_producer_func set_producer; - /** opaq argument passed to the producer callback */ - void *set_producer_arg; -}; - -/** - * Description of a single ptrlrpc_set callback - */ -struct ptlrpc_set_cbdata { - /** List linkage item */ - struct list_head psc_item; - /** Pointer to interpreting function */ - set_interpreter_func psc_interpret; - /** Opaq argument to pass to the callback */ - void *psc_data; -}; - -struct ptlrpc_bulk_desc; -struct ptlrpc_service_part; -struct ptlrpc_service; - -/** - * ptlrpc callback & work item stuff - */ -struct ptlrpc_cb_id { - void (*cbid_fn)(struct lnet_event *ev); /* specific callback fn */ - void *cbid_arg; /* additional arg */ -}; - -/** Maximum number of locks to fit into reply state */ -#define RS_MAX_LOCKS 8 -#define RS_DEBUG 0 - -/** - * Structure to define reply state on the server - * Reply state holds various reply message information. Also for "difficult" - * replies (rep-ack case) we store the state after sending reply and wait - * for the client to acknowledge the reception. In these cases locks could be - * added to the state for replay/failover consistency guarantees. - */ -struct ptlrpc_reply_state { - /** Callback description */ - struct ptlrpc_cb_id rs_cb_id; - /** Linkage for list of all reply states in a system */ - struct list_head rs_list; - /** Linkage for list of all reply states on same export */ - struct list_head rs_exp_list; - /** Linkage for list of all reply states for same obd */ - struct list_head rs_obd_list; -#if RS_DEBUG - struct list_head rs_debug_list; -#endif - /** A spinlock to protect the reply state flags */ - spinlock_t rs_lock; - /** Reply state flags */ - unsigned long rs_difficult:1; /* ACK/commit stuff */ - unsigned long rs_no_ack:1; /* no ACK, even for - * difficult requests - */ - unsigned long rs_scheduled:1; /* being handled? */ - unsigned long rs_scheduled_ever:1;/* any schedule attempts? */ - unsigned long rs_handled:1; /* been handled yet? */ - unsigned long rs_on_net:1; /* reply_out_callback pending? */ - unsigned long rs_prealloc:1; /* rs from prealloc list */ - unsigned long rs_committed:1;/* the transaction was committed - * and the rs was dispatched - */ - atomic_t rs_refcount; /* number of users */ - /** Number of locks awaiting client ACK */ - int rs_nlocks; - - /** Size of the state */ - int rs_size; - /** opcode */ - __u32 rs_opc; - /** Transaction number */ - __u64 rs_transno; - /** xid */ - __u64 rs_xid; - struct obd_export *rs_export; - struct ptlrpc_service_part *rs_svcpt; - /** Lnet metadata handle for the reply */ - struct lnet_handle_md rs_md_h; - - /** Context for the service thread */ - struct ptlrpc_svc_ctx *rs_svc_ctx; - /** Reply buffer (actually sent to the client), encoded if needed */ - struct lustre_msg *rs_repbuf; /* wrapper */ - /** Size of the reply buffer */ - int rs_repbuf_len; /* wrapper buf length */ - /** Size of the reply message */ - int rs_repdata_len; /* wrapper msg length */ - /** - * Actual reply message. Its content is encrypted (if needed) to - * produce reply buffer for actual sending. In simple case - * of no network encryption we just set \a rs_repbuf to \a rs_msg - */ - struct lustre_msg *rs_msg; /* reply message */ - - /** Handles of locks awaiting client reply ACK */ - struct lustre_handle rs_locks[RS_MAX_LOCKS]; - /** Lock modes of locks in \a rs_locks */ - enum ldlm_mode rs_modes[RS_MAX_LOCKS]; -}; - -struct ptlrpc_thread; - -/** RPC stages */ -enum rq_phase { - RQ_PHASE_NEW = 0xebc0de00, - RQ_PHASE_RPC = 0xebc0de01, - RQ_PHASE_BULK = 0xebc0de02, - RQ_PHASE_INTERPRET = 0xebc0de03, - RQ_PHASE_COMPLETE = 0xebc0de04, - RQ_PHASE_UNREG_RPC = 0xebc0de05, - RQ_PHASE_UNREG_BULK = 0xebc0de06, - RQ_PHASE_UNDEFINED = 0xebc0de07 -}; - -/** Type of request interpreter call-back */ -typedef int (*ptlrpc_interpterer_t)(const struct lu_env *env, - struct ptlrpc_request *req, - void *arg, int rc); - -/** - * Definition of request pool structure. - * The pool is used to store empty preallocated requests for the case - * when we would actually need to send something without performing - * any allocations (to avoid e.g. OOM). - */ -struct ptlrpc_request_pool { - /** Locks the list */ - spinlock_t prp_lock; - /** list of ptlrpc_request structs */ - struct list_head prp_req_list; - /** Maximum message size that would fit into a request from this pool */ - int prp_rq_size; - /** Function to allocate more requests for this pool */ - int (*prp_populate)(struct ptlrpc_request_pool *, int); -}; - -struct lu_context; -struct lu_env; - -struct ldlm_lock; - -#include - -/** - * Basic request prioritization operations structure. - * The whole idea is centered around locks and RPCs that might affect locks. - * When a lock is contended we try to give priority to RPCs that might lead - * to fastest release of that lock. - * Currently only implemented for OSTs only in a way that makes all - * IO and truncate RPCs that are coming from a locked region where a lock is - * contended a priority over other requests. - */ -struct ptlrpc_hpreq_ops { - /** - * Check if the lock handle of the given lock is the same as - * taken from the request. - */ - int (*hpreq_lock_match)(struct ptlrpc_request *, struct ldlm_lock *); - /** - * Check if the request is a high priority one. - */ - int (*hpreq_check)(struct ptlrpc_request *); - /** - * Called after the request has been handled. - */ - void (*hpreq_fini)(struct ptlrpc_request *); -}; - -struct ptlrpc_cli_req { - /** For bulk requests on client only: bulk descriptor */ - struct ptlrpc_bulk_desc *cr_bulk; - /** optional time limit for send attempts */ - long cr_delay_limit; - /** time request was first queued */ - unsigned long cr_queued_time; - /** request sent timeval */ - struct timespec64 cr_sent_tv; - /** time for request really sent out */ - time64_t cr_sent_out; - /** when req reply unlink must finish. */ - time64_t cr_reply_deadline; - /** when req bulk unlink must finish. */ - time64_t cr_bulk_deadline; - /** when req unlink must finish. */ - time64_t cr_req_deadline; - /** Portal to which this request would be sent */ - short cr_req_ptl; - /** Portal where to wait for reply and where reply would be sent */ - short cr_rep_ptl; - /** request resending number */ - unsigned int cr_resend_nr; - /** What was import generation when this request was sent */ - int cr_imp_gen; - enum lustre_imp_state cr_send_state; - /** Per-request waitq introduced by bug 21938 for recovery waiting */ - wait_queue_head_t cr_set_waitq; - /** Link item for request set lists */ - struct list_head cr_set_chain; - /** link to waited ctx */ - struct list_head cr_ctx_chain; - - /** client's half ctx */ - struct ptlrpc_cli_ctx *cr_cli_ctx; - /** Link back to the request set */ - struct ptlrpc_request_set *cr_set; - /** outgoing request MD handle */ - struct lnet_handle_md cr_req_md_h; - /** request-out callback parameter */ - struct ptlrpc_cb_id cr_req_cbid; - /** incoming reply MD handle */ - struct lnet_handle_md cr_reply_md_h; - wait_queue_head_t cr_reply_waitq; - /** reply callback parameter */ - struct ptlrpc_cb_id cr_reply_cbid; - /** Async completion handler, called when reply is received */ - ptlrpc_interpterer_t cr_reply_interp; - /** Async completion context */ - union ptlrpc_async_args cr_async_args; - /** Opaq data for replay and commit callbacks. */ - void *cr_cb_data; - /** Link to the imp->imp_unreplied_list */ - struct list_head cr_unreplied_list; - /** - * Commit callback, called when request is committed and about to be - * freed. - */ - void (*cr_commit_cb)(struct ptlrpc_request *); - /** Replay callback, called after request is replayed at recovery */ - void (*cr_replay_cb)(struct ptlrpc_request *); -}; - -/** client request member alias */ -/* NB: these alias should NOT be used by any new code, instead they should - * be removed step by step to avoid potential abuse - */ -#define rq_bulk rq_cli.cr_bulk -#define rq_delay_limit rq_cli.cr_delay_limit -#define rq_queued_time rq_cli.cr_queued_time -#define rq_sent_tv rq_cli.cr_sent_tv -#define rq_real_sent rq_cli.cr_sent_out -#define rq_reply_deadline rq_cli.cr_reply_deadline -#define rq_bulk_deadline rq_cli.cr_bulk_deadline -#define rq_req_deadline rq_cli.cr_req_deadline -#define rq_nr_resend rq_cli.cr_resend_nr -#define rq_request_portal rq_cli.cr_req_ptl -#define rq_reply_portal rq_cli.cr_rep_ptl -#define rq_import_generation rq_cli.cr_imp_gen -#define rq_send_state rq_cli.cr_send_state -#define rq_set_chain rq_cli.cr_set_chain -#define rq_ctx_chain rq_cli.cr_ctx_chain -#define rq_set rq_cli.cr_set -#define rq_set_waitq rq_cli.cr_set_waitq -#define rq_cli_ctx rq_cli.cr_cli_ctx -#define rq_req_md_h rq_cli.cr_req_md_h -#define rq_req_cbid rq_cli.cr_req_cbid -#define rq_reply_md_h rq_cli.cr_reply_md_h -#define rq_reply_waitq rq_cli.cr_reply_waitq -#define rq_reply_cbid rq_cli.cr_reply_cbid -#define rq_interpret_reply rq_cli.cr_reply_interp -#define rq_async_args rq_cli.cr_async_args -#define rq_cb_data rq_cli.cr_cb_data -#define rq_unreplied_list rq_cli.cr_unreplied_list -#define rq_commit_cb rq_cli.cr_commit_cb -#define rq_replay_cb rq_cli.cr_replay_cb - -struct ptlrpc_srv_req { - /** initial thread servicing this request */ - struct ptlrpc_thread *sr_svc_thread; - /** - * Server side list of incoming unserved requests sorted by arrival - * time. Traversed from time to time to notice about to expire - * requests and sent back "early replies" to clients to let them - * know server is alive and well, just very busy to service their - * requests in time - */ - struct list_head sr_timed_list; - /** server-side per-export list */ - struct list_head sr_exp_list; - /** server-side history, used for debuging purposes. */ - struct list_head sr_hist_list; - /** history sequence # */ - __u64 sr_hist_seq; - /** the index of service's srv_at_array into which request is linked */ - time64_t sr_at_index; - /** authed uid */ - uid_t sr_auth_uid; - /** authed uid mapped to */ - uid_t sr_auth_mapped_uid; - /** RPC is generated from what part of Lustre */ - enum lustre_sec_part sr_sp_from; - /** request session context */ - struct lu_context sr_ses; - /** \addtogroup nrs - * @{ - */ - /** stub for NRS request */ - struct ptlrpc_nrs_request sr_nrq; - /** @} nrs */ - /** request arrival time */ - struct timespec64 sr_arrival_time; - /** server's half ctx */ - struct ptlrpc_svc_ctx *sr_svc_ctx; - /** (server side), pointed directly into req buffer */ - struct ptlrpc_user_desc *sr_user_desc; - /** separated reply state */ - struct ptlrpc_reply_state *sr_reply_state; - /** server-side hp handlers */ - struct ptlrpc_hpreq_ops *sr_ops; - /** incoming request buffer */ - struct ptlrpc_request_buffer_desc *sr_rqbd; -}; - -/** server request member alias */ -/* NB: these alias should NOT be used by any new code, instead they should - * be removed step by step to avoid potential abuse - */ -#define rq_svc_thread rq_srv.sr_svc_thread -#define rq_timed_list rq_srv.sr_timed_list -#define rq_exp_list rq_srv.sr_exp_list -#define rq_history_list rq_srv.sr_hist_list -#define rq_history_seq rq_srv.sr_hist_seq -#define rq_at_index rq_srv.sr_at_index -#define rq_auth_uid rq_srv.sr_auth_uid -#define rq_auth_mapped_uid rq_srv.sr_auth_mapped_uid -#define rq_sp_from rq_srv.sr_sp_from -#define rq_session rq_srv.sr_ses -#define rq_nrq rq_srv.sr_nrq -#define rq_arrival_time rq_srv.sr_arrival_time -#define rq_reply_state rq_srv.sr_reply_state -#define rq_svc_ctx rq_srv.sr_svc_ctx -#define rq_user_desc rq_srv.sr_user_desc -#define rq_ops rq_srv.sr_ops -#define rq_rqbd rq_srv.sr_rqbd - -/** - * Represents remote procedure call. - * - * This is a staple structure used by everybody wanting to send a request - * in Lustre. - */ -struct ptlrpc_request { - /* Request type: one of PTL_RPC_MSG_* */ - int rq_type; - /** Result of request processing */ - int rq_status; - /** - * Linkage item through which this request is included into - * sending/delayed lists on client and into rqbd list on server - */ - struct list_head rq_list; - /** Lock to protect request flags and some other important bits, like - * rq_list - */ - spinlock_t rq_lock; - /** client-side flags are serialized by rq_lock @{ */ - unsigned int rq_intr:1, rq_replied:1, rq_err:1, - rq_timedout:1, rq_resend:1, rq_restart:1, - /** - * when ->rq_replay is set, request is kept by the client even - * after server commits corresponding transaction. This is - * used for operations that require sequence of multiple - * requests to be replayed. The only example currently is file - * open/close. When last request in such a sequence is - * committed, ->rq_replay is cleared on all requests in the - * sequence. - */ - rq_replay:1, - rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1, - rq_no_delay:1, rq_net_err:1, rq_wait_ctx:1, - rq_early:1, - rq_req_unlinked:1, /* unlinked request buffer from lnet */ - rq_reply_unlinked:1, /* unlinked reply buffer from lnet */ - rq_memalloc:1, /* req originated from "kswapd" */ - rq_committed:1, - rq_reply_truncated:1, - /** whether the "rq_set" is a valid one */ - rq_invalid_rqset:1, - rq_generation_set:1, - /** do not resend request on -EINPROGRESS */ - rq_no_retry_einprogress:1, - /* allow the req to be sent if the import is in recovery - * status - */ - rq_allow_replay:1, - /* bulk request, sent to server, but uncommitted */ - rq_unstable:1; - /** @} */ - - /** server-side flags @{ */ - unsigned int - rq_hp:1, /**< high priority RPC */ - rq_at_linked:1, /**< link into service's srv_at_array */ - rq_packed_final:1; /**< packed final reply */ - /** @} */ - - /** one of RQ_PHASE_* */ - enum rq_phase rq_phase; - /** one of RQ_PHASE_* to be used next */ - enum rq_phase rq_next_phase; - /** - * client-side refcount for SENT race, server-side refcount - * for multiple replies - */ - atomic_t rq_refcount; - /** - * client-side: - * !rq_truncate : # reply bytes actually received, - * rq_truncate : required repbuf_len for resend - */ - int rq_nob_received; - /** Request length */ - int rq_reqlen; - /** Reply length */ - int rq_replen; - /** Pool if request is from preallocated list */ - struct ptlrpc_request_pool *rq_pool; - /** Request message - what client sent */ - struct lustre_msg *rq_reqmsg; - /** Reply message - server response */ - struct lustre_msg *rq_repmsg; - /** Transaction number */ - __u64 rq_transno; - /** xid */ - __u64 rq_xid; - /** bulk match bits */ - u64 rq_mbits; - /** - * List item to for replay list. Not yet committed requests get linked - * there. - * Also see \a rq_replay comment above. - * It's also link chain on obd_export::exp_req_replay_queue - */ - struct list_head rq_replay_list; - /** non-shared members for client & server request*/ - union { - struct ptlrpc_cli_req rq_cli; - struct ptlrpc_srv_req rq_srv; - }; - /** - * security and encryption data - * @{ - */ - /** description of flavors for client & server */ - struct sptlrpc_flavor rq_flvr; - - /* client/server security flags */ - unsigned int - rq_ctx_init:1, /* context initiation */ - rq_ctx_fini:1, /* context destroy */ - rq_bulk_read:1, /* request bulk read */ - rq_bulk_write:1, /* request bulk write */ - /* server authentication flags */ - rq_auth_gss:1, /* authenticated by gss */ - rq_auth_usr_root:1, /* authed as root */ - rq_auth_usr_mdt:1, /* authed as mdt */ - rq_auth_usr_ost:1, /* authed as ost */ - /* security tfm flags */ - rq_pack_udesc:1, - rq_pack_bulk:1, - /* doesn't expect reply FIXME */ - rq_no_reply:1, - rq_pill_init:1, /* pill initialized */ - rq_srv_req:1; /* server request */ - - /** various buffer pointers */ - struct lustre_msg *rq_reqbuf; /**< req wrapper */ - char *rq_repbuf; /**< rep buffer */ - struct lustre_msg *rq_repdata; /**< rep wrapper msg */ - /** only in priv mode */ - struct lustre_msg *rq_clrbuf; - int rq_reqbuf_len; /* req wrapper buf len */ - int rq_reqdata_len; /* req wrapper msg len */ - int rq_repbuf_len; /* rep buffer len */ - int rq_repdata_len; /* rep wrapper msg len */ - int rq_clrbuf_len; /* only in priv mode */ - int rq_clrdata_len; /* only in priv mode */ - - /** early replies go to offset 0, regular replies go after that */ - unsigned int rq_reply_off; - - /** @} */ - - /** Fields that help to see if request and reply were swabbed or not */ - __u32 rq_req_swab_mask; - __u32 rq_rep_swab_mask; - - /** how many early replies (for stats) */ - int rq_early_count; - - /** Server-side, export on which request was received */ - struct obd_export *rq_export; - /** import where request is being sent */ - struct obd_import *rq_import; - /** our LNet NID */ - lnet_nid_t rq_self; - /** Peer description (the other side) */ - struct lnet_process_id rq_peer; - /** - * service time estimate (secs) - * If the request is not served by this time, it is marked as timed out. - */ - int rq_timeout; - /** - * when request/reply sent (secs), or time when request should be sent - */ - time64_t rq_sent; - /** when request must finish. */ - time64_t rq_deadline; - /** request format description */ - struct req_capsule rq_pill; -}; - -/** - * Call completion handler for rpc if any, return it's status or original - * rc if there was no handler defined for this request. - */ -static inline int ptlrpc_req_interpret(const struct lu_env *env, - struct ptlrpc_request *req, int rc) -{ - if (req->rq_interpret_reply) { - req->rq_status = req->rq_interpret_reply(env, req, - &req->rq_async_args, - rc); - return req->rq_status; - } - return rc; -} - -/* - * Can the request be moved from the regular NRS head to the high-priority NRS - * head (of the same PTLRPC service partition), if any? - * - * For a reliable result, this should be checked under svcpt->scp_req lock. - */ -static inline bool ptlrpc_nrs_req_can_move(struct ptlrpc_request *req) -{ - struct ptlrpc_nrs_request *nrq = &req->rq_nrq; - - /** - * LU-898: Check ptlrpc_nrs_request::nr_enqueued to make sure the - * request has been enqueued first, and ptlrpc_nrs_request::nr_started - * to make sure it has not been scheduled yet (analogous to previous - * (non-NRS) checking of !list_empty(&ptlrpc_request::rq_list). - */ - return nrq->nr_enqueued && !nrq->nr_started && !req->rq_hp; -} - -/** @} nrs */ - -/** - * Returns 1 if request buffer at offset \a index was already swabbed - */ -static inline int lustre_req_swabbed(struct ptlrpc_request *req, size_t index) -{ - LASSERT(index < sizeof(req->rq_req_swab_mask) * 8); - return req->rq_req_swab_mask & (1 << index); -} - -/** - * Returns 1 if request reply buffer at offset \a index was already swabbed - */ -static inline int lustre_rep_swabbed(struct ptlrpc_request *req, size_t index) -{ - LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8); - return req->rq_rep_swab_mask & (1 << index); -} - -/** - * Returns 1 if request needs to be swabbed into local cpu byteorder - */ -static inline int ptlrpc_req_need_swab(struct ptlrpc_request *req) -{ - return lustre_req_swabbed(req, MSG_PTLRPC_HEADER_OFF); -} - -/** - * Returns 1 if request reply needs to be swabbed into local cpu byteorder - */ -static inline int ptlrpc_rep_need_swab(struct ptlrpc_request *req) -{ - return lustre_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF); -} - -/** - * Mark request buffer at offset \a index that it was already swabbed - */ -static inline void lustre_set_req_swabbed(struct ptlrpc_request *req, - size_t index) -{ - LASSERT(index < sizeof(req->rq_req_swab_mask) * 8); - LASSERT((req->rq_req_swab_mask & (1 << index)) == 0); - req->rq_req_swab_mask |= 1 << index; -} - -/** - * Mark request reply buffer at offset \a index that it was already swabbed - */ -static inline void lustre_set_rep_swabbed(struct ptlrpc_request *req, - size_t index) -{ - LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8); - LASSERT((req->rq_rep_swab_mask & (1 << index)) == 0); - req->rq_rep_swab_mask |= 1 << index; -} - -/** - * Convert numerical request phase value \a phase into text string description - */ -static inline const char * -ptlrpc_phase2str(enum rq_phase phase) -{ - switch (phase) { - case RQ_PHASE_NEW: - return "New"; - case RQ_PHASE_RPC: - return "Rpc"; - case RQ_PHASE_BULK: - return "Bulk"; - case RQ_PHASE_INTERPRET: - return "Interpret"; - case RQ_PHASE_COMPLETE: - return "Complete"; - case RQ_PHASE_UNREG_RPC: - return "UnregRPC"; - case RQ_PHASE_UNREG_BULK: - return "UnregBULK"; - default: - return "?Phase?"; - } -} - -/** - * Convert numerical request phase of the request \a req into text stringi - * description - */ -static inline const char * -ptlrpc_rqphase2str(struct ptlrpc_request *req) -{ - return ptlrpc_phase2str(req->rq_phase); -} - -/** - * Debugging functions and helpers to print request structure into debug log - * @{ - */ -/* Spare the preprocessor, spoil the bugs. */ -#define FLAG(field, str) (field ? str : "") - -/** Convert bit flags into a string */ -#define DEBUG_REQ_FLAGS(req) \ - ptlrpc_rqphase2str(req), \ - FLAG(req->rq_intr, "I"), FLAG(req->rq_replied, "R"), \ - FLAG(req->rq_err, "E"), FLAG(req->rq_net_err, "e"), \ - FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"), \ - FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"), \ - FLAG(req->rq_no_resend, "N"), \ - FLAG(req->rq_waiting, "W"), \ - FLAG(req->rq_wait_ctx, "C"), FLAG(req->rq_hp, "H"), \ - FLAG(req->rq_committed, "M") - -#define REQ_FLAGS_FMT "%s:%s%s%s%s%s%s%s%s%s%s%s%s%s" - -void _debug_req(struct ptlrpc_request *req, - struct libcfs_debug_msg_data *data, const char *fmt, ...) - __printf(3, 4); - -/** - * Helper that decides if we need to print request according to current debug - * level settings - */ -#define debug_req(msgdata, mask, cdls, req, fmt, a...) \ -do { \ - CFS_CHECK_STACK(msgdata, mask, cdls); \ - \ - if (((mask) & D_CANTMASK) != 0 || \ - ((libcfs_debug & (mask)) != 0 && \ - (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) \ - _debug_req((req), msgdata, fmt, ##a); \ -} while (0) - -/** - * This is the debug print function you need to use to print request structure - * content into lustre debug log. - * for most callers (level is a constant) this is resolved at compile time - */ -#define DEBUG_REQ(level, req, fmt, args...) \ -do { \ - if ((level) & (D_ERROR | D_WARNING)) { \ - static struct cfs_debug_limit_state cdls; \ - LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls); \ - debug_req(&msgdata, level, &cdls, req, "@@@ "fmt" ", ## args);\ - } else { \ - LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL); \ - debug_req(&msgdata, level, NULL, req, "@@@ "fmt" ", ## args); \ - } \ -} while (0) -/** @} */ - -/** - * Structure that defines a single page of a bulk transfer - */ -struct ptlrpc_bulk_page { - /** Linkage to list of pages in a bulk */ - struct list_head bp_link; - /** - * Number of bytes in a page to transfer starting from \a bp_pageoffset - */ - int bp_buflen; - /** offset within a page */ - int bp_pageoffset; - /** The page itself */ - struct page *bp_page; -}; - -enum ptlrpc_bulk_op_type { - PTLRPC_BULK_OP_ACTIVE = 0x00000001, - PTLRPC_BULK_OP_PASSIVE = 0x00000002, - PTLRPC_BULK_OP_PUT = 0x00000004, - PTLRPC_BULK_OP_GET = 0x00000008, - PTLRPC_BULK_BUF_KVEC = 0x00000010, - PTLRPC_BULK_BUF_KIOV = 0x00000020, - PTLRPC_BULK_GET_SOURCE = PTLRPC_BULK_OP_PASSIVE | PTLRPC_BULK_OP_GET, - PTLRPC_BULK_PUT_SINK = PTLRPC_BULK_OP_PASSIVE | PTLRPC_BULK_OP_PUT, - PTLRPC_BULK_GET_SINK = PTLRPC_BULK_OP_ACTIVE | PTLRPC_BULK_OP_GET, - PTLRPC_BULK_PUT_SOURCE = PTLRPC_BULK_OP_ACTIVE | PTLRPC_BULK_OP_PUT, -}; - -static inline bool ptlrpc_is_bulk_op_get(enum ptlrpc_bulk_op_type type) -{ - return (type & PTLRPC_BULK_OP_GET) == PTLRPC_BULK_OP_GET; -} - -static inline bool ptlrpc_is_bulk_get_source(enum ptlrpc_bulk_op_type type) -{ - return (type & PTLRPC_BULK_GET_SOURCE) == PTLRPC_BULK_GET_SOURCE; -} - -static inline bool ptlrpc_is_bulk_put_sink(enum ptlrpc_bulk_op_type type) -{ - return (type & PTLRPC_BULK_PUT_SINK) == PTLRPC_BULK_PUT_SINK; -} - -static inline bool ptlrpc_is_bulk_get_sink(enum ptlrpc_bulk_op_type type) -{ - return (type & PTLRPC_BULK_GET_SINK) == PTLRPC_BULK_GET_SINK; -} - -static inline bool ptlrpc_is_bulk_put_source(enum ptlrpc_bulk_op_type type) -{ - return (type & PTLRPC_BULK_PUT_SOURCE) == PTLRPC_BULK_PUT_SOURCE; -} - -static inline bool ptlrpc_is_bulk_desc_kvec(enum ptlrpc_bulk_op_type type) -{ - return ((type & PTLRPC_BULK_BUF_KVEC) | (type & PTLRPC_BULK_BUF_KIOV)) - == PTLRPC_BULK_BUF_KVEC; -} - -static inline bool ptlrpc_is_bulk_desc_kiov(enum ptlrpc_bulk_op_type type) -{ - return ((type & PTLRPC_BULK_BUF_KVEC) | (type & PTLRPC_BULK_BUF_KIOV)) - == PTLRPC_BULK_BUF_KIOV; -} - -static inline bool ptlrpc_is_bulk_op_active(enum ptlrpc_bulk_op_type type) -{ - return ((type & PTLRPC_BULK_OP_ACTIVE) | - (type & PTLRPC_BULK_OP_PASSIVE)) == PTLRPC_BULK_OP_ACTIVE; -} - -static inline bool ptlrpc_is_bulk_op_passive(enum ptlrpc_bulk_op_type type) -{ - return ((type & PTLRPC_BULK_OP_ACTIVE) | - (type & PTLRPC_BULK_OP_PASSIVE)) == PTLRPC_BULK_OP_PASSIVE; -} - -struct ptlrpc_bulk_frag_ops { - /** - * Add a page \a page to the bulk descriptor \a desc - * Data to transfer in the page starts at offset \a pageoffset and - * amount of data to transfer from the page is \a len - */ - void (*add_kiov_frag)(struct ptlrpc_bulk_desc *desc, - struct page *page, int pageoffset, int len); - - /* - * Add a \a fragment to the bulk descriptor \a desc. - * Data to transfer in the fragment is pointed to by \a frag - * The size of the fragment is \a len - */ - int (*add_iov_frag)(struct ptlrpc_bulk_desc *desc, void *frag, int len); - - /** - * Uninitialize and free bulk descriptor \a desc. - * Works on bulk descriptors both from server and client side. - */ - void (*release_frags)(struct ptlrpc_bulk_desc *desc); -}; - -extern const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kiov_pin_ops; -extern const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kiov_nopin_ops; - -/** - * Definition of bulk descriptor. - * Bulks are special "Two phase" RPCs where initial request message - * is sent first and it is followed bt a transfer (o receiving) of a large - * amount of data to be settled into pages referenced from the bulk descriptors. - * Bulks transfers (the actual data following the small requests) are done - * on separate LNet portals. - * In lustre we use bulk transfers for READ and WRITE transfers from/to OSTs. - * Another user is readpage for MDT. - */ -struct ptlrpc_bulk_desc { - /** completed with failure */ - unsigned long bd_failure:1; - /** client side */ - unsigned long bd_registered:1; - /** For serialization with callback */ - spinlock_t bd_lock; - /** Import generation when request for this bulk was sent */ - int bd_import_generation; - /** {put,get}{source,sink}{kvec,kiov} */ - enum ptlrpc_bulk_op_type bd_type; - /** LNet portal for this bulk */ - __u32 bd_portal; - /** Server side - export this bulk created for */ - struct obd_export *bd_export; - /** Client side - import this bulk was sent on */ - struct obd_import *bd_import; - /** Back pointer to the request */ - struct ptlrpc_request *bd_req; - struct ptlrpc_bulk_frag_ops *bd_frag_ops; - wait_queue_head_t bd_waitq; /* server side only WQ */ - int bd_iov_count; /* # entries in bd_iov */ - int bd_max_iov; /* allocated size of bd_iov */ - int bd_nob; /* # bytes covered */ - int bd_nob_transferred; /* # bytes GOT/PUT */ - - u64 bd_last_mbits; - - struct ptlrpc_cb_id bd_cbid; /* network callback info */ - lnet_nid_t bd_sender; /* stash event::sender */ - int bd_md_count; /* # valid entries in bd_mds */ - int bd_md_max_brw; /* max entries in bd_mds */ - /** array of associated MDs */ - struct lnet_handle_md bd_mds[PTLRPC_BULK_OPS_COUNT]; - - union { - struct { - /* - * encrypt iov, size is either 0 or bd_iov_count. - */ - struct bio_vec *bd_enc_vec; - struct bio_vec *bd_vec; /* Array of bio_vecs */ - } bd_kiov; - - struct { - struct kvec *bd_enc_kvec; - struct kvec *bd_kvec; /* Array of kvecs */ - } bd_kvec; - } bd_u; -}; - -#define GET_KIOV(desc) ((desc)->bd_u.bd_kiov.bd_vec) -#define BD_GET_KIOV(desc, i) ((desc)->bd_u.bd_kiov.bd_vec[i]) -#define GET_ENC_KIOV(desc) ((desc)->bd_u.bd_kiov.bd_enc_vec) -#define BD_GET_ENC_KIOV(desc, i) ((desc)->bd_u.bd_kiov.bd_enc_vec[i]) -#define GET_KVEC(desc) ((desc)->bd_u.bd_kvec.bd_kvec) -#define BD_GET_KVEC(desc, i) ((desc)->bd_u.bd_kvec.bd_kvec[i]) -#define GET_ENC_KVEC(desc) ((desc)->bd_u.bd_kvec.bd_enc_kvec) -#define BD_GET_ENC_KVEC(desc, i) ((desc)->bd_u.bd_kvec.bd_enc_kvec[i]) - -enum { - SVC_STOPPED = 1 << 0, - SVC_STOPPING = 1 << 1, - SVC_STARTING = 1 << 2, - SVC_RUNNING = 1 << 3, -}; - -#define PTLRPC_THR_NAME_LEN 32 -/** - * Definition of server service thread structure - */ -struct ptlrpc_thread { - /** - * List of active threads in svc->srv_threads - */ - struct list_head t_link; - /** - * thread-private data (preallocated memory) - */ - void *t_data; - __u32 t_flags; - /** - * service thread index, from ptlrpc_start_threads - */ - unsigned int t_id; - /** - * service thread pid - */ - pid_t t_pid; - /** - * put watchdog in the structure per thread b=14840 - * - * Lustre watchdog is removed for client in the hope - * of a generic watchdog can be merged in kernel. - * When that happens, we should add below back. - * - * struct lc_watchdog *t_watchdog; - */ - /** - * the svc this thread belonged to b=18582 - */ - struct ptlrpc_service_part *t_svcpt; - wait_queue_head_t t_ctl_waitq; - struct lu_env *t_env; - char t_name[PTLRPC_THR_NAME_LEN]; -}; - -static inline int thread_is_stopped(struct ptlrpc_thread *thread) -{ - return !!(thread->t_flags & SVC_STOPPED); -} - -static inline int thread_is_stopping(struct ptlrpc_thread *thread) -{ - return !!(thread->t_flags & SVC_STOPPING); -} - -static inline int thread_is_starting(struct ptlrpc_thread *thread) -{ - return !!(thread->t_flags & SVC_STARTING); -} - -static inline int thread_is_running(struct ptlrpc_thread *thread) -{ - return !!(thread->t_flags & SVC_RUNNING); -} - -static inline void thread_clear_flags(struct ptlrpc_thread *thread, __u32 flags) -{ - thread->t_flags &= ~flags; -} - -static inline void thread_set_flags(struct ptlrpc_thread *thread, __u32 flags) -{ - thread->t_flags = flags; -} - -static inline void thread_add_flags(struct ptlrpc_thread *thread, __u32 flags) -{ - thread->t_flags |= flags; -} - -static inline int thread_test_and_clear_flags(struct ptlrpc_thread *thread, - __u32 flags) -{ - if (thread->t_flags & flags) { - thread->t_flags &= ~flags; - return 1; - } - return 0; -} - -/** - * Request buffer descriptor structure. - * This is a structure that contains one posted request buffer for service. - * Once data land into a buffer, event callback creates actual request and - * notifies wakes one of the service threads to process new incoming request. - * More than one request can fit into the buffer. - */ -struct ptlrpc_request_buffer_desc { - /** Link item for rqbds on a service */ - struct list_head rqbd_list; - /** History of requests for this buffer */ - struct list_head rqbd_reqs; - /** Back pointer to service for which this buffer is registered */ - struct ptlrpc_service_part *rqbd_svcpt; - /** LNet descriptor */ - struct lnet_handle_md rqbd_md_h; - int rqbd_refcount; - /** The buffer itself */ - char *rqbd_buffer; - struct ptlrpc_cb_id rqbd_cbid; - /** - * This "embedded" request structure is only used for the - * last request to fit into the buffer - */ - struct ptlrpc_request rqbd_req; -}; - -typedef int (*svc_handler_t)(struct ptlrpc_request *req); - -struct ptlrpc_service_ops { - /** - * if non-NULL called during thread creation (ptlrpc_start_thread()) - * to initialize service specific per-thread state. - */ - int (*so_thr_init)(struct ptlrpc_thread *thr); - /** - * if non-NULL called during thread shutdown (ptlrpc_main()) to - * destruct state created by ->srv_init(). - */ - void (*so_thr_done)(struct ptlrpc_thread *thr); - /** - * Handler function for incoming requests for this service - */ - int (*so_req_handler)(struct ptlrpc_request *req); - /** - * function to determine priority of the request, it's called - * on every new request - */ - int (*so_hpreq_handler)(struct ptlrpc_request *); - /** - * service-specific print fn - */ - void (*so_req_printer)(void *, struct ptlrpc_request *); -}; - -#ifndef __cfs_cacheline_aligned -/* NB: put it here for reducing patche dependence */ -# define __cfs_cacheline_aligned -#endif - -/** - * How many high priority requests to serve before serving one normal - * priority request - */ -#define PTLRPC_SVC_HP_RATIO 10 - -/** - * Definition of PortalRPC service. - * The service is listening on a particular portal (like tcp port) - * and perform actions for a specific server like IO service for OST - * or general metadata service for MDS. - */ -struct ptlrpc_service { - /** serialize sysfs operations */ - spinlock_t srv_lock; - /** most often accessed fields */ - /** chain thru all services */ - struct list_head srv_list; - /** service operations table */ - struct ptlrpc_service_ops srv_ops; - /** only statically allocated strings here; we don't clean them */ - char *srv_name; - /** only statically allocated strings here; we don't clean them */ - char *srv_thread_name; - /** service thread list */ - struct list_head srv_threads; - /** threads # should be created for each partition on initializing */ - int srv_nthrs_cpt_init; - /** limit of threads number for each partition */ - int srv_nthrs_cpt_limit; - /** Root of debugfs dir tree for this service */ - struct dentry *srv_debugfs_entry; - /** Pointer to statistic data for this service */ - struct lprocfs_stats *srv_stats; - /** # hp per lp reqs to handle */ - int srv_hpreq_ratio; - /** biggest request to receive */ - int srv_max_req_size; - /** biggest reply to send */ - int srv_max_reply_size; - /** size of individual buffers */ - int srv_buf_size; - /** # buffers to allocate in 1 group */ - int srv_nbuf_per_group; - /** Local portal on which to receive requests */ - __u32 srv_req_portal; - /** Portal on the client to send replies to */ - __u32 srv_rep_portal; - /** - * Tags for lu_context associated with this thread, see struct - * lu_context. - */ - __u32 srv_ctx_tags; - /** soft watchdog timeout multiplier */ - int srv_watchdog_factor; - /** under unregister_service */ - unsigned srv_is_stopping:1; - - /** max # request buffers in history per partition */ - int srv_hist_nrqbds_cpt_max; - /** number of CPTs this service bound on */ - int srv_ncpts; - /** CPTs array this service bound on */ - __u32 *srv_cpts; - /** 2^srv_cptab_bits >= cfs_cpt_numbert(srv_cptable) */ - int srv_cpt_bits; - /** CPT table this service is running over */ - struct cfs_cpt_table *srv_cptable; - - /* sysfs object */ - struct kobject srv_kobj; - struct completion srv_kobj_unregister; - /** - * partition data for ptlrpc service - */ - struct ptlrpc_service_part *srv_parts[0]; -}; - -/** - * Definition of PortalRPC service partition data. - * Although a service only has one instance of it right now, but we - * will have multiple instances very soon (instance per CPT). - * - * it has four locks: - * \a scp_lock - * serialize operations on rqbd and requests waiting for preprocess - * \a scp_req_lock - * serialize operations active requests sent to this portal - * \a scp_at_lock - * serialize adaptive timeout stuff - * \a scp_rep_lock - * serialize operations on RS list (reply states) - * - * We don't have any use-case to take two or more locks at the same time - * for now, so there is no lock order issue. - */ -struct ptlrpc_service_part { - /** back reference to owner */ - struct ptlrpc_service *scp_service __cfs_cacheline_aligned; - /* CPT id, reserved */ - int scp_cpt; - /** always increasing number */ - int scp_thr_nextid; - /** # of starting threads */ - int scp_nthrs_starting; - /** # of stopping threads, reserved for shrinking threads */ - int scp_nthrs_stopping; - /** # running threads */ - int scp_nthrs_running; - /** service threads list */ - struct list_head scp_threads; - - /** - * serialize the following fields, used for protecting - * rqbd list and incoming requests waiting for preprocess, - * threads starting & stopping are also protected by this lock. - */ - spinlock_t scp_lock __cfs_cacheline_aligned; - /** total # req buffer descs allocated */ - int scp_nrqbds_total; - /** # posted request buffers for receiving */ - int scp_nrqbds_posted; - /** in progress of allocating rqbd */ - int scp_rqbd_allocating; - /** # incoming reqs */ - int scp_nreqs_incoming; - /** request buffers to be reposted */ - struct list_head scp_rqbd_idle; - /** req buffers receiving */ - struct list_head scp_rqbd_posted; - /** incoming reqs */ - struct list_head scp_req_incoming; - /** timeout before re-posting reqs, in tick */ - long scp_rqbd_timeout; - /** - * all threads sleep on this. This wait-queue is signalled when new - * incoming request arrives and when difficult reply has to be handled. - */ - wait_queue_head_t scp_waitq; - - /** request history */ - struct list_head scp_hist_reqs; - /** request buffer history */ - struct list_head scp_hist_rqbds; - /** # request buffers in history */ - int scp_hist_nrqbds; - /** sequence number for request */ - __u64 scp_hist_seq; - /** highest seq culled from history */ - __u64 scp_hist_seq_culled; - - /** - * serialize the following fields, used for processing requests - * sent to this portal - */ - spinlock_t scp_req_lock __cfs_cacheline_aligned; - /** # reqs in either of the NRS heads below */ - /** # reqs being served */ - int scp_nreqs_active; - /** # HPreqs being served */ - int scp_nhreqs_active; - /** # hp requests handled */ - int scp_hreq_count; - - /** NRS head for regular requests */ - struct ptlrpc_nrs scp_nrs_reg; - /** NRS head for HP requests; this is only valid for services that can - * handle HP requests - */ - struct ptlrpc_nrs *scp_nrs_hp; - - /** AT stuff */ - /** @{ */ - /** - * serialize the following fields, used for changes on - * adaptive timeout - */ - spinlock_t scp_at_lock __cfs_cacheline_aligned; - /** estimated rpc service time */ - struct adaptive_timeout scp_at_estimate; - /** reqs waiting for replies */ - struct ptlrpc_at_array scp_at_array; - /** early reply timer */ - struct timer_list scp_at_timer; - /** debug */ - unsigned long scp_at_checktime; - /** check early replies */ - unsigned scp_at_check; - /** @} */ - - /** - * serialize the following fields, used for processing - * replies for this portal - */ - spinlock_t scp_rep_lock __cfs_cacheline_aligned; - /** all the active replies */ - struct list_head scp_rep_active; - /** List of free reply_states */ - struct list_head scp_rep_idle; - /** waitq to run, when adding stuff to srv_free_rs_list */ - wait_queue_head_t scp_rep_waitq; - /** # 'difficult' replies */ - atomic_t scp_nreps_difficult; -}; - -#define ptlrpc_service_for_each_part(part, i, svc) \ - for (i = 0; \ - i < (svc)->srv_ncpts && \ - (svc)->srv_parts && \ - ((part) = (svc)->srv_parts[i]); i++) - -/** - * Declaration of ptlrpcd control structure - */ -struct ptlrpcd_ctl { - /** - * Ptlrpc thread control flags (LIOD_START, LIOD_STOP, LIOD_FORCE) - */ - unsigned long pc_flags; - /** - * Thread lock protecting structure fields. - */ - spinlock_t pc_lock; - /** - * Start completion. - */ - struct completion pc_starting; - /** - * Stop completion. - */ - struct completion pc_finishing; - /** - * Thread requests set. - */ - struct ptlrpc_request_set *pc_set; - /** - * Thread name used in kthread_run() - */ - char pc_name[16]; - /** - * CPT the thread is bound on. - */ - int pc_cpt; - /** - * Index of ptlrpcd thread in the array. - */ - int pc_index; - /** - * Pointer to the array of partners' ptlrpcd_ctl structure. - */ - struct ptlrpcd_ctl **pc_partners; - /** - * Number of the ptlrpcd's partners. - */ - int pc_npartners; - /** - * Record the partner index to be processed next. - */ - int pc_cursor; - /** - * Error code if the thread failed to fully start. - */ - int pc_error; -}; - -/* Bits for pc_flags */ -enum ptlrpcd_ctl_flags { - /** - * Ptlrpc thread start flag. - */ - LIOD_START = 1 << 0, - /** - * Ptlrpc thread stop flag. - */ - LIOD_STOP = 1 << 1, - /** - * Ptlrpc thread force flag (only stop force so far). - * This will cause aborting any inflight rpcs handled - * by thread if LIOD_STOP is specified. - */ - LIOD_FORCE = 1 << 2, - /** - * This is a recovery ptlrpc thread. - */ - LIOD_RECOVERY = 1 << 3, -}; - -/** - * \addtogroup nrs - * @{ - * - * Service compatibility function; the policy is compatible with all services. - * - * \param[in] svc The service the policy is attempting to register with. - * \param[in] desc The policy descriptor - * - * \retval true The policy is compatible with the service - * - * \see ptlrpc_nrs_pol_desc::pd_compat() - */ -static inline bool nrs_policy_compat_all(const struct ptlrpc_service *svc, - const struct ptlrpc_nrs_pol_desc *desc) -{ - return true; -} - -/** - * Service compatibility function; the policy is compatible with only a specific - * service which is identified by its human-readable name at - * ptlrpc_service::srv_name. - * - * \param[in] svc The service the policy is attempting to register with. - * \param[in] desc The policy descriptor - * - * \retval false The policy is not compatible with the service - * \retval true The policy is compatible with the service - * - * \see ptlrpc_nrs_pol_desc::pd_compat() - */ -static inline bool nrs_policy_compat_one(const struct ptlrpc_service *svc, - const struct ptlrpc_nrs_pol_desc *desc) -{ - return strcmp(svc->srv_name, desc->pd_compat_svc_name) == 0; -} - -/** @} nrs */ - -/* ptlrpc/events.c */ -extern struct lnet_handle_eq ptlrpc_eq_h; -int ptlrpc_uuid_to_peer(struct obd_uuid *uuid, - struct lnet_process_id *peer, lnet_nid_t *self); -/** - * These callbacks are invoked by LNet when something happened to - * underlying buffer - * @{ - */ -void request_out_callback(struct lnet_event *ev); -void reply_in_callback(struct lnet_event *ev); -void client_bulk_callback(struct lnet_event *ev); -void request_in_callback(struct lnet_event *ev); -void reply_out_callback(struct lnet_event *ev); -/** @} */ - -/* ptlrpc/connection.c */ -struct ptlrpc_connection *ptlrpc_connection_get(struct lnet_process_id peer, - lnet_nid_t self, - struct obd_uuid *uuid); -int ptlrpc_connection_put(struct ptlrpc_connection *c); -struct ptlrpc_connection *ptlrpc_connection_addref(struct ptlrpc_connection *); -int ptlrpc_connection_init(void); -void ptlrpc_connection_fini(void); - -/* ptlrpc/niobuf.c */ -/** - * Actual interfacing with LNet to put/get/register/unregister stuff - * @{ - */ - -int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async); - -static inline int ptlrpc_client_bulk_active(struct ptlrpc_request *req) -{ - struct ptlrpc_bulk_desc *desc; - int rc; - - desc = req->rq_bulk; - - if (req->rq_bulk_deadline > ktime_get_real_seconds()) - return 1; - - if (!desc) - return 0; - - spin_lock(&desc->bd_lock); - rc = desc->bd_md_count; - spin_unlock(&desc->bd_lock); - return rc; -} - -#define PTLRPC_REPLY_MAYBE_DIFFICULT 0x01 -#define PTLRPC_REPLY_EARLY 0x02 -int ptlrpc_send_reply(struct ptlrpc_request *req, int flags); -int ptlrpc_reply(struct ptlrpc_request *req); -int ptlrpc_send_error(struct ptlrpc_request *req, int difficult); -int ptlrpc_error(struct ptlrpc_request *req); -int ptlrpc_at_get_net_latency(struct ptlrpc_request *req); -int ptl_send_rpc(struct ptlrpc_request *request, int noreply); -int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd); -/** @} */ - -/* ptlrpc/client.c */ -/** - * Client-side portals API. Everything to send requests, receive replies, - * request queues, request management, etc. - * @{ - */ -void ptlrpc_request_committed(struct ptlrpc_request *req, int force); - -int ptlrpc_inc_ref(void); -void ptlrpc_dec_ref(void); - -void ptlrpc_init_client(int req_portal, int rep_portal, char *name, - struct ptlrpc_client *); -struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid); - -int ptlrpc_queue_wait(struct ptlrpc_request *req); -int ptlrpc_replay_req(struct ptlrpc_request *req); -void ptlrpc_abort_inflight(struct obd_import *imp); -void ptlrpc_abort_set(struct ptlrpc_request_set *set); - -struct ptlrpc_request_set *ptlrpc_prep_set(void); -struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func, - void *arg); -int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set); -int ptlrpc_set_wait(struct ptlrpc_request_set *); -void ptlrpc_mark_interrupted(struct ptlrpc_request *req); -void ptlrpc_set_destroy(struct ptlrpc_request_set *); -void ptlrpc_set_add_req(struct ptlrpc_request_set *, struct ptlrpc_request *); - -void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool); -int ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq); - -struct ptlrpc_request_pool * -ptlrpc_init_rq_pool(int, int, - int (*populate_pool)(struct ptlrpc_request_pool *, int)); - -void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req); -struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp, - const struct req_format *format); -struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp, - struct ptlrpc_request_pool *, - const struct req_format *); -void ptlrpc_request_free(struct ptlrpc_request *request); -int ptlrpc_request_pack(struct ptlrpc_request *request, - __u32 version, int opcode); -struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *, - const struct req_format *, - __u32, int); -int ptlrpc_request_bufs_pack(struct ptlrpc_request *request, - __u32 version, int opcode, char **bufs, - struct ptlrpc_cli_ctx *ctx); -void ptlrpc_req_finished(struct ptlrpc_request *request); -struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req); -struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req, - unsigned int nfrags, - unsigned int max_brw, - unsigned int type, - unsigned int portal, - const struct ptlrpc_bulk_frag_ops *ops); - -int ptlrpc_prep_bulk_frag(struct ptlrpc_bulk_desc *desc, - void *frag, int len); -void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc, - struct page *page, int pageoffset, int len, - int pin); -static inline void ptlrpc_prep_bulk_page_pin(struct ptlrpc_bulk_desc *desc, - struct page *page, int pageoffset, - int len) -{ - __ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 1); -} - -static inline void ptlrpc_prep_bulk_page_nopin(struct ptlrpc_bulk_desc *desc, - struct page *page, int pageoffset, - int len) -{ - __ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 0); -} - -void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk); - -static inline void ptlrpc_release_bulk_page_pin(struct ptlrpc_bulk_desc *desc) -{ - int i; - - for (i = 0; i < desc->bd_iov_count ; i++) - put_page(BD_GET_KIOV(desc, i).bv_page); -} - -void ptlrpc_retain_replayable_request(struct ptlrpc_request *req, - struct obd_import *imp); -__u64 ptlrpc_next_xid(void); -__u64 ptlrpc_sample_next_xid(void); -__u64 ptlrpc_req_xid(struct ptlrpc_request *request); - -/* Set of routines to run a function in ptlrpcd context */ -void *ptlrpcd_alloc_work(struct obd_import *imp, - int (*cb)(const struct lu_env *, void *), void *data); -void ptlrpcd_destroy_work(void *handler); -int ptlrpcd_queue_work(void *handler); - -/** @} */ -struct ptlrpc_service_buf_conf { - /* nbufs is buffers # to allocate when growing the pool */ - unsigned int bc_nbufs; - /* buffer size to post */ - unsigned int bc_buf_size; - /* portal to listed for requests on */ - unsigned int bc_req_portal; - /* portal of where to send replies to */ - unsigned int bc_rep_portal; - /* maximum request size to be accepted for this service */ - unsigned int bc_req_max_size; - /* maximum reply size this service can ever send */ - unsigned int bc_rep_max_size; -}; - -struct ptlrpc_service_thr_conf { - /* threadname should be 8 characters or less - 6 will be added on */ - char *tc_thr_name; - /* threads increasing factor for each CPU */ - unsigned int tc_thr_factor; - /* service threads # to start on each partition while initializing */ - unsigned int tc_nthrs_init; - /* - * low water of threads # upper-limit on each partition while running, - * service availability may be impacted if threads number is lower - * than this value. It can be ZERO if the service doesn't require - * CPU affinity or there is only one partition. - */ - unsigned int tc_nthrs_base; - /* "soft" limit for total threads number */ - unsigned int tc_nthrs_max; - /* user specified threads number, it will be validated due to - * other members of this structure. - */ - unsigned int tc_nthrs_user; - /* set NUMA node affinity for service threads */ - unsigned int tc_cpu_affinity; - /* Tags for lu_context associated with service thread */ - __u32 tc_ctx_tags; -}; - -struct ptlrpc_service_cpt_conf { - struct cfs_cpt_table *cc_cptable; - /* string pattern to describe CPTs for a service */ - char *cc_pattern; -}; - -struct ptlrpc_service_conf { - /* service name */ - char *psc_name; - /* soft watchdog timeout multiplifier to print stuck service traces */ - unsigned int psc_watchdog_factor; - /* buffer information */ - struct ptlrpc_service_buf_conf psc_buf; - /* thread information */ - struct ptlrpc_service_thr_conf psc_thr; - /* CPU partition information */ - struct ptlrpc_service_cpt_conf psc_cpt; - /* function table */ - struct ptlrpc_service_ops psc_ops; -}; - -/* ptlrpc/service.c */ -/** - * Server-side services API. Register/unregister service, request state - * management, service thread management - * - * @{ - */ -void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs); -void ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs); -struct ptlrpc_service *ptlrpc_register_service(struct ptlrpc_service_conf *conf, - struct kset *parent, - struct dentry *debugfs_entry); - -int ptlrpc_start_threads(struct ptlrpc_service *svc); -int ptlrpc_unregister_service(struct ptlrpc_service *service); - -int ptlrpc_hr_init(void); -void ptlrpc_hr_fini(void); - -/** @} */ - -/* ptlrpc/import.c */ -/** - * Import API - * @{ - */ -int ptlrpc_connect_import(struct obd_import *imp); -int ptlrpc_init_import(struct obd_import *imp); -int ptlrpc_disconnect_import(struct obd_import *imp, int noclose); -int ptlrpc_import_recovery_state_machine(struct obd_import *imp); - -/* ptlrpc/pack_generic.c */ -int ptlrpc_reconnect_import(struct obd_import *imp); -/** @} */ - -/** - * ptlrpc msg buffer and swab interface - * - * @{ - */ -int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout, - u32 index); -void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout, - u32 index); -int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len); -int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len); - -void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens, - char **bufs); -int lustre_pack_request(struct ptlrpc_request *, __u32 magic, int count, - __u32 *lens, char **bufs); -int lustre_pack_reply(struct ptlrpc_request *, int count, __u32 *lens, - char **bufs); -int lustre_pack_reply_v2(struct ptlrpc_request *req, int count, - __u32 *lens, char **bufs, int flags); -#define LPRFL_EARLY_REPLY 1 -int lustre_pack_reply_flags(struct ptlrpc_request *, int count, __u32 *lens, - char **bufs, int flags); -int lustre_shrink_msg(struct lustre_msg *msg, int segment, - unsigned int newlen, int move_data); -void lustre_free_reply_state(struct ptlrpc_reply_state *rs); -int __lustre_unpack_msg(struct lustre_msg *m, int len); -u32 lustre_msg_hdr_size(__u32 magic, u32 count); -u32 lustre_msg_size(__u32 magic, int count, __u32 *lengths); -u32 lustre_msg_size_v2(int count, __u32 *lengths); -u32 lustre_packed_msg_size(struct lustre_msg *msg); -u32 lustre_msg_early_size(void); -void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, u32 n, u32 min_size); -void *lustre_msg_buf(struct lustre_msg *m, u32 n, u32 minlen); -u32 lustre_msg_buflen(struct lustre_msg *m, u32 n); -u32 lustre_msg_bufcount(struct lustre_msg *m); -char *lustre_msg_string(struct lustre_msg *m, u32 n, u32 max_len); -__u32 lustre_msghdr_get_flags(struct lustre_msg *msg); -void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags); -__u32 lustre_msg_get_flags(struct lustre_msg *msg); -void lustre_msg_add_flags(struct lustre_msg *msg, u32 flags); -void lustre_msg_set_flags(struct lustre_msg *msg, u32 flags); -void lustre_msg_clear_flags(struct lustre_msg *msg, u32 flags); -__u32 lustre_msg_get_op_flags(struct lustre_msg *msg); -void lustre_msg_add_op_flags(struct lustre_msg *msg, u32 flags); -struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg); -__u32 lustre_msg_get_type(struct lustre_msg *msg); -void lustre_msg_add_version(struct lustre_msg *msg, u32 version); -__u32 lustre_msg_get_opc(struct lustre_msg *msg); -__u16 lustre_msg_get_tag(struct lustre_msg *msg); -__u64 lustre_msg_get_last_committed(struct lustre_msg *msg); -__u64 *lustre_msg_get_versions(struct lustre_msg *msg); -__u64 lustre_msg_get_transno(struct lustre_msg *msg); -__u64 lustre_msg_get_slv(struct lustre_msg *msg); -__u32 lustre_msg_get_limit(struct lustre_msg *msg); -void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv); -void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit); -int lustre_msg_get_status(struct lustre_msg *msg); -__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg); -__u32 lustre_msg_get_magic(struct lustre_msg *msg); -__u32 lustre_msg_get_timeout(struct lustre_msg *msg); -__u32 lustre_msg_get_service_time(struct lustre_msg *msg); -__u32 lustre_msg_get_cksum(struct lustre_msg *msg); -__u32 lustre_msg_calc_cksum(struct lustre_msg *msg); -void lustre_msg_set_handle(struct lustre_msg *msg, - struct lustre_handle *handle); -void lustre_msg_set_type(struct lustre_msg *msg, __u32 type); -void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc); -void lustre_msg_set_last_xid(struct lustre_msg *msg, u64 last_xid); -void lustre_msg_set_tag(struct lustre_msg *msg, __u16 tag); -void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions); -void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno); -void lustre_msg_set_status(struct lustre_msg *msg, __u32 status); -void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt); -void ptlrpc_request_set_replen(struct ptlrpc_request *req); -void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout); -void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time); -void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid); -void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum); -void lustre_msg_set_mbits(struct lustre_msg *msg, u64 mbits); - -static inline void -lustre_shrink_reply(struct ptlrpc_request *req, int segment, - unsigned int newlen, int move_data) -{ - LASSERT(req->rq_reply_state); - LASSERT(req->rq_repmsg); - req->rq_replen = lustre_shrink_msg(req->rq_repmsg, segment, - newlen, move_data); -} - -#ifdef CONFIG_LUSTRE_TRANSLATE_ERRNOS - -static inline int ptlrpc_status_hton(int h) -{ - /* - * Positive errnos must be network errnos, such as LUSTRE_EDEADLK, - * ELDLM_LOCK_ABORTED, etc. - */ - if (h < 0) - return -lustre_errno_hton(-h); - else - return h; -} - -static inline int ptlrpc_status_ntoh(int n) -{ - /* - * See the comment in ptlrpc_status_hton(). - */ - if (n < 0) - return -lustre_errno_ntoh(-n); - else - return n; -} - -#else - -#define ptlrpc_status_hton(h) (h) -#define ptlrpc_status_ntoh(n) (n) - -#endif -/** @} */ - -/** Change request phase of \a req to \a new_phase */ -static inline void -ptlrpc_rqphase_move(struct ptlrpc_request *req, enum rq_phase new_phase) -{ - if (req->rq_phase == new_phase) - return; - - if (new_phase == RQ_PHASE_UNREG_RPC || - new_phase == RQ_PHASE_UNREG_BULK) { - /* No embedded unregistering phases */ - if (req->rq_phase == RQ_PHASE_UNREG_RPC || - req->rq_phase == RQ_PHASE_UNREG_BULK) - return; - - req->rq_next_phase = req->rq_phase; - if (req->rq_import) - atomic_inc(&req->rq_import->imp_unregistering); - } - - if (req->rq_phase == RQ_PHASE_UNREG_RPC || - req->rq_phase == RQ_PHASE_UNREG_BULK) { - if (req->rq_import) - atomic_dec(&req->rq_import->imp_unregistering); - } - - DEBUG_REQ(D_INFO, req, "move req \"%s\" -> \"%s\"", - ptlrpc_rqphase2str(req), ptlrpc_phase2str(new_phase)); - - req->rq_phase = new_phase; -} - -/** - * Returns true if request \a req got early reply and hard deadline is not met - */ -static inline int -ptlrpc_client_early(struct ptlrpc_request *req) -{ - return req->rq_early; -} - -/** - * Returns true if we got real reply from server for this request - */ -static inline int -ptlrpc_client_replied(struct ptlrpc_request *req) -{ - if (req->rq_reply_deadline > ktime_get_real_seconds()) - return 0; - return req->rq_replied; -} - -/** Returns true if request \a req is in process of receiving server reply */ -static inline int -ptlrpc_client_recv(struct ptlrpc_request *req) -{ - if (req->rq_reply_deadline > ktime_get_real_seconds()) - return 1; - return req->rq_receiving_reply; -} - -static inline int -ptlrpc_client_recv_or_unlink(struct ptlrpc_request *req) -{ - int rc; - - spin_lock(&req->rq_lock); - if (req->rq_reply_deadline > ktime_get_real_seconds()) { - spin_unlock(&req->rq_lock); - return 1; - } - if (req->rq_req_deadline > ktime_get_real_seconds()) { - spin_unlock(&req->rq_lock); - return 1; - } - rc = !req->rq_req_unlinked || !req->rq_reply_unlinked || - req->rq_receiving_reply; - spin_unlock(&req->rq_lock); - return rc; -} - -static inline void -ptlrpc_client_wake_req(struct ptlrpc_request *req) -{ - if (!req->rq_set) - wake_up(&req->rq_reply_waitq); - else - wake_up(&req->rq_set->set_waitq); -} - -static inline void -ptlrpc_rs_addref(struct ptlrpc_reply_state *rs) -{ - LASSERT(atomic_read(&rs->rs_refcount) > 0); - atomic_inc(&rs->rs_refcount); -} - -static inline void -ptlrpc_rs_decref(struct ptlrpc_reply_state *rs) -{ - LASSERT(atomic_read(&rs->rs_refcount) > 0); - if (atomic_dec_and_test(&rs->rs_refcount)) - lustre_free_reply_state(rs); -} - -/* Should only be called once per req */ -static inline void ptlrpc_req_drop_rs(struct ptlrpc_request *req) -{ - if (!req->rq_reply_state) - return; /* shouldn't occur */ - ptlrpc_rs_decref(req->rq_reply_state); - req->rq_reply_state = NULL; - req->rq_repmsg = NULL; -} - -static inline __u32 lustre_request_magic(struct ptlrpc_request *req) -{ - return lustre_msg_get_magic(req->rq_reqmsg); -} - -static inline int ptlrpc_req_get_repsize(struct ptlrpc_request *req) -{ - switch (req->rq_reqmsg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: - return req->rq_reqmsg->lm_repsize; - default: - LASSERTF(0, "incorrect message magic: %08x\n", - req->rq_reqmsg->lm_magic); - return -EFAULT; - } -} - -static inline int ptlrpc_send_limit_expired(struct ptlrpc_request *req) -{ - if (req->rq_delay_limit != 0 && - time_before(req->rq_queued_time + req->rq_delay_limit * HZ, - jiffies)) { - return 1; - } - return 0; -} - -static inline int ptlrpc_no_resend(struct ptlrpc_request *req) -{ - if (!req->rq_no_resend && ptlrpc_send_limit_expired(req)) { - spin_lock(&req->rq_lock); - req->rq_no_resend = 1; - spin_unlock(&req->rq_lock); - } - return req->rq_no_resend; -} - -static inline int -ptlrpc_server_get_timeout(struct ptlrpc_service_part *svcpt) -{ - int at = AT_OFF ? 0 : at_get(&svcpt->scp_at_estimate); - - return svcpt->scp_service->srv_watchdog_factor * - max_t(int, at, obd_timeout); -} - -static inline struct ptlrpc_service * -ptlrpc_req2svc(struct ptlrpc_request *req) -{ - return req->rq_rqbd->rqbd_svcpt->scp_service; -} - -/* ldlm/ldlm_lib.c */ -/** - * Target client logic - * @{ - */ -int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg); -int client_obd_cleanup(struct obd_device *obddev); -int client_connect_import(const struct lu_env *env, - struct obd_export **exp, struct obd_device *obd, - struct obd_uuid *cluuid, struct obd_connect_data *, - void *localdata); -int client_disconnect_export(struct obd_export *exp); -int client_import_add_conn(struct obd_import *imp, struct obd_uuid *uuid, - int priority); -int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid); -int client_import_find_conn(struct obd_import *imp, lnet_nid_t peer, - struct obd_uuid *uuid); -int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid); -void client_destroy_import(struct obd_import *imp); -/** @} */ - -/* ptlrpc/pinger.c */ -/** - * Pinger API (client side only) - * @{ - */ -enum timeout_event { - TIMEOUT_GRANT = 1 -}; - -struct timeout_item; -typedef int (*timeout_cb_t)(struct timeout_item *, void *); -int ptlrpc_pinger_add_import(struct obd_import *imp); -int ptlrpc_pinger_del_import(struct obd_import *imp); -int ptlrpc_add_timeout_client(int time, enum timeout_event event, - timeout_cb_t cb, void *data, - struct list_head *obd_list); -int ptlrpc_del_timeout_client(struct list_head *obd_list, - enum timeout_event event); -struct ptlrpc_request *ptlrpc_prep_ping(struct obd_import *imp); -int ptlrpc_obd_ping(struct obd_device *obd); -void ptlrpc_pinger_ir_up(void); -void ptlrpc_pinger_ir_down(void); -/** @} */ -int ptlrpc_pinger_suppress_pings(void); - -/* ptlrpc/ptlrpcd.c */ -void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force); -void ptlrpcd_free(struct ptlrpcd_ctl *pc); -void ptlrpcd_wake(struct ptlrpc_request *req); -void ptlrpcd_add_req(struct ptlrpc_request *req); -int ptlrpcd_addref(void); -void ptlrpcd_decref(void); - -/* ptlrpc/lproc_ptlrpc.c */ -/** - * procfs output related functions - * @{ - */ -const char *ll_opcode2str(__u32 opcode); -void ptlrpc_lprocfs_register_obd(struct obd_device *obd); -void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd); -void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes); -/** @} */ - -/* ptlrpc/llog_client.c */ -extern struct llog_operations llog_client_ops; -/** @} net */ - -#endif -/** @} PtlRPC */ diff --git a/drivers/staging/lustre/lustre/include/lustre_nrs.h b/drivers/staging/lustre/lustre/include/lustre_nrs.h deleted file mode 100644 index ffa7317da35b8cbb0ef3814713ffcbf2fd12af3a..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_nrs.h +++ /dev/null @@ -1,718 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License version 2 for more details. - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2014, Intel Corporation. - * - * Copyright 2012 Xyratex Technology Limited - */ -/* - * - * Network Request Scheduler (NRS) - * - */ - -#ifndef _LUSTRE_NRS_H -#define _LUSTRE_NRS_H - -/** - * \defgroup nrs Network Request Scheduler - * @{ - */ -struct ptlrpc_nrs_policy; -struct ptlrpc_nrs_resource; -struct ptlrpc_nrs_request; - -/** - * NRS control operations. - * - * These are common for all policies. - */ -enum ptlrpc_nrs_ctl { - /** - * Not a valid opcode. - */ - PTLRPC_NRS_CTL_INVALID, - /** - * Activate the policy. - */ - PTLRPC_NRS_CTL_START, - /** - * Reserved for multiple primary policies, which may be a possibility - * in the future. - */ - PTLRPC_NRS_CTL_STOP, - /** - * Policies can start using opcodes from this value and onwards for - * their own purposes; the assigned value itself is arbitrary. - */ - PTLRPC_NRS_CTL_1ST_POL_SPEC = 0x20, -}; - -/** - * NRS policy operations. - * - * These determine the behaviour of a policy, and are called in response to - * NRS core events. - */ -struct ptlrpc_nrs_pol_ops { - /** - * Called during policy registration; this operation is optional. - * - * \param[in,out] policy The policy being initialized - */ - int (*op_policy_init)(struct ptlrpc_nrs_policy *policy); - /** - * Called during policy unregistration; this operation is optional. - * - * \param[in,out] policy The policy being unregistered/finalized - */ - void (*op_policy_fini)(struct ptlrpc_nrs_policy *policy); - /** - * Called when activating a policy via lprocfs; policies allocate and - * initialize their resources here; this operation is optional. - * - * \param[in,out] policy The policy being started - * - * \see nrs_policy_start_locked() - */ - int (*op_policy_start)(struct ptlrpc_nrs_policy *policy); - /** - * Called when deactivating a policy via lprocfs; policies deallocate - * their resources here; this operation is optional - * - * \param[in,out] policy The policy being stopped - * - * \see nrs_policy_stop0() - */ - void (*op_policy_stop)(struct ptlrpc_nrs_policy *policy); - /** - * Used for policy-specific operations; i.e. not generic ones like - * \e PTLRPC_NRS_CTL_START and \e PTLRPC_NRS_CTL_GET_INFO; analogous - * to an ioctl; this operation is optional. - * - * \param[in,out] policy The policy carrying out operation \a opc - * \param[in] opc The command operation being carried out - * \param[in,out] arg An generic buffer for communication between the - * user and the control operation - * - * \retval -ve error - * \retval 0 success - * - * \see ptlrpc_nrs_policy_control() - */ - int (*op_policy_ctl)(struct ptlrpc_nrs_policy *policy, - enum ptlrpc_nrs_ctl opc, void *arg); - - /** - * Called when obtaining references to the resources of the resource - * hierarchy for a request that has arrived for handling at the PTLRPC - * service. Policies should return -ve for requests they do not wish - * to handle. This operation is mandatory. - * - * \param[in,out] policy The policy we're getting resources for. - * \param[in,out] nrq The request we are getting resources for. - * \param[in] parent The parent resource of the resource being - * requested; set to NULL if none. - * \param[out] resp The resource is to be returned here; the - * fallback policy in an NRS head should - * \e always return a non-NULL pointer value. - * \param[in] moving_req When set, signifies that this is an attempt - * to obtain resources for a request being moved - * to the high-priority NRS head by - * ldlm_lock_reorder_req(). - * This implies two things: - * 1. We are under obd_export::exp_rpc_lock and - * so should not sleep. - * 2. We should not perform non-idempotent or can - * skip performing idempotent operations that - * were carried out when resources were first - * taken for the request when it was initialized - * in ptlrpc_nrs_req_initialize(). - * - * \retval 0, +ve The level of the returned resource in the resource - * hierarchy; currently only 0 (for a non-leaf resource) - * and 1 (for a leaf resource) are supported by the - * framework. - * \retval -ve error - * - * \see ptlrpc_nrs_req_initialize() - * \see ptlrpc_nrs_hpreq_add_nolock() - * \see ptlrpc_nrs_req_hp_move() - */ - int (*op_res_get)(struct ptlrpc_nrs_policy *policy, - struct ptlrpc_nrs_request *nrq, - const struct ptlrpc_nrs_resource *parent, - struct ptlrpc_nrs_resource **resp, - bool moving_req); - /** - * Called when releasing references taken for resources in the resource - * hierarchy for the request; this operation is optional. - * - * \param[in,out] policy The policy the resource belongs to - * \param[in] res The resource to be freed - * - * \see ptlrpc_nrs_req_finalize() - * \see ptlrpc_nrs_hpreq_add_nolock() - * \see ptlrpc_nrs_req_hp_move() - */ - void (*op_res_put)(struct ptlrpc_nrs_policy *policy, - const struct ptlrpc_nrs_resource *res); - - /** - * Obtains a request for handling from the policy, and optionally - * removes the request from the policy; this operation is mandatory. - * - * \param[in,out] policy The policy to poll - * \param[in] peek When set, signifies that we just want to - * examine the request, and not handle it, so the - * request is not removed from the policy. - * \param[in] force When set, it will force a policy to return a - * request if it has one queued. - * - * \retval NULL No request available for handling - * \retval valid-pointer The request polled for handling - * - * \see ptlrpc_nrs_req_get_nolock() - */ - struct ptlrpc_nrs_request * - (*op_req_get)(struct ptlrpc_nrs_policy *policy, bool peek, - bool force); - /** - * Called when attempting to add a request to a policy for later - * handling; this operation is mandatory. - * - * \param[in,out] policy The policy on which to enqueue \a nrq - * \param[in,out] nrq The request to enqueue - * - * \retval 0 success - * \retval != 0 error - * - * \see ptlrpc_nrs_req_add_nolock() - */ - int (*op_req_enqueue)(struct ptlrpc_nrs_policy *policy, - struct ptlrpc_nrs_request *nrq); - /** - * Removes a request from the policy's set of pending requests. Normally - * called after a request has been polled successfully from the policy - * for handling; this operation is mandatory. - * - * \param[in,out] policy The policy the request \a nrq belongs to - * \param[in,out] nrq The request to dequeue - * - * \see ptlrpc_nrs_req_del_nolock() - */ - void (*op_req_dequeue)(struct ptlrpc_nrs_policy *policy, - struct ptlrpc_nrs_request *nrq); - /** - * Called after the request being carried out. Could be used for - * job/resource control; this operation is optional. - * - * \param[in,out] policy The policy which is stopping to handle request - * \a nrq - * \param[in,out] nrq The request - * - * \pre assert_spin_locked(&svcpt->scp_req_lock) - * - * \see ptlrpc_nrs_req_stop_nolock() - */ - void (*op_req_stop)(struct ptlrpc_nrs_policy *policy, - struct ptlrpc_nrs_request *nrq); - /** - * Registers the policy's lprocfs interface with a PTLRPC service. - * - * \param[in] svc The service - * - * \retval 0 success - * \retval != 0 error - */ - int (*op_lprocfs_init)(struct ptlrpc_service *svc); - /** - * Unegisters the policy's lprocfs interface with a PTLRPC service. - * - * In cases of failed policy registration in - * \e ptlrpc_nrs_policy_register(), this function may be called for a - * service which has not registered the policy successfully, so - * implementations of this method should make sure their operations are - * safe in such cases. - * - * \param[in] svc The service - */ - void (*op_lprocfs_fini)(struct ptlrpc_service *svc); -}; - -/** - * Policy flags - */ -enum nrs_policy_flags { - /** - * Fallback policy, use this flag only on a single supported policy per - * service. The flag cannot be used on policies that use - * \e PTLRPC_NRS_FL_REG_EXTERN - */ - PTLRPC_NRS_FL_FALLBACK = BIT(0), - /** - * Start policy immediately after registering. - */ - PTLRPC_NRS_FL_REG_START = BIT(1), - /** - * This is a policy registering from a module different to the one NRS - * core ships in (currently ptlrpc). - */ - PTLRPC_NRS_FL_REG_EXTERN = BIT(2), -}; - -/** - * NRS queue type. - * - * Denotes whether an NRS instance is for handling normal or high-priority - * RPCs, or whether an operation pertains to one or both of the NRS instances - * in a service. - */ -enum ptlrpc_nrs_queue_type { - PTLRPC_NRS_QUEUE_REG = BIT(0), - PTLRPC_NRS_QUEUE_HP = BIT(1), - PTLRPC_NRS_QUEUE_BOTH = (PTLRPC_NRS_QUEUE_REG | PTLRPC_NRS_QUEUE_HP) -}; - -/** - * NRS head - * - * A PTLRPC service has at least one NRS head instance for handling normal - * priority RPCs, and may optionally have a second NRS head instance for - * handling high-priority RPCs. Each NRS head maintains a list of available - * policies, of which one and only one policy is acting as the fallback policy, - * and optionally a different policy may be acting as the primary policy. For - * all RPCs handled by this NRS head instance, NRS core will first attempt to - * enqueue the RPC using the primary policy (if any). The fallback policy is - * used in the following cases: - * - when there was no primary policy in the - * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state at the time the request - * was initialized. - * - when the primary policy that was at the - * ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the - * RPC was initialized, denoted it did not wish, or for some other reason was - * not able to handle the request, by returning a non-valid NRS resource - * reference. - * - when the primary policy that was at the - * ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the - * RPC was initialized, fails later during the request enqueueing stage. - * - * \see nrs_resource_get_safe() - * \see nrs_request_enqueue() - */ -struct ptlrpc_nrs { - spinlock_t nrs_lock; - /** XXX Possibly replace svcpt->scp_req_lock with another lock here. */ - /** - * List of registered policies - */ - struct list_head nrs_policy_list; - /** - * List of policies with queued requests. Policies that have any - * outstanding requests are queued here, and this list is queried - * in a round-robin manner from NRS core when obtaining a request - * for handling. This ensures that requests from policies that at some - * point transition away from the - * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state are drained. - */ - struct list_head nrs_policy_queued; - /** - * Service partition for this NRS head - */ - struct ptlrpc_service_part *nrs_svcpt; - /** - * Primary policy, which is the preferred policy for handling RPCs - */ - struct ptlrpc_nrs_policy *nrs_policy_primary; - /** - * Fallback policy, which is the backup policy for handling RPCs - */ - struct ptlrpc_nrs_policy *nrs_policy_fallback; - /** - * This NRS head handles either HP or regular requests - */ - enum ptlrpc_nrs_queue_type nrs_queue_type; - /** - * # queued requests from all policies in this NRS head - */ - unsigned long nrs_req_queued; - /** - * # scheduled requests from all policies in this NRS head - */ - unsigned long nrs_req_started; - /** - * # policies on this NRS - */ - unsigned int nrs_num_pols; - /** - * This NRS head is in progress of starting a policy - */ - unsigned int nrs_policy_starting:1; - /** - * In progress of shutting down the whole NRS head; used during - * unregistration - */ - unsigned int nrs_stopping:1; - /** - * NRS policy is throttling request - */ - unsigned int nrs_throttling:1; -}; - -#define NRS_POL_NAME_MAX 16 -#define NRS_POL_ARG_MAX 16 - -struct ptlrpc_nrs_pol_desc; - -/** - * Service compatibility predicate; this determines whether a policy is adequate - * for handling RPCs of a particular PTLRPC service. - * - * XXX:This should give the same result during policy registration and - * unregistration, and for all partitions of a service; so the result should not - * depend on temporal service or other properties, that may influence the - * result. - */ -typedef bool (*nrs_pol_desc_compat_t)(const struct ptlrpc_service *svc, - const struct ptlrpc_nrs_pol_desc *desc); - -struct ptlrpc_nrs_pol_conf { - /** - * Human-readable policy name - */ - char nc_name[NRS_POL_NAME_MAX]; - /** - * NRS operations for this policy - */ - const struct ptlrpc_nrs_pol_ops *nc_ops; - /** - * Service compatibility predicate - */ - nrs_pol_desc_compat_t nc_compat; - /** - * Set for policies that support a single ptlrpc service, i.e. ones that - * have \a pd_compat set to nrs_policy_compat_one(). The variable value - * depicts the name of the single service that such policies are - * compatible with. - */ - const char *nc_compat_svc_name; - /** - * Owner module for this policy descriptor; policies registering from a - * different module to the one the NRS framework is held within - * (currently ptlrpc), should set this field to THIS_MODULE. - */ - struct module *nc_owner; - /** - * Policy registration flags; a bitmask of \e nrs_policy_flags - */ - unsigned int nc_flags; -}; - -/** - * NRS policy registering descriptor - * - * Is used to hold a description of a policy that can be passed to NRS core in - * order to register the policy with NRS heads in different PTLRPC services. - */ -struct ptlrpc_nrs_pol_desc { - /** - * Human-readable policy name - */ - char pd_name[NRS_POL_NAME_MAX]; - /** - * Link into nrs_core::nrs_policies - */ - struct list_head pd_list; - /** - * NRS operations for this policy - */ - const struct ptlrpc_nrs_pol_ops *pd_ops; - /** - * Service compatibility predicate - */ - nrs_pol_desc_compat_t pd_compat; - /** - * Set for policies that are compatible with only one PTLRPC service. - * - * \see ptlrpc_nrs_pol_conf::nc_compat_svc_name - */ - const char *pd_compat_svc_name; - /** - * Owner module for this policy descriptor. - * - * We need to hold a reference to the module whenever we might make use - * of any of the module's contents, i.e. - * - If one or more instances of the policy are at a state where they - * might be handling a request, i.e. - * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or - * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING as we will have to - * call into the policy's ptlrpc_nrs_pol_ops() handlers. A reference - * is taken on the module when - * \e ptlrpc_nrs_pol_desc::pd_refs becomes 1, and released when it - * becomes 0, so that we hold only one reference to the module maximum - * at any time. - * - * We do not need to hold a reference to the module, even though we - * might use code and data from the module, in the following cases: - * - During external policy registration, because this should happen in - * the module's init() function, in which case the module is safe from - * removal because a reference is being held on the module by the - * kernel, and iirc kmod (and I guess module-init-tools also) will - * serialize any racing processes properly anyway. - * - During external policy unregistration, because this should happen - * in a module's exit() function, and any attempts to start a policy - * instance would need to take a reference on the module, and this is - * not possible once we have reached the point where the exit() - * handler is called. - * - During service registration and unregistration, as service setup - * and cleanup, and policy registration, unregistration and policy - * instance starting, are serialized by \e nrs_core::nrs_mutex, so - * as long as users adhere to the convention of registering policies - * in init() and unregistering them in module exit() functions, there - * should not be a race between these operations. - * - During any policy-specific lprocfs operations, because a reference - * is held by the kernel on a proc entry that has been entered by a - * syscall, so as long as proc entries are removed during - * unregistration time, then unregistration and lprocfs operations - * will be properly serialized. - */ - struct module *pd_owner; - /** - * Bitmask of \e nrs_policy_flags - */ - unsigned int pd_flags; - /** - * # of references on this descriptor - */ - atomic_t pd_refs; -}; - -/** - * NRS policy state - * - * Policies transition from one state to the other during their lifetime - */ -enum ptlrpc_nrs_pol_state { - /** - * Not a valid policy state. - */ - NRS_POL_STATE_INVALID, - /** - * Policies are at this state either at the start of their life, or - * transition here when the user selects a different policy to act - * as the primary one. - */ - NRS_POL_STATE_STOPPED, - /** - * Policy is progress of stopping - */ - NRS_POL_STATE_STOPPING, - /** - * Policy is in progress of starting - */ - NRS_POL_STATE_STARTING, - /** - * A policy is in this state in two cases: - * - it is the fallback policy, which is always in this state. - * - it has been activated by the user; i.e. it is the primary policy, - */ - NRS_POL_STATE_STARTED, -}; - -/** - * NRS policy information - * - * Used for obtaining information for the status of a policy via lprocfs - */ -struct ptlrpc_nrs_pol_info { - /** - * Policy name - */ - char pi_name[NRS_POL_NAME_MAX]; - /** - * Policy argument - */ - char pi_arg[NRS_POL_ARG_MAX]; - /** - * Current policy state - */ - enum ptlrpc_nrs_pol_state pi_state; - /** - * # RPCs enqueued for later dispatching by the policy - */ - long pi_req_queued; - /** - * # RPCs started for dispatch by the policy - */ - long pi_req_started; - /** - * Is this a fallback policy? - */ - unsigned pi_fallback:1; -}; - -/** - * NRS policy - * - * There is one instance of this for each policy in each NRS head of each - * PTLRPC service partition. - */ -struct ptlrpc_nrs_policy { - /** - * Linkage into the NRS head's list of policies, - * ptlrpc_nrs:nrs_policy_list - */ - struct list_head pol_list; - /** - * Linkage into the NRS head's list of policies with enqueued - * requests ptlrpc_nrs:nrs_policy_queued - */ - struct list_head pol_list_queued; - /** - * Current state of this policy - */ - enum ptlrpc_nrs_pol_state pol_state; - /** - * Bitmask of nrs_policy_flags - */ - unsigned int pol_flags; - /** - * # RPCs enqueued for later dispatching by the policy - */ - long pol_req_queued; - /** - * # RPCs started for dispatch by the policy - */ - long pol_req_started; - /** - * Usage Reference count taken on the policy instance - */ - long pol_ref; - /** - * Human-readable policy argument - */ - char pol_arg[NRS_POL_ARG_MAX]; - /** - * The NRS head this policy has been created at - */ - struct ptlrpc_nrs *pol_nrs; - /** - * Private policy data; varies by policy type - */ - void *pol_private; - /** - * Policy descriptor for this policy instance. - */ - struct ptlrpc_nrs_pol_desc *pol_desc; -}; - -/** - * NRS resource - * - * Resources are embedded into two types of NRS entities: - * - Inside NRS policies, in the policy's private data in - * ptlrpc_nrs_policy::pol_private - * - In objects that act as prime-level scheduling entities in different NRS - * policies; e.g. on a policy that performs round robin or similar order - * scheduling across client NIDs, there would be one NRS resource per unique - * client NID. On a policy which performs round robin scheduling across - * backend filesystem objects, there would be one resource associated with - * each of the backend filesystem objects partaking in the scheduling - * performed by the policy. - * - * NRS resources share a parent-child relationship, in which resources embedded - * in policy instances are the parent entities, with all scheduling entities - * a policy schedules across being the children, thus forming a simple resource - * hierarchy. This hierarchy may be extended with one or more levels in the - * future if the ability to have more than one primary policy is added. - * - * Upon request initialization, references to the then active NRS policies are - * taken and used to later handle the dispatching of the request with one of - * these policies. - * - * \see nrs_resource_get_safe() - * \see ptlrpc_nrs_req_add() - */ -struct ptlrpc_nrs_resource { - /** - * This NRS resource's parent; is NULL for resources embedded in NRS - * policy instances; i.e. those are top-level ones. - */ - struct ptlrpc_nrs_resource *res_parent; - /** - * The policy associated with this resource. - */ - struct ptlrpc_nrs_policy *res_policy; -}; - -enum { - NRS_RES_FALLBACK, - NRS_RES_PRIMARY, - NRS_RES_MAX -}; - -#include - -/** - * NRS request - * - * Instances of this object exist embedded within ptlrpc_request; the main - * purpose of this object is to hold references to the request's resources - * for the lifetime of the request, and to hold properties that policies use - * use for determining the request's scheduling priority. - **/ -struct ptlrpc_nrs_request { - /** - * The request's resource hierarchy. - */ - struct ptlrpc_nrs_resource *nr_res_ptrs[NRS_RES_MAX]; - /** - * Index into ptlrpc_nrs_request::nr_res_ptrs of the resource of the - * policy that was used to enqueue the request. - * - * \see nrs_request_enqueue() - */ - unsigned int nr_res_idx; - unsigned int nr_initialized:1; - unsigned int nr_enqueued:1; - unsigned int nr_started:1; - unsigned int nr_finalized:1; - - /** - * Policy-specific fields, used for determining a request's scheduling - * priority, and other supporting functionality. - */ - union { - /** - * Fields for the FIFO policy - */ - struct nrs_fifo_req fifo; - } nr_u; - /** - * Externally-registering policies may want to use this to allocate - * their own request properties. - */ - void *ext; -}; - -/** @} nrs */ -#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_nrs_fifo.h b/drivers/staging/lustre/lustre/include/lustre_nrs_fifo.h deleted file mode 100644 index b70d97d4acbb6fddbab270a5195040c32bdb35fc..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_nrs_fifo.h +++ /dev/null @@ -1,71 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License version 2 for more details. - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2014, Intel Corporation. - * - * Copyright 2012 Xyratex Technology Limited - */ -/* - * - * Network Request Scheduler (NRS) First-in First-out (FIFO) policy - * - */ - -#ifndef _LUSTRE_NRS_FIFO_H -#define _LUSTRE_NRS_FIFO_H - -/* \name fifo - * - * FIFO policy - * - * This policy is a logical wrapper around previous, non-NRS functionality. - * It dispatches RPCs in the same order as they arrive from the network. This - * policy is currently used as the fallback policy, and the only enabled policy - * on all NRS heads of all PTLRPC service partitions. - * @{ - */ - -/** - * Private data structure for the FIFO policy - */ -struct nrs_fifo_head { - /** - * Resource object for policy instance. - */ - struct ptlrpc_nrs_resource fh_res; - /** - * List of queued requests. - */ - struct list_head fh_list; - /** - * For debugging purposes. - */ - __u64 fh_sequence; -}; - -struct nrs_fifo_req { - struct list_head fr_list; - __u64 fr_sequence; -}; - -/** @} fifo */ -#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_obdo.h b/drivers/staging/lustre/lustre/include/lustre_obdo.h deleted file mode 100644 index d67dcbb84f184942338d91d57b9b80850bf842dd..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_obdo.h +++ /dev/null @@ -1,55 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2014, Intel Corporation. - * - * Copyright 2015 Cray Inc, all rights reserved. - * Author: Ben Evans. - * - * Define obdo associated functions - * obdo: OBject Device o... - */ - -#ifndef _LUSTRE_OBDO_H_ -#define _LUSTRE_OBDO_H_ - -#include - -/** - * Create an obdo to send over the wire - */ -void lustre_set_wire_obdo(const struct obd_connect_data *ocd, - struct obdo *wobdo, - const struct obdo *lobdo); - -/** - * Create a local obdo from a wire based odbo - */ -void lustre_get_wire_obdo(const struct obd_connect_data *ocd, - struct obdo *lobdo, - const struct obdo *wobdo); - -#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_patchless_compat.h b/drivers/staging/lustre/lustre/include/lustre_patchless_compat.h deleted file mode 100644 index 298476ea7557ff190121e376f4f9890f0277ea35..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_patchless_compat.h +++ /dev/null @@ -1,68 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#ifndef LUSTRE_PATCHLESS_COMPAT_H -#define LUSTRE_PATCHLESS_COMPAT_H - -#include - -#include -#include -#include -#include - -#define ll_delete_from_page_cache(page) delete_from_page_cache(page) - -static inline void -truncate_complete_page(struct address_space *mapping, struct page *page) -{ - if (page->mapping != mapping) - return; - - if (PagePrivate(page)) - page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); - - cancel_dirty_page(page); - ClearPageMappedToDisk(page); - ll_delete_from_page_cache(page); -} - -#ifndef ATTR_CTIME_SET -/* - * set ATTR_CTIME_SET to a high value to avoid any risk of collision with other - * ATTR_* attributes (see bug 13828) - */ -#define ATTR_CTIME_SET (1 << 28) -#endif - -#endif /* LUSTRE_PATCHLESS_COMPAT_H */ diff --git a/drivers/staging/lustre/lustre/include/lustre_req_layout.h b/drivers/staging/lustre/lustre/include/lustre_req_layout.h deleted file mode 100644 index 213d0a01adcf2b64d95b1f3df4e8c4285269d464..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_req_layout.h +++ /dev/null @@ -1,307 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/include/lustre_req_layout.h - * - * Lustre Metadata Target (mdt) request handler - * - * Author: Nikita Danilov - */ - -#ifndef _LUSTRE_REQ_LAYOUT_H__ -#define _LUSTRE_REQ_LAYOUT_H__ - -#include - -/** \defgroup req_layout req_layout - * - * @{ - */ - -struct req_msg_field; -struct req_format; -struct req_capsule; - -struct ptlrpc_request; - -enum req_location { - RCL_CLIENT, - RCL_SERVER, - RCL_NR -}; - -/* Maximal number of fields (buffers) in a request message. */ -#define REQ_MAX_FIELD_NR 9 - -struct req_capsule { - struct ptlrpc_request *rc_req; - const struct req_format *rc_fmt; - enum req_location rc_loc; - __u32 rc_area[RCL_NR][REQ_MAX_FIELD_NR]; -}; - -void req_capsule_init(struct req_capsule *pill, struct ptlrpc_request *req, - enum req_location location); -void req_capsule_fini(struct req_capsule *pill); - -void req_capsule_set(struct req_capsule *pill, const struct req_format *fmt); -size_t req_capsule_filled_sizes(struct req_capsule *pill, - enum req_location loc); -int req_capsule_server_pack(struct req_capsule *pill); - -void *req_capsule_client_get(struct req_capsule *pill, - const struct req_msg_field *field); -void *req_capsule_client_swab_get(struct req_capsule *pill, - const struct req_msg_field *field, - void *swabber); -void *req_capsule_client_sized_get(struct req_capsule *pill, - const struct req_msg_field *field, - u32 len); -void *req_capsule_server_get(struct req_capsule *pill, - const struct req_msg_field *field); -void *req_capsule_server_sized_get(struct req_capsule *pill, - const struct req_msg_field *field, - u32 len); -void *req_capsule_server_swab_get(struct req_capsule *pill, - const struct req_msg_field *field, - void *swabber); -void *req_capsule_server_sized_swab_get(struct req_capsule *pill, - const struct req_msg_field *field, - u32 len, void *swabber); - -void req_capsule_set_size(struct req_capsule *pill, - const struct req_msg_field *field, - enum req_location loc, u32 size); -u32 req_capsule_get_size(const struct req_capsule *pill, - const struct req_msg_field *field, - enum req_location loc); -u32 req_capsule_msg_size(struct req_capsule *pill, enum req_location loc); -u32 req_capsule_fmt_size(__u32 magic, const struct req_format *fmt, - enum req_location loc); -void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt); - -int req_capsule_has_field(const struct req_capsule *pill, - const struct req_msg_field *field, - enum req_location loc); -void req_capsule_shrink(struct req_capsule *pill, - const struct req_msg_field *field, - u32 newlen, enum req_location loc); -int req_layout_init(void); -void req_layout_fini(void); - -extern struct req_format RQF_OBD_PING; -extern struct req_format RQF_OBD_SET_INFO; -extern struct req_format RQF_SEC_CTX; -/* MGS req_format */ -extern struct req_format RQF_MGS_TARGET_REG; -extern struct req_format RQF_MGS_SET_INFO; -extern struct req_format RQF_MGS_CONFIG_READ; -/* fid/fld req_format */ -extern struct req_format RQF_SEQ_QUERY; -extern struct req_format RQF_FLD_QUERY; -extern struct req_format RQF_FLD_READ; -/* MDS req_format */ -extern struct req_format RQF_MDS_CONNECT; -extern struct req_format RQF_MDS_DISCONNECT; -extern struct req_format RQF_MDS_STATFS; -extern struct req_format RQF_MDS_GETSTATUS; -extern struct req_format RQF_MDS_SYNC; -extern struct req_format RQF_MDS_GETXATTR; -extern struct req_format RQF_MDS_GETATTR; - -/* - * This is format of direct (non-intent) MDS_GETATTR_NAME request. - */ -extern struct req_format RQF_MDS_GETATTR_NAME; -extern struct req_format RQF_MDS_CLOSE; -extern struct req_format RQF_MDS_INTENT_CLOSE; -extern struct req_format RQF_MDS_CONNECT; -extern struct req_format RQF_MDS_DISCONNECT; -extern struct req_format RQF_MDS_GET_INFO; -extern struct req_format RQF_MDS_READPAGE; -extern struct req_format RQF_MDS_WRITEPAGE; -extern struct req_format RQF_MDS_REINT; -extern struct req_format RQF_MDS_REINT_CREATE; -extern struct req_format RQF_MDS_REINT_CREATE_ACL; -extern struct req_format RQF_MDS_REINT_CREATE_SLAVE; -extern struct req_format RQF_MDS_REINT_CREATE_SYM; -extern struct req_format RQF_MDS_REINT_OPEN; -extern struct req_format RQF_MDS_REINT_UNLINK; -extern struct req_format RQF_MDS_REINT_LINK; -extern struct req_format RQF_MDS_REINT_RENAME; -extern struct req_format RQF_MDS_REINT_SETATTR; -extern struct req_format RQF_MDS_REINT_SETXATTR; -extern struct req_format RQF_MDS_QUOTACTL; -extern struct req_format RQF_MDS_SWAP_LAYOUTS; -extern struct req_format RQF_MDS_REINT_MIGRATE; -/* MDS hsm formats */ -extern struct req_format RQF_MDS_HSM_STATE_GET; -extern struct req_format RQF_MDS_HSM_STATE_SET; -extern struct req_format RQF_MDS_HSM_ACTION; -extern struct req_format RQF_MDS_HSM_PROGRESS; -extern struct req_format RQF_MDS_HSM_CT_REGISTER; -extern struct req_format RQF_MDS_HSM_CT_UNREGISTER; -extern struct req_format RQF_MDS_HSM_REQUEST; -/* OST req_format */ -extern struct req_format RQF_OST_CONNECT; -extern struct req_format RQF_OST_DISCONNECT; -extern struct req_format RQF_OST_QUOTACTL; -extern struct req_format RQF_OST_GETATTR; -extern struct req_format RQF_OST_SETATTR; -extern struct req_format RQF_OST_CREATE; -extern struct req_format RQF_OST_PUNCH; -extern struct req_format RQF_OST_SYNC; -extern struct req_format RQF_OST_DESTROY; -extern struct req_format RQF_OST_BRW_READ; -extern struct req_format RQF_OST_BRW_WRITE; -extern struct req_format RQF_OST_STATFS; -extern struct req_format RQF_OST_SET_GRANT_INFO; -extern struct req_format RQF_OST_GET_INFO; -extern struct req_format RQF_OST_GET_INFO_LAST_ID; -extern struct req_format RQF_OST_GET_INFO_LAST_FID; -extern struct req_format RQF_OST_SET_INFO_LAST_FID; -extern struct req_format RQF_OST_GET_INFO_FIEMAP; - -/* LDLM req_format */ -extern struct req_format RQF_LDLM_ENQUEUE; -extern struct req_format RQF_LDLM_ENQUEUE_LVB; -extern struct req_format RQF_LDLM_CONVERT; -extern struct req_format RQF_LDLM_INTENT; -extern struct req_format RQF_LDLM_INTENT_BASIC; -extern struct req_format RQF_LDLM_INTENT_LAYOUT; -extern struct req_format RQF_LDLM_INTENT_GETATTR; -extern struct req_format RQF_LDLM_INTENT_OPEN; -extern struct req_format RQF_LDLM_INTENT_CREATE; -extern struct req_format RQF_LDLM_INTENT_UNLINK; -extern struct req_format RQF_LDLM_INTENT_GETXATTR; -extern struct req_format RQF_LDLM_CANCEL; -extern struct req_format RQF_LDLM_CALLBACK; -extern struct req_format RQF_LDLM_CP_CALLBACK; -extern struct req_format RQF_LDLM_BL_CALLBACK; -extern struct req_format RQF_LDLM_GL_CALLBACK; -extern struct req_format RQF_LDLM_GL_DESC_CALLBACK; -/* LOG req_format */ -extern struct req_format RQF_LOG_CANCEL; -extern struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE; -extern struct req_format RQF_LLOG_ORIGIN_HANDLE_DESTROY; -extern struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK; -extern struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK; -extern struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER; -extern struct req_format RQF_LLOG_ORIGIN_CONNECT; - -extern struct req_format RQF_CONNECT; - -extern struct req_msg_field RMF_GENERIC_DATA; -extern struct req_msg_field RMF_PTLRPC_BODY; -extern struct req_msg_field RMF_MDT_BODY; -extern struct req_msg_field RMF_MDT_EPOCH; -extern struct req_msg_field RMF_OBD_STATFS; -extern struct req_msg_field RMF_NAME; -extern struct req_msg_field RMF_SYMTGT; -extern struct req_msg_field RMF_TGTUUID; -extern struct req_msg_field RMF_CLUUID; -extern struct req_msg_field RMF_SETINFO_VAL; -extern struct req_msg_field RMF_SETINFO_KEY; -extern struct req_msg_field RMF_GETINFO_VAL; -extern struct req_msg_field RMF_GETINFO_VALLEN; -extern struct req_msg_field RMF_GETINFO_KEY; -extern struct req_msg_field RMF_CLOSE_DATA; - -/* - * connection handle received in MDS_CONNECT request. - */ -extern struct req_msg_field RMF_CONN; -extern struct req_msg_field RMF_CONNECT_DATA; -extern struct req_msg_field RMF_DLM_REQ; -extern struct req_msg_field RMF_DLM_REP; -extern struct req_msg_field RMF_DLM_LVB; -extern struct req_msg_field RMF_DLM_GL_DESC; -extern struct req_msg_field RMF_LDLM_INTENT; -extern struct req_msg_field RMF_LAYOUT_INTENT; -extern struct req_msg_field RMF_MDT_MD; -extern struct req_msg_field RMF_REC_REINT; -extern struct req_msg_field RMF_EADATA; -extern struct req_msg_field RMF_EAVALS; -extern struct req_msg_field RMF_EAVALS_LENS; -extern struct req_msg_field RMF_ACL; -extern struct req_msg_field RMF_LOGCOOKIES; -extern struct req_msg_field RMF_CAPA1; -extern struct req_msg_field RMF_CAPA2; -extern struct req_msg_field RMF_OBD_QUOTACHECK; -extern struct req_msg_field RMF_OBD_QUOTACTL; -extern struct req_msg_field RMF_STRING; -extern struct req_msg_field RMF_SWAP_LAYOUTS; -extern struct req_msg_field RMF_MDS_HSM_PROGRESS; -extern struct req_msg_field RMF_MDS_HSM_REQUEST; -extern struct req_msg_field RMF_MDS_HSM_USER_ITEM; -extern struct req_msg_field RMF_MDS_HSM_ARCHIVE; -extern struct req_msg_field RMF_HSM_USER_STATE; -extern struct req_msg_field RMF_HSM_STATE_SET; -extern struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION; -extern struct req_msg_field RMF_MDS_HSM_REQUEST; - -/* seq-mgr fields */ -extern struct req_msg_field RMF_SEQ_OPC; -extern struct req_msg_field RMF_SEQ_RANGE; -extern struct req_msg_field RMF_FID_SPACE; - -/* FLD fields */ -extern struct req_msg_field RMF_FLD_OPC; -extern struct req_msg_field RMF_FLD_MDFLD; - -extern struct req_msg_field RMF_LLOGD_BODY; -extern struct req_msg_field RMF_LLOG_LOG_HDR; -extern struct req_msg_field RMF_LLOGD_CONN_BODY; - -extern struct req_msg_field RMF_MGS_TARGET_INFO; -extern struct req_msg_field RMF_MGS_SEND_PARAM; - -extern struct req_msg_field RMF_OST_BODY; -extern struct req_msg_field RMF_OBD_IOOBJ; -extern struct req_msg_field RMF_OBD_ID; -extern struct req_msg_field RMF_FID; -extern struct req_msg_field RMF_NIOBUF_REMOTE; -extern struct req_msg_field RMF_RCS; -extern struct req_msg_field RMF_FIEMAP_KEY; -extern struct req_msg_field RMF_FIEMAP_VAL; -extern struct req_msg_field RMF_OST_ID; - -/* MGS config read message format */ -extern struct req_msg_field RMF_MGS_CONFIG_BODY; -extern struct req_msg_field RMF_MGS_CONFIG_RES; - -/* generic uint32 */ -extern struct req_msg_field RMF_U32; - -/** @} req_layout */ - -#endif /* _LUSTRE_REQ_LAYOUT_H__ */ diff --git a/drivers/staging/lustre/lustre/include/lustre_sec.h b/drivers/staging/lustre/lustre/include/lustre_sec.h deleted file mode 100644 index d35bcbc98831b1967ff2253473994473758f43c3..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_sec.h +++ /dev/null @@ -1,1072 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#ifndef _LUSTRE_SEC_H_ -#define _LUSTRE_SEC_H_ - -#include - -/** \defgroup sptlrpc sptlrpc - * - * @{ - */ - -/* - * to avoid include - */ -struct obd_import; -struct obd_export; -struct ptlrpc_request; -struct ptlrpc_reply_state; -struct ptlrpc_bulk_desc; -struct brw_page; -/* Linux specific */ -struct key; -struct seq_file; -struct lustre_cfg; - -/* - * forward declaration - */ -struct ptlrpc_sec_policy; -struct ptlrpc_sec_cops; -struct ptlrpc_sec_sops; -struct ptlrpc_sec; -struct ptlrpc_svc_ctx; -struct ptlrpc_cli_ctx; -struct ptlrpc_ctx_ops; - -/** - * \addtogroup flavor flavor - * - * RPC flavor is represented by a 32 bits integer. Currently the high 12 bits - * are unused, must be set to 0 for future expansion. - *
- * ------------------------------------------------------------------------
- * | 4b (bulk svc) | 4b (bulk type) | 4b (svc) | 4b (mech)  | 4b (policy) |
- * ------------------------------------------------------------------------
- * 
- * - * @{ - */ - -/* - * flavor constants - */ -enum sptlrpc_policy { - SPTLRPC_POLICY_NULL = 0, - SPTLRPC_POLICY_PLAIN = 1, - SPTLRPC_POLICY_GSS = 2, - SPTLRPC_POLICY_MAX, -}; - -enum sptlrpc_mech_null { - SPTLRPC_MECH_NULL = 0, - SPTLRPC_MECH_NULL_MAX, -}; - -enum sptlrpc_mech_plain { - SPTLRPC_MECH_PLAIN = 0, - SPTLRPC_MECH_PLAIN_MAX, -}; - -enum sptlrpc_mech_gss { - SPTLRPC_MECH_GSS_NULL = 0, - SPTLRPC_MECH_GSS_KRB5 = 1, - SPTLRPC_MECH_GSS_MAX, -}; - -enum sptlrpc_service_type { - SPTLRPC_SVC_NULL = 0, /**< no security */ - SPTLRPC_SVC_AUTH = 1, /**< authentication only */ - SPTLRPC_SVC_INTG = 2, /**< integrity */ - SPTLRPC_SVC_PRIV = 3, /**< privacy */ - SPTLRPC_SVC_MAX, -}; - -enum sptlrpc_bulk_type { - SPTLRPC_BULK_DEFAULT = 0, /**< follow rpc flavor */ - SPTLRPC_BULK_HASH = 1, /**< hash integrity */ - SPTLRPC_BULK_MAX, -}; - -enum sptlrpc_bulk_service { - SPTLRPC_BULK_SVC_NULL = 0, /**< no security */ - SPTLRPC_BULK_SVC_AUTH = 1, /**< authentication only */ - SPTLRPC_BULK_SVC_INTG = 2, /**< integrity */ - SPTLRPC_BULK_SVC_PRIV = 3, /**< privacy */ - SPTLRPC_BULK_SVC_MAX, -}; - -/* - * compose/extract macros - */ -#define FLVR_POLICY_OFFSET (0) -#define FLVR_MECH_OFFSET (4) -#define FLVR_SVC_OFFSET (8) -#define FLVR_BULK_TYPE_OFFSET (12) -#define FLVR_BULK_SVC_OFFSET (16) - -#define MAKE_FLVR(policy, mech, svc, btype, bsvc) \ - (((__u32)(policy) << FLVR_POLICY_OFFSET) | \ - ((__u32)(mech) << FLVR_MECH_OFFSET) | \ - ((__u32)(svc) << FLVR_SVC_OFFSET) | \ - ((__u32)(btype) << FLVR_BULK_TYPE_OFFSET) | \ - ((__u32)(bsvc) << FLVR_BULK_SVC_OFFSET)) - -/* - * extraction - */ -#define SPTLRPC_FLVR_POLICY(flavor) \ - ((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xF) -#define SPTLRPC_FLVR_MECH(flavor) \ - ((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xF) -#define SPTLRPC_FLVR_SVC(flavor) \ - ((((__u32)(flavor)) >> FLVR_SVC_OFFSET) & 0xF) -#define SPTLRPC_FLVR_BULK_TYPE(flavor) \ - ((((__u32)(flavor)) >> FLVR_BULK_TYPE_OFFSET) & 0xF) -#define SPTLRPC_FLVR_BULK_SVC(flavor) \ - ((((__u32)(flavor)) >> FLVR_BULK_SVC_OFFSET) & 0xF) - -#define SPTLRPC_FLVR_BASE(flavor) \ - ((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xFFF) -#define SPTLRPC_FLVR_BASE_SUB(flavor) \ - ((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xFF) - -/* - * gss subflavors - */ -#define MAKE_BASE_SUBFLVR(mech, svc) \ - ((__u32)(mech) | \ - ((__u32)(svc) << (FLVR_SVC_OFFSET - FLVR_MECH_OFFSET))) - -#define SPTLRPC_SUBFLVR_KRB5N \ - MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_NULL) -#define SPTLRPC_SUBFLVR_KRB5A \ - MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_AUTH) -#define SPTLRPC_SUBFLVR_KRB5I \ - MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_INTG) -#define SPTLRPC_SUBFLVR_KRB5P \ - MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_PRIV) - -/* - * "end user" flavors - */ -#define SPTLRPC_FLVR_NULL \ - MAKE_FLVR(SPTLRPC_POLICY_NULL, \ - SPTLRPC_MECH_NULL, \ - SPTLRPC_SVC_NULL, \ - SPTLRPC_BULK_DEFAULT, \ - SPTLRPC_BULK_SVC_NULL) -#define SPTLRPC_FLVR_PLAIN \ - MAKE_FLVR(SPTLRPC_POLICY_PLAIN, \ - SPTLRPC_MECH_PLAIN, \ - SPTLRPC_SVC_NULL, \ - SPTLRPC_BULK_HASH, \ - SPTLRPC_BULK_SVC_INTG) -#define SPTLRPC_FLVR_KRB5N \ - MAKE_FLVR(SPTLRPC_POLICY_GSS, \ - SPTLRPC_MECH_GSS_KRB5, \ - SPTLRPC_SVC_NULL, \ - SPTLRPC_BULK_DEFAULT, \ - SPTLRPC_BULK_SVC_NULL) -#define SPTLRPC_FLVR_KRB5A \ - MAKE_FLVR(SPTLRPC_POLICY_GSS, \ - SPTLRPC_MECH_GSS_KRB5, \ - SPTLRPC_SVC_AUTH, \ - SPTLRPC_BULK_DEFAULT, \ - SPTLRPC_BULK_SVC_NULL) -#define SPTLRPC_FLVR_KRB5I \ - MAKE_FLVR(SPTLRPC_POLICY_GSS, \ - SPTLRPC_MECH_GSS_KRB5, \ - SPTLRPC_SVC_INTG, \ - SPTLRPC_BULK_DEFAULT, \ - SPTLRPC_BULK_SVC_INTG) -#define SPTLRPC_FLVR_KRB5P \ - MAKE_FLVR(SPTLRPC_POLICY_GSS, \ - SPTLRPC_MECH_GSS_KRB5, \ - SPTLRPC_SVC_PRIV, \ - SPTLRPC_BULK_DEFAULT, \ - SPTLRPC_BULK_SVC_PRIV) - -#define SPTLRPC_FLVR_DEFAULT SPTLRPC_FLVR_NULL - -#define SPTLRPC_FLVR_INVALID ((__u32)0xFFFFFFFF) -#define SPTLRPC_FLVR_ANY ((__u32)0xFFF00000) - -/** - * extract the useful part from wire flavor - */ -#define WIRE_FLVR(wflvr) (((__u32)(wflvr)) & 0x000FFFFF) - -/** @} flavor */ - -static inline void flvr_set_svc(__u32 *flvr, __u32 svc) -{ - LASSERT(svc < SPTLRPC_SVC_MAX); - *flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr), - SPTLRPC_FLVR_MECH(*flvr), - svc, - SPTLRPC_FLVR_BULK_TYPE(*flvr), - SPTLRPC_FLVR_BULK_SVC(*flvr)); -} - -static inline void flvr_set_bulk_svc(__u32 *flvr, __u32 svc) -{ - LASSERT(svc < SPTLRPC_BULK_SVC_MAX); - *flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr), - SPTLRPC_FLVR_MECH(*flvr), - SPTLRPC_FLVR_SVC(*flvr), - SPTLRPC_FLVR_BULK_TYPE(*flvr), - svc); -} - -struct bulk_spec_hash { - __u8 hash_alg; -}; - -/** - * Full description of flavors being used on a ptlrpc connection, include - * both regular RPC and bulk transfer parts. - */ -struct sptlrpc_flavor { - /** - * wire flavor, should be renamed to sf_wire. - */ - __u32 sf_rpc; - /** - * general flags of PTLRPC_SEC_FL_* - */ - __u32 sf_flags; - /** - * rpc flavor specification - */ - union { - /* nothing for now */ - } u_rpc; - /** - * bulk flavor specification - */ - union { - struct bulk_spec_hash hash; - } u_bulk; -}; - -/** - * identify the RPC is generated from what part of Lustre. It's encoded into - * RPC requests and to be checked by ptlrpc service. - */ -enum lustre_sec_part { - LUSTRE_SP_CLI = 0, - LUSTRE_SP_MDT, - LUSTRE_SP_OST, - LUSTRE_SP_MGC, - LUSTRE_SP_MGS, - LUSTRE_SP_ANY = 0xFF -}; - -enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd); - -/** - * A rule specifies a flavor to be used by a ptlrpc connection between - * two Lustre parts. - */ -struct sptlrpc_rule { - __u32 sr_netid; /* LNET network ID */ - __u8 sr_from; /* sec_part */ - __u8 sr_to; /* sec_part */ - __u16 sr_padding; - struct sptlrpc_flavor sr_flvr; -}; - -/** - * A set of rules in memory. - * - * Rules are generated and stored on MGS, and propagated to MDT, OST, - * and client when needed. - */ -struct sptlrpc_rule_set { - int srs_nslot; - int srs_nrule; - struct sptlrpc_rule *srs_rules; -}; - -int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr); -bool sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr); - -static inline void sptlrpc_rule_set_init(struct sptlrpc_rule_set *set) -{ - memset(set, 0, sizeof(*set)); -} - -int sptlrpc_process_config(struct lustre_cfg *lcfg); -void sptlrpc_conf_log_start(const char *logname); -void sptlrpc_conf_log_stop(const char *logname); -void sptlrpc_conf_log_update_begin(const char *logname); -void sptlrpc_conf_log_update_end(const char *logname); -void sptlrpc_conf_client_adapt(struct obd_device *obd); - -/* The maximum length of security payload. 1024 is enough for Kerberos 5, - * and should be enough for other future mechanisms but not sure. - * Only used by pre-allocated request/reply pool. - */ -#define SPTLRPC_MAX_PAYLOAD (1024) - -struct vfs_cred { - u32 vc_uid; - u32 vc_gid; -}; - -struct ptlrpc_ctx_ops { - /** - * To determine whether it's suitable to use the \a ctx for \a vcred. - */ - int (*match)(struct ptlrpc_cli_ctx *ctx, struct vfs_cred *vcred); - - /** - * To bring the \a ctx uptodate. - */ - int (*refresh)(struct ptlrpc_cli_ctx *ctx); - - /** - * Validate the \a ctx. - */ - int (*validate)(struct ptlrpc_cli_ctx *ctx); - - /** - * Force the \a ctx to die. - */ - void (*force_die)(struct ptlrpc_cli_ctx *ctx, int grace); - int (*display)(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize); - - /** - * Sign the request message using \a ctx. - * - * \pre req->rq_reqmsg point to request message. - * \pre req->rq_reqlen is the request message length. - * \post req->rq_reqbuf point to request message with signature. - * \post req->rq_reqdata_len is set to the final request message size. - * - * \see null_ctx_sign(), plain_ctx_sign(), gss_cli_ctx_sign(). - */ - int (*sign)(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req); - - /** - * Verify the reply message using \a ctx. - * - * \pre req->rq_repdata point to reply message with signature. - * \pre req->rq_repdata_len is the total reply message length. - * \post req->rq_repmsg point to reply message without signature. - * \post req->rq_replen is the reply message length. - * - * \see null_ctx_verify(), plain_ctx_verify(), gss_cli_ctx_verify(). - */ - int (*verify)(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req); - - /** - * Encrypt the request message using \a ctx. - * - * \pre req->rq_reqmsg point to request message in clear text. - * \pre req->rq_reqlen is the request message length. - * \post req->rq_reqbuf point to request message. - * \post req->rq_reqdata_len is set to the final request message size. - * - * \see gss_cli_ctx_seal(). - */ - int (*seal)(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req); - - /** - * Decrypt the reply message using \a ctx. - * - * \pre req->rq_repdata point to encrypted reply message. - * \pre req->rq_repdata_len is the total cipher text length. - * \post req->rq_repmsg point to reply message in clear text. - * \post req->rq_replen is the reply message length in clear text. - * - * \see gss_cli_ctx_unseal(). - */ - int (*unseal)(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req); - - /** - * Wrap bulk request data. This is called before wrapping RPC - * request message. - * - * \pre bulk buffer is descripted by desc->bd_iov and - * desc->bd_iov_count. note for read it's just buffer, no data - * need to be sent; for write it contains data in clear text. - * \post when necessary, ptlrpc_bulk_sec_desc was properly prepared - * (usually inside of RPC request message). - * - encryption: cipher text bulk buffer is descripted by - * desc->bd_enc_iov and desc->bd_iov_count (currently assume iov - * count remains the same). - * - otherwise: bulk buffer is still desc->bd_iov and - * desc->bd_iov_count. - * - * \return 0: success. - * \return -ev: error code. - * - * \see plain_cli_wrap_bulk(), gss_cli_ctx_wrap_bulk(). - */ - int (*wrap_bulk)(struct ptlrpc_cli_ctx *ctx, - struct ptlrpc_request *req, - struct ptlrpc_bulk_desc *desc); - - /** - * Unwrap bulk reply data. This is called after wrapping RPC - * reply message. - * - * \pre bulk buffer is descripted by desc->bd_iov/desc->bd_enc_iov and - * desc->bd_iov_count, according to wrap_bulk(). - * \post final bulk data in clear text is placed in buffer described - * by desc->bd_iov and desc->bd_iov_count. - * \return +ve nob of actual bulk data in clear text. - * \return -ve error code. - * - * \see plain_cli_unwrap_bulk(), gss_cli_ctx_unwrap_bulk(). - */ - int (*unwrap_bulk)(struct ptlrpc_cli_ctx *ctx, - struct ptlrpc_request *req, - struct ptlrpc_bulk_desc *desc); -}; - -#define PTLRPC_CTX_NEW_BIT (0) /* newly created */ -#define PTLRPC_CTX_UPTODATE_BIT (1) /* uptodate */ -#define PTLRPC_CTX_DEAD_BIT (2) /* mark expired gracefully */ -#define PTLRPC_CTX_ERROR_BIT (3) /* fatal error (refresh, etc.) */ -#define PTLRPC_CTX_CACHED_BIT (8) /* in ctx cache (hash etc.) */ -#define PTLRPC_CTX_ETERNAL_BIT (9) /* always valid */ - -#define PTLRPC_CTX_NEW (1 << PTLRPC_CTX_NEW_BIT) -#define PTLRPC_CTX_UPTODATE (1 << PTLRPC_CTX_UPTODATE_BIT) -#define PTLRPC_CTX_DEAD (1 << PTLRPC_CTX_DEAD_BIT) -#define PTLRPC_CTX_ERROR (1 << PTLRPC_CTX_ERROR_BIT) -#define PTLRPC_CTX_CACHED (1 << PTLRPC_CTX_CACHED_BIT) -#define PTLRPC_CTX_ETERNAL (1 << PTLRPC_CTX_ETERNAL_BIT) - -#define PTLRPC_CTX_STATUS_MASK (PTLRPC_CTX_NEW_BIT | \ - PTLRPC_CTX_UPTODATE | \ - PTLRPC_CTX_DEAD | \ - PTLRPC_CTX_ERROR) - -struct ptlrpc_cli_ctx { - struct hlist_node cc_cache; /* linked into ctx cache */ - atomic_t cc_refcount; - struct ptlrpc_sec *cc_sec; - struct ptlrpc_ctx_ops *cc_ops; - unsigned long cc_expire; /* in seconds */ - unsigned int cc_early_expire:1; - unsigned long cc_flags; - struct vfs_cred cc_vcred; - spinlock_t cc_lock; - struct list_head cc_req_list; /* waiting reqs linked here */ - struct list_head cc_gc_chain; /* linked to gc chain */ -}; - -/** - * client side policy operation vector. - */ -struct ptlrpc_sec_cops { - /** - * Given an \a imp, create and initialize a ptlrpc_sec structure. - * \param ctx service context: - * - regular import: \a ctx should be NULL; - * - reverse import: \a ctx is obtained from incoming request. - * \param flavor specify what flavor to use. - * - * When necessary, policy module is responsible for taking reference - * on the import. - * - * \see null_create_sec(), plain_create_sec(), gss_sec_create_kr(). - */ - struct ptlrpc_sec *(*create_sec)(struct obd_import *imp, - struct ptlrpc_svc_ctx *ctx, - struct sptlrpc_flavor *flavor); - - /** - * Destructor of ptlrpc_sec. When called, refcount has been dropped - * to 0 and all contexts has been destroyed. - * - * \see null_destroy_sec(), plain_destroy_sec(), gss_sec_destroy_kr(). - */ - void (*destroy_sec)(struct ptlrpc_sec *sec); - - /** - * Notify that this ptlrpc_sec is going to die. Optionally, policy - * module is supposed to set sec->ps_dying and whatever necessary - * actions. - * - * \see plain_kill_sec(), gss_sec_kill(). - */ - void (*kill_sec)(struct ptlrpc_sec *sec); - - /** - * Given \a vcred, lookup and/or create its context. The policy module - * is supposed to maintain its own context cache. - * XXX currently \a create and \a remove_dead is always 1, perhaps - * should be removed completely. - * - * \see null_lookup_ctx(), plain_lookup_ctx(), gss_sec_lookup_ctx_kr(). - */ - struct ptlrpc_cli_ctx *(*lookup_ctx)(struct ptlrpc_sec *sec, - struct vfs_cred *vcred, - int create, int remove_dead); - - /** - * Called then the reference of \a ctx dropped to 0. The policy module - * is supposed to destroy this context or whatever else according to - * its cache maintenance mechanism. - * - * \param sync if zero, we shouldn't wait for the context being - * destroyed completely. - * - * \see plain_release_ctx(), gss_sec_release_ctx_kr(). - */ - void (*release_ctx)(struct ptlrpc_sec *sec, struct ptlrpc_cli_ctx *ctx, - int sync); - - /** - * Flush the context cache. - * - * \param uid context of which user, -1 means all contexts. - * \param grace if zero, the PTLRPC_CTX_UPTODATE_BIT of affected - * contexts should be cleared immediately. - * \param force if zero, only idle contexts will be flushed. - * - * \see plain_flush_ctx_cache(), gss_sec_flush_ctx_cache_kr(). - */ - int (*flush_ctx_cache)(struct ptlrpc_sec *sec, uid_t uid, - int grace, int force); - - /** - * Called periodically by garbage collector to remove dead contexts - * from cache. - * - * \see gss_sec_gc_ctx_kr(). - */ - void (*gc_ctx)(struct ptlrpc_sec *sec); - - /** - * Given an context \a ctx, install a corresponding reverse service - * context on client side. - * XXX currently it's only used by GSS module, maybe we should remove - * this from general API. - */ - int (*install_rctx)(struct obd_import *imp, struct ptlrpc_sec *sec, - struct ptlrpc_cli_ctx *ctx); - - /** - * To allocate request buffer for \a req. - * - * \pre req->rq_reqmsg == NULL. - * \pre req->rq_reqbuf == NULL, otherwise it must be pre-allocated, - * we are not supposed to free it. - * \post if success, req->rq_reqmsg point to a buffer with size - * at least \a lustre_msg_size. - * - * \see null_alloc_reqbuf(), plain_alloc_reqbuf(), gss_alloc_reqbuf(). - */ - int (*alloc_reqbuf)(struct ptlrpc_sec *sec, struct ptlrpc_request *req, - int lustre_msg_size); - - /** - * To free request buffer for \a req. - * - * \pre req->rq_reqbuf != NULL. - * - * \see null_free_reqbuf(), plain_free_reqbuf(), gss_free_reqbuf(). - */ - void (*free_reqbuf)(struct ptlrpc_sec *sec, struct ptlrpc_request *req); - - /** - * To allocate reply buffer for \a req. - * - * \pre req->rq_repbuf == NULL. - * \post if success, req->rq_repbuf point to a buffer with size - * req->rq_repbuf_len, the size should be large enough to receive - * reply which be transformed from \a lustre_msg_size of clear text. - * - * \see null_alloc_repbuf(), plain_alloc_repbuf(), gss_alloc_repbuf(). - */ - int (*alloc_repbuf)(struct ptlrpc_sec *sec, struct ptlrpc_request *req, - int lustre_msg_size); - - /** - * To free reply buffer for \a req. - * - * \pre req->rq_repbuf != NULL. - * \post req->rq_repbuf == NULL. - * \post req->rq_repbuf_len == 0. - * - * \see null_free_repbuf(), plain_free_repbuf(), gss_free_repbuf(). - */ - void (*free_repbuf)(struct ptlrpc_sec *sec, struct ptlrpc_request *req); - - /** - * To expand the request buffer of \a req, thus the \a segment in - * the request message pointed by req->rq_reqmsg can accommodate - * at least \a newsize of data. - * - * \pre req->rq_reqmsg->lm_buflens[segment] < newsize. - * - * \see null_enlarge_reqbuf(), plain_enlarge_reqbuf(), - * gss_enlarge_reqbuf(). - */ - int (*enlarge_reqbuf)(struct ptlrpc_sec *sec, - struct ptlrpc_request *req, - int segment, int newsize); - /* - * misc - */ - int (*display)(struct ptlrpc_sec *sec, struct seq_file *seq); -}; - -/** - * server side policy operation vector. - */ -struct ptlrpc_sec_sops { - /** - * verify an incoming request. - * - * \pre request message is pointed by req->rq_reqbuf, size is - * req->rq_reqdata_len; and the message has been unpacked to - * host byte order. - * - * \retval SECSVC_OK success, req->rq_reqmsg point to request message - * in clear text, size is req->rq_reqlen; req->rq_svc_ctx is set; - * req->rq_sp_from is decoded from request. - * \retval SECSVC_COMPLETE success, the request has been fully - * processed, and reply message has been prepared; req->rq_sp_from is - * decoded from request. - * \retval SECSVC_DROP failed, this request should be dropped. - * - * \see null_accept(), plain_accept(), gss_svc_accept_kr(). - */ - int (*accept)(struct ptlrpc_request *req); - - /** - * Perform security transformation upon reply message. - * - * \pre reply message is pointed by req->rq_reply_state->rs_msg, size - * is req->rq_replen. - * \post req->rs_repdata_len is the final message size. - * \post req->rq_reply_off is set. - * - * \see null_authorize(), plain_authorize(), gss_svc_authorize(). - */ - int (*authorize)(struct ptlrpc_request *req); - - /** - * Invalidate server context \a ctx. - * - * \see gss_svc_invalidate_ctx(). - */ - void (*invalidate_ctx)(struct ptlrpc_svc_ctx *ctx); - - /** - * Allocate a ptlrpc_reply_state. - * - * \param msgsize size of the reply message in clear text. - * \pre if req->rq_reply_state != NULL, then it's pre-allocated, we - * should simply use it; otherwise we'll responsible for allocating - * a new one. - * \post req->rq_reply_state != NULL; - * \post req->rq_reply_state->rs_msg != NULL; - * - * \see null_alloc_rs(), plain_alloc_rs(), gss_svc_alloc_rs(). - */ - int (*alloc_rs)(struct ptlrpc_request *req, int msgsize); - - /** - * Free a ptlrpc_reply_state. - */ - void (*free_rs)(struct ptlrpc_reply_state *rs); - - /** - * Release the server context \a ctx. - * - * \see gss_svc_free_ctx(). - */ - void (*free_ctx)(struct ptlrpc_svc_ctx *ctx); - - /** - * Install a reverse context based on the server context \a ctx. - * - * \see gss_svc_install_rctx_kr(). - */ - int (*install_rctx)(struct obd_import *imp, struct ptlrpc_svc_ctx *ctx); - - /** - * Prepare buffer for incoming bulk write. - * - * \pre desc->bd_iov and desc->bd_iov_count describes the buffer - * intended to receive the write. - * - * \see gss_svc_prep_bulk(). - */ - int (*prep_bulk)(struct ptlrpc_request *req, - struct ptlrpc_bulk_desc *desc); - - /** - * Unwrap the bulk write data. - * - * \see plain_svc_unwrap_bulk(), gss_svc_unwrap_bulk(). - */ - int (*unwrap_bulk)(struct ptlrpc_request *req, - struct ptlrpc_bulk_desc *desc); - - /** - * Wrap the bulk read data. - * - * \see plain_svc_wrap_bulk(), gss_svc_wrap_bulk(). - */ - int (*wrap_bulk)(struct ptlrpc_request *req, - struct ptlrpc_bulk_desc *desc); -}; - -struct ptlrpc_sec_policy { - struct module *sp_owner; - char *sp_name; - __u16 sp_policy; /* policy number */ - struct ptlrpc_sec_cops *sp_cops; /* client ops */ - struct ptlrpc_sec_sops *sp_sops; /* server ops */ -}; - -#define PTLRPC_SEC_FL_REVERSE 0x0001 /* reverse sec */ -#define PTLRPC_SEC_FL_ROOTONLY 0x0002 /* treat everyone as root */ -#define PTLRPC_SEC_FL_UDESC 0x0004 /* ship udesc */ -#define PTLRPC_SEC_FL_BULK 0x0008 /* intensive bulk i/o expected */ -#define PTLRPC_SEC_FL_PAG 0x0010 /* PAG mode */ - -/** - * The ptlrpc_sec represents the client side ptlrpc security facilities, - * each obd_import (both regular and reverse import) must associate with - * a ptlrpc_sec. - * - * \see sptlrpc_import_sec_adapt(). - */ -struct ptlrpc_sec { - struct ptlrpc_sec_policy *ps_policy; - atomic_t ps_refcount; - /** statistic only */ - atomic_t ps_nctx; - /** unique identifier */ - int ps_id; - struct sptlrpc_flavor ps_flvr; - enum lustre_sec_part ps_part; - /** after set, no more new context will be created */ - unsigned int ps_dying:1; - /** owning import */ - struct obd_import *ps_import; - spinlock_t ps_lock; - - /* - * garbage collection - */ - struct list_head ps_gc_list; - unsigned long ps_gc_interval; /* in seconds */ - time64_t ps_gc_next; /* in seconds */ -}; - -static inline int sec_is_reverse(struct ptlrpc_sec *sec) -{ - return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_REVERSE); -} - -static inline int sec_is_rootonly(struct ptlrpc_sec *sec) -{ - return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_ROOTONLY); -} - -struct ptlrpc_svc_ctx { - atomic_t sc_refcount; - struct ptlrpc_sec_policy *sc_policy; -}; - -/* - * user identity descriptor - */ -#define LUSTRE_MAX_GROUPS (128) - -struct ptlrpc_user_desc { - __u32 pud_uid; - __u32 pud_gid; - __u32 pud_fsuid; - __u32 pud_fsgid; - __u32 pud_cap; - __u32 pud_ngroups; - __u32 pud_groups[0]; -}; - -/* - * bulk flavors - */ -enum sptlrpc_bulk_hash_alg { - BULK_HASH_ALG_NULL = 0, - BULK_HASH_ALG_ADLER32, - BULK_HASH_ALG_CRC32, - BULK_HASH_ALG_MD5, - BULK_HASH_ALG_SHA1, - BULK_HASH_ALG_SHA256, - BULK_HASH_ALG_SHA384, - BULK_HASH_ALG_SHA512, - BULK_HASH_ALG_MAX -}; - -const char *sptlrpc_get_hash_name(__u8 hash_alg); -__u8 sptlrpc_get_hash_alg(const char *algname); - -enum { - BSD_FL_ERR = 1, -}; - -struct ptlrpc_bulk_sec_desc { - __u8 bsd_version; /* 0 */ - __u8 bsd_type; /* SPTLRPC_BULK_XXX */ - __u8 bsd_svc; /* SPTLRPC_BULK_SVC_XXXX */ - __u8 bsd_flags; /* flags */ - __u32 bsd_nob; /* nob of bulk data */ - __u8 bsd_data[0]; /* policy-specific token */ -}; - -/* - * round size up to next power of 2, for slab allocation. - * @size must be sane (can't overflow after round up) - */ -static inline int size_roundup_power2(int size) -{ - size--; - size |= size >> 1; - size |= size >> 2; - size |= size >> 4; - size |= size >> 8; - size |= size >> 16; - size++; - return size; -} - -/* - * internal support libraries - */ -void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg, - int segment, int newsize); - -/* - * security policies - */ -int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy); -int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy); - -__u32 sptlrpc_name2flavor_base(const char *name); -const char *sptlrpc_flavor2name_base(__u32 flvr); -char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf, - char *buf, int bufsize); -char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize); - -static inline -struct ptlrpc_sec_policy *sptlrpc_policy_get(struct ptlrpc_sec_policy *policy) -{ - __module_get(policy->sp_owner); - return policy; -} - -static inline -void sptlrpc_policy_put(struct ptlrpc_sec_policy *policy) -{ - module_put(policy->sp_owner); -} - -/* - * client credential - */ -static inline -unsigned long cli_ctx_status(struct ptlrpc_cli_ctx *ctx) -{ - return (ctx->cc_flags & PTLRPC_CTX_STATUS_MASK); -} - -static inline -int cli_ctx_is_ready(struct ptlrpc_cli_ctx *ctx) -{ - return (cli_ctx_status(ctx) == PTLRPC_CTX_UPTODATE); -} - -static inline -int cli_ctx_is_refreshed(struct ptlrpc_cli_ctx *ctx) -{ - return (cli_ctx_status(ctx) != 0); -} - -static inline -int cli_ctx_is_uptodate(struct ptlrpc_cli_ctx *ctx) -{ - return ((ctx->cc_flags & PTLRPC_CTX_UPTODATE) != 0); -} - -static inline -int cli_ctx_is_error(struct ptlrpc_cli_ctx *ctx) -{ - return ((ctx->cc_flags & PTLRPC_CTX_ERROR) != 0); -} - -static inline -int cli_ctx_is_dead(struct ptlrpc_cli_ctx *ctx) -{ - return ((ctx->cc_flags & (PTLRPC_CTX_DEAD | PTLRPC_CTX_ERROR)) != 0); -} - -static inline -int cli_ctx_is_eternal(struct ptlrpc_cli_ctx *ctx) -{ - return ((ctx->cc_flags & PTLRPC_CTX_ETERNAL) != 0); -} - -/* - * sec get/put - */ -void sptlrpc_sec_put(struct ptlrpc_sec *sec); - -/* - * internal apis which only used by policy implementation - */ -int sptlrpc_get_next_secid(void); - -/* - * exported client context api - */ -struct ptlrpc_cli_ctx *sptlrpc_cli_ctx_get(struct ptlrpc_cli_ctx *ctx); -void sptlrpc_cli_ctx_put(struct ptlrpc_cli_ctx *ctx, int sync); - -/* - * exported client context wrap/buffers - */ -int sptlrpc_cli_wrap_request(struct ptlrpc_request *req); -int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req); -int sptlrpc_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize); -void sptlrpc_cli_free_reqbuf(struct ptlrpc_request *req); -int sptlrpc_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize); -void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req); -int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req, - int segment, int newsize); -int sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req, - struct ptlrpc_request **req_ret); -void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req); - -void sptlrpc_request_out_callback(struct ptlrpc_request *req); - -/* - * exported higher interface of import & request - */ -int sptlrpc_import_sec_adapt(struct obd_import *imp, - struct ptlrpc_svc_ctx *ctx, - struct sptlrpc_flavor *flvr); -struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp); -void sptlrpc_import_sec_put(struct obd_import *imp); - -int sptlrpc_import_check_ctx(struct obd_import *imp); -void sptlrpc_import_flush_root_ctx(struct obd_import *imp); -void sptlrpc_import_flush_my_ctx(struct obd_import *imp); -void sptlrpc_import_flush_all_ctx(struct obd_import *imp); -int sptlrpc_req_get_ctx(struct ptlrpc_request *req); -void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync); -int sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout); -void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode); - -/* gc */ -void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec); -void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec); - -/* misc */ -const char *sec2target_str(struct ptlrpc_sec *sec); -/* - * lprocfs - */ -int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev); - -/* - * server side - */ -enum secsvc_accept_res { - SECSVC_OK = 0, - SECSVC_COMPLETE, - SECSVC_DROP, -}; - -int sptlrpc_svc_unwrap_request(struct ptlrpc_request *req); -int sptlrpc_svc_alloc_rs(struct ptlrpc_request *req, int msglen); -int sptlrpc_svc_wrap_reply(struct ptlrpc_request *req); -void sptlrpc_svc_free_rs(struct ptlrpc_reply_state *rs); -void sptlrpc_svc_ctx_addref(struct ptlrpc_request *req); -void sptlrpc_svc_ctx_decref(struct ptlrpc_request *req); - -int sptlrpc_target_export_check(struct obd_export *exp, - struct ptlrpc_request *req); - -/* bulk security api */ -void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc); -int get_free_pages_in_pool(void); -int pool_is_at_full_capacity(void); - -int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req, - struct ptlrpc_bulk_desc *desc); -int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req, - struct ptlrpc_bulk_desc *desc, - int nob); -int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req, - struct ptlrpc_bulk_desc *desc); - -/* bulk helpers (internal use only by policies) */ -int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg, - void *buf, int buflen); - -int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed); - -/* user descriptor helpers */ -static inline int sptlrpc_user_desc_size(int ngroups) -{ - return sizeof(struct ptlrpc_user_desc) + ngroups * sizeof(__u32); -} - -int sptlrpc_current_user_desc_size(void); -int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset); -int sptlrpc_unpack_user_desc(struct lustre_msg *req, int offset, int swabbed); - -enum { - LUSTRE_SEC_NONE = 0, - LUSTRE_SEC_REMOTE = 1, - LUSTRE_SEC_SPECIFY = 2, - LUSTRE_SEC_ALL = 3 -}; - -/** @} sptlrpc */ - -#endif /* _LUSTRE_SEC_H_ */ diff --git a/drivers/staging/lustre/lustre/include/lustre_swab.h b/drivers/staging/lustre/lustre/include/lustre_swab.h deleted file mode 100644 index 9d786bbe7f3f7b0b98fee1598a5b99c8bbe2774f..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/lustre_swab.h +++ /dev/null @@ -1,109 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2014, Intel Corporation. - * - * Copyright 2015 Cray Inc, all rights reserved. - * Author: Ben Evans. - * - * We assume all nodes are either little-endian or big-endian, and we - * always send messages in the sender's native format. The receiver - * detects the message format by checking the 'magic' field of the message - * (see lustre_msg_swabbed() below). - * - * Each wire type has corresponding 'lustre_swab_xxxtypexxx()' routines - * are implemented in ptlrpc/lustre_swab.c. These 'swabbers' convert the - * type from "other" endian, in-place in the message buffer. - * - * A swabber takes a single pointer argument. The caller must already have - * verified that the length of the message buffer >= sizeof (type). - * - * For variable length types, a second 'lustre_swab_v_xxxtypexxx()' routine - * may be defined that swabs just the variable part, after the caller has - * verified that the message buffer is large enough. - */ - -#ifndef _LUSTRE_SWAB_H_ -#define _LUSTRE_SWAB_H_ - -#include - -void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); -void lustre_swab_connect(struct obd_connect_data *ocd); -void lustre_swab_hsm_user_state(struct hsm_user_state *hus); -void lustre_swab_hsm_state_set(struct hsm_state_set *hss); -void lustre_swab_obd_statfs(struct obd_statfs *os); -void lustre_swab_obd_ioobj(struct obd_ioobj *ioo); -void lustre_swab_niobuf_remote(struct niobuf_remote *nbr); -void lustre_swab_ost_lvb_v1(struct ost_lvb_v1 *lvb); -void lustre_swab_ost_lvb(struct ost_lvb *lvb); -void lustre_swab_obd_quotactl(struct obd_quotactl *q); -void lustre_swab_lquota_lvb(struct lquota_lvb *lvb); -void lustre_swab_generic_32s(__u32 *val); -void lustre_swab_mdt_body(struct mdt_body *b); -void lustre_swab_mdt_ioepoch(struct mdt_ioepoch *b); -void lustre_swab_mdt_rec_setattr(struct mdt_rec_setattr *sa); -void lustre_swab_mdt_rec_reint(struct mdt_rec_reint *rr); -void lustre_swab_lmv_desc(struct lmv_desc *ld); -void lustre_swab_lmv_mds_md(union lmv_mds_md *lmm); -void lustre_swab_lov_desc(struct lov_desc *ld); -void lustre_swab_gl_desc(union ldlm_gl_desc *desc); -void lustre_swab_ldlm_intent(struct ldlm_intent *i); -void lustre_swab_ldlm_request(struct ldlm_request *rq); -void lustre_swab_ldlm_reply(struct ldlm_reply *r); -void lustre_swab_mgs_target_info(struct mgs_target_info *oinfo); -void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *oinfo); -void lustre_swab_mgs_config_body(struct mgs_config_body *body); -void lustre_swab_mgs_config_res(struct mgs_config_res *body); -void lustre_swab_ost_body(struct ost_body *b); -void lustre_swab_ost_last_id(__u64 *id); -void lustre_swab_fiemap(struct fiemap *fiemap); -void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum); -void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum); -void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod, - int stripe_count); -void lustre_swab_lov_mds_md(struct lov_mds_md *lmm); -void lustre_swab_lustre_capa(struct lustre_capa *c); -void lustre_swab_lustre_capa_key(struct lustre_capa_key *k); -void lustre_swab_fid2path(struct getinfo_fid2path *gf); -void lustre_swab_layout_intent(struct layout_intent *li); -void lustre_swab_hsm_user_state(struct hsm_user_state *hus); -void lustre_swab_hsm_current_action(struct hsm_current_action *action); -void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk); -void lustre_swab_hsm_user_state(struct hsm_user_state *hus); -void lustre_swab_hsm_user_item(struct hsm_user_item *hui); -void lustre_swab_hsm_request(struct hsm_request *hr); -void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl); -void lustre_swab_close_data(struct close_data *data); -void lustre_swab_lmv_user_md(struct lmv_user_md *lum); - -/* Functions for dumping PTLRPC fields */ -void dump_rniobuf(struct niobuf_remote *rnb); -void dump_ioo(struct obd_ioobj *nb); -void dump_ost_body(struct ost_body *ob); -void dump_rcs(__u32 *rc); - -#endif diff --git a/drivers/staging/lustre/lustre/include/obd.h b/drivers/staging/lustre/lustre/include/obd.h deleted file mode 100644 index b1907bbffb19baf0bc3fc35f2288708aacc62162..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/obd.h +++ /dev/null @@ -1,1114 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#ifndef __OBD_H -#define __OBD_H - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#define MAX_OBD_DEVICES 8192 - -struct osc_async_rc { - int ar_rc; - int ar_force_sync; - __u64 ar_min_xid; -}; - -struct lov_oinfo { /* per-stripe data structure */ - struct ost_id loi_oi; /* object ID/Sequence on the target OST */ - int loi_ost_idx; /* OST stripe index in lov_tgt_desc->tgts */ - int loi_ost_gen; /* generation of this loi_ost_idx */ - - unsigned long loi_kms_valid:1; - __u64 loi_kms; /* known minimum size */ - struct ost_lvb loi_lvb; - struct osc_async_rc loi_ar; -}; - -static inline void loi_kms_set(struct lov_oinfo *oinfo, __u64 kms) -{ - oinfo->loi_kms = kms; - oinfo->loi_kms_valid = 1; -} - -static inline void loi_init(struct lov_oinfo *loi) -{ -} - -struct lov_stripe_md; -struct obd_info; - -int lov_read_and_clear_async_rc(struct cl_object *clob); - -typedef int (*obd_enqueue_update_f)(void *cookie, int rc); - -/* obd info for a particular level (lov, osc). */ -struct obd_info { - /* OBD_STATFS_* flags */ - __u64 oi_flags; - /* lsm data specific for every OSC. */ - struct lov_stripe_md *oi_md; - /* statfs data specific for every OSC, if needed at all. */ - struct obd_statfs *oi_osfs; - /* An update callback which is called to update some data on upper - * level. E.g. it is used for update lsm->lsm_oinfo at every received - * request in osc level for enqueue requests. It is also possible to - * update some caller data from LOV layer if needed. - */ - obd_enqueue_update_f oi_cb_up; -}; - -struct obd_type { - struct list_head typ_chain; - struct obd_ops *typ_dt_ops; - struct md_ops *typ_md_ops; - struct dentry *typ_debugfs_entry; - char *typ_name; - int typ_refcnt; - struct lu_device_type *typ_lu; - spinlock_t obd_type_lock; - struct kobject *typ_kobj; -}; - -struct brw_page { - u64 off; - struct page *pg; - unsigned int count; - u32 flag; -}; - -struct timeout_item { - enum timeout_event ti_event; - unsigned long ti_timeout; - timeout_cb_t ti_cb; - void *ti_cb_data; - struct list_head ti_obd_list; - struct list_head ti_chain; -}; - -#define OBD_MAX_RIF_DEFAULT 8 -#define OBD_MAX_RIF_MAX 512 -#define OSC_MAX_RIF_MAX 256 -#define OSC_MAX_DIRTY_DEFAULT (OBD_MAX_RIF_DEFAULT * 4) -#define OSC_MAX_DIRTY_MB_MAX 2048 /* arbitrary, but < MAX_LONG bytes */ -#define OSC_DEFAULT_RESENDS 10 - -/* possible values for fo_sync_lock_cancel */ -enum { - NEVER_SYNC_ON_CANCEL = 0, - BLOCKING_SYNC_ON_CANCEL = 1, - ALWAYS_SYNC_ON_CANCEL = 2, - NUM_SYNC_ON_CANCEL_STATES -}; - -enum obd_cl_sem_lock_class { - OBD_CLI_SEM_NORMAL, - OBD_CLI_SEM_MGC, - OBD_CLI_SEM_MDCOSC, -}; - -/* - * Limit reply buffer size for striping data to one x86_64 page. This - * value is chosen to fit the striping data for common use cases while - * staying well below the limit at which the buffer must be backed by - * vmalloc(). Excessive use of vmalloc() may cause spinlock contention - * on the MDS. - */ -#define OBD_MAX_DEFAULT_EA_SIZE 4096 - -struct mdc_rpc_lock; -struct obd_import; -struct client_obd { - struct rw_semaphore cl_sem; - struct obd_uuid cl_target_uuid; - struct obd_import *cl_import; /* ptlrpc connection state */ - size_t cl_conn_count; - /* - * Cache maximum and default values for easize. This is - * strictly a performance optimization to minimize calls to - * obd_size_diskmd(). The default values are used to calculate the - * initial size of a request buffer. The ptlrpc layer will resize the - * buffer as needed to accommodate a larger reply from the - * server. The default values should be small enough to avoid wasted - * memory and excessive use of vmalloc(), yet large enough to avoid - * reallocating the buffer in the common use case. - */ - /* - * Default EA size for striping attributes. It is initialized at - * mount-time based on the default stripe width of the filesystem, - * then it tracks the largest observed EA size advertised by - * the MDT, up to a maximum value of OBD_MAX_DEFAULT_EA_SIZE. - */ - u32 cl_default_mds_easize; - /* Maximum possible EA size computed at mount-time based on - * the number of OSTs in the filesystem. May be increased at - * run-time if a larger observed size is advertised by the MDT. - */ - u32 cl_max_mds_easize; - - enum lustre_sec_part cl_sp_me; - enum lustre_sec_part cl_sp_to; - struct sptlrpc_flavor cl_flvr_mgc; /* fixed flavor of mgc->mgs */ - - /* the grant values are protected by loi_list_lock below */ - unsigned long cl_dirty_pages; /* all _dirty_ in pages */ - unsigned long cl_dirty_max_pages; /* allowed w/o rpc */ - unsigned long cl_dirty_transit; /* dirty synchronous */ - unsigned long cl_avail_grant; /* bytes of credit for ost */ - unsigned long cl_lost_grant; /* lost credits (trunc) */ - - /* since we allocate grant by blocks, we don't know how many grant will - * be used to add a page into cache. As a solution, we reserve maximum - * grant before trying to dirty a page and unreserve the rest. - * See osc_{reserve|unreserve}_grant for details. - */ - long cl_reserved_grant; - struct list_head cl_cache_waiters; /* waiting for cache/grant */ - unsigned long cl_next_shrink_grant; /* jiffies */ - struct list_head cl_grant_shrink_list; /* Timeout event list */ - int cl_grant_shrink_interval; /* seconds */ - - /* A chunk is an optimal size used by osc_extent to determine - * the extent size. A chunk is max(PAGE_SIZE, OST block size) - */ - int cl_chunkbits; - unsigned int cl_extent_tax; /* extent overhead, by bytes */ - - /* keep track of objects that have lois that contain pages which - * have been queued for async brw. this lock also protects the - * lists of osc_client_pages that hang off of the loi - */ - /* - * ->cl_loi_list_lock protects consistency of - * ->cl_loi_{ready,read,write}_list. ->ap_make_ready() and - * ->ap_completion() call-backs are executed under this lock. As we - * cannot guarantee that these call-backs never block on all platforms - * (as a matter of fact they do block on Mac OS X), type of - * ->cl_loi_list_lock is platform dependent: it's a spin-lock on Linux - * and blocking mutex on Mac OS X. (Alternative is to make this lock - * blocking everywhere, but we don't want to slow down fast-path of - * our main platform.) - * - * NB by Jinshan: though field names are still _loi_, but actually - * osc_object{}s are in the list. - */ - spinlock_t cl_loi_list_lock; - struct list_head cl_loi_ready_list; - struct list_head cl_loi_hp_ready_list; - struct list_head cl_loi_write_list; - struct list_head cl_loi_read_list; - __u32 cl_r_in_flight; - __u32 cl_w_in_flight; - /* just a sum of the loi/lop pending numbers to be exported by sysfs */ - atomic_t cl_pending_w_pages; - atomic_t cl_pending_r_pages; - __u32 cl_max_pages_per_rpc; - __u32 cl_max_rpcs_in_flight; - struct obd_histogram cl_read_rpc_hist; - struct obd_histogram cl_write_rpc_hist; - struct obd_histogram cl_read_page_hist; - struct obd_histogram cl_write_page_hist; - struct obd_histogram cl_read_offset_hist; - struct obd_histogram cl_write_offset_hist; - - /* LRU for osc caching pages */ - struct cl_client_cache *cl_cache; - /** member of cl_cache->ccc_lru */ - struct list_head cl_lru_osc; - /** # of available LRU slots left in the per-OSC cache. - * Available LRU slots are shared by all OSCs of the same file system, - * therefore this is a pointer to cl_client_cache::ccc_lru_left. - */ - atomic_long_t *cl_lru_left; - /** # of busy LRU pages. A page is considered busy if it's in writeback - * queue, or in transfer. Busy pages can't be discarded so they are not - * in LRU cache. - */ - atomic_long_t cl_lru_busy; - /** # of LRU pages in the cache for this client_obd */ - atomic_long_t cl_lru_in_list; - /** # of threads are shrinking LRU cache. To avoid contention, it's not - * allowed to have multiple threads shrinking LRU cache. - */ - atomic_t cl_lru_shrinkers; - /** The time when this LRU cache was last used. */ - time64_t cl_lru_last_used; - /** stats: how many reclaims have happened for this client_obd. - * reclaim and shrink - shrink is async, voluntarily rebalancing; - * reclaim is sync, initiated by IO thread when the LRU slots are - * in shortage. - */ - u64 cl_lru_reclaim; - /** List of LRU pages for this client_obd */ - struct list_head cl_lru_list; - /** Lock for LRU page list */ - spinlock_t cl_lru_list_lock; - /** # of unstable pages in this client_obd. - * An unstable page is a page state that WRITE RPC has finished but - * the transaction has NOT yet committed. - */ - atomic_long_t cl_unstable_count; - /** Link to osc_shrinker_list */ - struct list_head cl_shrink_list; - - /* number of in flight destroy rpcs is limited to max_rpcs_in_flight */ - atomic_t cl_destroy_in_flight; - wait_queue_head_t cl_destroy_waitq; - - struct mdc_rpc_lock *cl_rpc_lock; - - /* modify rpcs in flight - * currently used for metadata only - */ - spinlock_t cl_mod_rpcs_lock; - u16 cl_max_mod_rpcs_in_flight; - u16 cl_mod_rpcs_in_flight; - u16 cl_close_rpcs_in_flight; - wait_queue_head_t cl_mod_rpcs_waitq; - unsigned long *cl_mod_tag_bitmap; - struct obd_histogram cl_mod_rpcs_hist; - - /* mgc datastruct */ - atomic_t cl_mgc_refcount; - struct obd_export *cl_mgc_mgsexp; - - /* checksumming for data sent over the network */ - unsigned int cl_checksum:1; /* 0 = disabled, 1 = enabled */ - /* supported checksum types that are worked out at connect time */ - __u32 cl_supp_cksum_types; - /* checksum algorithm to be used */ - enum cksum_type cl_cksum_type; - - /* also protected by the poorly named _loi_list_lock lock above */ - struct osc_async_rc cl_ar; - - /* sequence manager */ - struct lu_client_seq *cl_seq; - - atomic_t cl_resends; /* resend count */ - - /* ptlrpc work for writeback in ptlrpcd context */ - void *cl_writeback_work; - void *cl_lru_work; - /* hash tables for osc_quota_info */ - struct rhashtable cl_quota_hash[MAXQUOTAS]; -}; - -#define obd2cli_tgt(obd) ((char *)(obd)->u.cli.cl_target_uuid.uuid) - -struct obd_id_info { - __u32 idx; - u64 *data; -}; - -struct echo_client_obd { - struct obd_export *ec_exp; /* the local connection to osc/lov */ - spinlock_t ec_lock; - struct list_head ec_objects; - struct list_head ec_locks; - __u64 ec_unique; -}; - -/* Generic subset of OSTs */ -struct ost_pool { - __u32 *op_array; /* array of index of lov_obd->lov_tgts */ - unsigned int op_count; /* number of OSTs in the array */ - unsigned int op_size; /* allocated size of lp_array */ - struct rw_semaphore op_rw_sem; /* to protect ost_pool use */ -}; - -/* allow statfs data caching for 1 second */ -#define OBD_STATFS_CACHE_SECONDS 1 - -struct lov_tgt_desc { - struct list_head ltd_kill; - struct obd_uuid ltd_uuid; - struct obd_device *ltd_obd; - struct obd_export *ltd_exp; - __u32 ltd_gen; - __u32 ltd_index; /* index in lov_obd->tgts */ - unsigned long ltd_active:1,/* is this target up for requests */ - ltd_activate:1,/* should target be activated */ - ltd_reap:1; /* should this target be deleted */ -}; - -struct lov_obd { - struct lov_desc desc; - struct lov_tgt_desc **lov_tgts; /* sparse array */ - struct ost_pool lov_packed; /* all OSTs in a packed array */ - struct mutex lov_lock; - struct obd_connect_data lov_ocd; - atomic_t lov_refcount; - __u32 lov_death_row;/* tgts scheduled to be deleted */ - __u32 lov_tgt_size; /* size of tgts array */ - int lov_connects; - int lov_pool_count; - struct rhashtable lov_pools_hash_body; /* used for key access */ - struct list_head lov_pool_list; /* used for sequential access */ - struct dentry *lov_pool_debugfs_entry; - enum lustre_sec_part lov_sp_me; - - /* Cached LRU and unstable data from upper layer */ - struct cl_client_cache *lov_cache; - - struct rw_semaphore lov_notify_lock; - - struct kobject *lov_tgts_kobj; -}; - -struct lmv_tgt_desc { - struct obd_uuid ltd_uuid; - struct obd_export *ltd_exp; - u32 ltd_idx; - struct mutex ltd_fid_mutex; - unsigned long ltd_active:1; /* target up for requests */ -}; - -struct lmv_obd { - struct lu_client_fld lmv_fld; - spinlock_t lmv_lock; - struct lmv_desc desc; - struct obd_uuid cluuid; - - struct mutex lmv_init_mutex; - int connected; - int max_easize; - int max_def_easize; - - u32 tgts_size; /* size of tgts array */ - struct lmv_tgt_desc **tgts; - - struct obd_connect_data conn_data; - struct kobject *lmv_tgts_kobj; -}; - -struct niobuf_local { - __u64 lnb_file_offset; - __u32 lnb_page_offset; - __u32 lnb_len; - __u32 lnb_flags; - int lnb_rc; - struct page *lnb_page; - void *lnb_data; -}; - -#define LUSTRE_FLD_NAME "fld" -#define LUSTRE_SEQ_NAME "seq" - -#define LUSTRE_MDD_NAME "mdd" -#define LUSTRE_OSD_LDISKFS_NAME "osd-ldiskfs" -#define LUSTRE_OSD_ZFS_NAME "osd-zfs" -#define LUSTRE_VVP_NAME "vvp" -#define LUSTRE_LMV_NAME "lmv" -#define LUSTRE_SLP_NAME "slp" -#define LUSTRE_LOD_NAME "lod" -#define LUSTRE_OSP_NAME "osp" -#define LUSTRE_LWP_NAME "lwp" - -/* obd device type names */ - /* FIXME all the references to LUSTRE_MDS_NAME should be swapped with LUSTRE_MDT_NAME */ -#define LUSTRE_MDS_NAME "mds" -#define LUSTRE_MDT_NAME "mdt" -#define LUSTRE_MDC_NAME "mdc" -#define LUSTRE_OSS_NAME "ost" /* FIXME change name to oss */ -#define LUSTRE_OST_NAME "obdfilter" /* FIXME change name to ost */ -#define LUSTRE_OSC_NAME "osc" -#define LUSTRE_LOV_NAME "lov" -#define LUSTRE_MGS_NAME "mgs" -#define LUSTRE_MGC_NAME "mgc" - -#define LUSTRE_ECHO_NAME "obdecho" -#define LUSTRE_ECHO_CLIENT_NAME "echo_client" -#define LUSTRE_QMT_NAME "qmt" - -/* Constant obd names (post-rename) */ -#define LUSTRE_MDS_OBDNAME "MDS" -#define LUSTRE_OSS_OBDNAME "OSS" -#define LUSTRE_MGS_OBDNAME "MGS" -#define LUSTRE_MGC_OBDNAME "MGC" - -/* Don't conflict with on-wire flags OBD_BRW_WRITE, etc */ -#define N_LOCAL_TEMP_PAGE 0x10000000 - -/* - * Events signalled through obd_notify() upcall-chain. - */ -enum obd_notify_event { - /* Device connect start */ - OBD_NOTIFY_CONNECT, - /* Device activated */ - OBD_NOTIFY_ACTIVE, - /* Device deactivated */ - OBD_NOTIFY_INACTIVE, - /* Connect data for import were changed */ - OBD_NOTIFY_OCD, - /* Sync request */ - OBD_NOTIFY_SYNC_NONBLOCK, - OBD_NOTIFY_SYNC, - /* Configuration event */ - OBD_NOTIFY_CONFIG, - /* Administratively deactivate/activate event */ - OBD_NOTIFY_DEACTIVATE, - OBD_NOTIFY_ACTIVATE -}; - -/* - * Data structure used to pass obd_notify()-event to non-obd listeners (llite - * being main example). - */ -struct obd_notify_upcall { - int (*onu_upcall)(struct obd_device *host, struct obd_device *watched, - enum obd_notify_event ev, void *owner, void *data); - /* Opaque datum supplied by upper layer listener */ - void *onu_owner; -}; - -struct target_recovery_data { - svc_handler_t trd_recovery_handler; - pid_t trd_processing_task; - struct completion trd_starting; - struct completion trd_finishing; -}; - -struct obd_llog_group { - struct llog_ctxt *olg_ctxts[LLOG_MAX_CTXTS]; - wait_queue_head_t olg_waitq; - spinlock_t olg_lock; - struct mutex olg_cat_processing; -}; - -/* corresponds to one of the obd's */ -#define OBD_DEVICE_MAGIC 0XAB5CD6EF - -struct lvfs_run_ctxt { - struct dt_device *dt; -}; - -struct obd_device { - struct obd_type *obd_type; - u32 obd_magic; /* OBD_DEVICE_MAGIC */ - int obd_minor; /* device number: lctl dl */ - struct lu_device *obd_lu_dev; - - /* common and UUID name of this device */ - struct obd_uuid obd_uuid; - char obd_name[MAX_OBD_NAME]; - - /* bitfield modification is protected by obd_dev_lock */ - unsigned long obd_attached:1, /* finished attach */ - obd_set_up:1, /* finished setup */ - obd_version_recov:1, /* obd uses version checking */ - obd_replayable:1,/* recovery is enabled; inform clients */ - obd_no_transno:1, /* no committed-transno notification */ - obd_no_recov:1, /* fail instead of retry messages */ - obd_stopping:1, /* started cleanup */ - obd_starting:1, /* started setup */ - obd_force:1, /* cleanup with > 0 obd refcount */ - obd_fail:1, /* cleanup with failover */ - obd_no_conn:1, /* deny new connections */ - obd_inactive:1, /* device active/inactive - * (for sysfs status only!!) - */ - obd_no_ir:1, /* no imperative recovery. */ - obd_process_conf:1; /* device is processing mgs config */ - /* use separate field as it is set in interrupt to don't mess with - * protection of other bits using _bh lock - */ - unsigned long obd_recovery_expired:1; - /* uuid-export hash body */ - struct rhashtable obd_uuid_hash; - wait_queue_head_t obd_refcount_waitq; - struct list_head obd_exports; - struct list_head obd_unlinked_exports; - struct list_head obd_delayed_exports; - atomic_t obd_refcount; - int obd_num_exports; - spinlock_t obd_nid_lock; - struct ldlm_namespace *obd_namespace; - struct ptlrpc_client obd_ldlm_client; /* XXX OST/MDS only */ - /* a spinlock is OK for what we do now, may need a semaphore later */ - spinlock_t obd_dev_lock; /* protect OBD bitfield above */ - spinlock_t obd_osfs_lock; - struct obd_statfs obd_osfs; /* locked by obd_osfs_lock */ - __u64 obd_osfs_age; - u64 obd_last_committed; - struct mutex obd_dev_mutex; - struct lvfs_run_ctxt obd_lvfs_ctxt; - struct obd_llog_group obd_olg; /* default llog group */ - struct obd_device *obd_observer; - struct rw_semaphore obd_observer_link_sem; - struct obd_notify_upcall obd_upcall; - struct obd_export *obd_self_export; - - union { - struct client_obd cli; - struct echo_client_obd echo_client; - struct lov_obd lov; - struct lmv_obd lmv; - } u; - - /* Fields used by LProcFS */ - struct lprocfs_stats *obd_stats; - unsigned int obd_cntr_base; - - struct lprocfs_stats *md_stats; - unsigned int md_cntr_base; - - struct dentry *obd_debugfs_entry; - struct dentry *obd_svc_debugfs_entry; - struct lprocfs_stats *obd_svc_stats; - atomic_t obd_evict_inprogress; - wait_queue_head_t obd_evict_inprogress_waitq; - struct list_head obd_evict_list; /* protected with pet_lock */ - - /** - * Ldlm pool part. Save last calculated SLV and Limit. - */ - rwlock_t obd_pool_lock; - u64 obd_pool_slv; - int obd_pool_limit; - - int obd_conn_inprogress; - - /** - * A list of outstanding class_incref()'s against this obd. For - * debugging. - */ - struct lu_ref obd_reference; - - struct kobject obd_kobj; /* sysfs object */ - struct completion obd_kobj_unregister; -}; - -int obd_uuid_add(struct obd_device *obd, struct obd_export *export); -void obd_uuid_del(struct obd_device *obd, struct obd_export *export); - -/* get/set_info keys */ -#define KEY_ASYNC "async" -#define KEY_CHANGELOG_CLEAR "changelog_clear" -#define KEY_FID2PATH "fid2path" -#define KEY_CHECKSUM "checksum" -#define KEY_CLEAR_FS "clear_fs" -#define KEY_CONN_DATA "conn_data" -#define KEY_EVICT_BY_NID "evict_by_nid" -#define KEY_FIEMAP "fiemap" -#define KEY_FLUSH_CTX "flush_ctx" -#define KEY_GRANT_SHRINK "grant_shrink" -#define KEY_HSM_COPYTOOL_SEND "hsm_send" -#define KEY_INIT_RECOV_BACKUP "init_recov_bk" -#define KEY_INTERMDS "inter_mds" -#define KEY_LAST_ID "last_id" -#define KEY_LAST_FID "last_fid" -#define KEY_MAX_EASIZE "max_easize" -#define KEY_DEFAULT_EASIZE "default_easize" -#define KEY_MGSSEC "mgssec" -#define KEY_READ_ONLY "read-only" -#define KEY_REGISTER_TARGET "register_target" -#define KEY_SET_FS "set_fs" -#define KEY_TGT_COUNT "tgt_count" -/* KEY_SET_INFO in lustre_idl.h */ -#define KEY_SPTLRPC_CONF "sptlrpc_conf" - -#define KEY_CACHE_SET "cache_set" -#define KEY_CACHE_LRU_SHRINK "cache_lru_shrink" - -struct lu_context; - -static inline int it_to_lock_mode(struct lookup_intent *it) -{ - /* CREAT needs to be tested before open (both could be set) */ - if (it->it_op & IT_CREAT) - return LCK_CW; - else if (it->it_op & (IT_GETATTR | IT_OPEN | IT_LOOKUP | - IT_LAYOUT)) - return LCK_CR; - else if (it->it_op & IT_READDIR) - return LCK_PR; - else if (it->it_op & IT_GETXATTR) - return LCK_PR; - else if (it->it_op & IT_SETXATTR) - return LCK_PW; - - LASSERTF(0, "Invalid it_op: %d\n", it->it_op); - return -EINVAL; -} - -enum md_op_flags { - MF_MDC_CANCEL_FID1 = BIT(0), - MF_MDC_CANCEL_FID2 = BIT(1), - MF_MDC_CANCEL_FID3 = BIT(2), - MF_MDC_CANCEL_FID4 = BIT(3), - MF_GET_MDT_IDX = BIT(4), -}; - -enum md_cli_flags { - CLI_SET_MEA = BIT(0), - CLI_RM_ENTRY = BIT(1), - CLI_HASH64 = BIT(2), - CLI_API32 = BIT(3), - CLI_MIGRATE = BIT(4), -}; - -/** - * GETXATTR is not included as only a couple of fields in the reply body - * is filled, but not FID which is needed for common intent handling in - * mdc_finish_intent_lock() - */ -static inline bool it_has_reply_body(const struct lookup_intent *it) -{ - return it->it_op & (IT_OPEN | IT_UNLINK | IT_LOOKUP | IT_GETATTR); -} - -struct md_op_data { - struct lu_fid op_fid1; /* operation fid1 (usually parent) */ - struct lu_fid op_fid2; /* operation fid2 (usually child) */ - struct lu_fid op_fid3; /* 2 extra fids to find conflicting */ - struct lu_fid op_fid4; /* to the operation locks. */ - u32 op_mds; /* what mds server open will go to */ - struct lustre_handle op_handle; - s64 op_mod_time; - const char *op_name; - size_t op_namelen; - __u32 op_mode; - struct lmv_stripe_md *op_mea1; - struct lmv_stripe_md *op_mea2; - __u32 op_suppgids[2]; - __u32 op_fsuid; - __u32 op_fsgid; - kernel_cap_t op_cap; - void *op_data; - size_t op_data_size; - - /* iattr fields and blocks. */ - struct iattr op_attr; - unsigned int op_attr_flags; - __u64 op_valid; - loff_t op_attr_blocks; - - __u32 op_flags; - - /* Various operation flags. */ - enum mds_op_bias op_bias; - - /* Used by readdir */ - __u64 op_offset; - - /* Used by readdir */ - __u32 op_max_pages; - - /* used to transfer info between the stacks of MD client - * see enum op_cli_flags - */ - enum md_cli_flags op_cli_flags; - - /* File object data version for HSM release, on client */ - __u64 op_data_version; - struct lustre_handle op_lease_handle; - - /* default stripe offset */ - __u32 op_default_stripe_offset; -}; - -struct md_callback { - int (*md_blocking_ast)(struct ldlm_lock *lock, - struct ldlm_lock_desc *desc, - void *data, int flag); -}; - -struct md_enqueue_info; -/* metadata stat-ahead */ - -struct md_enqueue_info { - struct md_op_data mi_data; - struct lookup_intent mi_it; - struct lustre_handle mi_lockh; - struct inode *mi_dir; - struct ldlm_enqueue_info mi_einfo; - int (*mi_cb)(struct ptlrpc_request *req, - struct md_enqueue_info *minfo, int rc); - void *mi_cbdata; -}; - -struct obd_ops { - struct module *owner; - int (*iocontrol)(unsigned int cmd, struct obd_export *exp, int len, - void *karg, void __user *uarg); - int (*get_info)(const struct lu_env *env, struct obd_export *, - __u32 keylen, void *key, __u32 *vallen, void *val); - int (*set_info_async)(const struct lu_env *, struct obd_export *, - __u32 keylen, void *key, - __u32 vallen, void *val, - struct ptlrpc_request_set *set); - int (*setup)(struct obd_device *dev, struct lustre_cfg *cfg); - int (*precleanup)(struct obd_device *dev); - int (*cleanup)(struct obd_device *dev); - int (*process_config)(struct obd_device *dev, u32 len, void *data); - int (*postrecov)(struct obd_device *dev); - int (*add_conn)(struct obd_import *imp, struct obd_uuid *uuid, - int priority); - int (*del_conn)(struct obd_import *imp, struct obd_uuid *uuid); - /* connect to the target device with given connection - * data. @ocd->ocd_connect_flags is modified to reflect flags actually - * granted by the target, which are guaranteed to be a subset of flags - * asked for. If @ocd == NULL, use default parameters. - */ - int (*connect)(const struct lu_env *env, - struct obd_export **exp, struct obd_device *src, - struct obd_uuid *cluuid, struct obd_connect_data *ocd, - void *localdata); - int (*reconnect)(const struct lu_env *env, - struct obd_export *exp, struct obd_device *src, - struct obd_uuid *cluuid, - struct obd_connect_data *ocd, - void *localdata); - int (*disconnect)(struct obd_export *exp); - - /* Initialize/finalize fids infrastructure. */ - int (*fid_init)(struct obd_device *obd, - struct obd_export *exp, enum lu_cli_type type); - int (*fid_fini)(struct obd_device *obd); - - /* Allocate new fid according to passed @hint. */ - int (*fid_alloc)(const struct lu_env *env, struct obd_export *exp, - struct lu_fid *fid, struct md_op_data *op_data); - - /* - * Object with @fid is getting deleted, we may want to do something - * about this. - */ - int (*statfs)(const struct lu_env *, struct obd_export *exp, - struct obd_statfs *osfs, __u64 max_age, __u32 flags); - int (*statfs_async)(struct obd_export *exp, struct obd_info *oinfo, - __u64 max_age, struct ptlrpc_request_set *set); - int (*create)(const struct lu_env *env, struct obd_export *exp, - struct obdo *oa); - int (*destroy)(const struct lu_env *env, struct obd_export *exp, - struct obdo *oa); - int (*setattr)(const struct lu_env *, struct obd_export *exp, - struct obdo *oa); - int (*getattr)(const struct lu_env *env, struct obd_export *exp, - struct obdo *oa); - int (*preprw)(const struct lu_env *env, int cmd, - struct obd_export *exp, struct obdo *oa, int objcount, - struct obd_ioobj *obj, struct niobuf_remote *remote, - int *nr_pages, struct niobuf_local *local); - int (*commitrw)(const struct lu_env *env, int cmd, - struct obd_export *exp, struct obdo *oa, - int objcount, struct obd_ioobj *obj, - struct niobuf_remote *remote, int pages, - struct niobuf_local *local, int rc); - int (*init_export)(struct obd_export *exp); - int (*destroy_export)(struct obd_export *exp); - - /* metadata-only methods */ - int (*import_event)(struct obd_device *, struct obd_import *, - enum obd_import_event); - - int (*notify)(struct obd_device *obd, struct obd_device *watched, - enum obd_notify_event ev, void *data); - - int (*health_check)(const struct lu_env *env, struct obd_device *); - struct obd_uuid *(*get_uuid)(struct obd_export *exp); - - /* quota methods */ - int (*quotactl)(struct obd_device *, struct obd_export *, - struct obd_quotactl *); - - /* pools methods */ - int (*pool_new)(struct obd_device *obd, char *poolname); - int (*pool_del)(struct obd_device *obd, char *poolname); - int (*pool_add)(struct obd_device *obd, char *poolname, - char *ostname); - int (*pool_rem)(struct obd_device *obd, char *poolname, - char *ostname); - void (*getref)(struct obd_device *obd); - void (*putref)(struct obd_device *obd); - /* - * NOTE: If adding ops, add another LPROCFS_OBD_OP_INIT() line - * to lprocfs_alloc_obd_stats() in obdclass/lprocfs_status.c. - * Also, add a wrapper function in include/linux/obd_class.h. - */ -}; - -/* lmv structures */ -struct lustre_md { - struct mdt_body *body; - struct lu_buf layout; - struct lmv_stripe_md *lmv; -#ifdef CONFIG_FS_POSIX_ACL - struct posix_acl *posix_acl; -#endif - struct mdt_remote_perm *remote_perm; -}; - -struct md_open_data { - struct obd_client_handle *mod_och; - struct ptlrpc_request *mod_open_req; - struct ptlrpc_request *mod_close_req; - atomic_t mod_refcount; - bool mod_is_create; -}; - -struct obd_client_handle { - struct lustre_handle och_fh; - struct lu_fid och_fid; - struct md_open_data *och_mod; - struct lustre_handle och_lease_handle; /* open lock for lease */ - __u32 och_magic; - fmode_t och_flags; -}; - -#define OBD_CLIENT_HANDLE_MAGIC 0xd15ea5ed - -struct lookup_intent; -struct cl_attr; - -struct md_ops { - int (*getstatus)(struct obd_export *, struct lu_fid *); - int (*null_inode)(struct obd_export *, const struct lu_fid *); - int (*close)(struct obd_export *, struct md_op_data *, - struct md_open_data *, struct ptlrpc_request **); - int (*create)(struct obd_export *, struct md_op_data *, - const void *, size_t, umode_t, uid_t, gid_t, - kernel_cap_t, __u64, struct ptlrpc_request **); - int (*enqueue)(struct obd_export *, struct ldlm_enqueue_info *, - const union ldlm_policy_data *, struct md_op_data *, - struct lustre_handle *, __u64); - int (*getattr)(struct obd_export *, struct md_op_data *, - struct ptlrpc_request **); - int (*getattr_name)(struct obd_export *, struct md_op_data *, - struct ptlrpc_request **); - int (*intent_lock)(struct obd_export *, struct md_op_data *, - struct lookup_intent *, - struct ptlrpc_request **, - ldlm_blocking_callback, __u64); - int (*link)(struct obd_export *, struct md_op_data *, - struct ptlrpc_request **); - int (*rename)(struct obd_export *, struct md_op_data *, - const char *, size_t, const char *, size_t, - struct ptlrpc_request **); - int (*setattr)(struct obd_export *, struct md_op_data *, void *, - size_t, struct ptlrpc_request **); - int (*sync)(struct obd_export *, const struct lu_fid *, - struct ptlrpc_request **); - int (*read_page)(struct obd_export *, struct md_op_data *, - struct md_callback *cb_op, __u64 hash_offset, - struct page **ppage); - int (*unlink)(struct obd_export *, struct md_op_data *, - struct ptlrpc_request **); - - int (*setxattr)(struct obd_export *, const struct lu_fid *, - u64, const char *, const void *, size_t, unsigned int, - u32, struct ptlrpc_request **); - - int (*getxattr)(struct obd_export *, const struct lu_fid *, - u64, const char *, size_t, struct ptlrpc_request **); - - int (*init_ea_size)(struct obd_export *, u32, u32); - - int (*get_lustre_md)(struct obd_export *, struct ptlrpc_request *, - struct obd_export *, struct obd_export *, - struct lustre_md *); - - int (*free_lustre_md)(struct obd_export *, struct lustre_md *); - - int (*merge_attr)(struct obd_export *, - const struct lmv_stripe_md *lsm, - struct cl_attr *attr, ldlm_blocking_callback); - - int (*set_open_replay_data)(struct obd_export *, - struct obd_client_handle *, - struct lookup_intent *); - int (*clear_open_replay_data)(struct obd_export *, - struct obd_client_handle *); - int (*set_lock_data)(struct obd_export *, const struct lustre_handle *, - void *, __u64 *); - - enum ldlm_mode (*lock_match)(struct obd_export *, __u64, - const struct lu_fid *, enum ldlm_type, - union ldlm_policy_data *, enum ldlm_mode, - struct lustre_handle *); - - int (*cancel_unused)(struct obd_export *, const struct lu_fid *, - union ldlm_policy_data *, enum ldlm_mode, - enum ldlm_cancel_flags flags, void *opaque); - - int (*get_fid_from_lsm)(struct obd_export *, - const struct lmv_stripe_md *, - const char *name, int namelen, - struct lu_fid *fid); - - int (*intent_getattr_async)(struct obd_export *, - struct md_enqueue_info *); - - int (*revalidate_lock)(struct obd_export *, struct lookup_intent *, - struct lu_fid *, __u64 *bits); - - int (*unpackmd)(struct obd_export *exp, struct lmv_stripe_md **plsm, - const union lmv_mds_md *lmv, size_t lmv_size); - /* - * NOTE: If adding ops, add another LPROCFS_MD_OP_INIT() line to - * lprocfs_alloc_md_stats() in obdclass/lprocfs_status.c. Also, add a - * wrapper function in include/linux/obd_class.h. - */ -}; - -static inline struct md_open_data *obd_mod_alloc(void) -{ - struct md_open_data *mod; - - mod = kzalloc(sizeof(*mod), GFP_NOFS); - if (!mod) - return NULL; - atomic_set(&mod->mod_refcount, 1); - return mod; -} - -#define obd_mod_get(mod) atomic_inc(&(mod)->mod_refcount) -#define obd_mod_put(mod) \ -({ \ - if (atomic_dec_and_test(&(mod)->mod_refcount)) { \ - if ((mod)->mod_open_req) \ - ptlrpc_req_finished((mod)->mod_open_req); \ - kfree(mod); \ - } \ -}) - -void obdo_from_inode(struct obdo *dst, struct inode *src, u32 valid); -void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent); - -/* return 1 if client should be resend request */ -static inline int client_should_resend(int resend, struct client_obd *cli) -{ - return atomic_read(&cli->cl_resends) ? - atomic_read(&cli->cl_resends) > resend : 1; -} - -/** - * Return device name for this device - * - * XXX: lu_device is declared before obd_device, while a pointer pointing - * back to obd_device in lu_device, so this helper function defines here - * instead of in lu_object.h - */ -static inline const char *lu_dev_name(const struct lu_device *lu_dev) -{ - return lu_dev->ld_obd->obd_name; -} - -static inline bool filename_is_volatile(const char *name, size_t namelen, - int *idx) -{ - const char *start; - char *end; - - if (strncmp(name, LUSTRE_VOLATILE_HDR, LUSTRE_VOLATILE_HDR_LEN) != 0) - return false; - - /* caller does not care of idx */ - if (!idx) - return true; - - /* volatile file, the MDT can be set from name */ - /* name format is LUSTRE_VOLATILE_HDR:[idx]: */ - /* if no MDT is specified, use std way */ - if (namelen < LUSTRE_VOLATILE_HDR_LEN + 2) - goto bad_format; - /* test for no MDT idx case */ - if ((*(name + LUSTRE_VOLATILE_HDR_LEN) == ':') && - (*(name + LUSTRE_VOLATILE_HDR_LEN + 1) == ':')) { - *idx = -1; - return true; - } - /* we have an idx, read it */ - start = name + LUSTRE_VOLATILE_HDR_LEN + 1; - *idx = simple_strtoul(start, &end, 0); - /* error cases: - * no digit, no trailing :, negative value - */ - if (((*idx == 0) && (end == start)) || - (*end != ':') || (*idx < 0)) - goto bad_format; - - return true; -bad_format: - /* bad format of mdt idx, we cannot return an error - * to caller so we use hash algo - */ - CERROR("Bad volatile file name format: %s\n", - name + LUSTRE_VOLATILE_HDR_LEN); - return false; -} - -static inline int cli_brw_size(struct obd_device *obd) -{ - return obd->u.cli.cl_max_pages_per_rpc << PAGE_SHIFT; -} - -/* - * when RPC size or the max RPCs in flight is increased, the max dirty pages - * of the client should be increased accordingly to avoid sending fragmented - * RPCs over the network when the client runs out of the maximum dirty space - * when so many RPCs are being generated. - */ -static inline void client_adjust_max_dirty(struct client_obd *cli) -{ - /* initializing */ - if (cli->cl_dirty_max_pages <= 0) - cli->cl_dirty_max_pages = - (OSC_MAX_DIRTY_DEFAULT * 1024 * 1024) >> PAGE_SHIFT; - else { - unsigned long dirty_max = cli->cl_max_rpcs_in_flight * - cli->cl_max_pages_per_rpc; - - if (dirty_max > cli->cl_dirty_max_pages) - cli->cl_dirty_max_pages = dirty_max; - } - - if (cli->cl_dirty_max_pages > totalram_pages / 8) - cli->cl_dirty_max_pages = totalram_pages / 8; -} - -#endif /* __OBD_H */ diff --git a/drivers/staging/lustre/lustre/include/obd_cksum.h b/drivers/staging/lustre/lustre/include/obd_cksum.h deleted file mode 100644 index e5f7bb20415db3a4e854d37b57a9fc3f20816790..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/obd_cksum.h +++ /dev/null @@ -1,153 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#ifndef __OBD_CKSUM -#define __OBD_CKSUM -#include -#include -#include - -static inline unsigned char cksum_obd2cfs(enum cksum_type cksum_type) -{ - switch (cksum_type) { - case OBD_CKSUM_CRC32: - return CFS_HASH_ALG_CRC32; - case OBD_CKSUM_ADLER: - return CFS_HASH_ALG_ADLER32; - case OBD_CKSUM_CRC32C: - return CFS_HASH_ALG_CRC32C; - default: - CERROR("Unknown checksum type (%x)!!!\n", cksum_type); - LBUG(); - } - return 0; -} - -/* The OBD_FL_CKSUM_* flags is packed into 5 bits of o_flags, since there can - * only be a single checksum type per RPC. - * - * The OBD_CHECKSUM_* type bits passed in ocd_cksum_types are a 32-bit bitmask - * since they need to represent the full range of checksum algorithms that - * both the client and server can understand. - * - * In case of an unsupported types/flags we fall back to ADLER - * because that is supported by all clients since 1.8 - * - * In case multiple algorithms are supported the best one is used. - */ -static inline u32 cksum_type_pack(enum cksum_type cksum_type) -{ - unsigned int performance = 0, tmp; - u32 flag = OBD_FL_CKSUM_ADLER; - - if (cksum_type & OBD_CKSUM_CRC32) { - tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)); - if (tmp > performance) { - performance = tmp; - flag = OBD_FL_CKSUM_CRC32; - } - } - if (cksum_type & OBD_CKSUM_CRC32C) { - tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)); - if (tmp > performance) { - performance = tmp; - flag = OBD_FL_CKSUM_CRC32C; - } - } - if (cksum_type & OBD_CKSUM_ADLER) { - tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)); - if (tmp > performance) { - performance = tmp; - flag = OBD_FL_CKSUM_ADLER; - } - } - if (unlikely(cksum_type && !(cksum_type & (OBD_CKSUM_CRC32C | - OBD_CKSUM_CRC32 | - OBD_CKSUM_ADLER)))) - CWARN("unknown cksum type %x\n", cksum_type); - - return flag; -} - -static inline enum cksum_type cksum_type_unpack(u32 o_flags) -{ - switch (o_flags & OBD_FL_CKSUM_ALL) { - case OBD_FL_CKSUM_CRC32C: - return OBD_CKSUM_CRC32C; - case OBD_FL_CKSUM_CRC32: - return OBD_CKSUM_CRC32; - default: - break; - } - - return OBD_CKSUM_ADLER; -} - -/* Return a bitmask of the checksum types supported on this system. - * 1.8 supported ADLER it is base and not depend on hw - * Client uses all available local algos - */ -static inline enum cksum_type cksum_types_supported_client(void) -{ - enum cksum_type ret = OBD_CKSUM_ADLER; - - CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n", - cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)), - cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)), - cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER))); - - if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) > 0) - ret |= OBD_CKSUM_CRC32C; - if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) > 0) - ret |= OBD_CKSUM_CRC32; - - return ret; -} - -/* Select the best checksum algorithm among those supplied in the cksum_types - * input. - * - * Currently, calling cksum_type_pack() with a mask will return the fastest - * checksum type due to its benchmarking at libcfs module load. - * Caution is advised, however, since what is fastest on a single client may - * not be the fastest or most efficient algorithm on the server. - */ -static inline enum cksum_type cksum_type_select(enum cksum_type cksum_types) -{ - return cksum_type_unpack(cksum_type_pack(cksum_types)); -} - -/* Checksum algorithm names. Must be defined in the same order as the - * OBD_CKSUM_* flags. - */ -#define DECLARE_CKSUM_NAME char *cksum_name[] = {"crc32", "adler", "crc32c"} - -#endif /* __OBD_H */ diff --git a/drivers/staging/lustre/lustre/include/obd_class.h b/drivers/staging/lustre/lustre/include/obd_class.h deleted file mode 100644 index fc9c7720fee03144fb11b030ca084f9b92c0a317..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/obd_class.h +++ /dev/null @@ -1,1603 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ -#ifndef __CLASS_OBD_H -#define __CLASS_OBD_H - -#include -#include -#include -#include -#include -#include - -/* requests should be send without delay and resends for avoid deadlocks */ -#define OBD_STATFS_NODELAY 0x0001 -/* the statfs callback should not update obd_osfs_age */ -#define OBD_STATFS_FROM_CACHE 0x0002 -/* the statfs is only for retrieving information from MDT0 */ -#define OBD_STATFS_FOR_MDT0 0x0004 - -/* OBD Device Declarations */ -extern struct obd_device *obd_devs[MAX_OBD_DEVICES]; -extern rwlock_t obd_dev_lock; - -/* OBD Operations Declarations */ -struct obd_device *class_exp2obd(struct obd_export *exp); -int class_handle_ioctl(unsigned int cmd, unsigned long arg); -int lustre_get_jobid(char *jobid); - -struct lu_device_type; - -/* genops.c */ -extern struct list_head obd_types; -struct obd_export *class_conn2export(struct lustre_handle *conn); -int class_register_type(struct obd_ops *dt_ops, struct md_ops *md_ops, - const char *name, struct lu_device_type *ldt); -int class_unregister_type(const char *name); - -struct obd_device *class_newdev(const char *type_name, const char *name); -void class_release_dev(struct obd_device *obd); - -int class_name2dev(const char *name); -struct obd_device *class_name2obd(const char *name); -int class_uuid2dev(struct obd_uuid *uuid); -struct obd_device *class_find_client_obd(struct obd_uuid *tgt_uuid, - const char *typ_name, - struct obd_uuid *grp_uuid); -struct obd_device *class_devices_in_group(struct obd_uuid *grp_uuid, - int *next); -struct obd_device *class_num2obd(int num); - -int class_notify_sptlrpc_conf(const char *fsname, int namelen); - -int obd_connect_flags2str(char *page, int count, __u64 flags, char *sep); - -int obd_zombie_impexp_init(void); -void obd_zombie_impexp_stop(void); -void obd_zombie_barrier(void); - -int obd_get_request_slot(struct client_obd *cli); -void obd_put_request_slot(struct client_obd *cli); -__u32 obd_get_max_rpcs_in_flight(struct client_obd *cli); -int obd_set_max_rpcs_in_flight(struct client_obd *cli, __u32 max); -int obd_set_max_mod_rpcs_in_flight(struct client_obd *cli, u16 max); -int obd_mod_rpc_stats_seq_show(struct client_obd *cli, struct seq_file *seq); - -u16 obd_get_mod_rpc_slot(struct client_obd *cli, u32 opc, - struct lookup_intent *it); -void obd_put_mod_rpc_slot(struct client_obd *cli, u32 opc, - struct lookup_intent *it, u16 tag); - -struct llog_handle; -struct llog_rec_hdr; -typedef int (*llog_cb_t)(const struct lu_env *, struct llog_handle *, - struct llog_rec_hdr *, void *); - -/* obd_config.c */ -char *lustre_cfg_string(struct lustre_cfg *lcfg, u32 index); -int class_process_config(struct lustre_cfg *lcfg); -int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars, - struct lustre_cfg *lcfg, void *data); - -/* For interoperability */ -struct cfg_interop_param { - char *old_param; - char *new_param; -}; - -int class_find_param(char *buf, char *key, char **valp); -struct cfg_interop_param *class_find_old_param(const char *param, - struct cfg_interop_param *ptr); -int class_get_next_param(char **params, char *copy); -int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh); -int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh); -int class_parse_net(char *buf, u32 *net, char **endh); -int class_match_nid(char *buf, char *key, lnet_nid_t nid); -int class_match_net(char *buf, char *key, u32 net); - -struct obd_device *class_incref(struct obd_device *obd, - const char *scope, const void *source); -void class_decref(struct obd_device *obd, - const char *scope, const void *source); -int class_config_llog_handler(const struct lu_env *env, - struct llog_handle *handle, - struct llog_rec_hdr *rec, void *data); -int class_add_uuid(const char *uuid, __u64 nid); - -/* obdecho */ -void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars); - -#define CFG_F_START 0x01 /* Set when we start updating from a log */ -#define CFG_F_MARKER 0x02 /* We are within a maker */ -#define CFG_F_SKIP 0x04 /* We should ignore this cfg command */ -#define CFG_F_COMPAT146 0x08 /* Allow old-style logs */ -#define CFG_F_EXCLUDE 0x10 /* OST exclusion list */ - -/* Passed as data param to class_config_parse_llog */ -struct config_llog_instance { - char *cfg_obdname; - void *cfg_instance; - struct super_block *cfg_sb; - struct obd_uuid cfg_uuid; - llog_cb_t cfg_callback; - int cfg_last_idx; /* for partial llog processing */ - int cfg_flags; -}; - -int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt, - char *name, struct config_llog_instance *cfg); -enum { - CONFIG_T_CONFIG = 0, - CONFIG_T_SPTLRPC = 1, - CONFIG_T_RECOVER = 2, - CONFIG_T_PARAMS = 3, - CONFIG_T_MAX = 4 -}; - -#define PARAMS_FILENAME "params" -#define LCTL_UPCALL "lctl" - -/* list of active configuration logs */ -struct config_llog_data { - struct ldlm_res_id cld_resid; - struct config_llog_instance cld_cfg; - struct list_head cld_list_chain; - atomic_t cld_refcount; - struct config_llog_data *cld_sptlrpc;/* depended sptlrpc log */ - struct config_llog_data *cld_params; /* common parameters log */ - struct config_llog_data *cld_recover;/* imperative recover log */ - struct obd_export *cld_mgcexp; - struct mutex cld_lock; - int cld_type; - unsigned int cld_stopping:1, /* - * we were told to stop - * watching - */ - cld_lostlock:1; /* lock not requeued */ - char cld_logname[0]; -}; - -struct lustre_profile { - struct list_head lp_list; - char *lp_profile; - char *lp_dt; - char *lp_md; - int lp_refs; - bool lp_list_deleted; -}; - -struct lustre_profile *class_get_profile(const char *prof); -void class_del_profile(const char *prof); -void class_put_profile(struct lustre_profile *lprof); -void class_del_profiles(void); - -#if LUSTRE_TRACKS_LOCK_EXP_REFS - -void __class_export_add_lock_ref(struct obd_export *exp, - struct ldlm_lock *lock); -void __class_export_del_lock_ref(struct obd_export *exp, - struct ldlm_lock *lock); -extern void (*class_export_dump_hook)(struct obd_export *exp); - -#else - -#define __class_export_add_lock_ref(exp, lock) do {} while (0) -#define __class_export_del_lock_ref(exp, lock) do {} while (0) - -#endif - -/* genops.c */ -struct obd_export *class_export_get(struct obd_export *exp); -void class_export_put(struct obd_export *exp); -struct obd_export *class_new_export(struct obd_device *obddev, - struct obd_uuid *cluuid); -void class_unlink_export(struct obd_export *exp); - -struct obd_import *class_import_get(struct obd_import *imp); -void class_import_put(struct obd_import *imp); -struct obd_import *class_new_import(struct obd_device *obd); -void class_destroy_import(struct obd_import *exp); - -void class_put_type(struct obd_type *type); -int class_connect(struct lustre_handle *conn, struct obd_device *obd, - struct obd_uuid *cluuid); -int class_disconnect(struct obd_export *exp); -void class_fail_export(struct obd_export *exp); -int class_manual_cleanup(struct obd_device *obd); - -static inline void class_export_rpc_inc(struct obd_export *exp) -{ - atomic_inc(&(exp)->exp_rpc_count); - CDEBUG(D_INFO, "RPC GETting export %p : new rpc_count %d\n", - (exp), atomic_read(&(exp)->exp_rpc_count)); -} - -static inline void class_export_rpc_dec(struct obd_export *exp) -{ - LASSERT_ATOMIC_POS(&exp->exp_rpc_count); - atomic_dec(&(exp)->exp_rpc_count); - CDEBUG(D_INFO, "RPC PUTting export %p : new rpc_count %d\n", - (exp), atomic_read(&(exp)->exp_rpc_count)); -} - -static inline struct obd_export *class_export_lock_get(struct obd_export *exp, - struct ldlm_lock *lock) -{ - atomic_inc(&(exp)->exp_locks_count); - __class_export_add_lock_ref(exp, lock); - CDEBUG(D_INFO, "lock GETting export %p : new locks_count %d\n", - (exp), atomic_read(&(exp)->exp_locks_count)); - return class_export_get(exp); -} - -static inline void class_export_lock_put(struct obd_export *exp, - struct ldlm_lock *lock) -{ - LASSERT_ATOMIC_POS(&exp->exp_locks_count); - atomic_dec(&(exp)->exp_locks_count); - __class_export_del_lock_ref(exp, lock); - CDEBUG(D_INFO, "lock PUTting export %p : new locks_count %d\n", - (exp), atomic_read(&(exp)->exp_locks_count)); - class_export_put(exp); -} - -static inline enum obd_option exp_flags_from_obd(struct obd_device *obd) -{ - return ((obd->obd_fail ? OBD_OPT_FAILOVER : 0) | - (obd->obd_force ? OBD_OPT_FORCE : 0) | - 0); -} - -static inline int lprocfs_climp_check(struct obd_device *obd) -{ - down_read(&(obd)->u.cli.cl_sem); - if (!(obd)->u.cli.cl_import) { - up_read(&(obd)->u.cli.cl_sem); - return -ENODEV; - } - return 0; -} - -struct inode; -struct lu_attr; -struct obdo; - -void obdo_to_ioobj(const struct obdo *oa, struct obd_ioobj *ioobj); - -#define OBT(dev) ((dev)->obd_type) -#define OBP(dev, op) ((dev)->obd_type->typ_dt_ops->op) -#define MDP(dev, op) ((dev)->obd_type->typ_md_ops->op) -#define CTXTP(ctxt, op) ((ctxt)->loc_logops->lop_##op) - -/* - * Ensure obd_setup: used for cleanup which must be called - * while obd is stopping - */ -static inline int obd_check_dev(struct obd_device *obd) -{ - if (!obd) { - CERROR("NULL device\n"); - return -ENODEV; - } - return 0; -} - -/* ensure obd_setup and !obd_stopping */ -static inline int obd_check_dev_active(struct obd_device *obd) -{ - int rc; - - rc = obd_check_dev(obd); - if (rc) - return rc; - if (!obd->obd_set_up || obd->obd_stopping) { - CERROR("Device %d not setup\n", obd->obd_minor); - return -ENODEV; - } - return rc; -} - -#define OBD_COUNTER_OFFSET(op) \ - ((offsetof(struct obd_ops, op) - \ - offsetof(struct obd_ops, iocontrol)) \ - / sizeof(((struct obd_ops *)(0))->iocontrol)) - -#define OBD_COUNTER_INCREMENT(obdx, op) \ -do { \ - if ((obdx)->obd_stats) { \ - unsigned int coffset; \ - coffset = (unsigned int)((obdx)->obd_cntr_base) + \ - OBD_COUNTER_OFFSET(op); \ - LASSERT(coffset < (obdx)->obd_stats->ls_num); \ - lprocfs_counter_incr((obdx)->obd_stats, coffset); \ - } \ -} while (0) - -#define EXP_COUNTER_INCREMENT(export, op) \ -do { \ - if ((export)->exp_obd->obd_stats) { \ - unsigned int coffset; \ - coffset = (unsigned int)((export)->exp_obd->obd_cntr_base) + \ - OBD_COUNTER_OFFSET(op); \ - LASSERT(coffset < (export)->exp_obd->obd_stats->ls_num); \ - lprocfs_counter_incr((export)->exp_obd->obd_stats, coffset); \ - } \ -} while (0) - -#define MD_COUNTER_OFFSET(op) \ - ((offsetof(struct md_ops, op) - \ - offsetof(struct md_ops, getstatus)) \ - / sizeof(((struct md_ops *)(0))->getstatus)) - -#define MD_COUNTER_INCREMENT(obdx, op) \ -do { \ - if ((obd)->md_stats) { \ - unsigned int coffset; \ - coffset = (unsigned int)((obdx)->md_cntr_base) + \ - MD_COUNTER_OFFSET(op); \ - LASSERT(coffset < (obdx)->md_stats->ls_num); \ - lprocfs_counter_incr((obdx)->md_stats, coffset); \ - } \ -} while (0) - -#define EXP_MD_COUNTER_INCREMENT(export, op) \ -do { \ - if ((export)->exp_obd->obd_stats) { \ - unsigned int coffset; \ - coffset = (unsigned int)((export)->exp_obd->md_cntr_base) + \ - MD_COUNTER_OFFSET(op); \ - LASSERT(coffset < (export)->exp_obd->md_stats->ls_num); \ - lprocfs_counter_incr((export)->exp_obd->md_stats, coffset); \ - if ((export)->exp_md_stats) \ - lprocfs_counter_incr( \ - (export)->exp_md_stats, coffset); \ - } \ -} while (0) - -#define EXP_CHECK_MD_OP(exp, op) \ -do { \ - if (!(exp)) { \ - CERROR("obd_" #op ": NULL export\n"); \ - return -ENODEV; \ - } \ - if (!(exp)->exp_obd || !OBT((exp)->exp_obd)) { \ - CERROR("obd_" #op ": cleaned up obd\n"); \ - return -EOPNOTSUPP; \ - } \ - if (!OBT((exp)->exp_obd) || !MDP((exp)->exp_obd, op)) { \ - CERROR("obd_" #op ": dev %s/%d no operation\n", \ - (exp)->exp_obd->obd_name, \ - (exp)->exp_obd->obd_minor); \ - return -EOPNOTSUPP; \ - } \ -} while (0) - -#define OBD_CHECK_DT_OP(obd, op, err) \ -do { \ - if (!OBT(obd) || !OBP((obd), op)) { \ - if (err) \ - CERROR("obd_" #op ": dev %d no operation\n", \ - obd->obd_minor); \ - return err; \ - } \ -} while (0) - -#define EXP_CHECK_DT_OP(exp, op) \ -do { \ - if (!(exp)) { \ - CERROR("obd_" #op ": NULL export\n"); \ - return -ENODEV; \ - } \ - if (!(exp)->exp_obd || !OBT((exp)->exp_obd)) { \ - CERROR("obd_" #op ": cleaned up obd\n"); \ - return -EOPNOTSUPP; \ - } \ - if (!OBT((exp)->exp_obd) || !OBP((exp)->exp_obd, op)) { \ - CERROR("obd_" #op ": dev %d no operation\n", \ - (exp)->exp_obd->obd_minor); \ - return -EOPNOTSUPP; \ - } \ -} while (0) - -#define CTXT_CHECK_OP(ctxt, op, err) \ -do { \ - if (!OBT(ctxt->loc_obd) || !CTXTP((ctxt), op)) { \ - if (err) \ - CERROR("lop_" #op ": dev %d no operation\n", \ - ctxt->loc_obd->obd_minor); \ - return err; \ - } \ -} while (0) - -static inline int class_devno_max(void) -{ - return MAX_OBD_DEVICES; -} - -static inline int obd_get_info(const struct lu_env *env, - struct obd_export *exp, __u32 keylen, - void *key, __u32 *vallen, void *val) -{ - int rc; - - EXP_CHECK_DT_OP(exp, get_info); - EXP_COUNTER_INCREMENT(exp, get_info); - - rc = OBP(exp->exp_obd, get_info)(env, exp, keylen, key, vallen, val); - return rc; -} - -static inline int obd_set_info_async(const struct lu_env *env, - struct obd_export *exp, u32 keylen, - void *key, u32 vallen, void *val, - struct ptlrpc_request_set *set) -{ - int rc; - - EXP_CHECK_DT_OP(exp, set_info_async); - EXP_COUNTER_INCREMENT(exp, set_info_async); - - rc = OBP(exp->exp_obd, set_info_async)(env, exp, keylen, key, vallen, - val, set); - return rc; -} - -/* - * obd-lu integration. - * - * Functionality is being moved into new lu_device-based layering, but some - * pieces of configuration process are still based on obd devices. - * - * Specifically, lu_device_type_operations::ldto_device_alloc() methods fully - * subsume ->o_setup() methods of obd devices they replace. The same for - * lu_device_operations::ldo_process_config() and ->o_process_config(). As a - * result, obd_setup() and obd_process_config() branch and call one XOR - * another. - * - * Yet neither lu_device_type_operations::ldto_device_fini() nor - * lu_device_type_operations::ldto_device_free() fully implement the - * functionality of ->o_precleanup() and ->o_cleanup() they override. Hence, - * obd_precleanup() and obd_cleanup() call both lu_device and obd operations. - */ - -static inline int obd_setup(struct obd_device *obd, struct lustre_cfg *cfg) -{ - int rc; - struct lu_device_type *ldt; - struct lu_device *d; - - ldt = obd->obd_type->typ_lu; - if (ldt) { - struct lu_context session_ctx; - struct lu_env env; - - lu_context_init(&session_ctx, LCT_SESSION | LCT_SERVER_SESSION); - session_ctx.lc_thread = NULL; - lu_context_enter(&session_ctx); - - rc = lu_env_init(&env, ldt->ldt_ctx_tags); - if (rc == 0) { - env.le_ses = &session_ctx; - d = ldt->ldt_ops->ldto_device_alloc(&env, ldt, cfg); - lu_env_fini(&env); - if (!IS_ERR(d)) { - obd->obd_lu_dev = d; - d->ld_obd = obd; - rc = 0; - } else { - rc = PTR_ERR(d); - } - } - lu_context_exit(&session_ctx); - lu_context_fini(&session_ctx); - - } else { - OBD_CHECK_DT_OP(obd, setup, -EOPNOTSUPP); - OBD_COUNTER_INCREMENT(obd, setup); - rc = OBP(obd, setup)(obd, cfg); - } - return rc; -} - -static inline int obd_precleanup(struct obd_device *obd) -{ - int rc; - struct lu_device_type *ldt; - struct lu_device *d; - - rc = obd_check_dev(obd); - if (rc) - return rc; - ldt = obd->obd_type->typ_lu; - d = obd->obd_lu_dev; - if (ldt && d) { - struct lu_env env; - - rc = lu_env_init(&env, ldt->ldt_ctx_tags); - if (!rc) { - ldt->ldt_ops->ldto_device_fini(&env, d); - lu_env_fini(&env); - } - } - OBD_CHECK_DT_OP(obd, precleanup, 0); - OBD_COUNTER_INCREMENT(obd, precleanup); - - rc = OBP(obd, precleanup)(obd); - return rc; -} - -static inline int obd_cleanup(struct obd_device *obd) -{ - int rc; - struct lu_device_type *ldt; - struct lu_device *d; - - rc = obd_check_dev(obd); - if (rc) - return rc; - - ldt = obd->obd_type->typ_lu; - d = obd->obd_lu_dev; - if (ldt && d) { - struct lu_env env; - - rc = lu_env_init(&env, ldt->ldt_ctx_tags); - if (rc == 0) { - ldt->ldt_ops->ldto_device_free(&env, d); - lu_env_fini(&env); - obd->obd_lu_dev = NULL; - } - } - OBD_CHECK_DT_OP(obd, cleanup, 0); - OBD_COUNTER_INCREMENT(obd, cleanup); - - rc = OBP(obd, cleanup)(obd); - return rc; -} - -static inline void obd_cleanup_client_import(struct obd_device *obd) -{ - /* - * If we set up but never connected, the - * client import will not have been cleaned. - */ - down_write(&obd->u.cli.cl_sem); - if (obd->u.cli.cl_import) { - struct obd_import *imp; - - imp = obd->u.cli.cl_import; - CDEBUG(D_CONFIG, "%s: client import never connected\n", - obd->obd_name); - ptlrpc_invalidate_import(imp); - client_destroy_import(imp); - obd->u.cli.cl_import = NULL; - } - up_write(&obd->u.cli.cl_sem); -} - -static inline int -obd_process_config(struct obd_device *obd, int datalen, void *data) -{ - int rc; - struct lu_device_type *ldt; - struct lu_device *d; - - rc = obd_check_dev(obd); - if (rc) - return rc; - - obd->obd_process_conf = 1; - ldt = obd->obd_type->typ_lu; - d = obd->obd_lu_dev; - if (ldt && d) { - struct lu_env env; - - rc = lu_env_init(&env, ldt->ldt_ctx_tags); - if (rc == 0) { - rc = d->ld_ops->ldo_process_config(&env, d, data); - lu_env_fini(&env); - } - } else { - OBD_CHECK_DT_OP(obd, process_config, -EOPNOTSUPP); - rc = OBP(obd, process_config)(obd, datalen, data); - } - OBD_COUNTER_INCREMENT(obd, process_config); - obd->obd_process_conf = 0; - - return rc; -} - -static inline int obd_create(const struct lu_env *env, struct obd_export *exp, - struct obdo *obdo) -{ - int rc; - - EXP_CHECK_DT_OP(exp, create); - EXP_COUNTER_INCREMENT(exp, create); - - rc = OBP(exp->exp_obd, create)(env, exp, obdo); - return rc; -} - -static inline int obd_destroy(const struct lu_env *env, struct obd_export *exp, - struct obdo *obdo) -{ - int rc; - - EXP_CHECK_DT_OP(exp, destroy); - EXP_COUNTER_INCREMENT(exp, destroy); - - rc = OBP(exp->exp_obd, destroy)(env, exp, obdo); - return rc; -} - -static inline int obd_getattr(const struct lu_env *env, struct obd_export *exp, - struct obdo *oa) -{ - int rc; - - EXP_CHECK_DT_OP(exp, getattr); - EXP_COUNTER_INCREMENT(exp, getattr); - - rc = OBP(exp->exp_obd, getattr)(env, exp, oa); - return rc; -} - -static inline int obd_setattr(const struct lu_env *env, struct obd_export *exp, - struct obdo *oa) -{ - int rc; - - EXP_CHECK_DT_OP(exp, setattr); - EXP_COUNTER_INCREMENT(exp, setattr); - - rc = OBP(exp->exp_obd, setattr)(env, exp, oa); - return rc; -} - -static inline int obd_add_conn(struct obd_import *imp, struct obd_uuid *uuid, - int priority) -{ - struct obd_device *obd = imp->imp_obd; - int rc; - - rc = obd_check_dev_active(obd); - if (rc) - return rc; - OBD_CHECK_DT_OP(obd, add_conn, -EOPNOTSUPP); - OBD_COUNTER_INCREMENT(obd, add_conn); - - rc = OBP(obd, add_conn)(imp, uuid, priority); - return rc; -} - -static inline int obd_del_conn(struct obd_import *imp, struct obd_uuid *uuid) -{ - struct obd_device *obd = imp->imp_obd; - int rc; - - rc = obd_check_dev_active(obd); - if (rc) - return rc; - OBD_CHECK_DT_OP(obd, del_conn, -EOPNOTSUPP); - OBD_COUNTER_INCREMENT(obd, del_conn); - - rc = OBP(obd, del_conn)(imp, uuid); - return rc; -} - -static inline struct obd_uuid *obd_get_uuid(struct obd_export *exp) -{ - struct obd_uuid *uuid; - - OBD_CHECK_DT_OP(exp->exp_obd, get_uuid, NULL); - EXP_COUNTER_INCREMENT(exp, get_uuid); - - uuid = OBP(exp->exp_obd, get_uuid)(exp); - return uuid; -} - -/* - * Create a new /a exp on device /a obd for the uuid /a cluuid - * @param exp New export handle - * @param d Connect data, supported flags are set, flags also understood - * by obd are returned. - */ -static inline int obd_connect(const struct lu_env *env, - struct obd_export **exp, struct obd_device *obd, - struct obd_uuid *cluuid, - struct obd_connect_data *data, - void *localdata) -{ - int rc; - __u64 ocf = data ? data->ocd_connect_flags : 0; /* - * for post-condition - * check - */ - - rc = obd_check_dev_active(obd); - if (rc) - return rc; - OBD_CHECK_DT_OP(obd, connect, -EOPNOTSUPP); - OBD_COUNTER_INCREMENT(obd, connect); - - rc = OBP(obd, connect)(env, exp, obd, cluuid, data, localdata); - /* check that only subset is granted */ - LASSERT(ergo(data, (data->ocd_connect_flags & ocf) == - data->ocd_connect_flags)); - return rc; -} - -static inline int obd_reconnect(const struct lu_env *env, - struct obd_export *exp, - struct obd_device *obd, - struct obd_uuid *cluuid, - struct obd_connect_data *d, - void *localdata) -{ - int rc; - __u64 ocf = d ? d->ocd_connect_flags : 0; /* for post-condition check */ - - rc = obd_check_dev_active(obd); - if (rc) - return rc; - OBD_CHECK_DT_OP(obd, reconnect, 0); - OBD_COUNTER_INCREMENT(obd, reconnect); - - rc = OBP(obd, reconnect)(env, exp, obd, cluuid, d, localdata); - /* check that only subset is granted */ - LASSERT(ergo(d, (d->ocd_connect_flags & ocf) == d->ocd_connect_flags)); - return rc; -} - -static inline int obd_disconnect(struct obd_export *exp) -{ - int rc; - - EXP_CHECK_DT_OP(exp, disconnect); - EXP_COUNTER_INCREMENT(exp, disconnect); - - rc = OBP(exp->exp_obd, disconnect)(exp); - return rc; -} - -static inline int obd_fid_init(struct obd_device *obd, struct obd_export *exp, - enum lu_cli_type type) -{ - int rc; - - OBD_CHECK_DT_OP(obd, fid_init, 0); - OBD_COUNTER_INCREMENT(obd, fid_init); - - rc = OBP(obd, fid_init)(obd, exp, type); - return rc; -} - -static inline int obd_fid_fini(struct obd_device *obd) -{ - int rc; - - OBD_CHECK_DT_OP(obd, fid_fini, 0); - OBD_COUNTER_INCREMENT(obd, fid_fini); - - rc = OBP(obd, fid_fini)(obd); - return rc; -} - -static inline int obd_fid_alloc(const struct lu_env *env, - struct obd_export *exp, - struct lu_fid *fid, - struct md_op_data *op_data) -{ - int rc; - - EXP_CHECK_DT_OP(exp, fid_alloc); - EXP_COUNTER_INCREMENT(exp, fid_alloc); - - rc = OBP(exp->exp_obd, fid_alloc)(env, exp, fid, op_data); - return rc; -} - -static inline int obd_pool_new(struct obd_device *obd, char *poolname) -{ - int rc; - - OBD_CHECK_DT_OP(obd, pool_new, -EOPNOTSUPP); - OBD_COUNTER_INCREMENT(obd, pool_new); - - rc = OBP(obd, pool_new)(obd, poolname); - return rc; -} - -static inline int obd_pool_del(struct obd_device *obd, char *poolname) -{ - int rc; - - OBD_CHECK_DT_OP(obd, pool_del, -EOPNOTSUPP); - OBD_COUNTER_INCREMENT(obd, pool_del); - - rc = OBP(obd, pool_del)(obd, poolname); - return rc; -} - -static inline int obd_pool_add(struct obd_device *obd, - char *poolname, - char *ostname) -{ - int rc; - - OBD_CHECK_DT_OP(obd, pool_add, -EOPNOTSUPP); - OBD_COUNTER_INCREMENT(obd, pool_add); - - rc = OBP(obd, pool_add)(obd, poolname, ostname); - return rc; -} - -static inline int obd_pool_rem(struct obd_device *obd, - char *poolname, - char *ostname) -{ - int rc; - - OBD_CHECK_DT_OP(obd, pool_rem, -EOPNOTSUPP); - OBD_COUNTER_INCREMENT(obd, pool_rem); - - rc = OBP(obd, pool_rem)(obd, poolname, ostname); - return rc; -} - -static inline void obd_getref(struct obd_device *obd) -{ - if (OBT(obd) && OBP(obd, getref)) { - OBD_COUNTER_INCREMENT(obd, getref); - OBP(obd, getref)(obd); - } -} - -static inline void obd_putref(struct obd_device *obd) -{ - if (OBT(obd) && OBP(obd, putref)) { - OBD_COUNTER_INCREMENT(obd, putref); - OBP(obd, putref)(obd); - } -} - -static inline int obd_init_export(struct obd_export *exp) -{ - int rc = 0; - - if ((exp)->exp_obd && OBT((exp)->exp_obd) && - OBP((exp)->exp_obd, init_export)) - rc = OBP(exp->exp_obd, init_export)(exp); - return rc; -} - -static inline int obd_destroy_export(struct obd_export *exp) -{ - if ((exp)->exp_obd && OBT((exp)->exp_obd) && - OBP((exp)->exp_obd, destroy_export)) - OBP(exp->exp_obd, destroy_export)(exp); - return 0; -} - -/* - * @max_age is the oldest time in jiffies that we accept using a cached data. - * If the cache is older than @max_age we will get a new value from the - * target. Use a value of "jiffies + HZ" to guarantee freshness. - */ -static inline int obd_statfs_async(struct obd_export *exp, - struct obd_info *oinfo, - __u64 max_age, - struct ptlrpc_request_set *rqset) -{ - int rc = 0; - struct obd_device *obd; - - if (!exp || !exp->exp_obd) - return -EINVAL; - - obd = exp->exp_obd; - OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP); - OBD_COUNTER_INCREMENT(obd, statfs); - - CDEBUG(D_SUPER, "%s: osfs %p age %llu, max_age %llu\n", - obd->obd_name, &obd->obd_osfs, obd->obd_osfs_age, max_age); - if (time_before64(obd->obd_osfs_age, max_age)) { - rc = OBP(obd, statfs_async)(exp, oinfo, max_age, rqset); - } else { - CDEBUG(D_SUPER, - "%s: use %p cache blocks %llu/%llu objects %llu/%llu\n", - obd->obd_name, &obd->obd_osfs, - obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks, - obd->obd_osfs.os_ffree, obd->obd_osfs.os_files); - spin_lock(&obd->obd_osfs_lock); - memcpy(oinfo->oi_osfs, &obd->obd_osfs, sizeof(*oinfo->oi_osfs)); - spin_unlock(&obd->obd_osfs_lock); - oinfo->oi_flags |= OBD_STATFS_FROM_CACHE; - if (oinfo->oi_cb_up) - oinfo->oi_cb_up(oinfo, 0); - } - return rc; -} - -static inline int obd_statfs_rqset(struct obd_export *exp, - struct obd_statfs *osfs, __u64 max_age, - __u32 flags) -{ - struct ptlrpc_request_set *set = NULL; - struct obd_info oinfo = { - .oi_osfs = osfs, - .oi_flags = flags, - }; - int rc = 0; - - set = ptlrpc_prep_set(); - if (!set) - return -ENOMEM; - - rc = obd_statfs_async(exp, &oinfo, max_age, set); - if (rc == 0) - rc = ptlrpc_set_wait(set); - ptlrpc_set_destroy(set); - return rc; -} - -/* - * @max_age is the oldest time in jiffies that we accept using a cached data. - * If the cache is older than @max_age we will get a new value from the - * target. Use a value of "jiffies + HZ" to guarantee freshness. - */ -static inline int obd_statfs(const struct lu_env *env, struct obd_export *exp, - struct obd_statfs *osfs, __u64 max_age, - __u32 flags) -{ - int rc = 0; - struct obd_device *obd = exp->exp_obd; - - if (!obd) - return -EINVAL; - - OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP); - OBD_COUNTER_INCREMENT(obd, statfs); - - CDEBUG(D_SUPER, "osfs %llu, max_age %llu\n", - obd->obd_osfs_age, max_age); - if (time_before64(obd->obd_osfs_age, max_age)) { - rc = OBP(obd, statfs)(env, exp, osfs, max_age, flags); - if (rc == 0) { - spin_lock(&obd->obd_osfs_lock); - memcpy(&obd->obd_osfs, osfs, sizeof(obd->obd_osfs)); - obd->obd_osfs_age = get_jiffies_64(); - spin_unlock(&obd->obd_osfs_lock); - } - } else { - CDEBUG(D_SUPER, - "%s: use %p cache blocks %llu/%llu objects %llu/%llu\n", - obd->obd_name, &obd->obd_osfs, - obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks, - obd->obd_osfs.os_ffree, obd->obd_osfs.os_files); - spin_lock(&obd->obd_osfs_lock); - memcpy(osfs, &obd->obd_osfs, sizeof(*osfs)); - spin_unlock(&obd->obd_osfs_lock); - } - return rc; -} - -static inline int obd_preprw(const struct lu_env *env, int cmd, - struct obd_export *exp, struct obdo *oa, - int objcount, struct obd_ioobj *obj, - struct niobuf_remote *remote, int *pages, - struct niobuf_local *local) -{ - int rc; - - EXP_CHECK_DT_OP(exp, preprw); - EXP_COUNTER_INCREMENT(exp, preprw); - - rc = OBP(exp->exp_obd, preprw)(env, cmd, exp, oa, objcount, obj, remote, - pages, local); - return rc; -} - -static inline int obd_commitrw(const struct lu_env *env, int cmd, - struct obd_export *exp, struct obdo *oa, - int objcount, struct obd_ioobj *obj, - struct niobuf_remote *rnb, int pages, - struct niobuf_local *local, int rc) -{ - EXP_CHECK_DT_OP(exp, commitrw); - EXP_COUNTER_INCREMENT(exp, commitrw); - - rc = OBP(exp->exp_obd, commitrw)(env, cmd, exp, oa, objcount, obj, - rnb, pages, local, rc); - return rc; -} - -static inline int obd_iocontrol(unsigned int cmd, struct obd_export *exp, - int len, void *karg, void __user *uarg) -{ - int rc; - - EXP_CHECK_DT_OP(exp, iocontrol); - EXP_COUNTER_INCREMENT(exp, iocontrol); - - rc = OBP(exp->exp_obd, iocontrol)(cmd, exp, len, karg, uarg); - return rc; -} - -static inline void obd_import_event(struct obd_device *obd, - struct obd_import *imp, - enum obd_import_event event) -{ - if (!obd) { - CERROR("NULL device\n"); - return; - } - if (obd->obd_set_up && OBP(obd, import_event)) { - OBD_COUNTER_INCREMENT(obd, import_event); - OBP(obd, import_event)(obd, imp, event); - } -} - -static inline int obd_notify(struct obd_device *obd, - struct obd_device *watched, - enum obd_notify_event ev, - void *data) -{ - int rc; - - rc = obd_check_dev(obd); - if (rc) - return rc; - - if (!obd->obd_set_up) { - CDEBUG(D_HA, "obd %s not set up\n", obd->obd_name); - return -EINVAL; - } - - if (!OBP(obd, notify)) { - CDEBUG(D_HA, "obd %s has no notify handler\n", obd->obd_name); - return -ENOSYS; - } - - OBD_COUNTER_INCREMENT(obd, notify); - rc = OBP(obd, notify)(obd, watched, ev, data); - return rc; -} - -static inline int obd_notify_observer(struct obd_device *observer, - struct obd_device *observed, - enum obd_notify_event ev, - void *data) -{ - int rc1; - int rc2; - - struct obd_notify_upcall *onu; - - if (observer->obd_observer) - rc1 = obd_notify(observer->obd_observer, observed, ev, data); - else - rc1 = 0; - /* - * Also, call non-obd listener, if any - */ - onu = &observer->obd_upcall; - if (onu->onu_upcall) - rc2 = onu->onu_upcall(observer, observed, ev, - onu->onu_owner, NULL); - else - rc2 = 0; - - return rc1 ? rc1 : rc2; -} - -static inline int obd_quotactl(struct obd_export *exp, - struct obd_quotactl *oqctl) -{ - int rc; - - EXP_CHECK_DT_OP(exp, quotactl); - EXP_COUNTER_INCREMENT(exp, quotactl); - - rc = OBP(exp->exp_obd, quotactl)(exp->exp_obd, exp, oqctl); - return rc; -} - -static inline int obd_health_check(const struct lu_env *env, - struct obd_device *obd) -{ - /* - * returns: 0 on healthy - * >0 on unhealthy + reason code/flag - * however the only supported reason == 1 right now - * We'll need to define some better reasons - * or flags in the future. - * <0 on error - */ - int rc; - - /* don't use EXP_CHECK_DT_OP, because NULL method is normal here */ - if (!obd || !OBT(obd)) { - CERROR("cleaned up obd\n"); - return -EOPNOTSUPP; - } - if (!obd->obd_set_up || obd->obd_stopping) - return 0; - if (!OBP(obd, health_check)) - return 0; - - rc = OBP(obd, health_check)(env, obd); - return rc; -} - -static inline int obd_register_observer(struct obd_device *obd, - struct obd_device *observer) -{ - int rc; - - rc = obd_check_dev(obd); - if (rc) - return rc; - down_write(&obd->obd_observer_link_sem); - if (obd->obd_observer && observer) { - up_write(&obd->obd_observer_link_sem); - return -EALREADY; - } - obd->obd_observer = observer; - up_write(&obd->obd_observer_link_sem); - return 0; -} - -/* metadata helpers */ -static inline int md_getstatus(struct obd_export *exp, struct lu_fid *fid) -{ - int rc; - - EXP_CHECK_MD_OP(exp, getstatus); - EXP_MD_COUNTER_INCREMENT(exp, getstatus); - rc = MDP(exp->exp_obd, getstatus)(exp, fid); - return rc; -} - -static inline int md_getattr(struct obd_export *exp, struct md_op_data *op_data, - struct ptlrpc_request **request) -{ - int rc; - - EXP_CHECK_MD_OP(exp, getattr); - EXP_MD_COUNTER_INCREMENT(exp, getattr); - rc = MDP(exp->exp_obd, getattr)(exp, op_data, request); - return rc; -} - -static inline int md_null_inode(struct obd_export *exp, - const struct lu_fid *fid) -{ - int rc; - - EXP_CHECK_MD_OP(exp, null_inode); - EXP_MD_COUNTER_INCREMENT(exp, null_inode); - rc = MDP(exp->exp_obd, null_inode)(exp, fid); - return rc; -} - -static inline int md_close(struct obd_export *exp, struct md_op_data *op_data, - struct md_open_data *mod, - struct ptlrpc_request **request) -{ - int rc; - - EXP_CHECK_MD_OP(exp, close); - EXP_MD_COUNTER_INCREMENT(exp, close); - rc = MDP(exp->exp_obd, close)(exp, op_data, mod, request); - return rc; -} - -static inline int md_create(struct obd_export *exp, struct md_op_data *op_data, - const void *data, size_t datalen, umode_t mode, - uid_t uid, gid_t gid, kernel_cap_t cap_effective, - __u64 rdev, struct ptlrpc_request **request) -{ - int rc; - - EXP_CHECK_MD_OP(exp, create); - EXP_MD_COUNTER_INCREMENT(exp, create); - rc = MDP(exp->exp_obd, create)(exp, op_data, data, datalen, mode, - uid, gid, cap_effective, rdev, request); - return rc; -} - -static inline int md_enqueue(struct obd_export *exp, - struct ldlm_enqueue_info *einfo, - const union ldlm_policy_data *policy, - struct md_op_data *op_data, - struct lustre_handle *lockh, - __u64 extra_lock_flags) -{ - int rc; - - EXP_CHECK_MD_OP(exp, enqueue); - EXP_MD_COUNTER_INCREMENT(exp, enqueue); - rc = MDP(exp->exp_obd, enqueue)(exp, einfo, policy, op_data, lockh, - extra_lock_flags); - return rc; -} - -static inline int md_getattr_name(struct obd_export *exp, - struct md_op_data *op_data, - struct ptlrpc_request **request) -{ - int rc; - - EXP_CHECK_MD_OP(exp, getattr_name); - EXP_MD_COUNTER_INCREMENT(exp, getattr_name); - rc = MDP(exp->exp_obd, getattr_name)(exp, op_data, request); - return rc; -} - -static inline int md_intent_lock(struct obd_export *exp, - struct md_op_data *op_data, - struct lookup_intent *it, - struct ptlrpc_request **reqp, - ldlm_blocking_callback cb_blocking, - __u64 extra_lock_flags) -{ - int rc; - - EXP_CHECK_MD_OP(exp, intent_lock); - EXP_MD_COUNTER_INCREMENT(exp, intent_lock); - rc = MDP(exp->exp_obd, intent_lock)(exp, op_data, it, reqp, - cb_blocking, extra_lock_flags); - return rc; -} - -static inline int md_link(struct obd_export *exp, struct md_op_data *op_data, - struct ptlrpc_request **request) -{ - int rc; - - EXP_CHECK_MD_OP(exp, link); - EXP_MD_COUNTER_INCREMENT(exp, link); - rc = MDP(exp->exp_obd, link)(exp, op_data, request); - return rc; -} - -static inline int md_rename(struct obd_export *exp, struct md_op_data *op_data, - const char *old, size_t oldlen, const char *new, - size_t newlen, struct ptlrpc_request **request) -{ - int rc; - - EXP_CHECK_MD_OP(exp, rename); - EXP_MD_COUNTER_INCREMENT(exp, rename); - rc = MDP(exp->exp_obd, rename)(exp, op_data, old, oldlen, new, - newlen, request); - return rc; -} - -static inline int md_setattr(struct obd_export *exp, struct md_op_data *op_data, - void *ea, size_t ealen, - struct ptlrpc_request **request) -{ - int rc; - - EXP_CHECK_MD_OP(exp, setattr); - EXP_MD_COUNTER_INCREMENT(exp, setattr); - rc = MDP(exp->exp_obd, setattr)(exp, op_data, ea, ealen, request); - return rc; -} - -static inline int md_sync(struct obd_export *exp, const struct lu_fid *fid, - struct ptlrpc_request **request) -{ - int rc; - - EXP_CHECK_MD_OP(exp, sync); - EXP_MD_COUNTER_INCREMENT(exp, sync); - rc = MDP(exp->exp_obd, sync)(exp, fid, request); - return rc; -} - -static inline int md_read_page(struct obd_export *exp, - struct md_op_data *op_data, - struct md_callback *cb_op, - __u64 hash_offset, - struct page **ppage) -{ - int rc; - - EXP_CHECK_MD_OP(exp, read_page); - EXP_MD_COUNTER_INCREMENT(exp, read_page); - rc = MDP(exp->exp_obd, read_page)(exp, op_data, cb_op, hash_offset, - ppage); - return rc; -} - -static inline int md_unlink(struct obd_export *exp, struct md_op_data *op_data, - struct ptlrpc_request **request) -{ - int rc; - - EXP_CHECK_MD_OP(exp, unlink); - EXP_MD_COUNTER_INCREMENT(exp, unlink); - rc = MDP(exp->exp_obd, unlink)(exp, op_data, request); - return rc; -} - -static inline int md_get_lustre_md(struct obd_export *exp, - struct ptlrpc_request *req, - struct obd_export *dt_exp, - struct obd_export *md_exp, - struct lustre_md *md) -{ - EXP_CHECK_MD_OP(exp, get_lustre_md); - EXP_MD_COUNTER_INCREMENT(exp, get_lustre_md); - return MDP(exp->exp_obd, get_lustre_md)(exp, req, dt_exp, md_exp, md); -} - -static inline int md_free_lustre_md(struct obd_export *exp, - struct lustre_md *md) -{ - EXP_CHECK_MD_OP(exp, free_lustre_md); - EXP_MD_COUNTER_INCREMENT(exp, free_lustre_md); - return MDP(exp->exp_obd, free_lustre_md)(exp, md); -} - -static inline int md_merge_attr(struct obd_export *exp, - const struct lmv_stripe_md *lsm, - struct cl_attr *attr, - ldlm_blocking_callback cb) -{ - EXP_CHECK_MD_OP(exp, merge_attr); - EXP_MD_COUNTER_INCREMENT(exp, merge_attr); - return MDP(exp->exp_obd, merge_attr)(exp, lsm, attr, cb); -} - -static inline int md_setxattr(struct obd_export *exp, const struct lu_fid *fid, - u64 obd_md_valid, const char *name, - const char *value, size_t value_size, - unsigned int xattr_flags, u32 suppgid, - struct ptlrpc_request **request) -{ - EXP_CHECK_MD_OP(exp, setxattr); - EXP_MD_COUNTER_INCREMENT(exp, setxattr); - return MDP(exp->exp_obd, setxattr)(exp, fid, obd_md_valid, name, - value, value_size, xattr_flags, - suppgid, request); -} - -static inline int md_getxattr(struct obd_export *exp, const struct lu_fid *fid, - u64 obd_md_valid, const char *name, - size_t buf_size, struct ptlrpc_request **req) -{ - EXP_CHECK_MD_OP(exp, getxattr); - EXP_MD_COUNTER_INCREMENT(exp, getxattr); - return MDP(exp->exp_obd, getxattr)(exp, fid, obd_md_valid, name, - buf_size, req); -} - -static inline int md_set_open_replay_data(struct obd_export *exp, - struct obd_client_handle *och, - struct lookup_intent *it) -{ - EXP_CHECK_MD_OP(exp, set_open_replay_data); - EXP_MD_COUNTER_INCREMENT(exp, set_open_replay_data); - return MDP(exp->exp_obd, set_open_replay_data)(exp, och, it); -} - -static inline int md_clear_open_replay_data(struct obd_export *exp, - struct obd_client_handle *och) -{ - EXP_CHECK_MD_OP(exp, clear_open_replay_data); - EXP_MD_COUNTER_INCREMENT(exp, clear_open_replay_data); - return MDP(exp->exp_obd, clear_open_replay_data)(exp, och); -} - -static inline int md_set_lock_data(struct obd_export *exp, - const struct lustre_handle *lockh, - void *data, __u64 *bits) -{ - EXP_CHECK_MD_OP(exp, set_lock_data); - EXP_MD_COUNTER_INCREMENT(exp, set_lock_data); - return MDP(exp->exp_obd, set_lock_data)(exp, lockh, data, bits); -} - -static inline int md_cancel_unused(struct obd_export *exp, - const struct lu_fid *fid, - union ldlm_policy_data *policy, - enum ldlm_mode mode, - enum ldlm_cancel_flags flags, - void *opaque) -{ - int rc; - - EXP_CHECK_MD_OP(exp, cancel_unused); - EXP_MD_COUNTER_INCREMENT(exp, cancel_unused); - - rc = MDP(exp->exp_obd, cancel_unused)(exp, fid, policy, mode, - flags, opaque); - return rc; -} - -static inline enum ldlm_mode md_lock_match(struct obd_export *exp, __u64 flags, - const struct lu_fid *fid, - enum ldlm_type type, - union ldlm_policy_data *policy, - enum ldlm_mode mode, - struct lustre_handle *lockh) -{ - EXP_CHECK_MD_OP(exp, lock_match); - EXP_MD_COUNTER_INCREMENT(exp, lock_match); - return MDP(exp->exp_obd, lock_match)(exp, flags, fid, type, - policy, mode, lockh); -} - -static inline int md_init_ea_size(struct obd_export *exp, u32 easize, - u32 def_asize) -{ - EXP_CHECK_MD_OP(exp, init_ea_size); - EXP_MD_COUNTER_INCREMENT(exp, init_ea_size); - return MDP(exp->exp_obd, init_ea_size)(exp, easize, def_asize); -} - -static inline int md_intent_getattr_async(struct obd_export *exp, - struct md_enqueue_info *minfo) -{ - int rc; - - EXP_CHECK_MD_OP(exp, intent_getattr_async); - EXP_MD_COUNTER_INCREMENT(exp, intent_getattr_async); - rc = MDP(exp->exp_obd, intent_getattr_async)(exp, minfo); - return rc; -} - -static inline int md_revalidate_lock(struct obd_export *exp, - struct lookup_intent *it, - struct lu_fid *fid, __u64 *bits) -{ - int rc; - - EXP_CHECK_MD_OP(exp, revalidate_lock); - EXP_MD_COUNTER_INCREMENT(exp, revalidate_lock); - rc = MDP(exp->exp_obd, revalidate_lock)(exp, it, fid, bits); - return rc; -} - -static inline int md_get_fid_from_lsm(struct obd_export *exp, - const struct lmv_stripe_md *lsm, - const char *name, int namelen, - struct lu_fid *fid) -{ - int rc; - - EXP_CHECK_MD_OP(exp, get_fid_from_lsm); - EXP_MD_COUNTER_INCREMENT(exp, get_fid_from_lsm); - rc = MDP(exp->exp_obd, get_fid_from_lsm)(exp, lsm, name, namelen, fid); - return rc; -} - -/* - * Unpack an MD struct from disk to in-memory format. - * Returns +ve size of unpacked MD (0 for free), or -ve error. - * - * If *plsm != NULL and lmm == NULL then *lsm will be freed. - * If *plsm == NULL then it will be allocated. - */ -static inline int md_unpackmd(struct obd_export *exp, - struct lmv_stripe_md **plsm, - const union lmv_mds_md *lmm, size_t lmm_size) -{ - int rc; - - EXP_CHECK_MD_OP(exp, unpackmd); - EXP_MD_COUNTER_INCREMENT(exp, unpackmd); - rc = MDP(exp->exp_obd, unpackmd)(exp, plsm, lmm, lmm_size); - return rc; -} - -/* OBD Metadata Support */ - -int obd_init_caches(void); -void obd_cleanup_caches(void); - -/* support routines */ -extern struct kmem_cache *obdo_cachep; - -typedef int (*register_lwp_cb)(void *data); - -struct lwp_register_item { - struct obd_export **lri_exp; - register_lwp_cb lri_cb_func; - void *lri_cb_data; - struct list_head lri_list; - char lri_name[MTI_NAME_MAXLEN]; -}; - -/* - * I'm as embarrassed about this as you are. - * - * // XXX do not look into _superhack with remaining eye - * // XXX if this were any uglier, I'd get my own show on MTV - */ -extern int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c); - -/* obd_mount.c */ -int lustre_unregister_fs(void); -int lustre_register_fs(void); -int lustre_check_exclusion(struct super_block *sb, char *svname); - -/* sysctl.c */ -int obd_sysctl_init(void); - -/* uuid.c */ -typedef __u8 class_uuid_t[16]; -void class_uuid_unparse(class_uuid_t in, struct obd_uuid *out); - -/* lustre_peer.c */ -int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index); -int class_add_uuid(const char *uuid, __u64 nid); -int class_del_uuid(const char *uuid); -int class_check_uuid(struct obd_uuid *uuid, __u64 nid); -void class_init_uuidlist(void); -void class_exit_uuidlist(void); - -/* class_obd.c */ -extern char obd_jobid_node[]; -extern struct miscdevice obd_psdev; -extern spinlock_t obd_types_lock; -int class_procfs_init(void); -int class_procfs_clean(void); - -/* prng.c */ -#define ll_generate_random_uuid(uuid_out) \ - get_random_bytes(uuid_out, sizeof(class_uuid_t)) - -/* statfs_pack.c */ -struct kstatfs; -void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs); -void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs); - -/* root squash info */ -struct rw_semaphore; -struct root_squash_info { - uid_t rsi_uid; - gid_t rsi_gid; - struct list_head rsi_nosquash_nids; - struct rw_semaphore rsi_sem; -}; - -/* linux-module.c */ -int obd_ioctl_getdata(char **buf, int *len, void __user *arg); - -#endif /* __LINUX_OBD_CLASS_H */ diff --git a/drivers/staging/lustre/lustre/include/obd_support.h b/drivers/staging/lustre/lustre/include/obd_support.h deleted file mode 100644 index 9e41633823f701a9c9b124c3585869e4dfedccca..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/obd_support.h +++ /dev/null @@ -1,517 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#ifndef _OBD_SUPPORT -#define _OBD_SUPPORT - -#include -#include - -#include -#include - -/* global variables */ -extern unsigned int obd_debug_peer_on_timeout; -extern unsigned int obd_dump_on_timeout; -extern unsigned int obd_dump_on_eviction; -/* obd_timeout should only be used for recovery, not for - * networking / disk / timings affected by load (use Adaptive Timeouts) - */ -extern unsigned int obd_timeout; /* seconds */ -extern unsigned int obd_timeout_set; -extern unsigned int at_min; -extern unsigned int at_max; -extern unsigned int at_history; -extern int at_early_margin; -extern int at_extra; -extern unsigned long obd_max_dirty_pages; -extern atomic_long_t obd_dirty_pages; -extern atomic_long_t obd_dirty_transit_pages; -extern char obd_jobid_var[]; - -/* Some hash init argument constants */ -/* Timeout definitions */ -#define OBD_TIMEOUT_DEFAULT 100 -/* Time to wait for all clients to reconnect during recovery (hard limit) */ -#define OBD_RECOVERY_TIME_HARD (obd_timeout * 9) -/* Time to wait for all clients to reconnect during recovery (soft limit) */ -/* Should be very conservative; must catch the first reconnect after reboot */ -#define OBD_RECOVERY_TIME_SOFT (obd_timeout * 3) -/* Change recovery-small 26b time if you change this */ -#define PING_INTERVAL max(obd_timeout / 4, 1U) -/* a bit more than maximal journal commit time in seconds */ -#define PING_INTERVAL_SHORT min(PING_INTERVAL, 7U) -/* Client may skip 1 ping; we must wait at least 2.5. But for multiple - * failover targets the client only pings one server at a time, and pings - * can be lost on a loaded network. Since eviction has serious consequences, - * and there's no urgent need to evict a client just because it's idle, we - * should be very conservative here. - */ -#define PING_EVICT_TIMEOUT (PING_INTERVAL * 6) -#define DISK_TIMEOUT 50 /* Beyond this we warn about disk speed */ -#define CONNECTION_SWITCH_MIN 5U /* Connection switching rate limiter */ -/* Max connect interval for nonresponsive servers; ~50s to avoid building up - * connect requests in the LND queues, but within obd_timeout so we don't - * miss the recovery window - */ -#define CONNECTION_SWITCH_MAX min(50U, max(CONNECTION_SWITCH_MIN, obd_timeout)) -#define CONNECTION_SWITCH_INC 5 /* Connection timeout backoff */ -/* In general this should be low to have quick detection of a system - * running on a backup server. (If it's too low, import_select_connection - * will increase the timeout anyhow.) - */ -#define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN, obd_timeout / 20) -/* The max delay between connects is SWITCH_MAX + SWITCH_INC + INITIAL */ -#define RECONNECT_DELAY_MAX (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC + \ - INITIAL_CONNECT_TIMEOUT) -/* The min time a target should wait for clients to reconnect in recovery */ -#define OBD_RECOVERY_TIME_MIN (2 * RECONNECT_DELAY_MAX) -#define OBD_IR_FACTOR_MIN 1 -#define OBD_IR_FACTOR_MAX 10 -#define OBD_IR_FACTOR_DEFAULT (OBD_IR_FACTOR_MAX / 2) -/* default timeout for the MGS to become IR_FULL */ -#define OBD_IR_MGS_TIMEOUT (4 * obd_timeout) -#define LONG_UNLINK 300 /* Unlink should happen before now */ - -/** - * Time interval of shrink, if the client is "idle" more than this interval, - * then the ll_grant thread will return the requested grant space to filter - */ -#define GRANT_SHRINK_INTERVAL 1200/*20 minutes*/ - -#define OBD_FAIL_MDS 0x100 -#define OBD_FAIL_MDS_HANDLE_UNPACK 0x101 -#define OBD_FAIL_MDS_GETATTR_NET 0x102 -#define OBD_FAIL_MDS_GETATTR_PACK 0x103 -#define OBD_FAIL_MDS_READPAGE_NET 0x104 -#define OBD_FAIL_MDS_READPAGE_PACK 0x105 -#define OBD_FAIL_MDS_SENDPAGE 0x106 -#define OBD_FAIL_MDS_REINT_NET 0x107 -#define OBD_FAIL_MDS_REINT_UNPACK 0x108 -#define OBD_FAIL_MDS_REINT_SETATTR 0x109 -#define OBD_FAIL_MDS_REINT_SETATTR_WRITE 0x10a -#define OBD_FAIL_MDS_REINT_CREATE 0x10b -#define OBD_FAIL_MDS_REINT_CREATE_WRITE 0x10c -#define OBD_FAIL_MDS_REINT_UNLINK 0x10d -#define OBD_FAIL_MDS_REINT_UNLINK_WRITE 0x10e -#define OBD_FAIL_MDS_REINT_LINK 0x10f -#define OBD_FAIL_MDS_REINT_LINK_WRITE 0x110 -#define OBD_FAIL_MDS_REINT_RENAME 0x111 -#define OBD_FAIL_MDS_REINT_RENAME_WRITE 0x112 -#define OBD_FAIL_MDS_OPEN_NET 0x113 -#define OBD_FAIL_MDS_OPEN_PACK 0x114 -#define OBD_FAIL_MDS_CLOSE_NET 0x115 -#define OBD_FAIL_MDS_CLOSE_PACK 0x116 -#define OBD_FAIL_MDS_CONNECT_NET 0x117 -#define OBD_FAIL_MDS_CONNECT_PACK 0x118 -#define OBD_FAIL_MDS_REINT_NET_REP 0x119 -#define OBD_FAIL_MDS_DISCONNECT_NET 0x11a -#define OBD_FAIL_MDS_GETSTATUS_NET 0x11b -#define OBD_FAIL_MDS_GETSTATUS_PACK 0x11c -#define OBD_FAIL_MDS_STATFS_PACK 0x11d -#define OBD_FAIL_MDS_STATFS_NET 0x11e -#define OBD_FAIL_MDS_GETATTR_NAME_NET 0x11f -#define OBD_FAIL_MDS_PIN_NET 0x120 -#define OBD_FAIL_MDS_UNPIN_NET 0x121 -#define OBD_FAIL_MDS_ALL_REPLY_NET 0x122 -#define OBD_FAIL_MDS_ALL_REQUEST_NET 0x123 -#define OBD_FAIL_MDS_SYNC_NET 0x124 -#define OBD_FAIL_MDS_SYNC_PACK 0x125 -/* OBD_FAIL_MDS_DONE_WRITING_NET 0x126 obsolete since 2.8.0 */ -/* OBD_FAIL_MDS_DONE_WRITING_PACK 0x127 obsolete since 2.8.0 */ -#define OBD_FAIL_MDS_ALLOC_OBDO 0x128 -#define OBD_FAIL_MDS_PAUSE_OPEN 0x129 -#define OBD_FAIL_MDS_STATFS_LCW_SLEEP 0x12a -#define OBD_FAIL_MDS_OPEN_CREATE 0x12b -#define OBD_FAIL_MDS_OST_SETATTR 0x12c -/* OBD_FAIL_MDS_QUOTACHECK_NET 0x12d obsolete since 2.4 */ -#define OBD_FAIL_MDS_QUOTACTL_NET 0x12e -#define OBD_FAIL_MDS_CLIENT_ADD 0x12f -#define OBD_FAIL_MDS_GETXATTR_NET 0x130 -#define OBD_FAIL_MDS_GETXATTR_PACK 0x131 -#define OBD_FAIL_MDS_SETXATTR_NET 0x132 -#define OBD_FAIL_MDS_SETXATTR 0x133 -#define OBD_FAIL_MDS_SETXATTR_WRITE 0x134 -#define OBD_FAIL_MDS_FS_SETUP 0x135 -#define OBD_FAIL_MDS_RESEND 0x136 -#define OBD_FAIL_MDS_LLOG_CREATE_FAILED 0x137 -#define OBD_FAIL_MDS_LOV_SYNC_RACE 0x138 -#define OBD_FAIL_MDS_OSC_PRECREATE 0x139 -#define OBD_FAIL_MDS_LLOG_SYNC_TIMEOUT 0x13a -#define OBD_FAIL_MDS_CLOSE_NET_REP 0x13b -#define OBD_FAIL_MDS_BLOCK_QUOTA_REQ 0x13c -#define OBD_FAIL_MDS_DROP_QUOTA_REQ 0x13d -#define OBD_FAIL_MDS_REMOVE_COMMON_EA 0x13e -#define OBD_FAIL_MDS_ALLOW_COMMON_EA_SETTING 0x13f -#define OBD_FAIL_MDS_FAIL_LOV_LOG_ADD 0x140 -#define OBD_FAIL_MDS_LOV_PREP_CREATE 0x141 -#define OBD_FAIL_MDS_REINT_DELAY 0x142 -#define OBD_FAIL_MDS_READLINK_EPROTO 0x143 -#define OBD_FAIL_MDS_OPEN_WAIT_CREATE 0x144 -#define OBD_FAIL_MDS_PDO_LOCK 0x145 -#define OBD_FAIL_MDS_PDO_LOCK2 0x146 -#define OBD_FAIL_MDS_OSC_CREATE_FAIL 0x147 -#define OBD_FAIL_MDS_NEGATIVE_POSITIVE 0x148 -#define OBD_FAIL_MDS_HSM_STATE_GET_NET 0x149 -#define OBD_FAIL_MDS_HSM_STATE_SET_NET 0x14a -#define OBD_FAIL_MDS_HSM_PROGRESS_NET 0x14b -#define OBD_FAIL_MDS_HSM_REQUEST_NET 0x14c -#define OBD_FAIL_MDS_HSM_CT_REGISTER_NET 0x14d -#define OBD_FAIL_MDS_HSM_CT_UNREGISTER_NET 0x14e -#define OBD_FAIL_MDS_SWAP_LAYOUTS_NET 0x14f -#define OBD_FAIL_MDS_HSM_ACTION_NET 0x150 -#define OBD_FAIL_MDS_CHANGELOG_INIT 0x151 - -/* layout lock */ -#define OBD_FAIL_MDS_NO_LL_GETATTR 0x170 -#define OBD_FAIL_MDS_NO_LL_OPEN 0x171 -#define OBD_FAIL_MDS_LL_BLOCK 0x172 - -/* CMD */ -#define OBD_FAIL_MDS_IS_SUBDIR_NET 0x180 -#define OBD_FAIL_MDS_IS_SUBDIR_PACK 0x181 -#define OBD_FAIL_MDS_SET_INFO_NET 0x182 -#define OBD_FAIL_MDS_WRITEPAGE_NET 0x183 -#define OBD_FAIL_MDS_WRITEPAGE_PACK 0x184 -#define OBD_FAIL_MDS_RECOVERY_ACCEPTS_GAPS 0x185 -#define OBD_FAIL_MDS_GET_INFO_NET 0x186 -#define OBD_FAIL_MDS_DQACQ_NET 0x187 - -/* OI scrub */ -#define OBD_FAIL_OSD_SCRUB_DELAY 0x190 -#define OBD_FAIL_OSD_SCRUB_CRASH 0x191 -#define OBD_FAIL_OSD_SCRUB_FATAL 0x192 -#define OBD_FAIL_OSD_FID_MAPPING 0x193 -#define OBD_FAIL_OSD_LMA_INCOMPAT 0x194 -#define OBD_FAIL_OSD_COMPAT_INVALID_ENTRY 0x195 - -#define OBD_FAIL_OST 0x200 -#define OBD_FAIL_OST_CONNECT_NET 0x201 -#define OBD_FAIL_OST_DISCONNECT_NET 0x202 -#define OBD_FAIL_OST_GET_INFO_NET 0x203 -#define OBD_FAIL_OST_CREATE_NET 0x204 -#define OBD_FAIL_OST_DESTROY_NET 0x205 -#define OBD_FAIL_OST_GETATTR_NET 0x206 -#define OBD_FAIL_OST_SETATTR_NET 0x207 -#define OBD_FAIL_OST_OPEN_NET 0x208 -#define OBD_FAIL_OST_CLOSE_NET 0x209 -#define OBD_FAIL_OST_BRW_NET 0x20a -#define OBD_FAIL_OST_PUNCH_NET 0x20b -#define OBD_FAIL_OST_STATFS_NET 0x20c -#define OBD_FAIL_OST_HANDLE_UNPACK 0x20d -#define OBD_FAIL_OST_BRW_WRITE_BULK 0x20e -#define OBD_FAIL_OST_BRW_READ_BULK 0x20f -#define OBD_FAIL_OST_SYNC_NET 0x210 -#define OBD_FAIL_OST_ALL_REPLY_NET 0x211 -#define OBD_FAIL_OST_ALL_REQUEST_NET 0x212 -#define OBD_FAIL_OST_LDLM_REPLY_NET 0x213 -#define OBD_FAIL_OST_BRW_PAUSE_BULK 0x214 -#define OBD_FAIL_OST_ENOSPC 0x215 -#define OBD_FAIL_OST_EROFS 0x216 -#define OBD_FAIL_OST_ENOENT 0x217 -/* OBD_FAIL_OST_QUOTACHECK_NET 0x218 obsolete since 2.4 */ -#define OBD_FAIL_OST_QUOTACTL_NET 0x219 -#define OBD_FAIL_OST_CHECKSUM_RECEIVE 0x21a -#define OBD_FAIL_OST_CHECKSUM_SEND 0x21b -#define OBD_FAIL_OST_BRW_SIZE 0x21c -#define OBD_FAIL_OST_DROP_REQ 0x21d -#define OBD_FAIL_OST_SETATTR_CREDITS 0x21e -#define OBD_FAIL_OST_HOLD_WRITE_RPC 0x21f -#define OBD_FAIL_OST_BRW_WRITE_BULK2 0x220 -#define OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221 -#define OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222 -#define OBD_FAIL_OST_PAUSE_CREATE 0x223 -#define OBD_FAIL_OST_BRW_PAUSE_PACK 0x224 -#define OBD_FAIL_OST_CONNECT_NET2 0x225 -#define OBD_FAIL_OST_NOMEM 0x226 -#define OBD_FAIL_OST_BRW_PAUSE_BULK2 0x227 -#define OBD_FAIL_OST_MAPBLK_ENOSPC 0x228 -#define OBD_FAIL_OST_ENOINO 0x229 -#define OBD_FAIL_OST_DQACQ_NET 0x230 -#define OBD_FAIL_OST_STATFS_EINPROGRESS 0x231 -#define OBD_FAIL_OST_SET_INFO_NET 0x232 - -#define OBD_FAIL_LDLM 0x300 -#define OBD_FAIL_LDLM_NAMESPACE_NEW 0x301 -#define OBD_FAIL_LDLM_ENQUEUE_NET 0x302 -#define OBD_FAIL_LDLM_CONVERT_NET 0x303 -#define OBD_FAIL_LDLM_CANCEL_NET 0x304 -#define OBD_FAIL_LDLM_BL_CALLBACK_NET 0x305 -#define OBD_FAIL_LDLM_CP_CALLBACK_NET 0x306 -#define OBD_FAIL_LDLM_GL_CALLBACK_NET 0x307 -#define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308 -#define OBD_FAIL_LDLM_ENQUEUE_INTENT_ERR 0x309 -#define OBD_FAIL_LDLM_CREATE_RESOURCE 0x30a -#define OBD_FAIL_LDLM_ENQUEUE_BLOCKED 0x30b -#define OBD_FAIL_LDLM_REPLY 0x30c -#define OBD_FAIL_LDLM_RECOV_CLIENTS 0x30d -#define OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT 0x30e -#define OBD_FAIL_LDLM_GLIMPSE 0x30f -#define OBD_FAIL_LDLM_CANCEL_RACE 0x310 -#define OBD_FAIL_LDLM_CANCEL_EVICT_RACE 0x311 -#define OBD_FAIL_LDLM_PAUSE_CANCEL 0x312 -#define OBD_FAIL_LDLM_CLOSE_THREAD 0x313 -#define OBD_FAIL_LDLM_CANCEL_BL_CB_RACE 0x314 -#define OBD_FAIL_LDLM_CP_CB_WAIT 0x315 -#define OBD_FAIL_LDLM_OST_FAIL_RACE 0x316 -#define OBD_FAIL_LDLM_INTR_CP_AST 0x317 -#define OBD_FAIL_LDLM_CP_BL_RACE 0x318 -#define OBD_FAIL_LDLM_NEW_LOCK 0x319 -#define OBD_FAIL_LDLM_AGL_DELAY 0x31a -#define OBD_FAIL_LDLM_AGL_NOLOCK 0x31b -#define OBD_FAIL_LDLM_OST_LVB 0x31c -#define OBD_FAIL_LDLM_ENQUEUE_HANG 0x31d -#define OBD_FAIL_LDLM_PAUSE_CANCEL2 0x31f -#define OBD_FAIL_LDLM_CP_CB_WAIT2 0x320 -#define OBD_FAIL_LDLM_CP_CB_WAIT3 0x321 -#define OBD_FAIL_LDLM_CP_CB_WAIT4 0x322 -#define OBD_FAIL_LDLM_CP_CB_WAIT5 0x323 - -#define OBD_FAIL_LDLM_GRANT_CHECK 0x32a - -/* LOCKLESS IO */ -#define OBD_FAIL_LDLM_SET_CONTENTION 0x385 - -#define OBD_FAIL_OSC 0x400 -#define OBD_FAIL_OSC_BRW_READ_BULK 0x401 -#define OBD_FAIL_OSC_BRW_WRITE_BULK 0x402 -#define OBD_FAIL_OSC_LOCK_BL_AST 0x403 -#define OBD_FAIL_OSC_LOCK_CP_AST 0x404 -#define OBD_FAIL_OSC_MATCH 0x405 -#define OBD_FAIL_OSC_BRW_PREP_REQ 0x406 -#define OBD_FAIL_OSC_SHUTDOWN 0x407 -#define OBD_FAIL_OSC_CHECKSUM_RECEIVE 0x408 -#define OBD_FAIL_OSC_CHECKSUM_SEND 0x409 -#define OBD_FAIL_OSC_BRW_PREP_REQ2 0x40a -#define OBD_FAIL_OSC_CONNECT_CKSUM 0x40b -#define OBD_FAIL_OSC_CKSUM_ADLER_ONLY 0x40c -#define OBD_FAIL_OSC_DIO_PAUSE 0x40d -#define OBD_FAIL_OSC_OBJECT_CONTENTION 0x40e -#define OBD_FAIL_OSC_CP_CANCEL_RACE 0x40f -#define OBD_FAIL_OSC_CP_ENQ_RACE 0x410 -#define OBD_FAIL_OSC_NO_GRANT 0x411 -#define OBD_FAIL_OSC_DELAY_SETTIME 0x412 -#define OBD_FAIL_OSC_DELAY_IO 0x414 - -#define OBD_FAIL_PTLRPC 0x500 -#define OBD_FAIL_PTLRPC_ACK 0x501 -#define OBD_FAIL_PTLRPC_RQBD 0x502 -#define OBD_FAIL_PTLRPC_BULK_GET_NET 0x503 -#define OBD_FAIL_PTLRPC_BULK_PUT_NET 0x504 -#define OBD_FAIL_PTLRPC_DROP_RPC 0x505 -#define OBD_FAIL_PTLRPC_DELAY_SEND 0x506 -#define OBD_FAIL_PTLRPC_DELAY_RECOV 0x507 -#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB 0x508 -#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a -#define OBD_FAIL_PTLRPC_PAUSE_REP 0x50c -#define OBD_FAIL_PTLRPC_IMP_DEACTIVE 0x50d -#define OBD_FAIL_PTLRPC_DUMP_LOG 0x50e -#define OBD_FAIL_PTLRPC_LONG_REPL_UNLINK 0x50f -#define OBD_FAIL_PTLRPC_LONG_BULK_UNLINK 0x510 -#define OBD_FAIL_PTLRPC_HPREQ_TIMEOUT 0x511 -#define OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT 0x512 -#define OBD_FAIL_PTLRPC_DROP_REQ_OPC 0x513 -#define OBD_FAIL_PTLRPC_FINISH_REPLAY 0x514 -#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2 0x515 -#define OBD_FAIL_PTLRPC_DELAY_IMP_FULL 0x516 -#define OBD_FAIL_PTLRPC_CANCEL_RESEND 0x517 -#define OBD_FAIL_PTLRPC_DROP_BULK 0x51a -#define OBD_FAIL_PTLRPC_LONG_REQ_UNLINK 0x51b -#define OBD_FAIL_PTLRPC_LONG_BOTH_UNLINK 0x51c - -#define OBD_FAIL_OBD_PING_NET 0x600 -#define OBD_FAIL_OBD_LOG_CANCEL_NET 0x601 -#define OBD_FAIL_OBD_LOGD_NET 0x602 -/* OBD_FAIL_OBD_QC_CALLBACK_NET 0x603 obsolete since 2.4 */ -#define OBD_FAIL_OBD_DQACQ 0x604 -#define OBD_FAIL_OBD_LLOG_SETUP 0x605 -#define OBD_FAIL_OBD_LOG_CANCEL_REP 0x606 -#define OBD_FAIL_OBD_IDX_READ_NET 0x607 -#define OBD_FAIL_OBD_IDX_READ_BREAK 0x608 -#define OBD_FAIL_OBD_NO_LRU 0x609 - -#define OBD_FAIL_TGT_REPLY_NET 0x700 -#define OBD_FAIL_TGT_CONN_RACE 0x701 -#define OBD_FAIL_TGT_FORCE_RECONNECT 0x702 -#define OBD_FAIL_TGT_DELAY_CONNECT 0x703 -#define OBD_FAIL_TGT_DELAY_RECONNECT 0x704 -#define OBD_FAIL_TGT_DELAY_PRECREATE 0x705 -#define OBD_FAIL_TGT_TOOMANY_THREADS 0x706 -#define OBD_FAIL_TGT_REPLAY_DROP 0x707 -#define OBD_FAIL_TGT_FAKE_EXP 0x708 -#define OBD_FAIL_TGT_REPLAY_DELAY 0x709 -#define OBD_FAIL_TGT_LAST_REPLAY 0x710 -#define OBD_FAIL_TGT_CLIENT_ADD 0x711 -#define OBD_FAIL_TGT_RCVG_FLAG 0x712 -#define OBD_FAIL_TGT_DELAY_CONDITIONAL 0x713 - -#define OBD_FAIL_MDC_REVALIDATE_PAUSE 0x800 -#define OBD_FAIL_MDC_ENQUEUE_PAUSE 0x801 -#define OBD_FAIL_MDC_OLD_EXT_FLAGS 0x802 -#define OBD_FAIL_MDC_GETATTR_ENQUEUE 0x803 -#define OBD_FAIL_MDC_RPCS_SEM 0x804 -#define OBD_FAIL_MDC_LIGHTWEIGHT 0x805 -#define OBD_FAIL_MDC_CLOSE 0x806 - -#define OBD_FAIL_MGS 0x900 -#define OBD_FAIL_MGS_ALL_REQUEST_NET 0x901 -#define OBD_FAIL_MGS_ALL_REPLY_NET 0x902 -#define OBD_FAIL_MGC_PAUSE_PROCESS_LOG 0x903 -#define OBD_FAIL_MGS_PAUSE_REQ 0x904 -#define OBD_FAIL_MGS_PAUSE_TARGET_REG 0x905 -#define OBD_FAIL_MGS_CONNECT_NET 0x906 -#define OBD_FAIL_MGS_DISCONNECT_NET 0x907 -#define OBD_FAIL_MGS_SET_INFO_NET 0x908 -#define OBD_FAIL_MGS_EXCEPTION_NET 0x909 -#define OBD_FAIL_MGS_TARGET_REG_NET 0x90a -#define OBD_FAIL_MGS_TARGET_DEL_NET 0x90b -#define OBD_FAIL_MGS_CONFIG_READ_NET 0x90c - -#define OBD_FAIL_QUOTA_DQACQ_NET 0xA01 -#define OBD_FAIL_QUOTA_EDQUOT 0xA02 -#define OBD_FAIL_QUOTA_DELAY_REINT 0xA03 -#define OBD_FAIL_QUOTA_RECOVERABLE_ERR 0xA04 - -#define OBD_FAIL_LPROC_REMOVE 0xB00 - -#define OBD_FAIL_SEQ 0x1000 -#define OBD_FAIL_SEQ_QUERY_NET 0x1001 -#define OBD_FAIL_SEQ_EXHAUST 0x1002 - -#define OBD_FAIL_FLD 0x1100 -#define OBD_FAIL_FLD_QUERY_NET 0x1101 -#define OBD_FAIL_FLD_READ_NET 0x1102 - -#define OBD_FAIL_SEC_CTX 0x1200 -#define OBD_FAIL_SEC_CTX_INIT_NET 0x1201 -#define OBD_FAIL_SEC_CTX_INIT_CONT_NET 0x1202 -#define OBD_FAIL_SEC_CTX_FINI_NET 0x1203 -#define OBD_FAIL_SEC_CTX_HDL_PAUSE 0x1204 - -#define OBD_FAIL_LLOG 0x1300 -#define OBD_FAIL_LLOG_ORIGIN_CONNECT_NET 0x1301 -#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CREATE_NET 0x1302 -#define OBD_FAIL_LLOG_ORIGIN_HANDLE_DESTROY_NET 0x1303 -#define OBD_FAIL_LLOG_ORIGIN_HANDLE_READ_HEADER_NET 0x1304 -#define OBD_FAIL_LLOG_ORIGIN_HANDLE_NEXT_BLOCK_NET 0x1305 -#define OBD_FAIL_LLOG_ORIGIN_HANDLE_PREV_BLOCK_NET 0x1306 -#define OBD_FAIL_LLOG_ORIGIN_HANDLE_WRITE_REC_NET 0x1307 -#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CLOSE_NET 0x1308 -#define OBD_FAIL_LLOG_CATINFO_NET 0x1309 -#define OBD_FAIL_MDS_SYNC_CAPA_SL 0x1310 -#define OBD_FAIL_SEQ_ALLOC 0x1311 - -#define OBD_FAIL_LLITE 0x1400 -#define OBD_FAIL_LLITE_FAULT_TRUNC_RACE 0x1401 -#define OBD_FAIL_LOCK_STATE_WAIT_INTR 0x1402 -#define OBD_FAIL_LOV_INIT 0x1403 -#define OBD_FAIL_GLIMPSE_DELAY 0x1404 -#define OBD_FAIL_LLITE_XATTR_ENOMEM 0x1405 -#define OBD_FAIL_MAKE_LOVEA_HOLE 0x1406 -#define OBD_FAIL_LLITE_LOST_LAYOUT 0x1407 -#define OBD_FAIL_GETATTR_DELAY 0x1409 - -#define OBD_FAIL_FID_INDIR 0x1501 -#define OBD_FAIL_FID_INLMA 0x1502 -#define OBD_FAIL_FID_IGIF 0x1504 -#define OBD_FAIL_FID_LOOKUP 0x1505 -#define OBD_FAIL_FID_NOLMA 0x1506 - -/* LFSCK */ -#define OBD_FAIL_LFSCK_DELAY1 0x1600 -#define OBD_FAIL_LFSCK_DELAY2 0x1601 -#define OBD_FAIL_LFSCK_DELAY3 0x1602 -#define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603 -#define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604 -#define OBD_FAIL_LFSCK_LINKEA_MORE2 0x1605 -#define OBD_FAIL_LFSCK_FATAL1 0x1608 -#define OBD_FAIL_LFSCK_FATAL2 0x1609 -#define OBD_FAIL_LFSCK_CRASH 0x160a -#define OBD_FAIL_LFSCK_NO_AUTO 0x160b -#define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c -#define OBD_FAIL_LFSCK_INVALID_PFID 0x1619 -#define OBD_FAIL_LFSCK_BAD_NAME_HASH 0x1628 - -/* UPDATE */ -#define OBD_FAIL_UPDATE_OBJ_NET 0x1700 -#define OBD_FAIL_UPDATE_OBJ_NET_REP 0x1701 - -/* LMV */ -#define OBD_FAIL_UNKNOWN_LMV_STRIPE 0x1901 - -/* Assign references to moved code to reduce code changes */ -#define OBD_FAIL_PRECHECK(id) CFS_FAIL_PRECHECK(id) -#define OBD_FAIL_CHECK(id) CFS_FAIL_CHECK(id) -#define OBD_FAIL_CHECK_VALUE(id, value) CFS_FAIL_CHECK_VALUE(id, value) -#define OBD_FAIL_CHECK_ORSET(id, value) CFS_FAIL_CHECK_ORSET(id, value) -#define OBD_FAIL_CHECK_RESET(id, value) CFS_FAIL_CHECK_RESET(id, value) -#define OBD_FAIL_RETURN(id, ret) CFS_FAIL_RETURN(id, ret) -#define OBD_FAIL_TIMEOUT(id, secs) CFS_FAIL_TIMEOUT(id, secs) -#define OBD_FAIL_TIMEOUT_MS(id, ms) CFS_FAIL_TIMEOUT_MS(id, ms) -#define OBD_FAIL_TIMEOUT_ORSET(id, value, secs) CFS_FAIL_TIMEOUT_ORSET(id, value, secs) -#define OBD_RACE(id) CFS_RACE(id) -#define OBD_FAIL_ONCE CFS_FAIL_ONCE -#define OBD_FAILED CFS_FAILED - -#ifdef CONFIG_DEBUG_SLAB -#define POISON(ptr, c, s) do {} while (0) -#define POISON_PTR(ptr) ((void)0) -#else -#define POISON(ptr, c, s) memset(ptr, c, s) -#define POISON_PTR(ptr) ((ptr) = (void *)0xdeadbeef) -#endif - -#ifdef POISON_BULK -#define POISON_PAGE(page, val) do { \ - memset(kmap(page), val, PAGE_SIZE); \ - kunmap(page); \ -} while (0) -#else -#define POISON_PAGE(page, val) do { } while (0) -#endif - -#define OBD_FREE_RCU(ptr, size, handle) \ -do { \ - struct portals_handle *__h = (handle); \ - \ - __h->h_cookie = (unsigned long)(ptr); \ - __h->h_size = (size); \ - call_rcu(&__h->h_rcu, class_handle_free_cb); \ - POISON_PTR(ptr); \ -} while (0) - -#define KEY_IS(str) \ - (keylen >= (sizeof(str) - 1) && \ - memcmp(key, str, (sizeof(str) - 1)) == 0) - -#endif diff --git a/drivers/staging/lustre/lustre/include/seq_range.h b/drivers/staging/lustre/lustre/include/seq_range.h deleted file mode 100644 index 9450da728160c34ac29d1e8806be121932997537..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/include/seq_range.h +++ /dev/null @@ -1,200 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2014, Intel Corporation. - * - * Copyright 2015 Cray Inc, all rights reserved. - * Author: Ben Evans. - * - * Define lu_seq_range associated functions - */ - -#ifndef _SEQ_RANGE_H_ -#define _SEQ_RANGE_H_ - -#include - -/** - * computes the sequence range type \a range - */ - -static inline unsigned int fld_range_type(const struct lu_seq_range *range) -{ - return range->lsr_flags & LU_SEQ_RANGE_MASK; -} - -/** - * Is this sequence range an OST? \a range - */ - -static inline bool fld_range_is_ost(const struct lu_seq_range *range) -{ - return fld_range_type(range) == LU_SEQ_RANGE_OST; -} - -/** - * Is this sequence range an MDT? \a range - */ - -static inline bool fld_range_is_mdt(const struct lu_seq_range *range) -{ - return fld_range_type(range) == LU_SEQ_RANGE_MDT; -} - -/** - * ANY range is only used when the fld client sends a fld query request, - * but it does not know whether the seq is an MDT or OST, so it will send the - * request with ANY type, which means any seq type from the lookup can be - * expected. /a range - */ -static inline unsigned int fld_range_is_any(const struct lu_seq_range *range) -{ - return fld_range_type(range) == LU_SEQ_RANGE_ANY; -} - -/** - * Apply flags to range \a range \a flags - */ - -static inline void fld_range_set_type(struct lu_seq_range *range, - unsigned int flags) -{ - range->lsr_flags |= flags; -} - -/** - * Add MDT to range type \a range - */ - -static inline void fld_range_set_mdt(struct lu_seq_range *range) -{ - fld_range_set_type(range, LU_SEQ_RANGE_MDT); -} - -/** - * Add OST to range type \a range - */ - -static inline void fld_range_set_ost(struct lu_seq_range *range) -{ - fld_range_set_type(range, LU_SEQ_RANGE_OST); -} - -/** - * Add ANY to range type \a range - */ - -static inline void fld_range_set_any(struct lu_seq_range *range) -{ - fld_range_set_type(range, LU_SEQ_RANGE_ANY); -} - -/** - * computes width of given sequence range \a range - */ - -static inline u64 lu_seq_range_space(const struct lu_seq_range *range) -{ - return range->lsr_end - range->lsr_start; -} - -/** - * initialize range to zero \a range - */ - -static inline void lu_seq_range_init(struct lu_seq_range *range) -{ - memset(range, 0, sizeof(*range)); -} - -/** - * check if given seq id \a s is within given range \a range - */ - -static inline bool lu_seq_range_within(const struct lu_seq_range *range, - u64 seq) -{ - return seq >= range->lsr_start && seq < range->lsr_end; -} - -/** - * Is the range sane? Is the end after the beginning? \a range - */ - -static inline bool lu_seq_range_is_sane(const struct lu_seq_range *range) -{ - return range->lsr_end >= range->lsr_start; -} - -/** - * Is the range 0? \a range - */ - -static inline bool lu_seq_range_is_zero(const struct lu_seq_range *range) -{ - return range->lsr_start == 0 && range->lsr_end == 0; -} - -/** - * Is the range out of space? \a range - */ - -static inline bool lu_seq_range_is_exhausted(const struct lu_seq_range *range) -{ - return lu_seq_range_space(range) == 0; -} - -/** - * return 0 if two ranges have the same location, nonzero if they are - * different \a r1 \a r2 - */ - -static inline int lu_seq_range_compare_loc(const struct lu_seq_range *r1, - const struct lu_seq_range *r2) -{ - return r1->lsr_index != r2->lsr_index || - r1->lsr_flags != r2->lsr_flags; -} - -#if !defined(__REQ_LAYOUT_USER__) -/** - * byte swap range structure \a range - */ - -void lustre_swab_lu_seq_range(struct lu_seq_range *range); -#endif -/** - * printf string and argument list for sequence range - */ -#define DRANGE "[%#16.16llx-%#16.16llx]:%x:%s" - -#define PRANGE(range) \ - (range)->lsr_start, \ - (range)->lsr_end, \ - (range)->lsr_index, \ - fld_range_is_mdt(range) ? "mdt" : "ost" - -#endif diff --git a/drivers/staging/lustre/lustre/ldlm/interval_tree.c b/drivers/staging/lustre/lustre/ldlm/interval_tree.c deleted file mode 100644 index 8df7a4463c21795a15b60cd37e4f88b573c28134..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ldlm/interval_tree.c +++ /dev/null @@ -1,599 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/ldlm/interval_tree.c - * - * Interval tree library used by ldlm extent lock code - * - * Author: Huang Wei - * Author: Jay Xiong - */ -#include -#include -#include - -enum { - INTERVAL_RED = 0, - INTERVAL_BLACK = 1 -}; - -static inline int node_is_left_child(struct interval_node *node) -{ - return node == node->in_parent->in_left; -} - -static inline int node_is_right_child(struct interval_node *node) -{ - return node == node->in_parent->in_right; -} - -static inline int node_is_red(struct interval_node *node) -{ - return node->in_color == INTERVAL_RED; -} - -static inline int node_is_black(struct interval_node *node) -{ - return node->in_color == INTERVAL_BLACK; -} - -static inline int extent_compare(struct interval_node_extent *e1, - struct interval_node_extent *e2) -{ - int rc; - - if (e1->start == e2->start) { - if (e1->end < e2->end) - rc = -1; - else if (e1->end > e2->end) - rc = 1; - else - rc = 0; - } else { - if (e1->start < e2->start) - rc = -1; - else - rc = 1; - } - return rc; -} - -static inline int extent_equal(struct interval_node_extent *e1, - struct interval_node_extent *e2) -{ - return (e1->start == e2->start) && (e1->end == e2->end); -} - -static inline int extent_overlapped(struct interval_node_extent *e1, - struct interval_node_extent *e2) -{ - return (e1->start <= e2->end) && (e2->start <= e1->end); -} - -static inline int node_equal(struct interval_node *n1, struct interval_node *n2) -{ - return extent_equal(&n1->in_extent, &n2->in_extent); -} - -static struct interval_node *interval_first(struct interval_node *node) -{ - if (!node) - return NULL; - while (node->in_left) - node = node->in_left; - return node; -} - -static struct interval_node *interval_last(struct interval_node *node) -{ - if (!node) - return NULL; - while (node->in_right) - node = node->in_right; - return node; -} - -static struct interval_node *interval_next(struct interval_node *node) -{ - if (!node) - return NULL; - if (node->in_right) - return interval_first(node->in_right); - while (node->in_parent && node_is_right_child(node)) - node = node->in_parent; - return node->in_parent; -} - -static struct interval_node *interval_prev(struct interval_node *node) -{ - if (!node) - return NULL; - - if (node->in_left) - return interval_last(node->in_left); - - while (node->in_parent && node_is_left_child(node)) - node = node->in_parent; - - return node->in_parent; -} - -enum interval_iter interval_iterate_reverse(struct interval_node *root, - interval_callback_t func, - void *data) -{ - enum interval_iter rc = INTERVAL_ITER_CONT; - struct interval_node *node; - - for (node = interval_last(root); node; node = interval_prev(node)) { - rc = func(node, data); - if (rc == INTERVAL_ITER_STOP) - break; - } - - return rc; -} -EXPORT_SYMBOL(interval_iterate_reverse); - -static void __rotate_change_maxhigh(struct interval_node *node, - struct interval_node *rotate) -{ - __u64 left_max, right_max; - - rotate->in_max_high = node->in_max_high; - left_max = node->in_left ? node->in_left->in_max_high : 0; - right_max = node->in_right ? node->in_right->in_max_high : 0; - node->in_max_high = max(interval_high(node), - max(left_max, right_max)); -} - -/* The left rotation "pivots" around the link from node to node->right, and - * - node will be linked to node->right's left child, and - * - node->right's left child will be linked to node's right child. - */ -static void __rotate_left(struct interval_node *node, - struct interval_node **root) -{ - struct interval_node *right = node->in_right; - struct interval_node *parent = node->in_parent; - - node->in_right = right->in_left; - if (node->in_right) - right->in_left->in_parent = node; - - right->in_left = node; - right->in_parent = parent; - if (parent) { - if (node_is_left_child(node)) - parent->in_left = right; - else - parent->in_right = right; - } else { - *root = right; - } - node->in_parent = right; - - /* update max_high for node and right */ - __rotate_change_maxhigh(node, right); -} - -/* The right rotation "pivots" around the link from node to node->left, and - * - node will be linked to node->left's right child, and - * - node->left's right child will be linked to node's left child. - */ -static void __rotate_right(struct interval_node *node, - struct interval_node **root) -{ - struct interval_node *left = node->in_left; - struct interval_node *parent = node->in_parent; - - node->in_left = left->in_right; - if (node->in_left) - left->in_right->in_parent = node; - left->in_right = node; - - left->in_parent = parent; - if (parent) { - if (node_is_right_child(node)) - parent->in_right = left; - else - parent->in_left = left; - } else { - *root = left; - } - node->in_parent = left; - - /* update max_high for node and left */ - __rotate_change_maxhigh(node, left); -} - -#define interval_swap(a, b) do { \ - struct interval_node *c = a; a = b; b = c; \ -} while (0) - -/* - * Operations INSERT and DELETE, when run on a tree with n keys, - * take O(logN) time.Because they modify the tree, the result - * may violate the red-black properties.To restore these properties, - * we must change the colors of some of the nodes in the tree - * and also change the pointer structure. - */ -static void interval_insert_color(struct interval_node *node, - struct interval_node **root) -{ - struct interval_node *parent, *gparent; - - while ((parent = node->in_parent) && node_is_red(parent)) { - gparent = parent->in_parent; - /* Parent is RED, so gparent must not be NULL */ - if (node_is_left_child(parent)) { - struct interval_node *uncle; - - uncle = gparent->in_right; - if (uncle && node_is_red(uncle)) { - uncle->in_color = INTERVAL_BLACK; - parent->in_color = INTERVAL_BLACK; - gparent->in_color = INTERVAL_RED; - node = gparent; - continue; - } - - if (parent->in_right == node) { - __rotate_left(parent, root); - interval_swap(node, parent); - } - - parent->in_color = INTERVAL_BLACK; - gparent->in_color = INTERVAL_RED; - __rotate_right(gparent, root); - } else { - struct interval_node *uncle; - - uncle = gparent->in_left; - if (uncle && node_is_red(uncle)) { - uncle->in_color = INTERVAL_BLACK; - parent->in_color = INTERVAL_BLACK; - gparent->in_color = INTERVAL_RED; - node = gparent; - continue; - } - - if (node_is_left_child(node)) { - __rotate_right(parent, root); - interval_swap(node, parent); - } - - parent->in_color = INTERVAL_BLACK; - gparent->in_color = INTERVAL_RED; - __rotate_left(gparent, root); - } - } - - (*root)->in_color = INTERVAL_BLACK; -} - -struct interval_node *interval_insert(struct interval_node *node, - struct interval_node **root) - -{ - struct interval_node **p, *parent = NULL; - - LASSERT(!interval_is_intree(node)); - p = root; - while (*p) { - parent = *p; - if (node_equal(parent, node)) - return parent; - - /* max_high field must be updated after each iteration */ - if (parent->in_max_high < interval_high(node)) - parent->in_max_high = interval_high(node); - - if (extent_compare(&node->in_extent, &parent->in_extent) < 0) - p = &parent->in_left; - else - p = &parent->in_right; - } - - /* link node into the tree */ - node->in_parent = parent; - node->in_color = INTERVAL_RED; - node->in_left = NULL; - node->in_right = NULL; - *p = node; - - interval_insert_color(node, root); - node->in_intree = 1; - - return NULL; -} -EXPORT_SYMBOL(interval_insert); - -static inline int node_is_black_or_0(struct interval_node *node) -{ - return !node || node_is_black(node); -} - -static void interval_erase_color(struct interval_node *node, - struct interval_node *parent, - struct interval_node **root) -{ - struct interval_node *tmp; - - while (node_is_black_or_0(node) && node != *root) { - if (parent->in_left == node) { - tmp = parent->in_right; - if (node_is_red(tmp)) { - tmp->in_color = INTERVAL_BLACK; - parent->in_color = INTERVAL_RED; - __rotate_left(parent, root); - tmp = parent->in_right; - } - if (node_is_black_or_0(tmp->in_left) && - node_is_black_or_0(tmp->in_right)) { - tmp->in_color = INTERVAL_RED; - node = parent; - parent = node->in_parent; - } else { - if (node_is_black_or_0(tmp->in_right)) { - struct interval_node *o_left; - - o_left = tmp->in_left; - if (o_left) - o_left->in_color = INTERVAL_BLACK; - tmp->in_color = INTERVAL_RED; - __rotate_right(tmp, root); - tmp = parent->in_right; - } - tmp->in_color = parent->in_color; - parent->in_color = INTERVAL_BLACK; - if (tmp->in_right) - tmp->in_right->in_color = INTERVAL_BLACK; - __rotate_left(parent, root); - node = *root; - break; - } - } else { - tmp = parent->in_left; - if (node_is_red(tmp)) { - tmp->in_color = INTERVAL_BLACK; - parent->in_color = INTERVAL_RED; - __rotate_right(parent, root); - tmp = parent->in_left; - } - if (node_is_black_or_0(tmp->in_left) && - node_is_black_or_0(tmp->in_right)) { - tmp->in_color = INTERVAL_RED; - node = parent; - parent = node->in_parent; - } else { - if (node_is_black_or_0(tmp->in_left)) { - struct interval_node *o_right; - - o_right = tmp->in_right; - if (o_right) - o_right->in_color = INTERVAL_BLACK; - tmp->in_color = INTERVAL_RED; - __rotate_left(tmp, root); - tmp = parent->in_left; - } - tmp->in_color = parent->in_color; - parent->in_color = INTERVAL_BLACK; - if (tmp->in_left) - tmp->in_left->in_color = INTERVAL_BLACK; - __rotate_right(parent, root); - node = *root; - break; - } - } - } - if (node) - node->in_color = INTERVAL_BLACK; -} - -/* - * if the @max_high value of @node is changed, this function traverse a path - * from node up to the root to update max_high for the whole tree. - */ -static void update_maxhigh(struct interval_node *node, - __u64 old_maxhigh) -{ - __u64 left_max, right_max; - - while (node) { - left_max = node->in_left ? node->in_left->in_max_high : 0; - right_max = node->in_right ? node->in_right->in_max_high : 0; - node->in_max_high = max(interval_high(node), - max(left_max, right_max)); - - if (node->in_max_high >= old_maxhigh) - break; - node = node->in_parent; - } -} - -void interval_erase(struct interval_node *node, - struct interval_node **root) -{ - struct interval_node *child, *parent; - int color; - - LASSERT(interval_is_intree(node)); - node->in_intree = 0; - if (!node->in_left) { - child = node->in_right; - } else if (!node->in_right) { - child = node->in_left; - } else { /* Both left and right child are not NULL */ - struct interval_node *old = node; - - node = interval_next(node); - child = node->in_right; - parent = node->in_parent; - color = node->in_color; - - if (child) - child->in_parent = parent; - if (parent == old) - parent->in_right = child; - else - parent->in_left = child; - - node->in_color = old->in_color; - node->in_right = old->in_right; - node->in_left = old->in_left; - node->in_parent = old->in_parent; - - if (old->in_parent) { - if (node_is_left_child(old)) - old->in_parent->in_left = node; - else - old->in_parent->in_right = node; - } else { - *root = node; - } - - old->in_left->in_parent = node; - if (old->in_right) - old->in_right->in_parent = node; - update_maxhigh(child ? : parent, node->in_max_high); - update_maxhigh(node, old->in_max_high); - if (parent == old) - parent = node; - goto color; - } - parent = node->in_parent; - color = node->in_color; - - if (child) - child->in_parent = parent; - if (parent) { - if (node_is_left_child(node)) - parent->in_left = child; - else - parent->in_right = child; - } else { - *root = child; - } - - update_maxhigh(child ? : parent, node->in_max_high); - -color: - if (color == INTERVAL_BLACK) - interval_erase_color(child, parent, root); -} -EXPORT_SYMBOL(interval_erase); - -static inline int interval_may_overlap(struct interval_node *node, - struct interval_node_extent *ext) -{ - return (ext->start <= node->in_max_high && - ext->end >= interval_low(node)); -} - -/* - * This function finds all intervals that overlap interval ext, - * and calls func to handle resulted intervals one by one. - * in lustre, this function will find all conflicting locks in - * the granted queue and add these locks to the ast work list. - * - * { - * if (!node) - * return 0; - * if (ext->end < interval_low(node)) { - * interval_search(node->in_left, ext, func, data); - * } else if (interval_may_overlap(node, ext)) { - * if (extent_overlapped(ext, &node->in_extent)) - * func(node, data); - * interval_search(node->in_left, ext, func, data); - * interval_search(node->in_right, ext, func, data); - * } - * return 0; - * } - * - */ -enum interval_iter interval_search(struct interval_node *node, - struct interval_node_extent *ext, - interval_callback_t func, - void *data) -{ - enum interval_iter rc = INTERVAL_ITER_CONT; - struct interval_node *parent; - - LASSERT(ext); - LASSERT(func); - - while (node) { - if (ext->end < interval_low(node)) { - if (node->in_left) { - node = node->in_left; - continue; - } - } else if (interval_may_overlap(node, ext)) { - if (extent_overlapped(ext, &node->in_extent)) { - rc = func(node, data); - if (rc == INTERVAL_ITER_STOP) - break; - } - - if (node->in_left) { - node = node->in_left; - continue; - } - if (node->in_right) { - node = node->in_right; - continue; - } - } - - parent = node->in_parent; - while (parent) { - if (node_is_left_child(node) && - parent->in_right) { - /* - * If we ever got the left, it means that the - * parent met ext->endin_right; - break; - } - node = parent; - parent = parent->in_parent; - } - if (!parent || !interval_may_overlap(parent, ext)) - break; - } - - return rc; -} -EXPORT_SYMBOL(interval_search); diff --git a/drivers/staging/lustre/lustre/ldlm/l_lock.c b/drivers/staging/lustre/lustre/ldlm/l_lock.c deleted file mode 100644 index 296259aa51e6b1a68f043710165e28795ffccad5..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ldlm/l_lock.c +++ /dev/null @@ -1,73 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_LDLM - -#include -#include - -/** - * Lock a lock and its resource. - * - * LDLM locking uses resource to serialize access to locks - * but there is a case when we change resource of lock upon - * enqueue reply. We rely on lock->l_resource = new_res - * being an atomic operation. - */ -struct ldlm_resource *lock_res_and_lock(struct ldlm_lock *lock) - __acquires(&lock->l_lock) - __acquires(&lock->l_resource->lr_lock) -{ - spin_lock(&lock->l_lock); - - lock_res(lock->l_resource); - - ldlm_set_res_locked(lock); - return lock->l_resource; -} -EXPORT_SYMBOL(lock_res_and_lock); - -/** - * Unlock a lock and its resource previously locked with lock_res_and_lock - */ -void unlock_res_and_lock(struct ldlm_lock *lock) - __releases(&lock->l_resource->lr_lock) - __releases(&lock->l_lock) -{ - /* on server-side resource of lock doesn't change */ - ldlm_clear_res_locked(lock); - - unlock_res(lock->l_resource); - spin_unlock(&lock->l_lock); -} -EXPORT_SYMBOL(unlock_res_and_lock); diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c b/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c deleted file mode 100644 index 4da23ade2bb34b49bab3cd579c2a975d8f19da57..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ldlm/ldlm_extent.c +++ /dev/null @@ -1,258 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2010, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/ldlm/ldlm_extent.c - * - * Author: Peter Braam - * Author: Phil Schwan - */ - -/** - * This file contains implementation of EXTENT lock type - * - * EXTENT lock type is for locking a contiguous range of values, represented - * by 64-bit starting and ending offsets (inclusive). There are several extent - * lock modes, some of which may be mutually incompatible. Extent locks are - * considered incompatible if their modes are incompatible and their extents - * intersect. See the lock mode compatibility matrix in lustre_dlm.h. - */ - -#define DEBUG_SUBSYSTEM S_LDLM -#include -#include -#include -#include -#include -#include "ldlm_internal.h" - -/* When a lock is cancelled by a client, the KMS may undergo change if this - * is the "highest lock". This function returns the new KMS value. - * Caller must hold lr_lock already. - * - * NB: A lock on [x,y] protects a KMS of up to y + 1 bytes! - */ -__u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms) -{ - struct ldlm_resource *res = lock->l_resource; - struct ldlm_lock *lck; - __u64 kms = 0; - - /* don't let another thread in ldlm_extent_shift_kms race in - * just after we finish and take our lock into account in its - * calculation of the kms - */ - ldlm_set_kms_ignore(lock); - - list_for_each_entry(lck, &res->lr_granted, l_res_link) { - - if (ldlm_is_kms_ignore(lck)) - continue; - - if (lck->l_policy_data.l_extent.end >= old_kms) - return old_kms; - - /* This extent _has_ to be smaller than old_kms (checked above) - * so kms can only ever be smaller or the same as old_kms. - */ - if (lck->l_policy_data.l_extent.end + 1 > kms) - kms = lck->l_policy_data.l_extent.end + 1; - } - LASSERTF(kms <= old_kms, "kms %llu old_kms %llu\n", kms, old_kms); - - return kms; -} -EXPORT_SYMBOL(ldlm_extent_shift_kms); - -struct kmem_cache *ldlm_interval_slab; - -/* interval tree, for LDLM_EXTENT. */ -static void ldlm_interval_attach(struct ldlm_interval *n, struct ldlm_lock *l) -{ - LASSERT(!l->l_tree_node); - LASSERT(l->l_resource->lr_type == LDLM_EXTENT); - - list_add_tail(&l->l_sl_policy, &n->li_group); - l->l_tree_node = n; -} - -struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock) -{ - struct ldlm_interval *node; - - LASSERT(lock->l_resource->lr_type == LDLM_EXTENT); - node = kmem_cache_zalloc(ldlm_interval_slab, GFP_NOFS); - if (!node) - return NULL; - - INIT_LIST_HEAD(&node->li_group); - ldlm_interval_attach(node, lock); - return node; -} - -void ldlm_interval_free(struct ldlm_interval *node) -{ - if (node) { - LASSERT(list_empty(&node->li_group)); - LASSERT(!interval_is_intree(&node->li_node)); - kmem_cache_free(ldlm_interval_slab, node); - } -} - -struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l) -{ - struct ldlm_interval *n = l->l_tree_node; - - if (!n) - return NULL; - - LASSERT(!list_empty(&n->li_group)); - l->l_tree_node = NULL; - list_del_init(&l->l_sl_policy); - - return list_empty(&n->li_group) ? n : NULL; -} - -static inline int lock_mode_to_index(enum ldlm_mode mode) -{ - int index; - - LASSERT(mode != 0); - LASSERT(is_power_of_2(mode)); - for (index = -1; mode; index++) - mode >>= 1; - LASSERT(index < LCK_MODE_NUM); - return index; -} - -/** Add newly granted lock into interval tree for the resource. */ -void ldlm_extent_add_lock(struct ldlm_resource *res, - struct ldlm_lock *lock) -{ - struct interval_node *found, **root; - struct ldlm_interval *node; - struct ldlm_extent *extent; - int idx, rc; - - LASSERT(lock->l_granted_mode == lock->l_req_mode); - - node = lock->l_tree_node; - LASSERT(node); - LASSERT(!interval_is_intree(&node->li_node)); - - idx = lock_mode_to_index(lock->l_granted_mode); - LASSERT(lock->l_granted_mode == 1 << idx); - LASSERT(lock->l_granted_mode == res->lr_itree[idx].lit_mode); - - /* node extent initialize */ - extent = &lock->l_policy_data.l_extent; - rc = interval_set(&node->li_node, extent->start, extent->end); - LASSERT(!rc); - - root = &res->lr_itree[idx].lit_root; - found = interval_insert(&node->li_node, root); - if (found) { /* The policy group found. */ - struct ldlm_interval *tmp; - - tmp = ldlm_interval_detach(lock); - ldlm_interval_free(tmp); - ldlm_interval_attach(to_ldlm_interval(found), lock); - } - res->lr_itree[idx].lit_size++; - - /* even though we use interval tree to manage the extent lock, we also - * add the locks into grant list, for debug purpose, .. - */ - ldlm_resource_add_lock(res, &res->lr_granted, lock); - - if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_GRANT_CHECK)) { - struct ldlm_lock *lck; - - list_for_each_entry_reverse(lck, &res->lr_granted, - l_res_link) { - if (lck == lock) - continue; - if (lockmode_compat(lck->l_granted_mode, - lock->l_granted_mode)) - continue; - if (ldlm_extent_overlap(&lck->l_req_extent, - &lock->l_req_extent)) { - CDEBUG(D_ERROR, - "granting conflicting lock %p %p\n", - lck, lock); - ldlm_resource_dump(D_ERROR, res); - LBUG(); - } - } - } -} - -/** Remove cancelled lock from resource interval tree. */ -void ldlm_extent_unlink_lock(struct ldlm_lock *lock) -{ - struct ldlm_resource *res = lock->l_resource; - struct ldlm_interval *node = lock->l_tree_node; - struct ldlm_interval_tree *tree; - int idx; - - if (!node || !interval_is_intree(&node->li_node)) /* duplicate unlink */ - return; - - idx = lock_mode_to_index(lock->l_granted_mode); - LASSERT(lock->l_granted_mode == 1 << idx); - tree = &res->lr_itree[idx]; - - LASSERT(tree->lit_root); /* assure the tree is not null */ - - tree->lit_size--; - node = ldlm_interval_detach(lock); - if (node) { - interval_erase(&node->li_node, &tree->lit_root); - ldlm_interval_free(node); - } -} - -void ldlm_extent_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy, - union ldlm_policy_data *lpolicy) -{ - lpolicy->l_extent.start = wpolicy->l_extent.start; - lpolicy->l_extent.end = wpolicy->l_extent.end; - lpolicy->l_extent.gid = wpolicy->l_extent.gid; -} - -void ldlm_extent_policy_local_to_wire(const union ldlm_policy_data *lpolicy, - union ldlm_wire_policy_data *wpolicy) -{ - memset(wpolicy, 0, sizeof(*wpolicy)); - wpolicy->l_extent.start = lpolicy->l_extent.start; - wpolicy->l_extent.end = lpolicy->l_extent.end; - wpolicy->l_extent.gid = lpolicy->l_extent.gid; -} diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c b/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c deleted file mode 100644 index 94f3b1e49896baf3f4b39ab6a81c4e9703101c9c..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ldlm/ldlm_flock.c +++ /dev/null @@ -1,486 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003 Hewlett-Packard Development Company LP. - * Developed under the sponsorship of the US Government under - * Subcontract No. B514193 - * - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2010, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -/** - * This file implements POSIX lock type for Lustre. - * Its policy properties are start and end of extent and PID. - * - * These locks are only done through MDS due to POSIX semantics requiring - * e.g. that locks could be only partially released and as such split into - * two parts, and also that two adjacent locks from the same process may be - * merged into a single wider lock. - * - * Lock modes are mapped like this: - * PR and PW for READ and WRITE locks - * NL to request a releasing of a portion of the lock - * - * These flock locks never timeout. - */ - -#define DEBUG_SUBSYSTEM S_LDLM - -#include -#include -#include -#include -#include -#include "ldlm_internal.h" - -static inline int -ldlm_same_flock_owner(struct ldlm_lock *lock, struct ldlm_lock *new) -{ - return((new->l_policy_data.l_flock.owner == - lock->l_policy_data.l_flock.owner) && - (new->l_export == lock->l_export)); -} - -static inline int -ldlm_flocks_overlap(struct ldlm_lock *lock, struct ldlm_lock *new) -{ - return((new->l_policy_data.l_flock.start <= - lock->l_policy_data.l_flock.end) && - (new->l_policy_data.l_flock.end >= - lock->l_policy_data.l_flock.start)); -} - -static inline void -ldlm_flock_destroy(struct ldlm_lock *lock, enum ldlm_mode mode) -{ - LDLM_DEBUG(lock, "%s(mode: %d)", - __func__, mode); - - list_del_init(&lock->l_res_link); - - /* client side - set a flag to prevent sending a CANCEL */ - lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_CBPENDING; - - /* when reaching here, it is under lock_res_and_lock(). Thus, - * need call the nolock version of ldlm_lock_decref_internal - */ - ldlm_lock_decref_internal_nolock(lock, mode); - - ldlm_lock_destroy_nolock(lock); -} - -/** - * Process a granting attempt for flock lock. - * Must be called under ns lock held. - * - * This function looks for any conflicts for \a lock in the granted or - * waiting queues. The lock is granted if no conflicts are found in - * either queue. - * - * It is also responsible for splitting a lock if a portion of the lock - * is released. - * - */ -static int ldlm_process_flock_lock(struct ldlm_lock *req) -{ - struct ldlm_resource *res = req->l_resource; - struct ldlm_namespace *ns = ldlm_res_to_ns(res); - struct ldlm_lock *tmp; - struct ldlm_lock *lock; - struct ldlm_lock *new = req; - struct ldlm_lock *new2 = NULL; - enum ldlm_mode mode = req->l_req_mode; - int added = (mode == LCK_NL); - int splitted = 0; - const struct ldlm_callback_suite null_cbs = { }; - - CDEBUG(D_DLMTRACE, - "owner %llu pid %u mode %u start %llu end %llu\n", - new->l_policy_data.l_flock.owner, - new->l_policy_data.l_flock.pid, mode, - req->l_policy_data.l_flock.start, - req->l_policy_data.l_flock.end); - - /* No blocking ASTs are sent to the clients for - * Posix file & record locks - */ - req->l_blocking_ast = NULL; - -reprocess: - /* This loop determines where this processes locks start - * in the resource lr_granted list. - */ - list_for_each_entry(lock, &res->lr_granted, l_res_link) - if (ldlm_same_flock_owner(lock, req)) - break; - - /* Scan the locks owned by this process to find the insertion point - * (as locks are ordered), and to handle overlaps. - * We may have to merge or split existing locks. - */ - list_for_each_entry_safe_from(lock, tmp, &res->lr_granted, l_res_link) { - - if (!ldlm_same_flock_owner(lock, new)) - break; - - if (lock->l_granted_mode == mode) { - /* If the modes are the same then we need to process - * locks that overlap OR adjoin the new lock. The extra - * logic condition is necessary to deal with arithmetic - * overflow and underflow. - */ - if ((new->l_policy_data.l_flock.start > - (lock->l_policy_data.l_flock.end + 1)) && - (lock->l_policy_data.l_flock.end != OBD_OBJECT_EOF)) - continue; - - if ((new->l_policy_data.l_flock.end < - (lock->l_policy_data.l_flock.start - 1)) && - (lock->l_policy_data.l_flock.start != 0)) - break; - - if (new->l_policy_data.l_flock.start < - lock->l_policy_data.l_flock.start) { - lock->l_policy_data.l_flock.start = - new->l_policy_data.l_flock.start; - } else { - new->l_policy_data.l_flock.start = - lock->l_policy_data.l_flock.start; - } - - if (new->l_policy_data.l_flock.end > - lock->l_policy_data.l_flock.end) { - lock->l_policy_data.l_flock.end = - new->l_policy_data.l_flock.end; - } else { - new->l_policy_data.l_flock.end = - lock->l_policy_data.l_flock.end; - } - - if (added) { - ldlm_flock_destroy(lock, mode); - } else { - new = lock; - added = 1; - } - continue; - } - - if (new->l_policy_data.l_flock.start > - lock->l_policy_data.l_flock.end) - continue; - - if (new->l_policy_data.l_flock.end < - lock->l_policy_data.l_flock.start) - break; - - if (new->l_policy_data.l_flock.start <= - lock->l_policy_data.l_flock.start) { - if (new->l_policy_data.l_flock.end < - lock->l_policy_data.l_flock.end) { - lock->l_policy_data.l_flock.start = - new->l_policy_data.l_flock.end + 1; - break; - } - ldlm_flock_destroy(lock, lock->l_req_mode); - continue; - } - if (new->l_policy_data.l_flock.end >= - lock->l_policy_data.l_flock.end) { - lock->l_policy_data.l_flock.end = - new->l_policy_data.l_flock.start - 1; - continue; - } - - /* split the existing lock into two locks */ - - /* if this is an F_UNLCK operation then we could avoid - * allocating a new lock and use the req lock passed in - * with the request but this would complicate the reply - * processing since updates to req get reflected in the - * reply. The client side replays the lock request so - * it must see the original lock data in the reply. - */ - - /* XXX - if ldlm_lock_new() can sleep we should - * release the lr_lock, allocate the new lock, - * and restart processing this lock. - */ - if (!new2) { - unlock_res_and_lock(req); - new2 = ldlm_lock_create(ns, &res->lr_name, LDLM_FLOCK, - lock->l_granted_mode, &null_cbs, - NULL, 0, LVB_T_NONE); - lock_res_and_lock(req); - if (IS_ERR(new2)) { - ldlm_flock_destroy(req, lock->l_granted_mode); - return LDLM_ITER_STOP; - } - goto reprocess; - } - - splitted = 1; - - new2->l_granted_mode = lock->l_granted_mode; - new2->l_policy_data.l_flock.pid = - new->l_policy_data.l_flock.pid; - new2->l_policy_data.l_flock.owner = - new->l_policy_data.l_flock.owner; - new2->l_policy_data.l_flock.start = - lock->l_policy_data.l_flock.start; - new2->l_policy_data.l_flock.end = - new->l_policy_data.l_flock.start - 1; - lock->l_policy_data.l_flock.start = - new->l_policy_data.l_flock.end + 1; - new2->l_conn_export = lock->l_conn_export; - if (lock->l_export) - new2->l_export = class_export_lock_get(lock->l_export, - new2); - ldlm_lock_addref_internal_nolock(new2, - lock->l_granted_mode); - - /* insert new2 at lock */ - ldlm_resource_add_lock(res, &lock->l_res_link, new2); - LDLM_LOCK_RELEASE(new2); - break; - } - - /* if new2 is created but never used, destroy it*/ - if (splitted == 0 && new2) - ldlm_lock_destroy_nolock(new2); - - /* At this point we're granting the lock request. */ - req->l_granted_mode = req->l_req_mode; - - if (!added) { - list_del_init(&req->l_res_link); - /* insert new lock before "lock", which might be the - * next lock for this owner, or might be the first - * lock for the next owner, or might not be a lock at - * all, but instead points at the head of the list - */ - ldlm_resource_add_lock(res, &lock->l_res_link, req); - } - - /* In case we're reprocessing the requested lock we can't destroy - * it until after calling ldlm_add_ast_work_item() above so that laawi() - * can bump the reference count on \a req. Otherwise \a req - * could be freed before the completion AST can be sent. - */ - if (added) - ldlm_flock_destroy(req, mode); - - ldlm_resource_dump(D_INFO, res); - return LDLM_ITER_CONTINUE; -} - -/** - * Flock completion callback function. - * - * \param lock [in,out]: A lock to be handled - * \param flags [in]: flags - * \param *data [in]: ldlm_work_cp_ast_lock() will use ldlm_cb_set_arg - * - * \retval 0 : success - * \retval <0 : failure - */ -int -ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data) -{ - struct file_lock *getlk = lock->l_ast_data; - int rc = 0; - - OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT2, 4); - if (OBD_FAIL_PRECHECK(OBD_FAIL_LDLM_CP_CB_WAIT3)) { - lock_res_and_lock(lock); - lock->l_flags |= LDLM_FL_FAIL_LOC; - unlock_res_and_lock(lock); - OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT3, 4); - } - CDEBUG(D_DLMTRACE, "flags: 0x%llx data: %p getlk: %p\n", - flags, data, getlk); - - LASSERT(flags != LDLM_FL_WAIT_NOREPROC); - - if (flags & LDLM_FL_FAILED) - goto granted; - - if (!(flags & LDLM_FL_BLOCKED_MASK)) { - if (!data) - /* mds granted the lock in the reply */ - goto granted; - /* CP AST RPC: lock get granted, wake it up */ - wake_up(&lock->l_waitq); - return 0; - } - - LDLM_DEBUG(lock, - "client-side enqueue returned a blocked lock, sleeping"); - - /* Go to sleep until the lock is granted. */ - rc = l_wait_event_abortable(lock->l_waitq, is_granted_or_cancelled(lock)); - - if (rc) { - lock_res_and_lock(lock); - - /* client side - set flag to prevent lock from being put on LRU list */ - ldlm_set_cbpending(lock); - unlock_res_and_lock(lock); - - LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)", - rc); - return rc; - } - -granted: - OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT, 10); - - if (OBD_FAIL_PRECHECK(OBD_FAIL_LDLM_CP_CB_WAIT4)) { - lock_res_and_lock(lock); - /* DEADLOCK is always set with CBPENDING */ - lock->l_flags |= LDLM_FL_FLOCK_DEADLOCK | LDLM_FL_CBPENDING; - unlock_res_and_lock(lock); - OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT4, 4); - } - if (OBD_FAIL_PRECHECK(OBD_FAIL_LDLM_CP_CB_WAIT5)) { - lock_res_and_lock(lock); - /* DEADLOCK is always set with CBPENDING */ - lock->l_flags |= LDLM_FL_FAIL_LOC | - LDLM_FL_FLOCK_DEADLOCK | LDLM_FL_CBPENDING; - unlock_res_and_lock(lock); - OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CP_CB_WAIT5, 4); - } - - lock_res_and_lock(lock); - - /* - * Protect against race where lock could have been just destroyed - * due to overlap in ldlm_process_flock_lock(). - */ - if (ldlm_is_destroyed(lock)) { - unlock_res_and_lock(lock); - LDLM_DEBUG(lock, "client-side enqueue waking up: destroyed"); - /* - * An error is still to be returned, to propagate it up to - * ldlm_cli_enqueue_fini() caller. - */ - return -EIO; - } - - /* ldlm_lock_enqueue() has already placed lock on the granted list. */ - ldlm_resource_unlink_lock(lock); - - /* - * Import invalidation. We need to actually release the lock - * references being held, so that it can go away. No point in - * holding the lock even if app still believes it has it, since - * server already dropped it anyway. Only for granted locks too. - */ - /* Do the same for DEADLOCK'ed locks. */ - if (ldlm_is_failed(lock) || ldlm_is_flock_deadlock(lock)) { - int mode; - - if (flags & LDLM_FL_TEST_LOCK) - LASSERT(ldlm_is_test_lock(lock)); - - if (ldlm_is_test_lock(lock) || ldlm_is_flock_deadlock(lock)) - mode = getlk->fl_type; - else - mode = lock->l_granted_mode; - - if (ldlm_is_flock_deadlock(lock)) { - LDLM_DEBUG(lock, - "client-side enqueue deadlock received"); - rc = -EDEADLK; - } - ldlm_flock_destroy(lock, mode); - unlock_res_and_lock(lock); - - /* Need to wake up the waiter if we were evicted */ - wake_up(&lock->l_waitq); - - /* - * An error is still to be returned, to propagate it up to - * ldlm_cli_enqueue_fini() caller. - */ - return rc ? : -EIO; - } - - LDLM_DEBUG(lock, "client-side enqueue granted"); - - if (flags & LDLM_FL_TEST_LOCK) { - /* fcntl(F_GETLK) request */ - /* The old mode was saved in getlk->fl_type so that if the mode - * in the lock changes we can decref the appropriate refcount. - */ - LASSERT(ldlm_is_test_lock(lock)); - ldlm_flock_destroy(lock, getlk->fl_type); - switch (lock->l_granted_mode) { - case LCK_PR: - getlk->fl_type = F_RDLCK; - break; - case LCK_PW: - getlk->fl_type = F_WRLCK; - break; - default: - getlk->fl_type = F_UNLCK; - } - getlk->fl_pid = -(pid_t)lock->l_policy_data.l_flock.pid; - getlk->fl_start = (loff_t)lock->l_policy_data.l_flock.start; - getlk->fl_end = (loff_t)lock->l_policy_data.l_flock.end; - } else { - /* We need to reprocess the lock to do merges or splits - * with existing locks owned by this process. - */ - ldlm_process_flock_lock(lock); - } - unlock_res_and_lock(lock); - return rc; -} -EXPORT_SYMBOL(ldlm_flock_completion_ast); - -void ldlm_flock_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy, - union ldlm_policy_data *lpolicy) -{ - lpolicy->l_flock.start = wpolicy->l_flock.lfw_start; - lpolicy->l_flock.end = wpolicy->l_flock.lfw_end; - lpolicy->l_flock.pid = wpolicy->l_flock.lfw_pid; - lpolicy->l_flock.owner = wpolicy->l_flock.lfw_owner; -} - -void ldlm_flock_policy_local_to_wire(const union ldlm_policy_data *lpolicy, - union ldlm_wire_policy_data *wpolicy) -{ - memset(wpolicy, 0, sizeof(*wpolicy)); - wpolicy->l_flock.lfw_start = lpolicy->l_flock.start; - wpolicy->l_flock.lfw_end = lpolicy->l_flock.end; - wpolicy->l_flock.lfw_pid = lpolicy->l_flock.pid; - wpolicy->l_flock.lfw_owner = lpolicy->l_flock.owner; -} diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c b/drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c deleted file mode 100644 index 2926208cdfa15eaae9a73d8f3a74d75f09764716..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ldlm/ldlm_inodebits.c +++ /dev/null @@ -1,69 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/ldlm/ldlm_inodebits.c - * - * Author: Peter Braam - * Author: Phil Schwan - */ - -/** - * This file contains implementation of IBITS lock type - * - * IBITS lock type contains a bit mask determining various properties of an - * object. The meanings of specific bits are specific to the caller and are - * opaque to LDLM code. - * - * Locks with intersecting bitmasks and conflicting lock modes (e.g. LCK_PW) - * are considered conflicting. See the lock mode compatibility matrix - * in lustre_dlm.h. - */ - -#define DEBUG_SUBSYSTEM S_LDLM - -#include -#include -#include -#include "ldlm_internal.h" - -void ldlm_ibits_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy, - union ldlm_policy_data *lpolicy) -{ - lpolicy->l_inodebits.bits = wpolicy->l_inodebits.bits; -} - -void ldlm_ibits_policy_local_to_wire(const union ldlm_policy_data *lpolicy, - union ldlm_wire_policy_data *wpolicy) -{ - memset(wpolicy, 0, sizeof(*wpolicy)); - wpolicy->l_inodebits.bits = lpolicy->l_inodebits.bits; -} diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h b/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h deleted file mode 100644 index bc33ca1006206d59cb9aa8e1d03644026873f5c1..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ldlm/ldlm_internal.h +++ /dev/null @@ -1,342 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define MAX_STRING_SIZE 128 - -extern int ldlm_srv_namespace_nr; -extern int ldlm_cli_namespace_nr; -extern struct mutex ldlm_srv_namespace_lock; -extern struct list_head ldlm_srv_namespace_list; -extern struct mutex ldlm_cli_namespace_lock; -extern struct list_head ldlm_cli_active_namespace_list; - -static inline int ldlm_namespace_nr_read(enum ldlm_side client) -{ - return client == LDLM_NAMESPACE_SERVER ? - ldlm_srv_namespace_nr : ldlm_cli_namespace_nr; -} - -static inline void ldlm_namespace_nr_inc(enum ldlm_side client) -{ - if (client == LDLM_NAMESPACE_SERVER) - ldlm_srv_namespace_nr++; - else - ldlm_cli_namespace_nr++; -} - -static inline void ldlm_namespace_nr_dec(enum ldlm_side client) -{ - if (client == LDLM_NAMESPACE_SERVER) - ldlm_srv_namespace_nr--; - else - ldlm_cli_namespace_nr--; -} - -static inline struct list_head *ldlm_namespace_list(enum ldlm_side client) -{ - return client == LDLM_NAMESPACE_SERVER ? - &ldlm_srv_namespace_list : &ldlm_cli_active_namespace_list; -} - -static inline struct mutex *ldlm_namespace_lock(enum ldlm_side client) -{ - return client == LDLM_NAMESPACE_SERVER ? - &ldlm_srv_namespace_lock : &ldlm_cli_namespace_lock; -} - -/* ns_bref is the number of resources in this namespace */ -static inline int ldlm_ns_empty(struct ldlm_namespace *ns) -{ - return atomic_read(&ns->ns_bref) == 0; -} - -void ldlm_namespace_move_to_active_locked(struct ldlm_namespace *ns, - enum ldlm_side client); -void ldlm_namespace_move_to_inactive_locked(struct ldlm_namespace *ns, - enum ldlm_side client); -struct ldlm_namespace *ldlm_namespace_first_locked(enum ldlm_side client); - -/* ldlm_request.c */ -/* Cancel lru flag, it indicates we cancel aged locks. */ -enum { - LDLM_LRU_FLAG_AGED = BIT(0), /* Cancel old non-LRU resize locks */ - LDLM_LRU_FLAG_PASSED = BIT(1), /* Cancel passed number of locks. */ - LDLM_LRU_FLAG_SHRINK = BIT(2), /* Cancel locks from shrinker. */ - LDLM_LRU_FLAG_LRUR = BIT(3), /* Cancel locks from lru resize. */ - LDLM_LRU_FLAG_NO_WAIT = BIT(4), /* Cancel locks w/o blocking (neither - * sending nor waiting for any rpcs) - */ - LDLM_LRU_FLAG_LRUR_NO_WAIT = BIT(5), /* LRUR + NO_WAIT */ -}; - -int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, - enum ldlm_cancel_flags sync, int flags); -int ldlm_cancel_lru_local(struct ldlm_namespace *ns, - struct list_head *cancels, int count, int max, - enum ldlm_cancel_flags cancel_flags, int flags); -extern unsigned int ldlm_enqueue_min; -extern unsigned int ldlm_cancel_unused_locks_before_replay; - -/* ldlm_lock.c */ - -struct ldlm_cb_set_arg { - struct ptlrpc_request_set *set; - int type; /* LDLM_{CP,BL,GL}_CALLBACK */ - atomic_t restart; - struct list_head *list; - union ldlm_gl_desc *gl_desc; /* glimpse AST descriptor */ -}; - -enum ldlm_desc_ast_t { - LDLM_WORK_BL_AST, - LDLM_WORK_CP_AST, - LDLM_WORK_REVOKE_AST, - LDLM_WORK_GL_AST -}; - -void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list); -int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill, - enum req_location loc, void *data, int size); -struct ldlm_lock * -ldlm_lock_create(struct ldlm_namespace *ns, const struct ldlm_res_id *id, - enum ldlm_type type, enum ldlm_mode mode, - const struct ldlm_callback_suite *cbs, - void *data, __u32 lvb_len, enum lvb_type lvb_type); -enum ldlm_error ldlm_lock_enqueue(struct ldlm_namespace *ns, - struct ldlm_lock **lock, void *cookie, - __u64 *flags); -void ldlm_lock_addref_internal(struct ldlm_lock *lock, enum ldlm_mode mode); -void ldlm_lock_addref_internal_nolock(struct ldlm_lock *lock, - enum ldlm_mode mode); -void ldlm_lock_decref_internal(struct ldlm_lock *lock, enum ldlm_mode mode); -void ldlm_lock_decref_internal_nolock(struct ldlm_lock *lock, - enum ldlm_mode mode); -int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list, - enum ldlm_desc_ast_t ast_type); -int ldlm_lock_remove_from_lru_check(struct ldlm_lock *lock, time_t last_use); -#define ldlm_lock_remove_from_lru(lock) ldlm_lock_remove_from_lru_check(lock, 0) -int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock); -void ldlm_lock_destroy_nolock(struct ldlm_lock *lock); - -/* ldlm_lockd.c */ -int ldlm_bl_to_thread_lock(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld, - struct ldlm_lock *lock); -int ldlm_bl_to_thread_list(struct ldlm_namespace *ns, - struct ldlm_lock_desc *ld, - struct list_head *cancels, int count, - enum ldlm_cancel_flags cancel_flags); -int ldlm_bl_thread_wakeup(void); - -void ldlm_handle_bl_callback(struct ldlm_namespace *ns, - struct ldlm_lock_desc *ld, struct ldlm_lock *lock); - -extern struct kmem_cache *ldlm_resource_slab; -extern struct kset *ldlm_ns_kset; - -/* ldlm_lockd.c & ldlm_lock.c */ -extern struct kmem_cache *ldlm_lock_slab; - -/* ldlm_extent.c */ -void ldlm_extent_add_lock(struct ldlm_resource *res, struct ldlm_lock *lock); -void ldlm_extent_unlink_lock(struct ldlm_lock *lock); - -/* l_lock.c */ -void l_check_ns_lock(struct ldlm_namespace *ns); -void l_check_no_ns_lock(struct ldlm_namespace *ns); - -extern struct dentry *ldlm_svc_debugfs_dir; - -struct ldlm_state { - struct ptlrpc_service *ldlm_cb_service; - struct ptlrpc_service *ldlm_cancel_service; - struct ptlrpc_client *ldlm_client; - struct ptlrpc_connection *ldlm_server_conn; - struct ldlm_bl_pool *ldlm_bl_pool; -}; - -/* ldlm_pool.c */ -__u64 ldlm_pool_get_slv(struct ldlm_pool *pl); -void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv); -__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl); - -/* interval tree, for LDLM_EXTENT. */ -extern struct kmem_cache *ldlm_interval_slab; /* slab cache for ldlm_interval */ -struct ldlm_interval *ldlm_interval_detach(struct ldlm_lock *l); -struct ldlm_interval *ldlm_interval_alloc(struct ldlm_lock *lock); -void ldlm_interval_free(struct ldlm_interval *node); -/* this function must be called with res lock held */ -static inline struct ldlm_extent * -ldlm_interval_extent(struct ldlm_interval *node) -{ - struct ldlm_lock *lock; - - LASSERT(!list_empty(&node->li_group)); - - lock = list_entry(node->li_group.next, struct ldlm_lock, l_sl_policy); - return &lock->l_policy_data.l_extent; -} - -int ldlm_init(void); -void ldlm_exit(void); - -enum ldlm_policy_res { - LDLM_POLICY_CANCEL_LOCK, - LDLM_POLICY_KEEP_LOCK, - LDLM_POLICY_SKIP_LOCK -}; - -#define LDLM_POOL_SYSFS_PRINT_int(v) sprintf(buf, "%d\n", v) -#define LDLM_POOL_SYSFS_SET_int(a, b) { a = b; } -#define LDLM_POOL_SYSFS_PRINT_u64(v) sprintf(buf, "%lld\n", v) -#define LDLM_POOL_SYSFS_SET_u64(a, b) { a = b; } -#define LDLM_POOL_SYSFS_PRINT_atomic(v) sprintf(buf, "%d\n", atomic_read(&v)) -#define LDLM_POOL_SYSFS_SET_atomic(a, b) atomic_set(&a, b) - -#define LDLM_POOL_SYSFS_READER_SHOW(var, type) \ - static ssize_t var##_show(struct kobject *kobj, \ - struct attribute *attr, \ - char *buf) \ - { \ - struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool, \ - pl_kobj); \ - type tmp; \ - \ - spin_lock(&pl->pl_lock); \ - tmp = pl->pl_##var; \ - spin_unlock(&pl->pl_lock); \ - \ - return LDLM_POOL_SYSFS_PRINT_##type(tmp); \ - } \ - struct __##var##__dummy_read {; } /* semicolon catcher */ - -#define LDLM_POOL_SYSFS_WRITER_STORE(var, type) \ - static ssize_t var##_store(struct kobject *kobj, \ - struct attribute *attr, \ - const char *buffer, \ - size_t count) \ - { \ - struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool, \ - pl_kobj); \ - unsigned long tmp; \ - int rc; \ - \ - rc = kstrtoul(buffer, 10, &tmp); \ - if (rc < 0) { \ - return rc; \ - } \ - \ - spin_lock(&pl->pl_lock); \ - LDLM_POOL_SYSFS_SET_##type(pl->pl_##var, tmp); \ - spin_unlock(&pl->pl_lock); \ - \ - return count; \ - } \ - struct __##var##__dummy_write {; } /* semicolon catcher */ - -#define LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(var, type) \ - static ssize_t var##_show(struct kobject *kobj, \ - struct attribute *attr, \ - char *buf) \ - { \ - struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool, \ - pl_kobj); \ - \ - return LDLM_POOL_SYSFS_PRINT_##type(pl->pl_##var); \ - } \ - struct __##var##__dummy_read {; } /* semicolon catcher */ - -#define LDLM_POOL_SYSFS_WRITER_NOLOCK_STORE(var, type) \ - static ssize_t var##_store(struct kobject *kobj, \ - struct attribute *attr, \ - const char *buffer, \ - size_t count) \ - { \ - struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool, \ - pl_kobj); \ - unsigned long tmp; \ - int rc; \ - \ - rc = kstrtoul(buffer, 10, &tmp); \ - if (rc < 0) { \ - return rc; \ - } \ - \ - LDLM_POOL_SYSFS_SET_##type(pl->pl_##var, tmp); \ - \ - return count; \ - } \ - struct __##var##__dummy_write {; } /* semicolon catcher */ - -static inline int is_granted_or_cancelled(struct ldlm_lock *lock) -{ - int ret = 0; - - lock_res_and_lock(lock); - if ((lock->l_req_mode == lock->l_granted_mode) && - !ldlm_is_cp_reqd(lock)) - ret = 1; - else if (ldlm_is_failed(lock) || ldlm_is_cancel(lock)) - ret = 1; - unlock_res_and_lock(lock); - - return ret; -} - -typedef void (*ldlm_policy_wire_to_local_t)(const union ldlm_wire_policy_data *, - union ldlm_policy_data *); - -typedef void (*ldlm_policy_local_to_wire_t)(const union ldlm_policy_data *, - union ldlm_wire_policy_data *); - -void ldlm_plain_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy, - union ldlm_policy_data *lpolicy); -void ldlm_plain_policy_local_to_wire(const union ldlm_policy_data *lpolicy, - union ldlm_wire_policy_data *wpolicy); -void ldlm_ibits_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy, - union ldlm_policy_data *lpolicy); -void ldlm_ibits_policy_local_to_wire(const union ldlm_policy_data *lpolicy, - union ldlm_wire_policy_data *wpolicy); -void ldlm_extent_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy, - union ldlm_policy_data *lpolicy); -void ldlm_extent_policy_local_to_wire(const union ldlm_policy_data *lpolicy, - union ldlm_wire_policy_data *wpolicy); -void ldlm_flock_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy, - union ldlm_policy_data *lpolicy); -void ldlm_flock_policy_local_to_wire(const union ldlm_policy_data *lpolicy, - union ldlm_wire_policy_data *wpolicy); - -static inline bool ldlm_res_eq(const struct ldlm_res_id *res0, - const struct ldlm_res_id *res1) -{ - return memcmp(res0, res1, sizeof(*res0)) == 0; -} diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c deleted file mode 100644 index 0aa4f234a4f453cab6b72c00b7609b9e01344b67..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ldlm/ldlm_lib.c +++ /dev/null @@ -1,842 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2010, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -/** - * This file deals with various client/target related logic including recovery. - * - * TODO: This code more logically belongs in the ptlrpc module than in ldlm and - * should be moved. - */ - -#define DEBUG_SUBSYSTEM S_LDLM - -#include -#include -#include -#include -#include -#include "ldlm_internal.h" - -/* @priority: If non-zero, move the selected connection to the list head. - * @create: If zero, only search in existing connections. - */ -static int import_set_conn(struct obd_import *imp, struct obd_uuid *uuid, - int priority, int create) -{ - struct ptlrpc_connection *ptlrpc_conn; - struct obd_import_conn *imp_conn = NULL, *item; - int rc = 0; - - if (!create && !priority) { - CDEBUG(D_HA, "Nothing to do\n"); - return -EINVAL; - } - - ptlrpc_conn = ptlrpc_uuid_to_connection(uuid); - if (!ptlrpc_conn) { - CDEBUG(D_HA, "can't find connection %s\n", uuid->uuid); - return -ENOENT; - } - - if (create) { - imp_conn = kzalloc(sizeof(*imp_conn), GFP_NOFS); - if (!imp_conn) { - rc = -ENOMEM; - goto out_put; - } - } - - spin_lock(&imp->imp_lock); - list_for_each_entry(item, &imp->imp_conn_list, oic_item) { - if (obd_uuid_equals(uuid, &item->oic_uuid)) { - if (priority) { - list_del(&item->oic_item); - list_add(&item->oic_item, - &imp->imp_conn_list); - item->oic_last_attempt = 0; - } - CDEBUG(D_HA, "imp %p@%s: found existing conn %s%s\n", - imp, imp->imp_obd->obd_name, uuid->uuid, - (priority ? ", moved to head" : "")); - spin_unlock(&imp->imp_lock); - rc = 0; - goto out_free; - } - } - /* No existing import connection found for \a uuid. */ - if (create) { - imp_conn->oic_conn = ptlrpc_conn; - imp_conn->oic_uuid = *uuid; - imp_conn->oic_last_attempt = 0; - if (priority) - list_add(&imp_conn->oic_item, &imp->imp_conn_list); - else - list_add_tail(&imp_conn->oic_item, - &imp->imp_conn_list); - CDEBUG(D_HA, "imp %p@%s: add connection %s at %s\n", - imp, imp->imp_obd->obd_name, uuid->uuid, - (priority ? "head" : "tail")); - } else { - spin_unlock(&imp->imp_lock); - rc = -ENOENT; - goto out_free; - } - - spin_unlock(&imp->imp_lock); - return 0; -out_free: - kfree(imp_conn); -out_put: - ptlrpc_connection_put(ptlrpc_conn); - return rc; -} - -int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid) -{ - return import_set_conn(imp, uuid, 1, 0); -} - -int client_import_add_conn(struct obd_import *imp, struct obd_uuid *uuid, - int priority) -{ - return import_set_conn(imp, uuid, priority, 1); -} -EXPORT_SYMBOL(client_import_add_conn); - -int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid) -{ - struct obd_import_conn *imp_conn; - struct obd_export *dlmexp; - int rc = -ENOENT; - - spin_lock(&imp->imp_lock); - if (list_empty(&imp->imp_conn_list)) { - LASSERT(!imp->imp_connection); - goto out; - } - - list_for_each_entry(imp_conn, &imp->imp_conn_list, oic_item) { - if (!obd_uuid_equals(uuid, &imp_conn->oic_uuid)) - continue; - LASSERT(imp_conn->oic_conn); - - if (imp_conn == imp->imp_conn_current) { - LASSERT(imp_conn->oic_conn == imp->imp_connection); - - if (imp->imp_state != LUSTRE_IMP_CLOSED && - imp->imp_state != LUSTRE_IMP_DISCON) { - CERROR("can't remove current connection\n"); - rc = -EBUSY; - goto out; - } - - ptlrpc_connection_put(imp->imp_connection); - imp->imp_connection = NULL; - - dlmexp = class_conn2export(&imp->imp_dlm_handle); - if (dlmexp && dlmexp->exp_connection) { - LASSERT(dlmexp->exp_connection == - imp_conn->oic_conn); - ptlrpc_connection_put(dlmexp->exp_connection); - dlmexp->exp_connection = NULL; - } - - if (dlmexp) - class_export_put(dlmexp); - } - - list_del(&imp_conn->oic_item); - ptlrpc_connection_put(imp_conn->oic_conn); - kfree(imp_conn); - CDEBUG(D_HA, "imp %p@%s: remove connection %s\n", - imp, imp->imp_obd->obd_name, uuid->uuid); - rc = 0; - break; - } -out: - spin_unlock(&imp->imp_lock); - if (rc == -ENOENT) - CERROR("connection %s not found\n", uuid->uuid); - return rc; -} -EXPORT_SYMBOL(client_import_del_conn); - -/** - * Find conn UUID by peer NID. \a peer is a server NID. This function is used - * to find a conn uuid of \a imp which can reach \a peer. - */ -int client_import_find_conn(struct obd_import *imp, lnet_nid_t peer, - struct obd_uuid *uuid) -{ - struct obd_import_conn *conn; - int rc = -ENOENT; - - spin_lock(&imp->imp_lock); - list_for_each_entry(conn, &imp->imp_conn_list, oic_item) { - /* Check if conn UUID does have this peer NID. */ - if (class_check_uuid(&conn->oic_uuid, peer)) { - *uuid = conn->oic_uuid; - rc = 0; - break; - } - } - spin_unlock(&imp->imp_lock); - return rc; -} -EXPORT_SYMBOL(client_import_find_conn); - -void client_destroy_import(struct obd_import *imp) -{ - /* Drop security policy instance after all RPCs have finished/aborted - * to let all busy contexts be released. - */ - class_import_get(imp); - class_destroy_import(imp); - sptlrpc_import_sec_put(imp); - class_import_put(imp); -} -EXPORT_SYMBOL(client_destroy_import); - -/* Configure an RPC client OBD device. - * - * lcfg parameters: - * 1 - client UUID - * 2 - server UUID - * 3 - inactive-on-startup - */ -int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) -{ - struct client_obd *cli = &obddev->u.cli; - struct obd_import *imp; - struct obd_uuid server_uuid; - int rq_portal, rp_portal, connect_op; - char *name = obddev->obd_type->typ_name; - enum ldlm_ns_type ns_type = LDLM_NS_TYPE_UNKNOWN; - int rc; - - /* In a more perfect world, we would hang a ptlrpc_client off of - * obd_type and just use the values from there. - */ - if (!strcmp(name, LUSTRE_OSC_NAME)) { - rq_portal = OST_REQUEST_PORTAL; - rp_portal = OSC_REPLY_PORTAL; - connect_op = OST_CONNECT; - cli->cl_sp_me = LUSTRE_SP_CLI; - cli->cl_sp_to = LUSTRE_SP_OST; - ns_type = LDLM_NS_TYPE_OSC; - } else if (!strcmp(name, LUSTRE_MDC_NAME) || - !strcmp(name, LUSTRE_LWP_NAME)) { - rq_portal = MDS_REQUEST_PORTAL; - rp_portal = MDC_REPLY_PORTAL; - connect_op = MDS_CONNECT; - cli->cl_sp_me = LUSTRE_SP_CLI; - cli->cl_sp_to = LUSTRE_SP_MDT; - ns_type = LDLM_NS_TYPE_MDC; - } else if (!strcmp(name, LUSTRE_MGC_NAME)) { - rq_portal = MGS_REQUEST_PORTAL; - rp_portal = MGC_REPLY_PORTAL; - connect_op = MGS_CONNECT; - cli->cl_sp_me = LUSTRE_SP_MGC; - cli->cl_sp_to = LUSTRE_SP_MGS; - cli->cl_flvr_mgc.sf_rpc = SPTLRPC_FLVR_INVALID; - ns_type = LDLM_NS_TYPE_MGC; - } else { - CERROR("unknown client OBD type \"%s\", can't setup\n", - name); - return -EINVAL; - } - - if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) { - CERROR("requires a TARGET UUID\n"); - return -EINVAL; - } - - if (LUSTRE_CFG_BUFLEN(lcfg, 1) > 37) { - CERROR("client UUID must be less than 38 characters\n"); - return -EINVAL; - } - - if (LUSTRE_CFG_BUFLEN(lcfg, 2) < 1) { - CERROR("setup requires a SERVER UUID\n"); - return -EINVAL; - } - - if (LUSTRE_CFG_BUFLEN(lcfg, 2) > 37) { - CERROR("target UUID must be less than 38 characters\n"); - return -EINVAL; - } - - init_rwsem(&cli->cl_sem); - cli->cl_conn_count = 0; - memcpy(server_uuid.uuid, lustre_cfg_buf(lcfg, 2), - min_t(unsigned int, LUSTRE_CFG_BUFLEN(lcfg, 2), - sizeof(server_uuid))); - - cli->cl_dirty_pages = 0; - cli->cl_avail_grant = 0; - /* FIXME: Should limit this for the sum of all cl_dirty_max_pages. */ - /* - * cl_dirty_max_pages may be changed at connect time in - * ptlrpc_connect_interpret(). - */ - client_adjust_max_dirty(cli); - INIT_LIST_HEAD(&cli->cl_cache_waiters); - INIT_LIST_HEAD(&cli->cl_loi_ready_list); - INIT_LIST_HEAD(&cli->cl_loi_hp_ready_list); - INIT_LIST_HEAD(&cli->cl_loi_write_list); - INIT_LIST_HEAD(&cli->cl_loi_read_list); - spin_lock_init(&cli->cl_loi_list_lock); - atomic_set(&cli->cl_pending_w_pages, 0); - atomic_set(&cli->cl_pending_r_pages, 0); - cli->cl_r_in_flight = 0; - cli->cl_w_in_flight = 0; - - spin_lock_init(&cli->cl_read_rpc_hist.oh_lock); - spin_lock_init(&cli->cl_write_rpc_hist.oh_lock); - spin_lock_init(&cli->cl_read_page_hist.oh_lock); - spin_lock_init(&cli->cl_write_page_hist.oh_lock); - spin_lock_init(&cli->cl_read_offset_hist.oh_lock); - spin_lock_init(&cli->cl_write_offset_hist.oh_lock); - - /* lru for osc. */ - INIT_LIST_HEAD(&cli->cl_lru_osc); - atomic_set(&cli->cl_lru_shrinkers, 0); - atomic_long_set(&cli->cl_lru_busy, 0); - atomic_long_set(&cli->cl_lru_in_list, 0); - INIT_LIST_HEAD(&cli->cl_lru_list); - spin_lock_init(&cli->cl_lru_list_lock); - atomic_long_set(&cli->cl_unstable_count, 0); - INIT_LIST_HEAD(&cli->cl_shrink_list); - - init_waitqueue_head(&cli->cl_destroy_waitq); - atomic_set(&cli->cl_destroy_in_flight, 0); - /* Turn on checksumming by default. */ - cli->cl_checksum = 1; - /* - * The supported checksum types will be worked out at connect time - * Set cl_chksum* to CRC32 for now to avoid returning screwed info - * through procfs. - */ - cli->cl_cksum_type = OBD_CKSUM_CRC32; - cli->cl_supp_cksum_types = OBD_CKSUM_CRC32; - atomic_set(&cli->cl_resends, OSC_DEFAULT_RESENDS); - - /* - * Set it to possible maximum size. It may be reduced by ocd_brw_size - * from OFD after connecting. - */ - cli->cl_max_pages_per_rpc = PTLRPC_MAX_BRW_PAGES; - - /* - * set cl_chunkbits default value to PAGE_CACHE_SHIFT, - * it will be updated at OSC connection time. - */ - cli->cl_chunkbits = PAGE_SHIFT; - - if (!strcmp(name, LUSTRE_MDC_NAME)) - cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT; - else if (totalram_pages >> (20 - PAGE_SHIFT) <= 128 /* MB */) - cli->cl_max_rpcs_in_flight = 2; - else if (totalram_pages >> (20 - PAGE_SHIFT) <= 256 /* MB */) - cli->cl_max_rpcs_in_flight = 3; - else if (totalram_pages >> (20 - PAGE_SHIFT) <= 512 /* MB */) - cli->cl_max_rpcs_in_flight = 4; - else - cli->cl_max_rpcs_in_flight = OBD_MAX_RIF_DEFAULT; - - spin_lock_init(&cli->cl_mod_rpcs_lock); - spin_lock_init(&cli->cl_mod_rpcs_hist.oh_lock); - cli->cl_max_mod_rpcs_in_flight = 0; - cli->cl_mod_rpcs_in_flight = 0; - cli->cl_close_rpcs_in_flight = 0; - init_waitqueue_head(&cli->cl_mod_rpcs_waitq); - cli->cl_mod_tag_bitmap = NULL; - - if (connect_op == MDS_CONNECT) { - cli->cl_max_mod_rpcs_in_flight = cli->cl_max_rpcs_in_flight - 1; - cli->cl_mod_tag_bitmap = kcalloc(BITS_TO_LONGS(OBD_MAX_RIF_MAX), - sizeof(long), GFP_NOFS); - if (!cli->cl_mod_tag_bitmap) { - rc = -ENOMEM; - goto err; - } - } - - rc = ldlm_get_ref(); - if (rc) { - CERROR("ldlm_get_ref failed: %d\n", rc); - goto err; - } - - ptlrpc_init_client(rq_portal, rp_portal, name, - &obddev->obd_ldlm_client); - - imp = class_new_import(obddev); - if (!imp) { - rc = -ENOENT; - goto err_ldlm; - } - imp->imp_client = &obddev->obd_ldlm_client; - imp->imp_connect_op = connect_op; - memcpy(cli->cl_target_uuid.uuid, lustre_cfg_buf(lcfg, 1), - LUSTRE_CFG_BUFLEN(lcfg, 1)); - class_import_put(imp); - - rc = client_import_add_conn(imp, &server_uuid, 1); - if (rc) { - CERROR("can't add initial connection\n"); - goto err_import; - } - - cli->cl_import = imp; - /* cli->cl_max_mds_easize updated by mdc_init_ea_size() */ - cli->cl_max_mds_easize = sizeof(struct lov_mds_md_v3); - - if (LUSTRE_CFG_BUFLEN(lcfg, 3) > 0) { - if (!strcmp(lustre_cfg_string(lcfg, 3), "inactive")) { - CDEBUG(D_HA, "marking %s %s->%s as inactive\n", - name, obddev->obd_name, - cli->cl_target_uuid.uuid); - spin_lock(&imp->imp_lock); - imp->imp_deactive = 1; - spin_unlock(&imp->imp_lock); - } - } - - obddev->obd_namespace = ldlm_namespace_new(obddev, obddev->obd_name, - LDLM_NAMESPACE_CLIENT, - LDLM_NAMESPACE_GREEDY, - ns_type); - if (!obddev->obd_namespace) { - CERROR("Unable to create client namespace - %s\n", - obddev->obd_name); - rc = -ENOMEM; - goto err_import; - } - - return rc; - -err_import: - class_destroy_import(imp); -err_ldlm: - ldlm_put_ref(); -err: - kfree(cli->cl_mod_tag_bitmap); - cli->cl_mod_tag_bitmap = NULL; - return rc; -} -EXPORT_SYMBOL(client_obd_setup); - -int client_obd_cleanup(struct obd_device *obddev) -{ - struct client_obd *cli = &obddev->u.cli; - - ldlm_namespace_free_post(obddev->obd_namespace); - obddev->obd_namespace = NULL; - - obd_cleanup_client_import(obddev); - LASSERT(!obddev->u.cli.cl_import); - - ldlm_put_ref(); - - kfree(cli->cl_mod_tag_bitmap); - cli->cl_mod_tag_bitmap = NULL; - - return 0; -} -EXPORT_SYMBOL(client_obd_cleanup); - -/* ->o_connect() method for client side (OSC and MDC and MGC) */ -int client_connect_import(const struct lu_env *env, - struct obd_export **exp, - struct obd_device *obd, struct obd_uuid *cluuid, - struct obd_connect_data *data, void *localdata) -{ - struct client_obd *cli = &obd->u.cli; - struct obd_import *imp = cli->cl_import; - struct obd_connect_data *ocd; - struct lustre_handle conn = { 0 }; - bool is_mdc = false; - int rc; - - *exp = NULL; - down_write(&cli->cl_sem); - if (cli->cl_conn_count > 0) { - rc = -EALREADY; - goto out_sem; - } - - rc = class_connect(&conn, obd, cluuid); - if (rc) - goto out_sem; - - cli->cl_conn_count++; - *exp = class_conn2export(&conn); - - LASSERT(obd->obd_namespace); - - imp->imp_dlm_handle = conn; - rc = ptlrpc_init_import(imp); - if (rc != 0) - goto out_ldlm; - - ocd = &imp->imp_connect_data; - if (data) { - *ocd = *data; - is_mdc = !strncmp(imp->imp_obd->obd_type->typ_name, - LUSTRE_MDC_NAME, 3); - if (is_mdc) - data->ocd_connect_flags |= OBD_CONNECT_MULTIMODRPCS; - imp->imp_connect_flags_orig = data->ocd_connect_flags; - } - - rc = ptlrpc_connect_import(imp); - if (rc != 0) { - if (data && is_mdc) - data->ocd_connect_flags &= ~OBD_CONNECT_MULTIMODRPCS; - LASSERT(imp->imp_state == LUSTRE_IMP_DISCON); - goto out_ldlm; - } - LASSERT(*exp && (*exp)->exp_connection); - - if (data) { - LASSERTF((ocd->ocd_connect_flags & data->ocd_connect_flags) == - ocd->ocd_connect_flags, "old %#llx, new %#llx\n", - data->ocd_connect_flags, ocd->ocd_connect_flags); - data->ocd_connect_flags = ocd->ocd_connect_flags; - /* clear the flag as it was not set and is not known - * by upper layers - */ - if (is_mdc) - data->ocd_connect_flags &= ~OBD_CONNECT_MULTIMODRPCS; - } - - ptlrpc_pinger_add_import(imp); - - if (rc) { -out_ldlm: - cli->cl_conn_count--; - class_disconnect(*exp); - *exp = NULL; - } -out_sem: - up_write(&cli->cl_sem); - - return rc; -} -EXPORT_SYMBOL(client_connect_import); - -int client_disconnect_export(struct obd_export *exp) -{ - struct obd_device *obd = class_exp2obd(exp); - struct client_obd *cli; - struct obd_import *imp; - int rc = 0, err; - - if (!obd) { - CERROR("invalid export for disconnect: exp %p cookie %#llx\n", - exp, exp ? exp->exp_handle.h_cookie : -1); - return -EINVAL; - } - - cli = &obd->u.cli; - imp = cli->cl_import; - - down_write(&cli->cl_sem); - CDEBUG(D_INFO, "disconnect %s - %zu\n", obd->obd_name, - cli->cl_conn_count); - - if (!cli->cl_conn_count) { - CERROR("disconnecting disconnected device (%s)\n", - obd->obd_name); - rc = -EINVAL; - goto out_disconnect; - } - - cli->cl_conn_count--; - if (cli->cl_conn_count) { - rc = 0; - goto out_disconnect; - } - - /* Mark import deactivated now, so we don't try to reconnect if any - * of the cleanup RPCs fails (e.g. LDLM cancel, etc). We don't - * fully deactivate the import, or that would drop all requests. - */ - spin_lock(&imp->imp_lock); - imp->imp_deactive = 1; - spin_unlock(&imp->imp_lock); - - /* Some non-replayable imports (MDS's OSCs) are pinged, so just - * delete it regardless. (It's safe to delete an import that was - * never added.) - */ - (void)ptlrpc_pinger_del_import(imp); - - if (obd->obd_namespace) { - /* obd_force == local only */ - ldlm_cli_cancel_unused(obd->obd_namespace, NULL, - obd->obd_force ? LCF_LOCAL : 0, NULL); - ldlm_namespace_free_prior(obd->obd_namespace, imp, - obd->obd_force); - } - - /* There's no need to hold sem while disconnecting an import, - * and it may actually cause deadlock in GSS. - */ - up_write(&cli->cl_sem); - rc = ptlrpc_disconnect_import(imp, 0); - down_write(&cli->cl_sem); - - ptlrpc_invalidate_import(imp); - -out_disconnect: - /* Use server style - class_disconnect should be always called for - * o_disconnect. - */ - err = class_disconnect(exp); - if (!rc && err) - rc = err; - - up_write(&cli->cl_sem); - - return rc; -} -EXPORT_SYMBOL(client_disconnect_export); - -/** - * Packs current SLV and Limit into \a req. - */ -int target_pack_pool_reply(struct ptlrpc_request *req) -{ - struct obd_device *obd; - - /* Check that we still have all structures alive as this may - * be some late RPC at shutdown time. - */ - if (unlikely(!req->rq_export || !req->rq_export->exp_obd || - !exp_connect_lru_resize(req->rq_export))) { - lustre_msg_set_slv(req->rq_repmsg, 0); - lustre_msg_set_limit(req->rq_repmsg, 0); - return 0; - } - - /* OBD is alive here as export is alive, which we checked above. */ - obd = req->rq_export->exp_obd; - - read_lock(&obd->obd_pool_lock); - lustre_msg_set_slv(req->rq_repmsg, obd->obd_pool_slv); - lustre_msg_set_limit(req->rq_repmsg, obd->obd_pool_limit); - read_unlock(&obd->obd_pool_lock); - - return 0; -} -EXPORT_SYMBOL(target_pack_pool_reply); - -static int -target_send_reply_msg(struct ptlrpc_request *req, int rc, int fail_id) -{ - if (OBD_FAIL_CHECK_ORSET(fail_id & ~OBD_FAIL_ONCE, OBD_FAIL_ONCE)) { - DEBUG_REQ(D_ERROR, req, "dropping reply"); - return -ECOMM; - } - - if (unlikely(rc)) { - DEBUG_REQ(D_NET, req, "processing error (%d)", rc); - req->rq_status = rc; - return ptlrpc_send_error(req, 1); - } - - DEBUG_REQ(D_NET, req, "sending reply"); - return ptlrpc_send_reply(req, PTLRPC_REPLY_MAYBE_DIFFICULT); -} - -void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id) -{ - struct ptlrpc_service_part *svcpt; - int netrc; - struct ptlrpc_reply_state *rs; - struct obd_export *exp; - - if (req->rq_no_reply) - return; - - svcpt = req->rq_rqbd->rqbd_svcpt; - rs = req->rq_reply_state; - if (!rs || !rs->rs_difficult) { - /* no notifiers */ - target_send_reply_msg(req, rc, fail_id); - return; - } - - /* must be an export if locks saved */ - LASSERT(req->rq_export); - /* req/reply consistent */ - LASSERT(rs->rs_svcpt == svcpt); - - /* "fresh" reply */ - LASSERT(!rs->rs_scheduled); - LASSERT(!rs->rs_scheduled_ever); - LASSERT(!rs->rs_handled); - LASSERT(!rs->rs_on_net); - LASSERT(!rs->rs_export); - LASSERT(list_empty(&rs->rs_obd_list)); - LASSERT(list_empty(&rs->rs_exp_list)); - - exp = class_export_get(req->rq_export); - - /* disable reply scheduling while I'm setting up */ - rs->rs_scheduled = 1; - rs->rs_on_net = 1; - rs->rs_xid = req->rq_xid; - rs->rs_transno = req->rq_transno; - rs->rs_export = exp; - rs->rs_opc = lustre_msg_get_opc(req->rq_reqmsg); - - spin_lock(&exp->exp_uncommitted_replies_lock); - CDEBUG(D_NET, "rs transno = %llu, last committed = %llu\n", - rs->rs_transno, exp->exp_last_committed); - if (rs->rs_transno > exp->exp_last_committed) { - /* not committed already */ - list_add_tail(&rs->rs_obd_list, - &exp->exp_uncommitted_replies); - } - spin_unlock(&exp->exp_uncommitted_replies_lock); - - spin_lock(&exp->exp_lock); - list_add_tail(&rs->rs_exp_list, &exp->exp_outstanding_replies); - spin_unlock(&exp->exp_lock); - - netrc = target_send_reply_msg(req, rc, fail_id); - - spin_lock(&svcpt->scp_rep_lock); - - atomic_inc(&svcpt->scp_nreps_difficult); - - if (netrc != 0) { - /* error sending: reply is off the net. Also we need +1 - * reply ref until ptlrpc_handle_rs() is done - * with the reply state (if the send was successful, there - * would have been +1 ref for the net, which - * reply_out_callback leaves alone) - */ - rs->rs_on_net = 0; - ptlrpc_rs_addref(rs); - } - - spin_lock(&rs->rs_lock); - if (rs->rs_transno <= exp->exp_last_committed || - (!rs->rs_on_net && !rs->rs_no_ack) || - list_empty(&rs->rs_exp_list) || /* completed already */ - list_empty(&rs->rs_obd_list)) { - CDEBUG(D_HA, "Schedule reply immediately\n"); - ptlrpc_dispatch_difficult_reply(rs); - } else { - list_add(&rs->rs_list, &svcpt->scp_rep_active); - rs->rs_scheduled = 0; /* allow notifier to schedule */ - } - spin_unlock(&rs->rs_lock); - spin_unlock(&svcpt->scp_rep_lock); -} -EXPORT_SYMBOL(target_send_reply); - -enum ldlm_mode lck_compat_array[] = { - [LCK_EX] = LCK_COMPAT_EX, - [LCK_PW] = LCK_COMPAT_PW, - [LCK_PR] = LCK_COMPAT_PR, - [LCK_CW] = LCK_COMPAT_CW, - [LCK_CR] = LCK_COMPAT_CR, - [LCK_NL] = LCK_COMPAT_NL, - [LCK_GROUP] = LCK_COMPAT_GROUP, - [LCK_COS] = LCK_COMPAT_COS, -}; - -/** - * Rather arbitrary mapping from LDLM error codes to errno values. This should - * not escape to the user level. - */ -int ldlm_error2errno(enum ldlm_error error) -{ - int result; - - switch (error) { - case ELDLM_OK: - case ELDLM_LOCK_MATCHED: - result = 0; - break; - case ELDLM_LOCK_CHANGED: - result = -ESTALE; - break; - case ELDLM_LOCK_ABORTED: - result = -ENAVAIL; - break; - case ELDLM_LOCK_REPLACED: - result = -ESRCH; - break; - case ELDLM_NO_LOCK_DATA: - result = -ENOENT; - break; - case ELDLM_NAMESPACE_EXISTS: - result = -EEXIST; - break; - case ELDLM_BAD_NAMESPACE: - result = -EBADF; - break; - default: - if (((int)error) < 0) /* cast to signed type */ - result = error; /* as enum ldlm_error can be unsigned */ - else { - CERROR("Invalid DLM result code: %d\n", error); - result = -EPROTO; - } - } - return result; -} -EXPORT_SYMBOL(ldlm_error2errno); - -#if LUSTRE_TRACKS_LOCK_EXP_REFS -void ldlm_dump_export_locks(struct obd_export *exp) -{ - spin_lock(&exp->exp_locks_list_guard); - if (!list_empty(&exp->exp_locks_list)) { - struct ldlm_lock *lock; - - CERROR("dumping locks for export %p,ignore if the unmount doesn't hang\n", - exp); - list_for_each_entry(lock, &exp->exp_locks_list, - l_exp_refs_link) - LDLM_ERROR(lock, "lock:"); - } - spin_unlock(&exp->exp_locks_list_guard); -} -#endif diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c deleted file mode 100644 index a644d133063b6094e92109442cad2240f34123aa..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ldlm/ldlm_lock.c +++ /dev/null @@ -1,2135 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2010, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/ldlm/ldlm_lock.c - * - * Author: Peter Braam - * Author: Phil Schwan - */ - -#define DEBUG_SUBSYSTEM S_LDLM - -#include -#include -#include -#include "ldlm_internal.h" - -/* lock types */ -char *ldlm_lockname[] = { - [0] = "--", - [LCK_EX] = "EX", - [LCK_PW] = "PW", - [LCK_PR] = "PR", - [LCK_CW] = "CW", - [LCK_CR] = "CR", - [LCK_NL] = "NL", - [LCK_GROUP] = "GROUP", - [LCK_COS] = "COS", -}; -EXPORT_SYMBOL(ldlm_lockname); - -static char *ldlm_typename[] = { - [LDLM_PLAIN] = "PLN", - [LDLM_EXTENT] = "EXT", - [LDLM_FLOCK] = "FLK", - [LDLM_IBITS] = "IBT", -}; - -static ldlm_policy_wire_to_local_t ldlm_policy_wire_to_local[] = { - [LDLM_PLAIN - LDLM_MIN_TYPE] = ldlm_plain_policy_wire_to_local, - [LDLM_EXTENT - LDLM_MIN_TYPE] = ldlm_extent_policy_wire_to_local, - [LDLM_FLOCK - LDLM_MIN_TYPE] = ldlm_flock_policy_wire_to_local, - [LDLM_IBITS - LDLM_MIN_TYPE] = ldlm_ibits_policy_wire_to_local, -}; - -static ldlm_policy_local_to_wire_t ldlm_policy_local_to_wire[] = { - [LDLM_PLAIN - LDLM_MIN_TYPE] = ldlm_plain_policy_local_to_wire, - [LDLM_EXTENT - LDLM_MIN_TYPE] = ldlm_extent_policy_local_to_wire, - [LDLM_FLOCK - LDLM_MIN_TYPE] = ldlm_flock_policy_local_to_wire, - [LDLM_IBITS - LDLM_MIN_TYPE] = ldlm_ibits_policy_local_to_wire, -}; - -/** - * Converts lock policy from local format to on the wire lock_desc format - */ -static void ldlm_convert_policy_to_wire(enum ldlm_type type, - const union ldlm_policy_data *lpolicy, - union ldlm_wire_policy_data *wpolicy) -{ - ldlm_policy_local_to_wire_t convert; - - convert = ldlm_policy_local_to_wire[type - LDLM_MIN_TYPE]; - - convert(lpolicy, wpolicy); -} - -/** - * Converts lock policy from on the wire lock_desc format to local format - */ -void ldlm_convert_policy_to_local(struct obd_export *exp, enum ldlm_type type, - const union ldlm_wire_policy_data *wpolicy, - union ldlm_policy_data *lpolicy) -{ - ldlm_policy_wire_to_local_t convert; - - convert = ldlm_policy_wire_to_local[type - LDLM_MIN_TYPE]; - - convert(wpolicy, lpolicy); -} - -const char *ldlm_it2str(enum ldlm_intent_flags it) -{ - switch (it) { - case IT_OPEN: - return "open"; - case IT_CREAT: - return "creat"; - case (IT_OPEN | IT_CREAT): - return "open|creat"; - case IT_READDIR: - return "readdir"; - case IT_GETATTR: - return "getattr"; - case IT_LOOKUP: - return "lookup"; - case IT_UNLINK: - return "unlink"; - case IT_GETXATTR: - return "getxattr"; - case IT_LAYOUT: - return "layout"; - default: - CERROR("Unknown intent 0x%08x\n", it); - return "UNKNOWN"; - } -} -EXPORT_SYMBOL(ldlm_it2str); - -/* - * REFCOUNTED LOCK OBJECTS - */ - -/** - * Get a reference on a lock. - * - * Lock refcounts, during creation: - * - one special one for allocation, dec'd only once in destroy - * - one for being a lock that's in-use - * - one for the addref associated with a new lock - */ -struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock) -{ - atomic_inc(&lock->l_refc); - return lock; -} -EXPORT_SYMBOL(ldlm_lock_get); - -/** - * Release lock reference. - * - * Also frees the lock if it was last reference. - */ -void ldlm_lock_put(struct ldlm_lock *lock) -{ - LASSERT(lock->l_resource != LP_POISON); - LASSERT(atomic_read(&lock->l_refc) > 0); - if (atomic_dec_and_test(&lock->l_refc)) { - struct ldlm_resource *res; - - LDLM_DEBUG(lock, - "final lock_put on destroyed lock, freeing it."); - - res = lock->l_resource; - LASSERT(ldlm_is_destroyed(lock)); - LASSERT(list_empty(&lock->l_res_link)); - LASSERT(list_empty(&lock->l_pending_chain)); - - lprocfs_counter_decr(ldlm_res_to_ns(res)->ns_stats, - LDLM_NSS_LOCKS); - lu_ref_del(&res->lr_reference, "lock", lock); - ldlm_resource_putref(res); - lock->l_resource = NULL; - if (lock->l_export) { - class_export_lock_put(lock->l_export, lock); - lock->l_export = NULL; - } - - kfree(lock->l_lvb_data); - - ldlm_interval_free(ldlm_interval_detach(lock)); - lu_ref_fini(&lock->l_reference); - OBD_FREE_RCU(lock, sizeof(*lock), &lock->l_handle); - } -} -EXPORT_SYMBOL(ldlm_lock_put); - -/** - * Removes LDLM lock \a lock from LRU. Assumes LRU is already locked. - */ -int ldlm_lock_remove_from_lru_nolock(struct ldlm_lock *lock) -{ - int rc = 0; - - if (!list_empty(&lock->l_lru)) { - struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); - - LASSERT(lock->l_resource->lr_type != LDLM_FLOCK); - list_del_init(&lock->l_lru); - LASSERT(ns->ns_nr_unused > 0); - ns->ns_nr_unused--; - rc = 1; - } - return rc; -} - -/** - * Removes LDLM lock \a lock from LRU. Obtains the LRU lock first. - * - * If \a last_use is non-zero, it will remove the lock from LRU only if - * it matches lock's l_last_used. - * - * \retval 0 if \a last_use is set, the lock is not in LRU list or \a last_use - * doesn't match lock's l_last_used; - * otherwise, the lock hasn't been in the LRU list. - * \retval 1 the lock was in LRU list and removed. - */ -int ldlm_lock_remove_from_lru_check(struct ldlm_lock *lock, time_t last_use) -{ - struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); - int rc = 0; - - spin_lock(&ns->ns_lock); - if (last_use == 0 || last_use == lock->l_last_used) - rc = ldlm_lock_remove_from_lru_nolock(lock); - spin_unlock(&ns->ns_lock); - - return rc; -} - -/** - * Adds LDLM lock \a lock to namespace LRU. Assumes LRU is already locked. - */ -static void ldlm_lock_add_to_lru_nolock(struct ldlm_lock *lock) -{ - struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); - - lock->l_last_used = jiffies; - LASSERT(list_empty(&lock->l_lru)); - LASSERT(lock->l_resource->lr_type != LDLM_FLOCK); - list_add_tail(&lock->l_lru, &ns->ns_unused_list); - ldlm_clear_skipped(lock); - LASSERT(ns->ns_nr_unused >= 0); - ns->ns_nr_unused++; -} - -/** - * Adds LDLM lock \a lock to namespace LRU. Obtains necessary LRU locks - * first. - */ -static void ldlm_lock_add_to_lru(struct ldlm_lock *lock) -{ - struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); - - spin_lock(&ns->ns_lock); - ldlm_lock_add_to_lru_nolock(lock); - spin_unlock(&ns->ns_lock); -} - -/** - * Moves LDLM lock \a lock that is already in namespace LRU to the tail of - * the LRU. Performs necessary LRU locking - */ -static void ldlm_lock_touch_in_lru(struct ldlm_lock *lock) -{ - struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); - - spin_lock(&ns->ns_lock); - if (!list_empty(&lock->l_lru)) { - ldlm_lock_remove_from_lru_nolock(lock); - ldlm_lock_add_to_lru_nolock(lock); - } - spin_unlock(&ns->ns_lock); -} - -/** - * Helper to destroy a locked lock. - * - * Used by ldlm_lock_destroy and ldlm_lock_destroy_nolock - * Must be called with l_lock and lr_lock held. - * - * Does not actually free the lock data, but rather marks the lock as - * destroyed by setting l_destroyed field in the lock to 1. Destroys a - * handle->lock association too, so that the lock can no longer be found - * and removes the lock from LRU list. Actual lock freeing occurs when - * last lock reference goes away. - * - * Original comment (of some historical value): - * This used to have a 'strict' flag, which recovery would use to mark an - * in-use lock as needing-to-die. Lest I am ever tempted to put it back, I - * shall explain why it's gone: with the new hash table scheme, once you call - * ldlm_lock_destroy, you can never drop your final references on this lock. - * Because it's not in the hash table anymore. -phil - */ -static int ldlm_lock_destroy_internal(struct ldlm_lock *lock) -{ - if (lock->l_readers || lock->l_writers) { - LDLM_ERROR(lock, "lock still has references"); - LBUG(); - } - - if (!list_empty(&lock->l_res_link)) { - LDLM_ERROR(lock, "lock still on resource"); - LBUG(); - } - - if (ldlm_is_destroyed(lock)) { - LASSERT(list_empty(&lock->l_lru)); - return 0; - } - ldlm_set_destroyed(lock); - - ldlm_lock_remove_from_lru(lock); - class_handle_unhash(&lock->l_handle); - - return 1; -} - -/** - * Destroys a LDLM lock \a lock. Performs necessary locking first. - */ -static void ldlm_lock_destroy(struct ldlm_lock *lock) -{ - int first; - - lock_res_and_lock(lock); - first = ldlm_lock_destroy_internal(lock); - unlock_res_and_lock(lock); - - /* drop reference from hashtable only for first destroy */ - if (first) { - lu_ref_del(&lock->l_reference, "hash", lock); - LDLM_LOCK_RELEASE(lock); - } -} - -/** - * Destroys a LDLM lock \a lock that is already locked. - */ -void ldlm_lock_destroy_nolock(struct ldlm_lock *lock) -{ - int first; - - first = ldlm_lock_destroy_internal(lock); - /* drop reference from hashtable only for first destroy */ - if (first) { - lu_ref_del(&lock->l_reference, "hash", lock); - LDLM_LOCK_RELEASE(lock); - } -} - -/* this is called by portals_handle2object with the handle lock taken */ -static void lock_handle_addref(void *lock) -{ - LDLM_LOCK_GET((struct ldlm_lock *)lock); -} - -static void lock_handle_free(void *lock, int size) -{ - LASSERT(size == sizeof(struct ldlm_lock)); - kmem_cache_free(ldlm_lock_slab, lock); -} - -static struct portals_handle_ops lock_handle_ops = { - .hop_addref = lock_handle_addref, - .hop_free = lock_handle_free, -}; - -/** - * - * Allocate and initialize new lock structure. - * - * usage: pass in a resource on which you have done ldlm_resource_get - * new lock will take over the refcount. - * returns: lock with refcount 2 - one for current caller and one for remote - */ -static struct ldlm_lock *ldlm_lock_new(struct ldlm_resource *resource) -{ - struct ldlm_lock *lock; - - LASSERT(resource); - - lock = kmem_cache_zalloc(ldlm_lock_slab, GFP_NOFS); - if (!lock) - return NULL; - - spin_lock_init(&lock->l_lock); - lock->l_resource = resource; - lu_ref_add(&resource->lr_reference, "lock", lock); - - atomic_set(&lock->l_refc, 2); - INIT_LIST_HEAD(&lock->l_res_link); - INIT_LIST_HEAD(&lock->l_lru); - INIT_LIST_HEAD(&lock->l_pending_chain); - INIT_LIST_HEAD(&lock->l_bl_ast); - INIT_LIST_HEAD(&lock->l_cp_ast); - INIT_LIST_HEAD(&lock->l_rk_ast); - init_waitqueue_head(&lock->l_waitq); - lock->l_blocking_lock = NULL; - INIT_LIST_HEAD(&lock->l_sl_mode); - INIT_LIST_HEAD(&lock->l_sl_policy); - - lprocfs_counter_incr(ldlm_res_to_ns(resource)->ns_stats, - LDLM_NSS_LOCKS); - INIT_LIST_HEAD(&lock->l_handle.h_link); - class_handle_hash(&lock->l_handle, &lock_handle_ops); - - lu_ref_init(&lock->l_reference); - lu_ref_add(&lock->l_reference, "hash", lock); - lock->l_callback_timeout = 0; - -#if LUSTRE_TRACKS_LOCK_EXP_REFS - INIT_LIST_HEAD(&lock->l_exp_refs_link); - lock->l_exp_refs_nr = 0; - lock->l_exp_refs_target = NULL; -#endif - - return lock; -} - -/** - * Moves LDLM lock \a lock to another resource. - * This is used on client when server returns some other lock than requested - * (typically as a result of intent operation) - */ -int ldlm_lock_change_resource(struct ldlm_namespace *ns, struct ldlm_lock *lock, - const struct ldlm_res_id *new_resid) -{ - struct ldlm_resource *oldres = lock->l_resource; - struct ldlm_resource *newres; - int type; - - lock_res_and_lock(lock); - if (memcmp(new_resid, &lock->l_resource->lr_name, - sizeof(lock->l_resource->lr_name)) == 0) { - /* Nothing to do */ - unlock_res_and_lock(lock); - return 0; - } - - LASSERT(new_resid->name[0] != 0); - - /* This function assumes that the lock isn't on any lists */ - LASSERT(list_empty(&lock->l_res_link)); - - type = oldres->lr_type; - unlock_res_and_lock(lock); - - newres = ldlm_resource_get(ns, NULL, new_resid, type, 1); - if (IS_ERR(newres)) - return PTR_ERR(newres); - - lu_ref_add(&newres->lr_reference, "lock", lock); - /* - * To flip the lock from the old to the new resource, lock, oldres and - * newres have to be locked. Resource spin-locks are nested within - * lock->l_lock, and are taken in the memory address order to avoid - * dead-locks. - */ - spin_lock(&lock->l_lock); - oldres = lock->l_resource; - if (oldres < newres) { - lock_res(oldres); - lock_res_nested(newres, LRT_NEW); - } else { - lock_res(newres); - lock_res_nested(oldres, LRT_NEW); - } - LASSERT(memcmp(new_resid, &oldres->lr_name, - sizeof(oldres->lr_name)) != 0); - lock->l_resource = newres; - unlock_res(oldres); - unlock_res_and_lock(lock); - - /* ...and the flowers are still standing! */ - lu_ref_del(&oldres->lr_reference, "lock", lock); - ldlm_resource_putref(oldres); - - return 0; -} - -/** \defgroup ldlm_handles LDLM HANDLES - * Ways to get hold of locks without any addresses. - * @{ - */ - -/** - * Fills in handle for LDLM lock \a lock into supplied \a lockh - * Does not take any references. - */ -void ldlm_lock2handle(const struct ldlm_lock *lock, struct lustre_handle *lockh) -{ - lockh->cookie = lock->l_handle.h_cookie; -} -EXPORT_SYMBOL(ldlm_lock2handle); - -/** - * Obtain a lock reference by handle. - * - * if \a flags: atomically get the lock and set the flags. - * Return NULL if flag already set - */ -struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *handle, - __u64 flags) -{ - struct ldlm_lock *lock; - - LASSERT(handle); - - lock = class_handle2object(handle->cookie, NULL); - if (!lock) - return NULL; - - if (lock->l_export && lock->l_export->exp_failed) { - CDEBUG(D_INFO, "lock export failed: lock %p, exp %p\n", - lock, lock->l_export); - LDLM_LOCK_PUT(lock); - return NULL; - } - - /* It's unlikely but possible that someone marked the lock as - * destroyed after we did handle2object on it - */ - if (flags == 0 && !ldlm_is_destroyed(lock)) { - lu_ref_add(&lock->l_reference, "handle", current); - return lock; - } - - lock_res_and_lock(lock); - - LASSERT(lock->l_resource); - - lu_ref_add_atomic(&lock->l_reference, "handle", current); - if (unlikely(ldlm_is_destroyed(lock))) { - unlock_res_and_lock(lock); - CDEBUG(D_INFO, "lock already destroyed: lock %p\n", lock); - LDLM_LOCK_PUT(lock); - return NULL; - } - - if (flags) { - if (lock->l_flags & flags) { - unlock_res_and_lock(lock); - LDLM_LOCK_PUT(lock); - return NULL; - } - - lock->l_flags |= flags; - } - - unlock_res_and_lock(lock); - return lock; -} -EXPORT_SYMBOL(__ldlm_handle2lock); -/** @} ldlm_handles */ - -/** - * Fill in "on the wire" representation for given LDLM lock into supplied - * lock descriptor \a desc structure. - */ -void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc) -{ - ldlm_res2desc(lock->l_resource, &desc->l_resource); - desc->l_req_mode = lock->l_req_mode; - desc->l_granted_mode = lock->l_granted_mode; - ldlm_convert_policy_to_wire(lock->l_resource->lr_type, - &lock->l_policy_data, - &desc->l_policy_data); -} - -/** - * Add a lock to list of conflicting locks to send AST to. - * - * Only add if we have not sent a blocking AST to the lock yet. - */ -static void ldlm_add_bl_work_item(struct ldlm_lock *lock, struct ldlm_lock *new, - struct list_head *work_list) -{ - if (!ldlm_is_ast_sent(lock)) { - LDLM_DEBUG(lock, "lock incompatible; sending blocking AST."); - ldlm_set_ast_sent(lock); - /* If the enqueuing client said so, tell the AST recipient to - * discard dirty data, rather than writing back. - */ - if (ldlm_is_ast_discard_data(new)) - ldlm_set_discard_data(lock); - LASSERT(list_empty(&lock->l_bl_ast)); - list_add(&lock->l_bl_ast, work_list); - LDLM_LOCK_GET(lock); - LASSERT(!lock->l_blocking_lock); - lock->l_blocking_lock = LDLM_LOCK_GET(new); - } -} - -/** - * Add a lock to list of just granted locks to send completion AST to. - */ -static void ldlm_add_cp_work_item(struct ldlm_lock *lock, - struct list_head *work_list) -{ - if (!ldlm_is_cp_reqd(lock)) { - ldlm_set_cp_reqd(lock); - LDLM_DEBUG(lock, "lock granted; sending completion AST."); - LASSERT(list_empty(&lock->l_cp_ast)); - list_add(&lock->l_cp_ast, work_list); - LDLM_LOCK_GET(lock); - } -} - -/** - * Aggregator function to add AST work items into a list. Determines - * what sort of an AST work needs to be done and calls the proper - * adding function. - * Must be called with lr_lock held. - */ -static void ldlm_add_ast_work_item(struct ldlm_lock *lock, - struct ldlm_lock *new, - struct list_head *work_list) -{ - check_res_locked(lock->l_resource); - if (new) - ldlm_add_bl_work_item(lock, new, work_list); - else - ldlm_add_cp_work_item(lock, work_list); -} - -/** - * Add specified reader/writer reference to LDLM lock with handle \a lockh. - * r/w reference type is determined by \a mode - * Calls ldlm_lock_addref_internal. - */ -void ldlm_lock_addref(const struct lustre_handle *lockh, enum ldlm_mode mode) -{ - struct ldlm_lock *lock; - - lock = ldlm_handle2lock(lockh); - LASSERTF(lock, "Non-existing lock: %llx\n", lockh->cookie); - ldlm_lock_addref_internal(lock, mode); - LDLM_LOCK_PUT(lock); -} -EXPORT_SYMBOL(ldlm_lock_addref); - -/** - * Helper function. - * Add specified reader/writer reference to LDLM lock \a lock. - * r/w reference type is determined by \a mode - * Removes lock from LRU if it is there. - * Assumes the LDLM lock is already locked. - */ -void ldlm_lock_addref_internal_nolock(struct ldlm_lock *lock, - enum ldlm_mode mode) -{ - ldlm_lock_remove_from_lru(lock); - if (mode & (LCK_NL | LCK_CR | LCK_PR)) { - lock->l_readers++; - lu_ref_add_atomic(&lock->l_reference, "reader", lock); - } - if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP | LCK_COS)) { - lock->l_writers++; - lu_ref_add_atomic(&lock->l_reference, "writer", lock); - } - LDLM_LOCK_GET(lock); - lu_ref_add_atomic(&lock->l_reference, "user", lock); - LDLM_DEBUG(lock, "ldlm_lock_addref(%s)", ldlm_lockname[mode]); -} - -/** - * Attempts to add reader/writer reference to a lock with handle \a lockh, and - * fails if lock is already LDLM_FL_CBPENDING or destroyed. - * - * \retval 0 success, lock was addref-ed - * - * \retval -EAGAIN lock is being canceled. - */ -int ldlm_lock_addref_try(const struct lustre_handle *lockh, enum ldlm_mode mode) -{ - struct ldlm_lock *lock; - int result; - - result = -EAGAIN; - lock = ldlm_handle2lock(lockh); - if (lock) { - lock_res_and_lock(lock); - if (lock->l_readers != 0 || lock->l_writers != 0 || - !ldlm_is_cbpending(lock)) { - ldlm_lock_addref_internal_nolock(lock, mode); - result = 0; - } - unlock_res_and_lock(lock); - LDLM_LOCK_PUT(lock); - } - return result; -} -EXPORT_SYMBOL(ldlm_lock_addref_try); - -/** - * Add specified reader/writer reference to LDLM lock \a lock. - * Locks LDLM lock and calls ldlm_lock_addref_internal_nolock to do the work. - * Only called for local locks. - */ -void ldlm_lock_addref_internal(struct ldlm_lock *lock, enum ldlm_mode mode) -{ - lock_res_and_lock(lock); - ldlm_lock_addref_internal_nolock(lock, mode); - unlock_res_and_lock(lock); -} - -/** - * Removes reader/writer reference for LDLM lock \a lock. - * Assumes LDLM lock is already locked. - * only called in ldlm_flock_destroy and for local locks. - * Does NOT add lock to LRU if no r/w references left to accommodate flock locks - * that cannot be placed in LRU. - */ -void ldlm_lock_decref_internal_nolock(struct ldlm_lock *lock, - enum ldlm_mode mode) -{ - LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]); - if (mode & (LCK_NL | LCK_CR | LCK_PR)) { - LASSERT(lock->l_readers > 0); - lu_ref_del(&lock->l_reference, "reader", lock); - lock->l_readers--; - } - if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP | LCK_COS)) { - LASSERT(lock->l_writers > 0); - lu_ref_del(&lock->l_reference, "writer", lock); - lock->l_writers--; - } - - lu_ref_del(&lock->l_reference, "user", lock); - LDLM_LOCK_RELEASE(lock); /* matches the LDLM_LOCK_GET() in addref */ -} - -/** - * Removes reader/writer reference for LDLM lock \a lock. - * Locks LDLM lock first. - * If the lock is determined to be client lock on a client and r/w refcount - * drops to zero and the lock is not blocked, the lock is added to LRU lock - * on the namespace. - * For blocked LDLM locks if r/w count drops to zero, blocking_ast is called. - */ -void ldlm_lock_decref_internal(struct ldlm_lock *lock, enum ldlm_mode mode) -{ - struct ldlm_namespace *ns; - - lock_res_and_lock(lock); - - ns = ldlm_lock_to_ns(lock); - - ldlm_lock_decref_internal_nolock(lock, mode); - - if ((ldlm_is_local(lock) || lock->l_req_mode == LCK_GROUP) && - !lock->l_readers && !lock->l_writers) { - /* If this is a local lock on a server namespace and this was - * the last reference, cancel the lock. - * - * Group locks are special: - * They must not go in LRU, but they are not called back - * like non-group locks, instead they are manually released. - * They have an l_writers reference which they keep until - * they are manually released, so we remove them when they have - * no more reader or writer references. - LU-6368 - */ - ldlm_set_cbpending(lock); - } - - if (!lock->l_readers && !lock->l_writers && ldlm_is_cbpending(lock)) { - /* If we received a blocked AST and this was the last reference, - * run the callback. - */ - LDLM_DEBUG(lock, "final decref done on cbpending lock"); - - LDLM_LOCK_GET(lock); /* dropped by bl thread */ - ldlm_lock_remove_from_lru(lock); - unlock_res_and_lock(lock); - - if (ldlm_is_fail_loc(lock)) - OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE); - - if (ldlm_is_atomic_cb(lock) || - ldlm_bl_to_thread_lock(ns, NULL, lock) != 0) - ldlm_handle_bl_callback(ns, NULL, lock); - } else if (!lock->l_readers && !lock->l_writers && - !ldlm_is_no_lru(lock) && !ldlm_is_bl_ast(lock)) { - LDLM_DEBUG(lock, "add lock into lru list"); - - /* If this is a client-side namespace and this was the last - * reference, put it on the LRU. - */ - ldlm_lock_add_to_lru(lock); - unlock_res_and_lock(lock); - - if (ldlm_is_fail_loc(lock)) - OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE); - - /* Call ldlm_cancel_lru() only if EARLY_CANCEL and LRU RESIZE - * are not supported by the server, otherwise, it is done on - * enqueue. - */ - if (!exp_connect_cancelset(lock->l_conn_export) && - !ns_connect_lru_resize(ns)) - ldlm_cancel_lru(ns, 0, LCF_ASYNC, 0); - } else { - LDLM_DEBUG(lock, "do not add lock into lru list"); - unlock_res_and_lock(lock); - } -} - -/** - * Decrease reader/writer refcount for LDLM lock with handle \a lockh - */ -void ldlm_lock_decref(const struct lustre_handle *lockh, enum ldlm_mode mode) -{ - struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0); - - LASSERTF(lock, "Non-existing lock: %#llx\n", lockh->cookie); - ldlm_lock_decref_internal(lock, mode); - LDLM_LOCK_PUT(lock); -} -EXPORT_SYMBOL(ldlm_lock_decref); - -/** - * Decrease reader/writer refcount for LDLM lock with handle - * \a lockh and mark it for subsequent cancellation once r/w refcount - * drops to zero instead of putting into LRU. - */ -void ldlm_lock_decref_and_cancel(const struct lustre_handle *lockh, - enum ldlm_mode mode) -{ - struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0); - - LASSERT(lock); - - LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]); - lock_res_and_lock(lock); - ldlm_set_cbpending(lock); - unlock_res_and_lock(lock); - ldlm_lock_decref_internal(lock, mode); - LDLM_LOCK_PUT(lock); -} -EXPORT_SYMBOL(ldlm_lock_decref_and_cancel); - -struct sl_insert_point { - struct list_head *res_link; - struct list_head *mode_link; - struct list_head *policy_link; -}; - -/** - * Finds a position to insert the new lock into granted lock list. - * - * Used for locks eligible for skiplist optimization. - * - * Parameters: - * queue [input]: the granted list where search acts on; - * req [input]: the lock whose position to be located; - * prev [output]: positions within 3 lists to insert @req to - * Return Value: - * filled @prev - * NOTE: called by - * - ldlm_grant_lock_with_skiplist - */ -static void search_granted_lock(struct list_head *queue, - struct ldlm_lock *req, - struct sl_insert_point *prev) -{ - struct ldlm_lock *lock, *mode_end, *policy_end; - - list_for_each_entry(lock, queue, l_res_link) { - - mode_end = list_prev_entry(lock, l_sl_mode); - - if (lock->l_req_mode != req->l_req_mode) { - /* jump to last lock of mode group */ - lock = mode_end; - continue; - } - - /* suitable mode group is found */ - if (lock->l_resource->lr_type == LDLM_PLAIN) { - /* insert point is last lock of the mode group */ - prev->res_link = &mode_end->l_res_link; - prev->mode_link = &mode_end->l_sl_mode; - prev->policy_link = &req->l_sl_policy; - return; - } - - if (lock->l_resource->lr_type == LDLM_IBITS) { - for (;;) { - policy_end = - list_prev_entry(lock, l_sl_policy); - - if (lock->l_policy_data.l_inodebits.bits == - req->l_policy_data.l_inodebits.bits) { - /* insert point is last lock of - * the policy group - */ - prev->res_link = - &policy_end->l_res_link; - prev->mode_link = - &policy_end->l_sl_mode; - prev->policy_link = - &policy_end->l_sl_policy; - return; - } - - if (policy_end == mode_end) - /* done with mode group */ - break; - - /* go to next policy group within mode group */ - lock = list_next_entry(policy_end, l_res_link); - } /* loop over policy groups within the mode group */ - - /* insert point is last lock of the mode group, - * new policy group is started - */ - prev->res_link = &mode_end->l_res_link; - prev->mode_link = &mode_end->l_sl_mode; - prev->policy_link = &req->l_sl_policy; - return; - } - - LDLM_ERROR(lock, "is not LDLM_PLAIN or LDLM_IBITS lock"); - LBUG(); - } - - /* insert point is last lock on the queue, - * new mode group and new policy group are started - */ - prev->res_link = queue->prev; - prev->mode_link = &req->l_sl_mode; - prev->policy_link = &req->l_sl_policy; -} - -/** - * Add a lock into resource granted list after a position described by - * \a prev. - */ -static void ldlm_granted_list_add_lock(struct ldlm_lock *lock, - struct sl_insert_point *prev) -{ - struct ldlm_resource *res = lock->l_resource; - - check_res_locked(res); - - ldlm_resource_dump(D_INFO, res); - LDLM_DEBUG(lock, "About to add lock:"); - - if (ldlm_is_destroyed(lock)) { - CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n"); - return; - } - - LASSERT(list_empty(&lock->l_res_link)); - LASSERT(list_empty(&lock->l_sl_mode)); - LASSERT(list_empty(&lock->l_sl_policy)); - - /* - * lock->link == prev->link means lock is first starting the group. - * Don't re-add to itself to suppress kernel warnings. - */ - if (&lock->l_res_link != prev->res_link) - list_add(&lock->l_res_link, prev->res_link); - if (&lock->l_sl_mode != prev->mode_link) - list_add(&lock->l_sl_mode, prev->mode_link); - if (&lock->l_sl_policy != prev->policy_link) - list_add(&lock->l_sl_policy, prev->policy_link); -} - -/** - * Add a lock to granted list on a resource maintaining skiplist - * correctness. - */ -static void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock) -{ - struct sl_insert_point prev; - - LASSERT(lock->l_req_mode == lock->l_granted_mode); - - search_granted_lock(&lock->l_resource->lr_granted, lock, &prev); - ldlm_granted_list_add_lock(lock, &prev); -} - -/** - * Perform lock granting bookkeeping. - * - * Includes putting the lock into granted list and updating lock mode. - * NOTE: called by - * - ldlm_lock_enqueue - * - ldlm_reprocess_queue - * - ldlm_lock_convert - * - * must be called with lr_lock held - */ -void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list) -{ - struct ldlm_resource *res = lock->l_resource; - - check_res_locked(res); - - lock->l_granted_mode = lock->l_req_mode; - - if (work_list && lock->l_completion_ast) - ldlm_add_ast_work_item(lock, NULL, work_list); - - if (res->lr_type == LDLM_PLAIN || res->lr_type == LDLM_IBITS) { - ldlm_grant_lock_with_skiplist(lock); - } else if (res->lr_type == LDLM_EXTENT) { - ldlm_extent_add_lock(res, lock); - } else if (res->lr_type == LDLM_FLOCK) { - /* - * We should not add locks to granted list in - * the following cases: - * - this is an UNLOCK but not a real lock; - * - this is a TEST lock; - * - this is a F_CANCELLK lock (async flock has req_mode == 0) - * - this is a deadlock (flock cannot be granted) - */ - if (!lock->l_req_mode || lock->l_req_mode == LCK_NL || - ldlm_is_test_lock(lock) || ldlm_is_flock_deadlock(lock)) - return; - ldlm_resource_add_lock(res, &res->lr_granted, lock); - } else { - LBUG(); - } - - ldlm_pool_add(&ldlm_res_to_ns(res)->ns_pool, lock); -} - -/** - * Describe the overlap between two locks. itree_overlap_cb data. - */ -struct lock_match_data { - struct ldlm_lock *lmd_old; - struct ldlm_lock *lmd_lock; - enum ldlm_mode *lmd_mode; - union ldlm_policy_data *lmd_policy; - __u64 lmd_flags; - int lmd_unref; -}; - -/** - * Check if the given @lock meets the criteria for a match. - * A reference on the lock is taken if matched. - * - * \param lock test-against this lock - * \param data parameters - */ -static int lock_matches(struct ldlm_lock *lock, struct lock_match_data *data) -{ - union ldlm_policy_data *lpol = &lock->l_policy_data; - enum ldlm_mode match; - - if (lock == data->lmd_old) - return INTERVAL_ITER_STOP; - - /* - * Check if this lock can be matched. - * Used by LU-2919(exclusive open) for open lease lock - */ - if (ldlm_is_excl(lock)) - return INTERVAL_ITER_CONT; - - /* - * llite sometimes wants to match locks that will be - * canceled when their users drop, but we allow it to match - * if it passes in CBPENDING and the lock still has users. - * this is generally only going to be used by children - * whose parents already hold a lock so forward progress - * can still happen. - */ - if (ldlm_is_cbpending(lock) && - !(data->lmd_flags & LDLM_FL_CBPENDING)) - return INTERVAL_ITER_CONT; - - if (!data->lmd_unref && ldlm_is_cbpending(lock) && - !lock->l_readers && !lock->l_writers) - return INTERVAL_ITER_CONT; - - if (!(lock->l_req_mode & *data->lmd_mode)) - return INTERVAL_ITER_CONT; - match = lock->l_req_mode; - - switch (lock->l_resource->lr_type) { - case LDLM_EXTENT: - if (lpol->l_extent.start > data->lmd_policy->l_extent.start || - lpol->l_extent.end < data->lmd_policy->l_extent.end) - return INTERVAL_ITER_CONT; - - if (unlikely(match == LCK_GROUP) && - data->lmd_policy->l_extent.gid != LDLM_GID_ANY && - lpol->l_extent.gid != data->lmd_policy->l_extent.gid) - return INTERVAL_ITER_CONT; - break; - case LDLM_IBITS: - /* - * We match if we have existing lock with same or wider set - * of bits. - */ - if ((lpol->l_inodebits.bits & - data->lmd_policy->l_inodebits.bits) != - data->lmd_policy->l_inodebits.bits) - return INTERVAL_ITER_CONT; - break; - default: - break; - } - /* - * We match if we have existing lock with same or wider set - * of bits. - */ - if (!data->lmd_unref && LDLM_HAVE_MASK(lock, GONE)) - return INTERVAL_ITER_CONT; - - if (!equi(data->lmd_flags & LDLM_FL_LOCAL_ONLY, ldlm_is_local(lock))) - return INTERVAL_ITER_CONT; - - if (data->lmd_flags & LDLM_FL_TEST_LOCK) { - LDLM_LOCK_GET(lock); - ldlm_lock_touch_in_lru(lock); - } else { - ldlm_lock_addref_internal_nolock(lock, match); - } - - *data->lmd_mode = match; - data->lmd_lock = lock; - - return INTERVAL_ITER_STOP; -} - -static enum interval_iter itree_overlap_cb(struct interval_node *in, void *args) -{ - struct ldlm_interval *node = to_ldlm_interval(in); - struct lock_match_data *data = args; - struct ldlm_lock *lock; - int rc; - - list_for_each_entry(lock, &node->li_group, l_sl_policy) { - rc = lock_matches(lock, data); - if (rc == INTERVAL_ITER_STOP) - return INTERVAL_ITER_STOP; - } - return INTERVAL_ITER_CONT; -} - -/** - * Search for a lock with given parameters in interval trees. - * - * \param res search for a lock in this resource - * \param data parameters - * - * \retval a referenced lock or NULL. - */ -static struct ldlm_lock *search_itree(struct ldlm_resource *res, - struct lock_match_data *data) -{ - struct interval_node_extent ext = { - .start = data->lmd_policy->l_extent.start, - .end = data->lmd_policy->l_extent.end - }; - int idx; - - for (idx = 0; idx < LCK_MODE_NUM; idx++) { - struct ldlm_interval_tree *tree = &res->lr_itree[idx]; - - if (!tree->lit_root) - continue; - - if (!(tree->lit_mode & *data->lmd_mode)) - continue; - - interval_search(tree->lit_root, &ext, - itree_overlap_cb, data); - } - return data->lmd_lock; -} - -/** - * Search for a lock with given properties in a queue. - * - * \param queue search for a lock in this queue - * \param data parameters - * - * \retval a referenced lock or NULL. - */ -static struct ldlm_lock *search_queue(struct list_head *queue, - struct lock_match_data *data) -{ - struct ldlm_lock *lock; - int rc; - - list_for_each_entry(lock, queue, l_res_link) { - rc = lock_matches(lock, data); - if (rc == INTERVAL_ITER_STOP) - return data->lmd_lock; - } - return NULL; -} - -void ldlm_lock_fail_match_locked(struct ldlm_lock *lock) -{ - if ((lock->l_flags & LDLM_FL_FAIL_NOTIFIED) == 0) { - lock->l_flags |= LDLM_FL_FAIL_NOTIFIED; - wake_up_all(&lock->l_waitq); - } -} - -/** - * Mark lock as "matchable" by OST. - * - * Used to prevent certain races in LOV/OSC where the lock is granted, but LVB - * is not yet valid. - * Assumes LDLM lock is already locked. - */ -void ldlm_lock_allow_match_locked(struct ldlm_lock *lock) -{ - ldlm_set_lvb_ready(lock); - wake_up_all(&lock->l_waitq); -} -EXPORT_SYMBOL(ldlm_lock_allow_match_locked); - -/** - * Mark lock as "matchable" by OST. - * Locks the lock and then \see ldlm_lock_allow_match_locked - */ -void ldlm_lock_allow_match(struct ldlm_lock *lock) -{ - lock_res_and_lock(lock); - ldlm_lock_allow_match_locked(lock); - unlock_res_and_lock(lock); -} -EXPORT_SYMBOL(ldlm_lock_allow_match); - -/** - * Attempt to find a lock with specified properties. - * - * Typically returns a reference to matched lock unless LDLM_FL_TEST_LOCK is - * set in \a flags - * - * Can be called in two ways: - * - * If 'ns' is NULL, then lockh describes an existing lock that we want to look - * for a duplicate of. - * - * Otherwise, all of the fields must be filled in, to match against. - * - * If 'flags' contains LDLM_FL_LOCAL_ONLY, then only match local locks on the - * server (ie, connh is NULL) - * If 'flags' contains LDLM_FL_BLOCK_GRANTED, then only locks on the granted - * list will be considered - * If 'flags' contains LDLM_FL_CBPENDING, then locks that have been marked - * to be canceled can still be matched as long as they still have reader - * or writer referneces - * If 'flags' contains LDLM_FL_TEST_LOCK, then don't actually reference a lock, - * just tell us if we would have matched. - * - * \retval 1 if it finds an already-existing lock that is compatible; in this - * case, lockh is filled in with a addref()ed lock - * - * We also check security context, and if that fails we simply return 0 (to - * keep caller code unchanged), the context failure will be discovered by - * caller sometime later. - */ -enum ldlm_mode ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags, - const struct ldlm_res_id *res_id, - enum ldlm_type type, - union ldlm_policy_data *policy, - enum ldlm_mode mode, - struct lustre_handle *lockh, int unref) -{ - struct lock_match_data data = { - .lmd_old = NULL, - .lmd_lock = NULL, - .lmd_mode = &mode, - .lmd_policy = policy, - .lmd_flags = flags, - .lmd_unref = unref, - }; - struct ldlm_resource *res; - struct ldlm_lock *lock; - int rc = 0; - - if (!ns) { - data.lmd_old = ldlm_handle2lock(lockh); - LASSERT(data.lmd_old); - - ns = ldlm_lock_to_ns(data.lmd_old); - res_id = &data.lmd_old->l_resource->lr_name; - type = data.lmd_old->l_resource->lr_type; - *data.lmd_mode = data.lmd_old->l_req_mode; - } - - res = ldlm_resource_get(ns, NULL, res_id, type, 0); - if (IS_ERR(res)) { - LASSERT(!data.lmd_old); - return 0; - } - - LDLM_RESOURCE_ADDREF(res); - lock_res(res); - - if (res->lr_type == LDLM_EXTENT) - lock = search_itree(res, &data); - else - lock = search_queue(&res->lr_granted, &data); - if (lock) { - rc = 1; - goto out; - } - if (flags & LDLM_FL_BLOCK_GRANTED) { - rc = 0; - goto out; - } - lock = search_queue(&res->lr_waiting, &data); - if (lock) { - rc = 1; - goto out; - } -out: - unlock_res(res); - LDLM_RESOURCE_DELREF(res); - ldlm_resource_putref(res); - - if (lock) { - ldlm_lock2handle(lock, lockh); - if ((flags & LDLM_FL_LVB_READY) && !ldlm_is_lvb_ready(lock)) { - __u64 wait_flags = LDLM_FL_LVB_READY | - LDLM_FL_DESTROYED | LDLM_FL_FAIL_NOTIFIED; - - if (lock->l_completion_ast) { - int err = lock->l_completion_ast(lock, - LDLM_FL_WAIT_NOREPROC, - NULL); - if (err) { - if (flags & LDLM_FL_TEST_LOCK) - LDLM_LOCK_RELEASE(lock); - else - ldlm_lock_decref_internal(lock, - mode); - rc = 0; - goto out2; - } - } - - /* XXX FIXME see comment on CAN_MATCH in lustre_dlm.h */ - wait_event_idle_timeout(lock->l_waitq, - lock->l_flags & wait_flags, - obd_timeout * HZ); - if (!ldlm_is_lvb_ready(lock)) { - if (flags & LDLM_FL_TEST_LOCK) - LDLM_LOCK_RELEASE(lock); - else - ldlm_lock_decref_internal(lock, mode); - rc = 0; - } - } - } - out2: - if (rc) { - LDLM_DEBUG(lock, "matched (%llu %llu)", - (type == LDLM_PLAIN || type == LDLM_IBITS) ? - res_id->name[2] : policy->l_extent.start, - (type == LDLM_PLAIN || type == LDLM_IBITS) ? - res_id->name[3] : policy->l_extent.end); - - /* check user's security context */ - if (lock->l_conn_export && - sptlrpc_import_check_ctx( - class_exp2cliimp(lock->l_conn_export))) { - if (!(flags & LDLM_FL_TEST_LOCK)) - ldlm_lock_decref_internal(lock, mode); - rc = 0; - } - - if (flags & LDLM_FL_TEST_LOCK) - LDLM_LOCK_RELEASE(lock); - - } else if (!(flags & LDLM_FL_TEST_LOCK)) {/*less verbose for test-only*/ - LDLM_DEBUG_NOLOCK("not matched ns %p type %u mode %u res %llu/%llu (%llu %llu)", - ns, type, mode, res_id->name[0], - res_id->name[1], - (type == LDLM_PLAIN || type == LDLM_IBITS) ? - res_id->name[2] : policy->l_extent.start, - (type == LDLM_PLAIN || type == LDLM_IBITS) ? - res_id->name[3] : policy->l_extent.end); - } - if (data.lmd_old) - LDLM_LOCK_PUT(data.lmd_old); - - return rc ? mode : 0; -} -EXPORT_SYMBOL(ldlm_lock_match); - -enum ldlm_mode ldlm_revalidate_lock_handle(const struct lustre_handle *lockh, - __u64 *bits) -{ - struct ldlm_lock *lock; - enum ldlm_mode mode = 0; - - lock = ldlm_handle2lock(lockh); - if (lock) { - lock_res_and_lock(lock); - if (LDLM_HAVE_MASK(lock, GONE)) - goto out; - - if (ldlm_is_cbpending(lock) && - lock->l_readers == 0 && lock->l_writers == 0) - goto out; - - if (bits) - *bits = lock->l_policy_data.l_inodebits.bits; - mode = lock->l_granted_mode; - ldlm_lock_addref_internal_nolock(lock, mode); - } - -out: - if (lock) { - unlock_res_and_lock(lock); - LDLM_LOCK_PUT(lock); - } - return mode; -} -EXPORT_SYMBOL(ldlm_revalidate_lock_handle); - -/** The caller must guarantee that the buffer is large enough. */ -int ldlm_fill_lvb(struct ldlm_lock *lock, struct req_capsule *pill, - enum req_location loc, void *data, int size) -{ - void *lvb; - - LASSERT(data); - LASSERT(size >= 0); - - switch (lock->l_lvb_type) { - case LVB_T_OST: - if (size == sizeof(struct ost_lvb)) { - if (loc == RCL_CLIENT) - lvb = req_capsule_client_swab_get(pill, - &RMF_DLM_LVB, - lustre_swab_ost_lvb); - else - lvb = req_capsule_server_swab_get(pill, - &RMF_DLM_LVB, - lustre_swab_ost_lvb); - if (unlikely(!lvb)) { - LDLM_ERROR(lock, "no LVB"); - return -EPROTO; - } - - memcpy(data, lvb, size); - } else if (size == sizeof(struct ost_lvb_v1)) { - struct ost_lvb *olvb = data; - - if (loc == RCL_CLIENT) - lvb = req_capsule_client_swab_get(pill, - &RMF_DLM_LVB, - lustre_swab_ost_lvb_v1); - else - lvb = req_capsule_server_sized_swab_get(pill, - &RMF_DLM_LVB, size, - lustre_swab_ost_lvb_v1); - if (unlikely(!lvb)) { - LDLM_ERROR(lock, "no LVB"); - return -EPROTO; - } - - memcpy(data, lvb, size); - olvb->lvb_mtime_ns = 0; - olvb->lvb_atime_ns = 0; - olvb->lvb_ctime_ns = 0; - } else { - LDLM_ERROR(lock, "Replied unexpected ost LVB size %d", - size); - return -EINVAL; - } - break; - case LVB_T_LQUOTA: - if (size == sizeof(struct lquota_lvb)) { - if (loc == RCL_CLIENT) - lvb = req_capsule_client_swab_get(pill, - &RMF_DLM_LVB, - lustre_swab_lquota_lvb); - else - lvb = req_capsule_server_swab_get(pill, - &RMF_DLM_LVB, - lustre_swab_lquota_lvb); - if (unlikely(!lvb)) { - LDLM_ERROR(lock, "no LVB"); - return -EPROTO; - } - - memcpy(data, lvb, size); - } else { - LDLM_ERROR(lock, - "Replied unexpected lquota LVB size %d", - size); - return -EINVAL; - } - break; - case LVB_T_LAYOUT: - if (size == 0) - break; - - if (loc == RCL_CLIENT) - lvb = req_capsule_client_get(pill, &RMF_DLM_LVB); - else - lvb = req_capsule_server_get(pill, &RMF_DLM_LVB); - if (unlikely(!lvb)) { - LDLM_ERROR(lock, "no LVB"); - return -EPROTO; - } - - memcpy(data, lvb, size); - break; - default: - LDLM_ERROR(lock, "Unknown LVB type: %d", lock->l_lvb_type); - dump_stack(); - return -EINVAL; - } - - return 0; -} - -/** - * Create and fill in new LDLM lock with specified properties. - * Returns a referenced lock - */ -struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns, - const struct ldlm_res_id *res_id, - enum ldlm_type type, - enum ldlm_mode mode, - const struct ldlm_callback_suite *cbs, - void *data, __u32 lvb_len, - enum lvb_type lvb_type) -{ - struct ldlm_lock *lock; - struct ldlm_resource *res; - int rc; - - res = ldlm_resource_get(ns, NULL, res_id, type, 1); - if (IS_ERR(res)) - return ERR_CAST(res); - - lock = ldlm_lock_new(res); - if (!lock) { - ldlm_resource_putref(res); - return ERR_PTR(-ENOMEM); - } - - lock->l_req_mode = mode; - lock->l_ast_data = data; - lock->l_pid = current->pid; - if (cbs) { - lock->l_blocking_ast = cbs->lcs_blocking; - lock->l_completion_ast = cbs->lcs_completion; - lock->l_glimpse_ast = cbs->lcs_glimpse; - } - - lock->l_tree_node = NULL; - /* if this is the extent lock, allocate the interval tree node */ - if (type == LDLM_EXTENT) { - if (!ldlm_interval_alloc(lock)) { - rc = -ENOMEM; - goto out; - } - } - - if (lvb_len) { - lock->l_lvb_len = lvb_len; - lock->l_lvb_data = kzalloc(lvb_len, GFP_NOFS); - if (!lock->l_lvb_data) { - rc = -ENOMEM; - goto out; - } - } - - lock->l_lvb_type = lvb_type; - if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_NEW_LOCK)) { - rc = -ENOENT; - goto out; - } - - return lock; - -out: - ldlm_lock_destroy(lock); - LDLM_LOCK_RELEASE(lock); - return ERR_PTR(rc); -} - - - -/** - * Enqueue (request) a lock. - * On the client this is called from ldlm_cli_enqueue_fini - * after we already got an initial reply from the server with some status. - * - * Does not block. As a result of enqueue the lock would be put - * into granted or waiting list. - */ -enum ldlm_error ldlm_lock_enqueue(struct ldlm_namespace *ns, - struct ldlm_lock **lockp, - void *cookie, __u64 *flags) -{ - struct ldlm_lock *lock = *lockp; - struct ldlm_resource *res = lock->l_resource; - - lock_res_and_lock(lock); - if (lock->l_req_mode == lock->l_granted_mode) { - /* The server returned a blocked lock, but it was granted - * before we got a chance to actually enqueue it. We don't - * need to do anything else. - */ - *flags &= ~LDLM_FL_BLOCKED_MASK; - goto out; - } - - ldlm_resource_unlink_lock(lock); - - /* Cannot happen unless on the server */ - if (res->lr_type == LDLM_EXTENT && !lock->l_tree_node) - LBUG(); - - /* Some flags from the enqueue want to make it into the AST, via the - * lock's l_flags. - */ - if (*flags & LDLM_FL_AST_DISCARD_DATA) - ldlm_set_ast_discard_data(lock); - if (*flags & LDLM_FL_TEST_LOCK) - ldlm_set_test_lock(lock); - - /* - * This distinction between local lock trees is very important; a client - * namespace only has information about locks taken by that client, and - * thus doesn't have enough information to decide for itself if it can - * be granted (below). In this case, we do exactly what the server - * tells us to do, as dictated by the 'flags'. - */ - if (*flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED)) - ldlm_resource_add_lock(res, &res->lr_waiting, lock); - else - ldlm_grant_lock(lock, NULL); - -out: - unlock_res_and_lock(lock); - return ELDLM_OK; -} - -/** - * Process a call to blocking AST callback for a lock in ast_work list - */ -static int -ldlm_work_bl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq) -{ - struct ldlm_cb_set_arg *arg = opaq; - struct ldlm_lock_desc d; - int rc; - struct ldlm_lock *lock; - - if (list_empty(arg->list)) - return -ENOENT; - - lock = list_first_entry(arg->list, struct ldlm_lock, l_bl_ast); - - /* nobody should touch l_bl_ast */ - lock_res_and_lock(lock); - list_del_init(&lock->l_bl_ast); - - LASSERT(ldlm_is_ast_sent(lock)); - LASSERT(lock->l_bl_ast_run == 0); - LASSERT(lock->l_blocking_lock); - lock->l_bl_ast_run++; - unlock_res_and_lock(lock); - - ldlm_lock2desc(lock->l_blocking_lock, &d); - - rc = lock->l_blocking_ast(lock, &d, (void *)arg, LDLM_CB_BLOCKING); - LDLM_LOCK_RELEASE(lock->l_blocking_lock); - lock->l_blocking_lock = NULL; - LDLM_LOCK_RELEASE(lock); - - return rc; -} - -/** - * Process a call to completion AST callback for a lock in ast_work list - */ -static int -ldlm_work_cp_ast_lock(struct ptlrpc_request_set *rqset, void *opaq) -{ - struct ldlm_cb_set_arg *arg = opaq; - int rc = 0; - struct ldlm_lock *lock; - ldlm_completion_callback completion_callback; - - if (list_empty(arg->list)) - return -ENOENT; - - lock = list_first_entry(arg->list, struct ldlm_lock, l_cp_ast); - - /* It's possible to receive a completion AST before we've set - * the l_completion_ast pointer: either because the AST arrived - * before the reply, or simply because there's a small race - * window between receiving the reply and finishing the local - * enqueue. (bug 842) - * - * This can't happen with the blocking_ast, however, because we - * will never call the local blocking_ast until we drop our - * reader/writer reference, which we won't do until we get the - * reply and finish enqueueing. - */ - - /* nobody should touch l_cp_ast */ - lock_res_and_lock(lock); - list_del_init(&lock->l_cp_ast); - LASSERT(ldlm_is_cp_reqd(lock)); - /* save l_completion_ast since it can be changed by - * mds_intent_policy(), see bug 14225 - */ - completion_callback = lock->l_completion_ast; - ldlm_clear_cp_reqd(lock); - unlock_res_and_lock(lock); - - if (completion_callback) - rc = completion_callback(lock, 0, (void *)arg); - LDLM_LOCK_RELEASE(lock); - - return rc; -} - -/** - * Process a call to revocation AST callback for a lock in ast_work list - */ -static int -ldlm_work_revoke_ast_lock(struct ptlrpc_request_set *rqset, void *opaq) -{ - struct ldlm_cb_set_arg *arg = opaq; - struct ldlm_lock_desc desc; - int rc; - struct ldlm_lock *lock; - - if (list_empty(arg->list)) - return -ENOENT; - - lock = list_first_entry(arg->list, struct ldlm_lock, l_rk_ast); - list_del_init(&lock->l_rk_ast); - - /* the desc just pretend to exclusive */ - ldlm_lock2desc(lock, &desc); - desc.l_req_mode = LCK_EX; - desc.l_granted_mode = 0; - - rc = lock->l_blocking_ast(lock, &desc, (void *)arg, LDLM_CB_BLOCKING); - LDLM_LOCK_RELEASE(lock); - - return rc; -} - -/** - * Process a call to glimpse AST callback for a lock in ast_work list - */ -static int ldlm_work_gl_ast_lock(struct ptlrpc_request_set *rqset, void *opaq) -{ - struct ldlm_cb_set_arg *arg = opaq; - struct ldlm_glimpse_work *gl_work; - struct ldlm_lock *lock; - int rc = 0; - - if (list_empty(arg->list)) - return -ENOENT; - - gl_work = list_first_entry(arg->list, struct ldlm_glimpse_work, - gl_list); - list_del_init(&gl_work->gl_list); - - lock = gl_work->gl_lock; - - /* transfer the glimpse descriptor to ldlm_cb_set_arg */ - arg->gl_desc = gl_work->gl_desc; - - /* invoke the actual glimpse callback */ - if (lock->l_glimpse_ast(lock, (void *)arg) == 0) - rc = 1; - - LDLM_LOCK_RELEASE(lock); - - if ((gl_work->gl_flags & LDLM_GL_WORK_NOFREE) == 0) - kfree(gl_work); - - return rc; -} - -/** - * Process list of locks in need of ASTs being sent. - * - * Used on server to send multiple ASTs together instead of sending one by - * one. - */ -int ldlm_run_ast_work(struct ldlm_namespace *ns, struct list_head *rpc_list, - enum ldlm_desc_ast_t ast_type) -{ - struct ldlm_cb_set_arg *arg; - set_producer_func work_ast_lock; - int rc; - - if (list_empty(rpc_list)) - return 0; - - arg = kzalloc(sizeof(*arg), GFP_NOFS); - if (!arg) - return -ENOMEM; - - atomic_set(&arg->restart, 0); - arg->list = rpc_list; - - switch (ast_type) { - case LDLM_WORK_BL_AST: - arg->type = LDLM_BL_CALLBACK; - work_ast_lock = ldlm_work_bl_ast_lock; - break; - case LDLM_WORK_CP_AST: - arg->type = LDLM_CP_CALLBACK; - work_ast_lock = ldlm_work_cp_ast_lock; - break; - case LDLM_WORK_REVOKE_AST: - arg->type = LDLM_BL_CALLBACK; - work_ast_lock = ldlm_work_revoke_ast_lock; - break; - case LDLM_WORK_GL_AST: - arg->type = LDLM_GL_CALLBACK; - work_ast_lock = ldlm_work_gl_ast_lock; - break; - default: - LBUG(); - } - - /* We create a ptlrpc request set with flow control extension. - * This request set will use the work_ast_lock function to produce new - * requests and will send a new request each time one completes in order - * to keep the number of requests in flight to ns_max_parallel_ast - */ - arg->set = ptlrpc_prep_fcset(ns->ns_max_parallel_ast ? : UINT_MAX, - work_ast_lock, arg); - if (!arg->set) { - rc = -ENOMEM; - goto out; - } - - ptlrpc_set_wait(arg->set); - ptlrpc_set_destroy(arg->set); - - rc = atomic_read(&arg->restart) ? -ERESTART : 0; - goto out; -out: - kfree(arg); - return rc; -} - -static bool is_bl_done(struct ldlm_lock *lock) -{ - bool bl_done = true; - - if (!ldlm_is_bl_done(lock)) { - lock_res_and_lock(lock); - bl_done = ldlm_is_bl_done(lock); - unlock_res_and_lock(lock); - } - - return bl_done; -} - -/** - * Helper function to call blocking AST for LDLM lock \a lock in a - * "cancelling" mode. - */ -void ldlm_cancel_callback(struct ldlm_lock *lock) -{ - check_res_locked(lock->l_resource); - if (!ldlm_is_cancel(lock)) { - ldlm_set_cancel(lock); - if (lock->l_blocking_ast) { - unlock_res_and_lock(lock); - lock->l_blocking_ast(lock, NULL, lock->l_ast_data, - LDLM_CB_CANCELING); - lock_res_and_lock(lock); - } else { - LDLM_DEBUG(lock, "no blocking ast"); - } - /* only canceller can set bl_done bit */ - ldlm_set_bl_done(lock); - wake_up_all(&lock->l_waitq); - } else if (!ldlm_is_bl_done(lock)) { - /* - * The lock is guaranteed to have been canceled once - * returning from this function. - */ - unlock_res_and_lock(lock); - wait_event_idle(lock->l_waitq, is_bl_done(lock)); - lock_res_and_lock(lock); - } -} - -/** - * Remove skiplist-enabled LDLM lock \a req from granted list - */ -void ldlm_unlink_lock_skiplist(struct ldlm_lock *req) -{ - if (req->l_resource->lr_type != LDLM_PLAIN && - req->l_resource->lr_type != LDLM_IBITS) - return; - - list_del_init(&req->l_sl_policy); - list_del_init(&req->l_sl_mode); -} - -/** - * Attempts to cancel LDLM lock \a lock that has no reader/writer references. - */ -void ldlm_lock_cancel(struct ldlm_lock *lock) -{ - struct ldlm_resource *res; - struct ldlm_namespace *ns; - - lock_res_and_lock(lock); - - res = lock->l_resource; - ns = ldlm_res_to_ns(res); - - /* Please do not, no matter how tempting, remove this LBUG without - * talking to me first. -phik - */ - if (lock->l_readers || lock->l_writers) { - LDLM_ERROR(lock, "lock still has references"); - LBUG(); - } - - /* Releases cancel callback. */ - ldlm_cancel_callback(lock); - - ldlm_resource_unlink_lock(lock); - ldlm_lock_destroy_nolock(lock); - - if (lock->l_granted_mode == lock->l_req_mode) - ldlm_pool_del(&ns->ns_pool, lock); - - /* Make sure we will not be called again for same lock what is possible - * if not to zero out lock->l_granted_mode - */ - lock->l_granted_mode = LCK_MINMODE; - unlock_res_and_lock(lock); -} -EXPORT_SYMBOL(ldlm_lock_cancel); - -/** - * Set opaque data into the lock that only makes sense to upper layer. - */ -int ldlm_lock_set_data(const struct lustre_handle *lockh, void *data) -{ - struct ldlm_lock *lock = ldlm_handle2lock(lockh); - int rc = -EINVAL; - - if (lock) { - if (!lock->l_ast_data) - lock->l_ast_data = data; - if (lock->l_ast_data == data) - rc = 0; - LDLM_LOCK_PUT(lock); - } - return rc; -} -EXPORT_SYMBOL(ldlm_lock_set_data); - -struct export_cl_data { - struct obd_export *ecl_exp; - int ecl_loop; -}; - -/** - * Print lock with lock handle \a lockh description into debug log. - * - * Used when printing all locks on a resource for debug purposes. - */ -void ldlm_lock_dump_handle(int level, const struct lustre_handle *lockh) -{ - struct ldlm_lock *lock; - - if (!((libcfs_debug | D_ERROR) & level)) - return; - - lock = ldlm_handle2lock(lockh); - if (!lock) - return; - - LDLM_DEBUG_LIMIT(level, lock, "###"); - - LDLM_LOCK_PUT(lock); -} -EXPORT_SYMBOL(ldlm_lock_dump_handle); - -/** - * Print lock information with custom message into debug log. - * Helper function. - */ -void _ldlm_lock_debug(struct ldlm_lock *lock, - struct libcfs_debug_msg_data *msgdata, - const char *fmt, ...) -{ - va_list args; - struct obd_export *exp = lock->l_export; - struct ldlm_resource *resource = lock->l_resource; - char *nid = "local"; - - va_start(args, fmt); - - if (exp && exp->exp_connection) { - nid = libcfs_nid2str(exp->exp_connection->c_peer.nid); - } else if (exp && exp->exp_obd) { - struct obd_import *imp = exp->exp_obd->u.cli.cl_import; - - nid = libcfs_nid2str(imp->imp_connection->c_peer.nid); - } - - if (!resource) { - libcfs_debug_vmsg2(msgdata, fmt, args, - " ns: \?\? lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: \?\? rrc=\?\? type: \?\?\? flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lu lvb_type: %d\n", - lock, - lock->l_handle.h_cookie, - atomic_read(&lock->l_refc), - lock->l_readers, lock->l_writers, - ldlm_lockname[lock->l_granted_mode], - ldlm_lockname[lock->l_req_mode], - lock->l_flags, nid, - lock->l_remote_handle.cookie, - exp ? atomic_read(&exp->exp_refcount) : -99, - lock->l_pid, lock->l_callback_timeout, - lock->l_lvb_type); - va_end(args); - return; - } - - switch (resource->lr_type) { - case LDLM_EXTENT: - libcfs_debug_vmsg2(msgdata, fmt, args, - " ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s [%llu->%llu] (req %llu->%llu) flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lu lvb_type: %d\n", - ldlm_lock_to_ns_name(lock), lock, - lock->l_handle.h_cookie, - atomic_read(&lock->l_refc), - lock->l_readers, lock->l_writers, - ldlm_lockname[lock->l_granted_mode], - ldlm_lockname[lock->l_req_mode], - PLDLMRES(resource), - atomic_read(&resource->lr_refcount), - ldlm_typename[resource->lr_type], - lock->l_policy_data.l_extent.start, - lock->l_policy_data.l_extent.end, - lock->l_req_extent.start, - lock->l_req_extent.end, - lock->l_flags, nid, - lock->l_remote_handle.cookie, - exp ? atomic_read(&exp->exp_refcount) : -99, - lock->l_pid, lock->l_callback_timeout, - lock->l_lvb_type); - break; - - case LDLM_FLOCK: - libcfs_debug_vmsg2(msgdata, fmt, args, - " ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s pid: %d [%llu->%llu] flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lu\n", - ldlm_lock_to_ns_name(lock), lock, - lock->l_handle.h_cookie, - atomic_read(&lock->l_refc), - lock->l_readers, lock->l_writers, - ldlm_lockname[lock->l_granted_mode], - ldlm_lockname[lock->l_req_mode], - PLDLMRES(resource), - atomic_read(&resource->lr_refcount), - ldlm_typename[resource->lr_type], - lock->l_policy_data.l_flock.pid, - lock->l_policy_data.l_flock.start, - lock->l_policy_data.l_flock.end, - lock->l_flags, nid, - lock->l_remote_handle.cookie, - exp ? atomic_read(&exp->exp_refcount) : -99, - lock->l_pid, lock->l_callback_timeout); - break; - - case LDLM_IBITS: - libcfs_debug_vmsg2(msgdata, fmt, args, - " ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " bits %#llx rrc: %d type: %s flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lu lvb_type: %d\n", - ldlm_lock_to_ns_name(lock), - lock, lock->l_handle.h_cookie, - atomic_read(&lock->l_refc), - lock->l_readers, lock->l_writers, - ldlm_lockname[lock->l_granted_mode], - ldlm_lockname[lock->l_req_mode], - PLDLMRES(resource), - lock->l_policy_data.l_inodebits.bits, - atomic_read(&resource->lr_refcount), - ldlm_typename[resource->lr_type], - lock->l_flags, nid, - lock->l_remote_handle.cookie, - exp ? atomic_read(&exp->exp_refcount) : -99, - lock->l_pid, lock->l_callback_timeout, - lock->l_lvb_type); - break; - - default: - libcfs_debug_vmsg2(msgdata, fmt, args, - " ns: %s lock: %p/%#llx lrc: %d/%d,%d mode: %s/%s res: " DLDLMRES " rrc: %d type: %s flags: %#llx nid: %s remote: %#llx expref: %d pid: %u timeout: %lu lvb_type: %d\n", - ldlm_lock_to_ns_name(lock), - lock, lock->l_handle.h_cookie, - atomic_read(&lock->l_refc), - lock->l_readers, lock->l_writers, - ldlm_lockname[lock->l_granted_mode], - ldlm_lockname[lock->l_req_mode], - PLDLMRES(resource), - atomic_read(&resource->lr_refcount), - ldlm_typename[resource->lr_type], - lock->l_flags, nid, - lock->l_remote_handle.cookie, - exp ? atomic_read(&exp->exp_refcount) : -99, - lock->l_pid, lock->l_callback_timeout, - lock->l_lvb_type); - break; - } - va_end(args); -} -EXPORT_SYMBOL(_ldlm_lock_debug); diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c b/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c deleted file mode 100644 index 5963e90d093833512bcb3390205fc811f69f2789..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ldlm/ldlm_lockd.c +++ /dev/null @@ -1,1163 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2010, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/ldlm/ldlm_lockd.c - * - * Author: Peter Braam - * Author: Phil Schwan - */ - -#define DEBUG_SUBSYSTEM S_LDLM - -#include -#include -#include -#include -#include -#include "ldlm_internal.h" - -static int ldlm_num_threads; -module_param(ldlm_num_threads, int, 0444); -MODULE_PARM_DESC(ldlm_num_threads, "number of DLM service threads to start"); - -static char *ldlm_cpts; -module_param(ldlm_cpts, charp, 0444); -MODULE_PARM_DESC(ldlm_cpts, "CPU partitions ldlm threads should run on"); - -static struct mutex ldlm_ref_mutex; -static int ldlm_refcount; - -static struct kobject *ldlm_kobj; -struct kset *ldlm_ns_kset; -static struct kset *ldlm_svc_kset; - -struct ldlm_cb_async_args { - struct ldlm_cb_set_arg *ca_set_arg; - struct ldlm_lock *ca_lock; -}; - -/* LDLM state */ - -static struct ldlm_state *ldlm_state; - -#define ELT_STOPPED 0 -#define ELT_READY 1 -#define ELT_TERMINATE 2 - -struct ldlm_bl_pool { - spinlock_t blp_lock; - - /* - * blp_prio_list is used for callbacks that should be handled - * as a priority. It is used for LDLM_FL_DISCARD_DATA requests. - * see bug 13843 - */ - struct list_head blp_prio_list; - - /* - * blp_list is used for all other callbacks which are likely - * to take longer to process. - */ - struct list_head blp_list; - - wait_queue_head_t blp_waitq; - struct completion blp_comp; - atomic_t blp_num_threads; - atomic_t blp_busy_threads; - int blp_min_threads; - int blp_max_threads; -}; - -struct ldlm_bl_work_item { - struct list_head blwi_entry; - struct ldlm_namespace *blwi_ns; - struct ldlm_lock_desc blwi_ld; - struct ldlm_lock *blwi_lock; - struct list_head blwi_head; - int blwi_count; - struct completion blwi_comp; - enum ldlm_cancel_flags blwi_flags; - int blwi_mem_pressure; -}; - -/** - * Callback handler for receiving incoming blocking ASTs. - * - * This can only happen on client side. - */ -void ldlm_handle_bl_callback(struct ldlm_namespace *ns, - struct ldlm_lock_desc *ld, struct ldlm_lock *lock) -{ - int do_ast; - - LDLM_DEBUG(lock, "client blocking AST callback handler"); - - lock_res_and_lock(lock); - ldlm_set_cbpending(lock); - - if (ldlm_is_cancel_on_block(lock)) - ldlm_set_cancel(lock); - - do_ast = !lock->l_readers && !lock->l_writers; - unlock_res_and_lock(lock); - - if (do_ast) { - CDEBUG(D_DLMTRACE, - "Lock %p already unused, calling callback (%p)\n", lock, - lock->l_blocking_ast); - if (lock->l_blocking_ast) - lock->l_blocking_ast(lock, ld, lock->l_ast_data, - LDLM_CB_BLOCKING); - } else { - CDEBUG(D_DLMTRACE, - "Lock %p is referenced, will be cancelled later\n", - lock); - } - - LDLM_DEBUG(lock, "client blocking callback handler END"); - LDLM_LOCK_RELEASE(lock); -} - -/** - * Callback handler for receiving incoming completion ASTs. - * - * This only can happen on client side. - */ -static void ldlm_handle_cp_callback(struct ptlrpc_request *req, - struct ldlm_namespace *ns, - struct ldlm_request *dlm_req, - struct ldlm_lock *lock) -{ - int lvb_len; - LIST_HEAD(ast_list); - int rc = 0; - - LDLM_DEBUG(lock, "client completion callback handler START"); - - if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE)) { - int to = HZ; - - while (to > 0) { - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(to); - if (lock->l_granted_mode == lock->l_req_mode || - ldlm_is_destroyed(lock)) - break; - } - } - - lvb_len = req_capsule_get_size(&req->rq_pill, &RMF_DLM_LVB, RCL_CLIENT); - if (lvb_len < 0) { - LDLM_ERROR(lock, "Fail to get lvb_len, rc = %d", lvb_len); - rc = lvb_len; - goto out; - } else if (lvb_len > 0) { - if (lock->l_lvb_len > 0) { - /* for extent lock, lvb contains ost_lvb{}. */ - LASSERT(lock->l_lvb_data); - - if (unlikely(lock->l_lvb_len < lvb_len)) { - LDLM_ERROR(lock, - "Replied LVB is larger than expectation, expected = %d, replied = %d", - lock->l_lvb_len, lvb_len); - rc = -EINVAL; - goto out; - } - } else if (ldlm_has_layout(lock)) { /* for layout lock, lvb has - * variable length - */ - void *lvb_data; - - lvb_data = kzalloc(lvb_len, GFP_NOFS); - if (!lvb_data) { - LDLM_ERROR(lock, "No memory: %d.\n", lvb_len); - rc = -ENOMEM; - goto out; - } - - lock_res_and_lock(lock); - LASSERT(!lock->l_lvb_data); - lock->l_lvb_type = LVB_T_LAYOUT; - lock->l_lvb_data = lvb_data; - lock->l_lvb_len = lvb_len; - unlock_res_and_lock(lock); - } - } - - lock_res_and_lock(lock); - if (ldlm_is_destroyed(lock) || - lock->l_granted_mode == lock->l_req_mode) { - /* bug 11300: the lock has already been granted */ - unlock_res_and_lock(lock); - LDLM_DEBUG(lock, "Double grant race happened"); - rc = 0; - goto out; - } - - /* If we receive the completion AST before the actual enqueue returned, - * then we might need to switch lock modes, resources, or extents. - */ - if (dlm_req->lock_desc.l_granted_mode != lock->l_req_mode) { - lock->l_req_mode = dlm_req->lock_desc.l_granted_mode; - LDLM_DEBUG(lock, "completion AST, new lock mode"); - } - - if (lock->l_resource->lr_type != LDLM_PLAIN) { - ldlm_convert_policy_to_local(req->rq_export, - dlm_req->lock_desc.l_resource.lr_type, - &dlm_req->lock_desc.l_policy_data, - &lock->l_policy_data); - LDLM_DEBUG(lock, "completion AST, new policy data"); - } - - ldlm_resource_unlink_lock(lock); - if (memcmp(&dlm_req->lock_desc.l_resource.lr_name, - &lock->l_resource->lr_name, - sizeof(lock->l_resource->lr_name)) != 0) { - unlock_res_and_lock(lock); - rc = ldlm_lock_change_resource(ns, lock, - &dlm_req->lock_desc.l_resource.lr_name); - if (rc < 0) { - LDLM_ERROR(lock, "Failed to allocate resource"); - goto out; - } - LDLM_DEBUG(lock, "completion AST, new resource"); - CERROR("change resource!\n"); - lock_res_and_lock(lock); - } - - if (dlm_req->lock_flags & LDLM_FL_AST_SENT) { - /* BL_AST locks are not needed in LRU. - * Let ldlm_cancel_lru() be fast. - */ - ldlm_lock_remove_from_lru(lock); - lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_BL_AST; - LDLM_DEBUG(lock, "completion AST includes blocking AST"); - } - - if (lock->l_lvb_len > 0) { - rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_CLIENT, - lock->l_lvb_data, lvb_len); - if (rc < 0) { - unlock_res_and_lock(lock); - goto out; - } - } - - ldlm_grant_lock(lock, &ast_list); - unlock_res_and_lock(lock); - - LDLM_DEBUG(lock, "callback handler finished, about to run_ast_work"); - - /* Let Enqueue to call osc_lock_upcall() and initialize l_ast_data */ - OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 2); - - ldlm_run_ast_work(ns, &ast_list, LDLM_WORK_CP_AST); - - LDLM_DEBUG_NOLOCK("client completion callback handler END (lock %p)", - lock); - goto out; - -out: - if (rc < 0) { - lock_res_and_lock(lock); - ldlm_set_failed(lock); - unlock_res_and_lock(lock); - wake_up(&lock->l_waitq); - } - LDLM_LOCK_RELEASE(lock); -} - -/** - * Callback handler for receiving incoming glimpse ASTs. - * - * This only can happen on client side. After handling the glimpse AST - * we also consider dropping the lock here if it is unused locally for a - * long time. - */ -static void ldlm_handle_gl_callback(struct ptlrpc_request *req, - struct ldlm_namespace *ns, - struct ldlm_request *dlm_req, - struct ldlm_lock *lock) -{ - int rc = -ENOSYS; - - LDLM_DEBUG(lock, "client glimpse AST callback handler"); - - if (lock->l_glimpse_ast) - rc = lock->l_glimpse_ast(lock, req); - - if (req->rq_repmsg) { - ptlrpc_reply(req); - } else { - req->rq_status = rc; - ptlrpc_error(req); - } - - lock_res_and_lock(lock); - if (lock->l_granted_mode == LCK_PW && - !lock->l_readers && !lock->l_writers && - time_after(jiffies, - lock->l_last_used + 10 * HZ)) { - unlock_res_and_lock(lock); - if (ldlm_bl_to_thread_lock(ns, NULL, lock)) - ldlm_handle_bl_callback(ns, NULL, lock); - - return; - } - unlock_res_and_lock(lock); - LDLM_LOCK_RELEASE(lock); -} - -static int ldlm_callback_reply(struct ptlrpc_request *req, int rc) -{ - if (req->rq_no_reply) - return 0; - - req->rq_status = rc; - if (!req->rq_packed_final) { - rc = lustre_pack_reply(req, 1, NULL, NULL); - if (rc) - return rc; - } - return ptlrpc_reply(req); -} - -static int __ldlm_bl_to_thread(struct ldlm_bl_work_item *blwi, - enum ldlm_cancel_flags cancel_flags) -{ - struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool; - - spin_lock(&blp->blp_lock); - if (blwi->blwi_lock && ldlm_is_discard_data(blwi->blwi_lock)) { - /* add LDLM_FL_DISCARD_DATA requests to the priority list */ - list_add_tail(&blwi->blwi_entry, &blp->blp_prio_list); - } else { - /* other blocking callbacks are added to the regular list */ - list_add_tail(&blwi->blwi_entry, &blp->blp_list); - } - spin_unlock(&blp->blp_lock); - - wake_up(&blp->blp_waitq); - - /* can not check blwi->blwi_flags as blwi could be already freed in - * LCF_ASYNC mode - */ - if (!(cancel_flags & LCF_ASYNC)) - wait_for_completion(&blwi->blwi_comp); - - return 0; -} - -static inline void init_blwi(struct ldlm_bl_work_item *blwi, - struct ldlm_namespace *ns, - struct ldlm_lock_desc *ld, - struct list_head *cancels, int count, - struct ldlm_lock *lock, - enum ldlm_cancel_flags cancel_flags) -{ - init_completion(&blwi->blwi_comp); - INIT_LIST_HEAD(&blwi->blwi_head); - - if (current->flags & PF_MEMALLOC) - blwi->blwi_mem_pressure = 1; - - blwi->blwi_ns = ns; - blwi->blwi_flags = cancel_flags; - if (ld) - blwi->blwi_ld = *ld; - if (count) { - list_add(&blwi->blwi_head, cancels); - list_del_init(cancels); - blwi->blwi_count = count; - } else { - blwi->blwi_lock = lock; - } -} - -/** - * Queues a list of locks \a cancels containing \a count locks - * for later processing by a blocking thread. If \a count is zero, - * then the lock referenced as \a lock is queued instead. - * - * The blocking thread would then call ->l_blocking_ast callback in the lock. - * If list addition fails an error is returned and caller is supposed to - * call ->l_blocking_ast itself. - */ -static int ldlm_bl_to_thread(struct ldlm_namespace *ns, - struct ldlm_lock_desc *ld, - struct ldlm_lock *lock, - struct list_head *cancels, int count, - enum ldlm_cancel_flags cancel_flags) -{ - if (cancels && count == 0) - return 0; - - if (cancel_flags & LCF_ASYNC) { - struct ldlm_bl_work_item *blwi; - - blwi = kzalloc(sizeof(*blwi), GFP_NOFS); - if (!blwi) - return -ENOMEM; - init_blwi(blwi, ns, ld, cancels, count, lock, cancel_flags); - - return __ldlm_bl_to_thread(blwi, cancel_flags); - } else { - /* if it is synchronous call do minimum mem alloc, as it could - * be triggered from kernel shrinker - */ - struct ldlm_bl_work_item blwi; - - memset(&blwi, 0, sizeof(blwi)); - init_blwi(&blwi, ns, ld, cancels, count, lock, cancel_flags); - return __ldlm_bl_to_thread(&blwi, cancel_flags); - } -} - -int ldlm_bl_to_thread_lock(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld, - struct ldlm_lock *lock) -{ - return ldlm_bl_to_thread(ns, ld, lock, NULL, 0, LCF_ASYNC); -} - -int ldlm_bl_to_thread_list(struct ldlm_namespace *ns, struct ldlm_lock_desc *ld, - struct list_head *cancels, int count, - enum ldlm_cancel_flags cancel_flags) -{ - return ldlm_bl_to_thread(ns, ld, NULL, cancels, count, cancel_flags); -} - -int ldlm_bl_thread_wakeup(void) -{ - wake_up(&ldlm_state->ldlm_bl_pool->blp_waitq); - return 0; -} - -/* Setinfo coming from Server (eg MDT) to Client (eg MDC)! */ -static int ldlm_handle_setinfo(struct ptlrpc_request *req) -{ - struct obd_device *obd = req->rq_export->exp_obd; - char *key; - void *val; - int keylen, vallen; - int rc = -ENOSYS; - - DEBUG_REQ(D_HSM, req, "%s: handle setinfo\n", obd->obd_name); - - req_capsule_set(&req->rq_pill, &RQF_OBD_SET_INFO); - - key = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY); - if (!key) { - DEBUG_REQ(D_IOCTL, req, "no set_info key"); - return -EFAULT; - } - keylen = req_capsule_get_size(&req->rq_pill, &RMF_SETINFO_KEY, - RCL_CLIENT); - val = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_VAL); - if (!val) { - DEBUG_REQ(D_IOCTL, req, "no set_info val"); - return -EFAULT; - } - vallen = req_capsule_get_size(&req->rq_pill, &RMF_SETINFO_VAL, - RCL_CLIENT); - - /* We are responsible for swabbing contents of val */ - - if (KEY_IS(KEY_HSM_COPYTOOL_SEND)) - /* Pass it on to mdc (the "export" in this case) */ - rc = obd_set_info_async(req->rq_svc_thread->t_env, - req->rq_export, - sizeof(KEY_HSM_COPYTOOL_SEND), - KEY_HSM_COPYTOOL_SEND, - vallen, val, NULL); - else - DEBUG_REQ(D_WARNING, req, "ignoring unknown key %s", key); - - return rc; -} - -static inline void ldlm_callback_errmsg(struct ptlrpc_request *req, - const char *msg, int rc, - const struct lustre_handle *handle) -{ - DEBUG_REQ((req->rq_no_reply || rc) ? D_WARNING : D_DLMTRACE, req, - "%s: [nid %s] [rc %d] [lock %#llx]", - msg, libcfs_id2str(req->rq_peer), rc, - handle ? handle->cookie : 0); - if (req->rq_no_reply) - CWARN("No reply was sent, maybe cause bug 21636.\n"); - else if (rc) - CWARN("Send reply failed, maybe cause bug 21636.\n"); -} - -/* TODO: handle requests in a similar way as MDT: see mdt_handle_common() */ -static int ldlm_callback_handler(struct ptlrpc_request *req) -{ - struct ldlm_namespace *ns; - struct ldlm_request *dlm_req; - struct ldlm_lock *lock; - int rc; - - /* Requests arrive in sender's byte order. The ptlrpc service - * handler has already checked and, if necessary, byte-swapped the - * incoming request message body, but I am responsible for the - * message buffers. - */ - - /* do nothing for sec context finalize */ - if (lustre_msg_get_opc(req->rq_reqmsg) == SEC_CTX_FINI) - return 0; - - req_capsule_init(&req->rq_pill, req, RCL_SERVER); - - if (!req->rq_export) { - rc = ldlm_callback_reply(req, -ENOTCONN); - ldlm_callback_errmsg(req, "Operate on unconnected server", - rc, NULL); - return 0; - } - - LASSERT(req->rq_export->exp_obd); - - switch (lustre_msg_get_opc(req->rq_reqmsg)) { - case LDLM_BL_CALLBACK: - if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET)) { - if (cfs_fail_err) - ldlm_callback_reply(req, -(int)cfs_fail_err); - return 0; - } - break; - case LDLM_CP_CALLBACK: - if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CP_CALLBACK_NET)) - return 0; - break; - case LDLM_GL_CALLBACK: - if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_GL_CALLBACK_NET)) - return 0; - break; - case LDLM_SET_INFO: - rc = ldlm_handle_setinfo(req); - ldlm_callback_reply(req, rc); - return 0; - default: - CERROR("unknown opcode %u\n", - lustre_msg_get_opc(req->rq_reqmsg)); - ldlm_callback_reply(req, -EPROTO); - return 0; - } - - ns = req->rq_export->exp_obd->obd_namespace; - LASSERT(ns); - - req_capsule_set(&req->rq_pill, &RQF_LDLM_CALLBACK); - - dlm_req = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); - if (!dlm_req) { - rc = ldlm_callback_reply(req, -EPROTO); - ldlm_callback_errmsg(req, "Operate without parameter", rc, - NULL); - return 0; - } - - /* Force a known safe race, send a cancel to the server for a lock - * which the server has already started a blocking callback on. - */ - if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_BL_CB_RACE) && - lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) { - rc = ldlm_cli_cancel(&dlm_req->lock_handle[0], 0); - if (rc < 0) - CERROR("ldlm_cli_cancel: %d\n", rc); - } - - lock = ldlm_handle2lock_long(&dlm_req->lock_handle[0], 0); - if (!lock) { - CDEBUG(D_DLMTRACE, - "callback on lock %#llx - lock disappeared\n", - dlm_req->lock_handle[0].cookie); - rc = ldlm_callback_reply(req, -EINVAL); - ldlm_callback_errmsg(req, "Operate with invalid parameter", rc, - &dlm_req->lock_handle[0]); - return 0; - } - - if (ldlm_is_fail_loc(lock) && - lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) - OBD_RACE(OBD_FAIL_LDLM_CP_BL_RACE); - - /* Copy hints/flags (e.g. LDLM_FL_DISCARD_DATA) from AST. */ - lock_res_and_lock(lock); - lock->l_flags |= ldlm_flags_from_wire(dlm_req->lock_flags & - LDLM_FL_AST_MASK); - if (lustre_msg_get_opc(req->rq_reqmsg) == LDLM_BL_CALLBACK) { - /* If somebody cancels lock and cache is already dropped, - * or lock is failed before cp_ast received on client, - * we can tell the server we have no lock. Otherwise, we - * should send cancel after dropping the cache. - */ - if ((ldlm_is_canceling(lock) && ldlm_is_bl_done(lock)) || - ldlm_is_failed(lock)) { - LDLM_DEBUG(lock, - "callback on lock %#llx - lock disappeared", - dlm_req->lock_handle[0].cookie); - unlock_res_and_lock(lock); - LDLM_LOCK_RELEASE(lock); - rc = ldlm_callback_reply(req, -EINVAL); - ldlm_callback_errmsg(req, "Operate on stale lock", rc, - &dlm_req->lock_handle[0]); - return 0; - } - /* BL_AST locks are not needed in LRU. - * Let ldlm_cancel_lru() be fast. - */ - ldlm_lock_remove_from_lru(lock); - ldlm_set_bl_ast(lock); - } - unlock_res_and_lock(lock); - - /* We want the ost thread to get this reply so that it can respond - * to ost requests (write cache writeback) that might be triggered - * in the callback. - * - * But we'd also like to be able to indicate in the reply that we're - * cancelling right now, because it's unused, or have an intent result - * in the reply, so we might have to push the responsibility for sending - * the reply down into the AST handlers, alas. - */ - - switch (lustre_msg_get_opc(req->rq_reqmsg)) { - case LDLM_BL_CALLBACK: - CDEBUG(D_INODE, "blocking ast\n"); - req_capsule_extend(&req->rq_pill, &RQF_LDLM_BL_CALLBACK); - if (!ldlm_is_cancel_on_block(lock)) { - rc = ldlm_callback_reply(req, 0); - if (req->rq_no_reply || rc) - ldlm_callback_errmsg(req, "Normal process", rc, - &dlm_req->lock_handle[0]); - } - if (ldlm_bl_to_thread_lock(ns, &dlm_req->lock_desc, lock)) - ldlm_handle_bl_callback(ns, &dlm_req->lock_desc, lock); - break; - case LDLM_CP_CALLBACK: - CDEBUG(D_INODE, "completion ast\n"); - req_capsule_extend(&req->rq_pill, &RQF_LDLM_CP_CALLBACK); - ldlm_callback_reply(req, 0); - ldlm_handle_cp_callback(req, ns, dlm_req, lock); - break; - case LDLM_GL_CALLBACK: - CDEBUG(D_INODE, "glimpse ast\n"); - req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK); - ldlm_handle_gl_callback(req, ns, dlm_req, lock); - break; - default: - LBUG(); /* checked above */ - } - - return 0; -} - -static int ldlm_bl_get_work(struct ldlm_bl_pool *blp, - struct ldlm_bl_work_item **p_blwi, - struct obd_export **p_exp) -{ - int num_th = atomic_read(&blp->blp_num_threads); - struct ldlm_bl_work_item *blwi = NULL; - static unsigned int num_bl; - - spin_lock(&blp->blp_lock); - /* process a request from the blp_list at least every blp_num_threads */ - if (!list_empty(&blp->blp_list) && - (list_empty(&blp->blp_prio_list) || num_bl == 0)) - blwi = list_first_entry(&blp->blp_list, - struct ldlm_bl_work_item, blwi_entry); - else - if (!list_empty(&blp->blp_prio_list)) - blwi = list_first_entry(&blp->blp_prio_list, - struct ldlm_bl_work_item, - blwi_entry); - - if (blwi) { - if (++num_bl >= num_th) - num_bl = 0; - list_del(&blwi->blwi_entry); - } - spin_unlock(&blp->blp_lock); - *p_blwi = blwi; - - return (*p_blwi || *p_exp) ? 1 : 0; -} - -/* This only contains temporary data until the thread starts */ -struct ldlm_bl_thread_data { - struct ldlm_bl_pool *bltd_blp; - struct completion bltd_comp; - int bltd_num; -}; - -static int ldlm_bl_thread_main(void *arg); - -static int ldlm_bl_thread_start(struct ldlm_bl_pool *blp, bool check_busy) -{ - struct ldlm_bl_thread_data bltd = { .bltd_blp = blp }; - struct task_struct *task; - - init_completion(&bltd.bltd_comp); - - bltd.bltd_num = atomic_inc_return(&blp->blp_num_threads); - if (bltd.bltd_num >= blp->blp_max_threads) { - atomic_dec(&blp->blp_num_threads); - return 0; - } - - LASSERTF(bltd.bltd_num > 0, "thread num:%d\n", bltd.bltd_num); - if (check_busy && - atomic_read(&blp->blp_busy_threads) < (bltd.bltd_num - 1)) { - atomic_dec(&blp->blp_num_threads); - return 0; - } - - task = kthread_run(ldlm_bl_thread_main, &bltd, "ldlm_bl_%02d", - bltd.bltd_num); - if (IS_ERR(task)) { - CERROR("cannot start LDLM thread ldlm_bl_%02d: rc %ld\n", - bltd.bltd_num, PTR_ERR(task)); - atomic_dec(&blp->blp_num_threads); - return PTR_ERR(task); - } - wait_for_completion(&bltd.bltd_comp); - - return 0; -} - -/* Not fatal if racy and have a few too many threads */ -static int ldlm_bl_thread_need_create(struct ldlm_bl_pool *blp, - struct ldlm_bl_work_item *blwi) -{ - if (atomic_read(&blp->blp_num_threads) >= blp->blp_max_threads) - return 0; - - if (atomic_read(&blp->blp_busy_threads) < - atomic_read(&blp->blp_num_threads)) - return 0; - - if (blwi && (!blwi->blwi_ns || blwi->blwi_mem_pressure)) - return 0; - - return 1; -} - -static int ldlm_bl_thread_blwi(struct ldlm_bl_pool *blp, - struct ldlm_bl_work_item *blwi) -{ - unsigned int flags = 0; - - if (!blwi->blwi_ns) - /* added by ldlm_cleanup() */ - return LDLM_ITER_STOP; - - if (blwi->blwi_mem_pressure) - flags = memalloc_noreclaim_save(); - - OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL2, 4); - - if (blwi->blwi_count) { - int count; - - /* - * The special case when we cancel locks in lru - * asynchronously, we pass the list of locks here. - * Thus locks are marked LDLM_FL_CANCELING, but NOT - * canceled locally yet. - */ - count = ldlm_cli_cancel_list_local(&blwi->blwi_head, - blwi->blwi_count, - LCF_BL_AST); - ldlm_cli_cancel_list(&blwi->blwi_head, count, NULL, - blwi->blwi_flags); - } else { - ldlm_handle_bl_callback(blwi->blwi_ns, &blwi->blwi_ld, - blwi->blwi_lock); - } - if (blwi->blwi_mem_pressure) - memalloc_noreclaim_restore(flags); - - if (blwi->blwi_flags & LCF_ASYNC) - kfree(blwi); - else - complete(&blwi->blwi_comp); - - return 0; -} - -/** - * Main blocking requests processing thread. - * - * Callers put locks into its queue by calling ldlm_bl_to_thread. - * This thread in the end ends up doing actual call to ->l_blocking_ast - * for queued locks. - */ -static int ldlm_bl_thread_main(void *arg) -{ - struct ldlm_bl_pool *blp; - struct ldlm_bl_thread_data *bltd = arg; - - blp = bltd->bltd_blp; - - complete(&bltd->bltd_comp); - /* cannot use bltd after this, it is only on caller's stack */ - - while (1) { - struct ldlm_bl_work_item *blwi = NULL; - struct obd_export *exp = NULL; - int rc; - - rc = ldlm_bl_get_work(blp, &blwi, &exp); - if (!rc) - wait_event_idle_exclusive(blp->blp_waitq, - ldlm_bl_get_work(blp, &blwi, - &exp)); - atomic_inc(&blp->blp_busy_threads); - - if (ldlm_bl_thread_need_create(blp, blwi)) - /* discard the return value, we tried */ - ldlm_bl_thread_start(blp, true); - - if (blwi) - rc = ldlm_bl_thread_blwi(blp, blwi); - - atomic_dec(&blp->blp_busy_threads); - - if (rc == LDLM_ITER_STOP) - break; - } - - atomic_dec(&blp->blp_num_threads); - complete(&blp->blp_comp); - return 0; -} - -static int ldlm_setup(void); -static int ldlm_cleanup(void); - -int ldlm_get_ref(void) -{ - int rc = 0; - - rc = ptlrpc_inc_ref(); - if (rc) - return rc; - - mutex_lock(&ldlm_ref_mutex); - if (++ldlm_refcount == 1) { - rc = ldlm_setup(); - if (rc) - ldlm_refcount--; - } - mutex_unlock(&ldlm_ref_mutex); - - if (rc) - ptlrpc_dec_ref(); - - return rc; -} - -void ldlm_put_ref(void) -{ - int rc = 0; - mutex_lock(&ldlm_ref_mutex); - if (ldlm_refcount == 1) { - rc = ldlm_cleanup(); - - if (rc) - CERROR("ldlm_cleanup failed: %d\n", rc); - else - ldlm_refcount--; - } else { - ldlm_refcount--; - } - mutex_unlock(&ldlm_ref_mutex); - if (!rc) - ptlrpc_dec_ref(); -} - -static ssize_t cancel_unused_locks_before_replay_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - return sprintf(buf, "%d\n", ldlm_cancel_unused_locks_before_replay); -} - -static ssize_t cancel_unused_locks_before_replay_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - int rc; - unsigned long val; - - rc = kstrtoul(buffer, 10, &val); - if (rc) - return rc; - - ldlm_cancel_unused_locks_before_replay = val; - - return count; -} -LUSTRE_RW_ATTR(cancel_unused_locks_before_replay); - -/* These are for root of /sys/fs/lustre/ldlm */ -static struct attribute *ldlm_attrs[] = { - &lustre_attr_cancel_unused_locks_before_replay.attr, - NULL, -}; - -static const struct attribute_group ldlm_attr_group = { - .attrs = ldlm_attrs, -}; - -static int ldlm_setup(void) -{ - static struct ptlrpc_service_conf conf; - struct ldlm_bl_pool *blp = NULL; - int rc = 0; - int i; - - if (ldlm_state) - return -EALREADY; - - ldlm_state = kzalloc(sizeof(*ldlm_state), GFP_NOFS); - if (!ldlm_state) - return -ENOMEM; - - ldlm_kobj = kobject_create_and_add("ldlm", lustre_kobj); - if (!ldlm_kobj) { - rc = -ENOMEM; - goto out; - } - - rc = sysfs_create_group(ldlm_kobj, &ldlm_attr_group); - if (rc) - goto out; - - ldlm_ns_kset = kset_create_and_add("namespaces", NULL, ldlm_kobj); - if (!ldlm_ns_kset) { - rc = -ENOMEM; - goto out; - } - - ldlm_svc_kset = kset_create_and_add("services", NULL, ldlm_kobj); - if (!ldlm_svc_kset) { - rc = -ENOMEM; - goto out; - } - - ldlm_debugfs_setup(); - - memset(&conf, 0, sizeof(conf)); - conf = (typeof(conf)) { - .psc_name = "ldlm_cbd", - .psc_watchdog_factor = 2, - .psc_buf = { - .bc_nbufs = LDLM_CLIENT_NBUFS, - .bc_buf_size = LDLM_BUFSIZE, - .bc_req_max_size = LDLM_MAXREQSIZE, - .bc_rep_max_size = LDLM_MAXREPSIZE, - .bc_req_portal = LDLM_CB_REQUEST_PORTAL, - .bc_rep_portal = LDLM_CB_REPLY_PORTAL, - }, - .psc_thr = { - .tc_thr_name = "ldlm_cb", - .tc_thr_factor = LDLM_THR_FACTOR, - .tc_nthrs_init = LDLM_NTHRS_INIT, - .tc_nthrs_base = LDLM_NTHRS_BASE, - .tc_nthrs_max = LDLM_NTHRS_MAX, - .tc_nthrs_user = ldlm_num_threads, - .tc_cpu_affinity = 1, - .tc_ctx_tags = LCT_MD_THREAD | LCT_DT_THREAD, - }, - .psc_cpt = { - .cc_pattern = ldlm_cpts, - }, - .psc_ops = { - .so_req_handler = ldlm_callback_handler, - }, - }; - ldlm_state->ldlm_cb_service = - ptlrpc_register_service(&conf, ldlm_svc_kset, - ldlm_svc_debugfs_dir); - if (IS_ERR(ldlm_state->ldlm_cb_service)) { - CERROR("failed to start service\n"); - rc = PTR_ERR(ldlm_state->ldlm_cb_service); - ldlm_state->ldlm_cb_service = NULL; - goto out; - } - - blp = kzalloc(sizeof(*blp), GFP_NOFS); - if (!blp) { - rc = -ENOMEM; - goto out; - } - ldlm_state->ldlm_bl_pool = blp; - - spin_lock_init(&blp->blp_lock); - INIT_LIST_HEAD(&blp->blp_list); - INIT_LIST_HEAD(&blp->blp_prio_list); - init_waitqueue_head(&blp->blp_waitq); - atomic_set(&blp->blp_num_threads, 0); - atomic_set(&blp->blp_busy_threads, 0); - - if (ldlm_num_threads == 0) { - blp->blp_min_threads = LDLM_NTHRS_INIT; - blp->blp_max_threads = LDLM_NTHRS_MAX; - } else { - blp->blp_min_threads = min_t(int, LDLM_NTHRS_MAX, - max_t(int, LDLM_NTHRS_INIT, - ldlm_num_threads)); - - blp->blp_max_threads = blp->blp_min_threads; - } - - for (i = 0; i < blp->blp_min_threads; i++) { - rc = ldlm_bl_thread_start(blp, false); - if (rc < 0) - goto out; - } - - rc = ldlm_pools_init(); - if (rc) { - CERROR("Failed to initialize LDLM pools: %d\n", rc); - goto out; - } - return 0; - - out: - ldlm_cleanup(); - return rc; -} - -static int ldlm_cleanup(void) -{ - if (!list_empty(ldlm_namespace_list(LDLM_NAMESPACE_SERVER)) || - !list_empty(ldlm_namespace_list(LDLM_NAMESPACE_CLIENT))) { - CERROR("ldlm still has namespaces; clean these up first.\n"); - ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE); - ldlm_dump_all_namespaces(LDLM_NAMESPACE_CLIENT, D_DLMTRACE); - return -EBUSY; - } - - ldlm_pools_fini(); - - if (ldlm_state->ldlm_bl_pool) { - struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool; - - while (atomic_read(&blp->blp_num_threads) > 0) { - struct ldlm_bl_work_item blwi = { .blwi_ns = NULL }; - - init_completion(&blp->blp_comp); - - spin_lock(&blp->blp_lock); - list_add_tail(&blwi.blwi_entry, &blp->blp_list); - wake_up(&blp->blp_waitq); - spin_unlock(&blp->blp_lock); - - wait_for_completion(&blp->blp_comp); - } - - kfree(blp); - } - - if (ldlm_state->ldlm_cb_service) - ptlrpc_unregister_service(ldlm_state->ldlm_cb_service); - - if (ldlm_ns_kset) - kset_unregister(ldlm_ns_kset); - if (ldlm_svc_kset) - kset_unregister(ldlm_svc_kset); - if (ldlm_kobj) { - sysfs_remove_group(ldlm_kobj, &ldlm_attr_group); - kobject_put(ldlm_kobj); - } - - ldlm_debugfs_cleanup(); - - kfree(ldlm_state); - ldlm_state = NULL; - - return 0; -} - -int ldlm_init(void) -{ - mutex_init(&ldlm_ref_mutex); - mutex_init(ldlm_namespace_lock(LDLM_NAMESPACE_SERVER)); - mutex_init(ldlm_namespace_lock(LDLM_NAMESPACE_CLIENT)); - ldlm_resource_slab = kmem_cache_create("ldlm_resources", - sizeof(struct ldlm_resource), 0, - SLAB_HWCACHE_ALIGN, NULL); - if (!ldlm_resource_slab) - return -ENOMEM; - - ldlm_lock_slab = kmem_cache_create("ldlm_locks", - sizeof(struct ldlm_lock), 0, - SLAB_HWCACHE_ALIGN | - SLAB_TYPESAFE_BY_RCU, NULL); - if (!ldlm_lock_slab) { - kmem_cache_destroy(ldlm_resource_slab); - return -ENOMEM; - } - - ldlm_interval_slab = kmem_cache_create("interval_node", - sizeof(struct ldlm_interval), - 0, SLAB_HWCACHE_ALIGN, NULL); - if (!ldlm_interval_slab) { - kmem_cache_destroy(ldlm_resource_slab); - kmem_cache_destroy(ldlm_lock_slab); - return -ENOMEM; - } -#if LUSTRE_TRACKS_LOCK_EXP_REFS - class_export_dump_hook = ldlm_dump_export_locks; -#endif - return 0; -} - -void ldlm_exit(void) -{ - if (ldlm_refcount) - CERROR("ldlm_refcount is %d in %s!\n", ldlm_refcount, __func__); - kmem_cache_destroy(ldlm_resource_slab); - /* ldlm_lock_put() use RCU to call ldlm_lock_free, so need call - * synchronize_rcu() to wait a grace period elapsed, so that - * ldlm_lock_free() get a chance to be called. - */ - synchronize_rcu(); - kmem_cache_destroy(ldlm_lock_slab); - kmem_cache_destroy(ldlm_interval_slab); -} diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_plain.c b/drivers/staging/lustre/lustre/ldlm/ldlm_plain.c deleted file mode 100644 index 33b5a3f96fcb0733311d2bb3a1affb7bf74b3cc1..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ldlm/ldlm_plain.c +++ /dev/null @@ -1,68 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/ldlm/ldlm_plain.c - * - * Author: Peter Braam - * Author: Phil Schwan - */ - -/** - * This file contains implementation of PLAIN lock type. - * - * PLAIN locks are the simplest form of LDLM locking, and are used when - * there only needs to be a single lock on a resource. This avoids some - * of the complexity of EXTENT and IBITS lock types, but doesn't allow - * different "parts" of a resource to be locked concurrently. Example - * use cases for PLAIN locks include locking of MGS configuration logs - * and (as of Lustre 2.4) quota records. - */ - -#define DEBUG_SUBSYSTEM S_LDLM - -#include -#include -#include - -#include "ldlm_internal.h" - -void ldlm_plain_policy_wire_to_local(const union ldlm_wire_policy_data *wpolicy, - union ldlm_policy_data *lpolicy) -{ - /* No policy for plain locks */ -} - -void ldlm_plain_policy_local_to_wire(const union ldlm_policy_data *lpolicy, - union ldlm_wire_policy_data *wpolicy) -{ - /* No policy for plain locks */ -} diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c b/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c deleted file mode 100644 index 36d14ee4e5b1845e9d3e4dbab4e93dc76290272d..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ldlm/ldlm_pool.c +++ /dev/null @@ -1,1013 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2010, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/ldlm/ldlm_pool.c - * - * Author: Yury Umanets - */ - -/* - * Idea of this code is rather simple. Each second, for each server namespace - * we have SLV - server lock volume which is calculated on current number of - * granted locks, grant speed for past period, etc - that is, locking load. - * This SLV number may be thought as a flow definition for simplicity. It is - * sent to clients with each occasion to let them know what is current load - * situation on the server. By default, at the beginning, SLV on server is - * set max value which is calculated as the following: allow to one client - * have all locks of limit ->pl_limit for 10h. - * - * Next, on clients, number of cached locks is not limited artificially in any - * way as it was before. Instead, client calculates CLV, that is, client lock - * volume for each lock and compares it with last SLV from the server. CLV is - * calculated as the number of locks in LRU * lock live time in seconds. If - * CLV > SLV - lock is canceled. - * - * Client has LVF, that is, lock volume factor which regulates how much - * sensitive client should be about last SLV from server. The higher LVF is the - * more locks will be canceled on client. Default value for it is 1. Setting LVF - * to 2 means that client will cancel locks 2 times faster. - * - * Locks on a client will be canceled more intensively in these cases: - * (1) if SLV is smaller, that is, load is higher on the server; - * (2) client has a lot of locks (the more locks are held by client, the bigger - * chances that some of them should be canceled); - * (3) client has old locks (taken some time ago); - * - * Thus, according to flow paradigm that we use for better understanding SLV, - * CLV is the volume of particle in flow described by SLV. According to this, - * if flow is getting thinner, more and more particles become outside of it and - * as particles are locks, they should be canceled. - * - * General idea of this belongs to Vitaly Fertman (vitaly@clusterfs.com). - * Andreas Dilger (adilger@clusterfs.com) proposed few nice ideas like using - * LVF and many cleanups. Flow definition to allow more easy understanding of - * the logic belongs to Nikita Danilov (nikita@clusterfs.com) as well as many - * cleanups and fixes. And design and implementation are done by Yury Umanets - * (umka@clusterfs.com). - * - * Glossary for terms used: - * - * pl_limit - Number of allowed locks in pool. Applies to server and client - * side (tunable); - * - * pl_granted - Number of granted locks (calculated); - * pl_grant_rate - Number of granted locks for last T (calculated); - * pl_cancel_rate - Number of canceled locks for last T (calculated); - * pl_grant_speed - Grant speed (GR - CR) for last T (calculated); - * pl_grant_plan - Planned number of granted locks for next T (calculated); - * pl_server_lock_volume - Current server lock volume (calculated); - * - * As it may be seen from list above, we have few possible tunables which may - * affect behavior much. They all may be modified via sysfs. However, they also - * give a possibility for constructing few pre-defined behavior policies. If - * none of predefines is suitable for a working pattern being used, new one may - * be "constructed" via sysfs tunables. - */ - -#define DEBUG_SUBSYSTEM S_LDLM - -#include -#include -#include -#include -#include "ldlm_internal.h" - -/* - * 50 ldlm locks for 1MB of RAM. - */ -#define LDLM_POOL_HOST_L ((NUM_CACHEPAGES >> (20 - PAGE_SHIFT)) * 50) - -/* - * Maximal possible grant step plan in %. - */ -#define LDLM_POOL_MAX_GSP (30) - -/* - * Minimal possible grant step plan in %. - */ -#define LDLM_POOL_MIN_GSP (1) - -/* - * This controls the speed of reaching LDLM_POOL_MAX_GSP - * with increasing thread period. - */ -#define LDLM_POOL_GSP_STEP_SHIFT (2) - -/* - * LDLM_POOL_GSP% of all locks is default GP. - */ -#define LDLM_POOL_GP(L) (((L) * LDLM_POOL_MAX_GSP) / 100) - -/* - * Max age for locks on clients. - */ -#define LDLM_POOL_MAX_AGE (36000) - -/* - * The granularity of SLV calculation. - */ -#define LDLM_POOL_SLV_SHIFT (10) - -static inline __u64 dru(__u64 val, __u32 shift, int round_up) -{ - return (val + (round_up ? (1 << shift) - 1 : 0)) >> shift; -} - -static inline __u64 ldlm_pool_slv_max(__u32 L) -{ - /* - * Allow to have all locks for 1 client for 10 hrs. - * Formula is the following: limit * 10h / 1 client. - */ - __u64 lim = (__u64)L * LDLM_POOL_MAX_AGE / 1; - return lim; -} - -static inline __u64 ldlm_pool_slv_min(__u32 L) -{ - return 1; -} - -enum { - LDLM_POOL_FIRST_STAT = 0, - LDLM_POOL_GRANTED_STAT = LDLM_POOL_FIRST_STAT, - LDLM_POOL_GRANT_STAT, - LDLM_POOL_CANCEL_STAT, - LDLM_POOL_GRANT_RATE_STAT, - LDLM_POOL_CANCEL_RATE_STAT, - LDLM_POOL_GRANT_PLAN_STAT, - LDLM_POOL_SLV_STAT, - LDLM_POOL_SHRINK_REQTD_STAT, - LDLM_POOL_SHRINK_FREED_STAT, - LDLM_POOL_RECALC_STAT, - LDLM_POOL_TIMING_STAT, - LDLM_POOL_LAST_STAT -}; - -/** - * Calculates suggested grant_step in % of available locks for passed - * \a period. This is later used in grant_plan calculations. - */ -static inline int ldlm_pool_t2gsp(unsigned int t) -{ - /* - * This yields 1% grant step for anything below LDLM_POOL_GSP_STEP - * and up to 30% for anything higher than LDLM_POOL_GSP_STEP. - * - * How this will affect execution is the following: - * - * - for thread period 1s we will have grant_step 1% which good from - * pov of taking some load off from server and push it out to clients. - * This is like that because 1% for grant_step means that server will - * not allow clients to get lots of locks in short period of time and - * keep all old locks in their caches. Clients will always have to - * get some locks back if they want to take some new; - * - * - for thread period 10s (which is default) we will have 23% which - * means that clients will have enough of room to take some new locks - * without getting some back. All locks from this 23% which were not - * taken by clients in current period will contribute in SLV growing. - * SLV growing means more locks cached on clients until limit or grant - * plan is reached. - */ - return LDLM_POOL_MAX_GSP - - ((LDLM_POOL_MAX_GSP - LDLM_POOL_MIN_GSP) >> - (t >> LDLM_POOL_GSP_STEP_SHIFT)); -} - -/** - * Recalculates next stats on passed \a pl. - * - * \pre ->pl_lock is locked. - */ -static void ldlm_pool_recalc_stats(struct ldlm_pool *pl) -{ - int grant_plan = pl->pl_grant_plan; - __u64 slv = pl->pl_server_lock_volume; - int granted = atomic_read(&pl->pl_granted); - int grant_rate = atomic_read(&pl->pl_grant_rate); - int cancel_rate = atomic_read(&pl->pl_cancel_rate); - - lprocfs_counter_add(pl->pl_stats, LDLM_POOL_SLV_STAT, - slv); - lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANTED_STAT, - granted); - lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT, - grant_rate); - lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT, - grant_plan); - lprocfs_counter_add(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT, - cancel_rate); -} - -/** - * Sets SLV and Limit from container_of(pl, struct ldlm_namespace, - * ns_pool)->ns_obd tp passed \a pl. - */ -static void ldlm_cli_pool_pop_slv(struct ldlm_pool *pl) -{ - struct obd_device *obd; - - /* - * Get new SLV and Limit from obd which is updated with coming - * RPCs. - */ - obd = container_of(pl, struct ldlm_namespace, - ns_pool)->ns_obd; - read_lock(&obd->obd_pool_lock); - pl->pl_server_lock_volume = obd->obd_pool_slv; - atomic_set(&pl->pl_limit, obd->obd_pool_limit); - read_unlock(&obd->obd_pool_lock); -} - -/** - * Recalculates client size pool \a pl according to current SLV and Limit. - */ -static int ldlm_cli_pool_recalc(struct ldlm_pool *pl) -{ - time64_t recalc_interval_sec; - int ret; - - recalc_interval_sec = ktime_get_real_seconds() - pl->pl_recalc_time; - if (recalc_interval_sec < pl->pl_recalc_period) - return 0; - - spin_lock(&pl->pl_lock); - /* - * Check if we need to recalc lists now. - */ - recalc_interval_sec = ktime_get_real_seconds() - pl->pl_recalc_time; - if (recalc_interval_sec < pl->pl_recalc_period) { - spin_unlock(&pl->pl_lock); - return 0; - } - - /* - * Make sure that pool knows last SLV and Limit from obd. - */ - ldlm_cli_pool_pop_slv(pl); - - spin_unlock(&pl->pl_lock); - - /* - * Do not cancel locks in case lru resize is disabled for this ns. - */ - if (!ns_connect_lru_resize(container_of(pl, struct ldlm_namespace, - ns_pool))) { - ret = 0; - goto out; - } - - /* - * In the time of canceling locks on client we do not need to maintain - * sharp timing, we only want to cancel locks asap according to new SLV. - * It may be called when SLV has changed much, this is why we do not - * take into account pl->pl_recalc_time here. - */ - ret = ldlm_cancel_lru(container_of(pl, struct ldlm_namespace, ns_pool), - 0, LCF_ASYNC, LDLM_LRU_FLAG_LRUR); - -out: - spin_lock(&pl->pl_lock); - /* - * Time of LRU resizing might be longer than period, - * so update after LRU resizing rather than before it. - */ - pl->pl_recalc_time = ktime_get_real_seconds(); - lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT, - recalc_interval_sec); - spin_unlock(&pl->pl_lock); - return ret; -} - -/** - * This function is main entry point for memory pressure handling on client - * side. Main goal of this function is to cancel some number of locks on - * passed \a pl according to \a nr and \a gfp_mask. - */ -static int ldlm_cli_pool_shrink(struct ldlm_pool *pl, - int nr, gfp_t gfp_mask) -{ - struct ldlm_namespace *ns; - int unused; - - ns = container_of(pl, struct ldlm_namespace, ns_pool); - - /* - * Do not cancel locks in case lru resize is disabled for this ns. - */ - if (!ns_connect_lru_resize(ns)) - return 0; - - /* - * Make sure that pool knows last SLV and Limit from obd. - */ - ldlm_cli_pool_pop_slv(pl); - - spin_lock(&ns->ns_lock); - unused = ns->ns_nr_unused; - spin_unlock(&ns->ns_lock); - - if (nr == 0) - return (unused / 100) * sysctl_vfs_cache_pressure; - else - return ldlm_cancel_lru(ns, nr, LCF_ASYNC, LDLM_LRU_FLAG_SHRINK); -} - -static const struct ldlm_pool_ops ldlm_cli_pool_ops = { - .po_recalc = ldlm_cli_pool_recalc, - .po_shrink = ldlm_cli_pool_shrink -}; - -/** - * Pool recalc wrapper. Will call either client or server pool recalc callback - * depending what pool \a pl is used. - */ -static int ldlm_pool_recalc(struct ldlm_pool *pl) -{ - u32 recalc_interval_sec; - int count; - - recalc_interval_sec = ktime_get_real_seconds() - pl->pl_recalc_time; - if (recalc_interval_sec > 0) { - spin_lock(&pl->pl_lock); - recalc_interval_sec = ktime_get_real_seconds() - pl->pl_recalc_time; - - if (recalc_interval_sec > 0) { - /* - * Update pool statistics every 1s. - */ - ldlm_pool_recalc_stats(pl); - - /* - * Zero out all rates and speed for the last period. - */ - atomic_set(&pl->pl_grant_rate, 0); - atomic_set(&pl->pl_cancel_rate, 0); - } - spin_unlock(&pl->pl_lock); - } - - if (pl->pl_ops->po_recalc) { - count = pl->pl_ops->po_recalc(pl); - lprocfs_counter_add(pl->pl_stats, LDLM_POOL_RECALC_STAT, - count); - } - - recalc_interval_sec = pl->pl_recalc_time - ktime_get_real_seconds() + - pl->pl_recalc_period; - if (recalc_interval_sec <= 0) { - /* DEBUG: should be re-removed after LU-4536 is fixed */ - CDEBUG(D_DLMTRACE, - "%s: Negative interval(%ld), too short period(%ld)\n", - pl->pl_name, (long)recalc_interval_sec, - (long)pl->pl_recalc_period); - - /* Prevent too frequent recalculation. */ - recalc_interval_sec = 1; - } - - return recalc_interval_sec; -} - -/* - * Pool shrink wrapper. Will call either client or server pool recalc callback - * depending what pool pl is used. When nr == 0, just return the number of - * freeable locks. Otherwise, return the number of canceled locks. - */ -static int ldlm_pool_shrink(struct ldlm_pool *pl, int nr, gfp_t gfp_mask) -{ - int cancel = 0; - - if (pl->pl_ops->po_shrink) { - cancel = pl->pl_ops->po_shrink(pl, nr, gfp_mask); - if (nr > 0) { - lprocfs_counter_add(pl->pl_stats, - LDLM_POOL_SHRINK_REQTD_STAT, - nr); - lprocfs_counter_add(pl->pl_stats, - LDLM_POOL_SHRINK_FREED_STAT, - cancel); - CDEBUG(D_DLMTRACE, - "%s: request to shrink %d locks, shrunk %d\n", - pl->pl_name, nr, cancel); - } - } - return cancel; -} - -static int lprocfs_pool_state_seq_show(struct seq_file *m, void *unused) -{ - int granted, grant_rate, cancel_rate; - int grant_speed, lvf; - struct ldlm_pool *pl = m->private; - __u64 slv, clv; - __u32 limit; - - spin_lock(&pl->pl_lock); - slv = pl->pl_server_lock_volume; - clv = pl->pl_client_lock_volume; - limit = atomic_read(&pl->pl_limit); - granted = atomic_read(&pl->pl_granted); - grant_rate = atomic_read(&pl->pl_grant_rate); - cancel_rate = atomic_read(&pl->pl_cancel_rate); - grant_speed = grant_rate - cancel_rate; - lvf = atomic_read(&pl->pl_lock_volume_factor); - spin_unlock(&pl->pl_lock); - - seq_printf(m, "LDLM pool state (%s):\n" - " SLV: %llu\n" - " CLV: %llu\n" - " LVF: %d\n", - pl->pl_name, slv, clv, lvf); - - seq_printf(m, " GR: %d\n CR: %d\n GS: %d\n" - " G: %d\n L: %d\n", - grant_rate, cancel_rate, grant_speed, - granted, limit); - - return 0; -} - -LPROC_SEQ_FOPS_RO(lprocfs_pool_state); - -static ssize_t grant_speed_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool, - pl_kobj); - - int grant_speed; - - spin_lock(&pl->pl_lock); - /* serialize with ldlm_pool_recalc */ - grant_speed = atomic_read(&pl->pl_grant_rate) - - atomic_read(&pl->pl_cancel_rate); - spin_unlock(&pl->pl_lock); - return sprintf(buf, "%d\n", grant_speed); -} -LUSTRE_RO_ATTR(grant_speed); - -LDLM_POOL_SYSFS_READER_SHOW(grant_plan, int); -LUSTRE_RO_ATTR(grant_plan); - -LDLM_POOL_SYSFS_READER_SHOW(recalc_period, int); -LDLM_POOL_SYSFS_WRITER_STORE(recalc_period, int); -LUSTRE_RW_ATTR(recalc_period); - -LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(server_lock_volume, u64); -LUSTRE_RO_ATTR(server_lock_volume); - -LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(limit, atomic); -LDLM_POOL_SYSFS_WRITER_NOLOCK_STORE(limit, atomic); -LUSTRE_RW_ATTR(limit); - -LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(granted, atomic); -LUSTRE_RO_ATTR(granted); - -LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(cancel_rate, atomic); -LUSTRE_RO_ATTR(cancel_rate); - -LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(grant_rate, atomic); -LUSTRE_RO_ATTR(grant_rate); - -LDLM_POOL_SYSFS_READER_NOLOCK_SHOW(lock_volume_factor, atomic); -LDLM_POOL_SYSFS_WRITER_NOLOCK_STORE(lock_volume_factor, atomic); -LUSTRE_RW_ATTR(lock_volume_factor); - -#define LDLM_POOL_ADD_VAR(name, var, ops) \ - do { \ - snprintf(var_name, MAX_STRING_SIZE, #name); \ - pool_vars[0].data = var; \ - pool_vars[0].fops = ops; \ - ldebugfs_add_vars(pl->pl_debugfs_entry, pool_vars, NULL);\ - } while (0) - -/* These are for pools in /sys/fs/lustre/ldlm/namespaces/.../pool */ -static struct attribute *ldlm_pl_attrs[] = { - &lustre_attr_grant_speed.attr, - &lustre_attr_grant_plan.attr, - &lustre_attr_recalc_period.attr, - &lustre_attr_server_lock_volume.attr, - &lustre_attr_limit.attr, - &lustre_attr_granted.attr, - &lustre_attr_cancel_rate.attr, - &lustre_attr_grant_rate.attr, - &lustre_attr_lock_volume_factor.attr, - NULL, -}; - -static void ldlm_pl_release(struct kobject *kobj) -{ - struct ldlm_pool *pl = container_of(kobj, struct ldlm_pool, - pl_kobj); - complete(&pl->pl_kobj_unregister); -} - -static struct kobj_type ldlm_pl_ktype = { - .default_attrs = ldlm_pl_attrs, - .sysfs_ops = &lustre_sysfs_ops, - .release = ldlm_pl_release, -}; - -static int ldlm_pool_sysfs_init(struct ldlm_pool *pl) -{ - struct ldlm_namespace *ns = container_of(pl, struct ldlm_namespace, - ns_pool); - int err; - - init_completion(&pl->pl_kobj_unregister); - err = kobject_init_and_add(&pl->pl_kobj, &ldlm_pl_ktype, &ns->ns_kobj, - "pool"); - - return err; -} - -static int ldlm_pool_debugfs_init(struct ldlm_pool *pl) -{ - struct ldlm_namespace *ns = container_of(pl, struct ldlm_namespace, - ns_pool); - struct dentry *debugfs_ns_parent; - struct lprocfs_vars pool_vars[2]; - char *var_name = NULL; - int rc = 0; - - var_name = kzalloc(MAX_STRING_SIZE + 1, GFP_NOFS); - if (!var_name) - return -ENOMEM; - - debugfs_ns_parent = ns->ns_debugfs_entry; - if (IS_ERR_OR_NULL(debugfs_ns_parent)) { - CERROR("%s: debugfs entry is not initialized\n", - ldlm_ns_name(ns)); - rc = -EINVAL; - goto out_free_name; - } - pl->pl_debugfs_entry = debugfs_create_dir("pool", debugfs_ns_parent); - - var_name[MAX_STRING_SIZE] = '\0'; - memset(pool_vars, 0, sizeof(pool_vars)); - pool_vars[0].name = var_name; - - LDLM_POOL_ADD_VAR(state, pl, &lprocfs_pool_state_fops); - - pl->pl_stats = lprocfs_alloc_stats(LDLM_POOL_LAST_STAT - - LDLM_POOL_FIRST_STAT, 0); - if (!pl->pl_stats) { - rc = -ENOMEM; - goto out_free_name; - } - - lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANTED_STAT, - LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, - "granted", "locks"); - lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_STAT, - LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, - "grant", "locks"); - lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_STAT, - LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, - "cancel", "locks"); - lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT, - LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, - "grant_rate", "locks/s"); - lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT, - LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, - "cancel_rate", "locks/s"); - lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT, - LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, - "grant_plan", "locks/s"); - lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SLV_STAT, - LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, - "slv", "slv"); - lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_REQTD_STAT, - LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, - "shrink_request", "locks"); - lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_FREED_STAT, - LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, - "shrink_freed", "locks"); - lprocfs_counter_init(pl->pl_stats, LDLM_POOL_RECALC_STAT, - LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, - "recalc_freed", "locks"); - lprocfs_counter_init(pl->pl_stats, LDLM_POOL_TIMING_STAT, - LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, - "recalc_timing", "sec"); - debugfs_create_file("stats", 0644, pl->pl_debugfs_entry, pl->pl_stats, - &lprocfs_stats_seq_fops); - -out_free_name: - kfree(var_name); - return rc; -} - -static void ldlm_pool_sysfs_fini(struct ldlm_pool *pl) -{ - kobject_put(&pl->pl_kobj); - wait_for_completion(&pl->pl_kobj_unregister); -} - -static void ldlm_pool_debugfs_fini(struct ldlm_pool *pl) -{ - if (pl->pl_stats) { - lprocfs_free_stats(&pl->pl_stats); - pl->pl_stats = NULL; - } - debugfs_remove_recursive(pl->pl_debugfs_entry); -} - -int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns, - int idx, enum ldlm_side client) -{ - int rc; - - spin_lock_init(&pl->pl_lock); - atomic_set(&pl->pl_granted, 0); - pl->pl_recalc_time = ktime_get_real_seconds(); - atomic_set(&pl->pl_lock_volume_factor, 1); - - atomic_set(&pl->pl_grant_rate, 0); - atomic_set(&pl->pl_cancel_rate, 0); - pl->pl_grant_plan = LDLM_POOL_GP(LDLM_POOL_HOST_L); - - snprintf(pl->pl_name, sizeof(pl->pl_name), "ldlm-pool-%s-%d", - ldlm_ns_name(ns), idx); - - atomic_set(&pl->pl_limit, 1); - pl->pl_server_lock_volume = 0; - pl->pl_ops = &ldlm_cli_pool_ops; - pl->pl_recalc_period = LDLM_POOL_CLI_DEF_RECALC_PERIOD; - pl->pl_client_lock_volume = 0; - rc = ldlm_pool_debugfs_init(pl); - if (rc) - return rc; - - rc = ldlm_pool_sysfs_init(pl); - if (rc) - return rc; - - CDEBUG(D_DLMTRACE, "Lock pool %s is initialized\n", pl->pl_name); - - return rc; -} - -void ldlm_pool_fini(struct ldlm_pool *pl) -{ - ldlm_pool_sysfs_fini(pl); - ldlm_pool_debugfs_fini(pl); - - /* - * Pool should not be used after this point. We can't free it here as - * it lives in struct ldlm_namespace, but still interested in catching - * any abnormal using cases. - */ - POISON(pl, 0x5a, sizeof(*pl)); -} - -/** - * Add new taken ldlm lock \a lock into pool \a pl accounting. - */ -void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock) -{ - /* - * FLOCK locks are special in a sense that they are almost never - * cancelled, instead special kind of lock is used to drop them. - * also there is no LRU for flock locks, so no point in tracking - * them anyway. - */ - if (lock->l_resource->lr_type == LDLM_FLOCK) - return; - - atomic_inc(&pl->pl_granted); - atomic_inc(&pl->pl_grant_rate); - lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_GRANT_STAT); - /* - * Do not do pool recalc for client side as all locks which - * potentially may be canceled has already been packed into - * enqueue/cancel rpc. Also we do not want to run out of stack - * with too long call paths. - */ -} - -/** - * Remove ldlm lock \a lock from pool \a pl accounting. - */ -void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock) -{ - /* - * Filter out FLOCK locks. Read above comment in ldlm_pool_add(). - */ - if (lock->l_resource->lr_type == LDLM_FLOCK) - return; - - LASSERT(atomic_read(&pl->pl_granted) > 0); - atomic_dec(&pl->pl_granted); - atomic_inc(&pl->pl_cancel_rate); - - lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_CANCEL_STAT); -} - -/** - * Returns current \a pl SLV. - * - * \pre ->pl_lock is not locked. - */ -__u64 ldlm_pool_get_slv(struct ldlm_pool *pl) -{ - __u64 slv; - - spin_lock(&pl->pl_lock); - slv = pl->pl_server_lock_volume; - spin_unlock(&pl->pl_lock); - return slv; -} - -/** - * Sets passed \a clv to \a pl. - * - * \pre ->pl_lock is not locked. - */ -void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv) -{ - spin_lock(&pl->pl_lock); - pl->pl_client_lock_volume = clv; - spin_unlock(&pl->pl_lock); -} - -/** - * Returns current LVF from \a pl. - */ -__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl) -{ - return atomic_read(&pl->pl_lock_volume_factor); -} - -static int ldlm_pool_granted(struct ldlm_pool *pl) -{ - return atomic_read(&pl->pl_granted); -} - -/* - * count locks from all namespaces (if possible). Returns number of - * cached locks. - */ -static unsigned long ldlm_pools_count(enum ldlm_side client, gfp_t gfp_mask) -{ - unsigned long total = 0; - int nr_ns; - struct ldlm_namespace *ns; - struct ldlm_namespace *ns_old = NULL; /* loop detection */ - - if (client == LDLM_NAMESPACE_CLIENT && !(gfp_mask & __GFP_FS)) - return 0; - - CDEBUG(D_DLMTRACE, "Request to count %s locks from all pools\n", - client == LDLM_NAMESPACE_CLIENT ? "client" : "server"); - - /* - * Find out how many resources we may release. - */ - for (nr_ns = ldlm_namespace_nr_read(client); - nr_ns > 0; nr_ns--) { - mutex_lock(ldlm_namespace_lock(client)); - if (list_empty(ldlm_namespace_list(client))) { - mutex_unlock(ldlm_namespace_lock(client)); - return 0; - } - ns = ldlm_namespace_first_locked(client); - - if (ns == ns_old) { - mutex_unlock(ldlm_namespace_lock(client)); - break; - } - - if (ldlm_ns_empty(ns)) { - ldlm_namespace_move_to_inactive_locked(ns, client); - mutex_unlock(ldlm_namespace_lock(client)); - continue; - } - - if (!ns_old) - ns_old = ns; - - ldlm_namespace_get(ns); - ldlm_namespace_move_to_active_locked(ns, client); - mutex_unlock(ldlm_namespace_lock(client)); - total += ldlm_pool_shrink(&ns->ns_pool, 0, gfp_mask); - ldlm_namespace_put(ns); - } - - return total; -} - -static unsigned long ldlm_pools_scan(enum ldlm_side client, int nr, - gfp_t gfp_mask) -{ - unsigned long freed = 0; - int tmp, nr_ns; - struct ldlm_namespace *ns; - - if (client == LDLM_NAMESPACE_CLIENT && !(gfp_mask & __GFP_FS)) - return -1; - - /* - * Shrink at least ldlm_namespace_nr_read(client) namespaces. - */ - for (tmp = nr_ns = ldlm_namespace_nr_read(client); - tmp > 0; tmp--) { - int cancel, nr_locks; - - /* - * Do not call shrink under ldlm_namespace_lock(client) - */ - mutex_lock(ldlm_namespace_lock(client)); - if (list_empty(ldlm_namespace_list(client))) { - mutex_unlock(ldlm_namespace_lock(client)); - break; - } - ns = ldlm_namespace_first_locked(client); - ldlm_namespace_get(ns); - ldlm_namespace_move_to_active_locked(ns, client); - mutex_unlock(ldlm_namespace_lock(client)); - - nr_locks = ldlm_pool_granted(&ns->ns_pool); - /* - * We use to shrink propotionally but with new shrinker API, - * we lost the total number of freeable locks. - */ - cancel = 1 + min_t(int, nr_locks, nr / nr_ns); - freed += ldlm_pool_shrink(&ns->ns_pool, cancel, gfp_mask); - ldlm_namespace_put(ns); - } - /* - * we only decrease the SLV in server pools shrinker, return - * SHRINK_STOP to kernel to avoid needless loop. LU-1128 - */ - return freed; -} - -static unsigned long ldlm_pools_cli_count(struct shrinker *s, - struct shrink_control *sc) -{ - return ldlm_pools_count(LDLM_NAMESPACE_CLIENT, sc->gfp_mask); -} - -static unsigned long ldlm_pools_cli_scan(struct shrinker *s, - struct shrink_control *sc) -{ - return ldlm_pools_scan(LDLM_NAMESPACE_CLIENT, sc->nr_to_scan, - sc->gfp_mask); -} - -static void ldlm_pools_recalc(struct work_struct *ws); -static DECLARE_DELAYED_WORK(ldlm_recalc_pools, ldlm_pools_recalc); - -static void ldlm_pools_recalc(struct work_struct *ws) -{ - enum ldlm_side client = LDLM_NAMESPACE_CLIENT; - struct ldlm_namespace *ns; - struct ldlm_namespace *ns_old = NULL; - /* seconds of sleep if no active namespaces */ - int time = LDLM_POOL_CLI_DEF_RECALC_PERIOD; - int nr; - - /* - * Recalc at least ldlm_namespace_nr_read(client) namespaces. - */ - for (nr = ldlm_namespace_nr_read(client); nr > 0; nr--) { - int skip; - /* - * Lock the list, get first @ns in the list, getref, move it - * to the tail, unlock and call pool recalc. This way we avoid - * calling recalc under @ns lock what is really good as we get - * rid of potential deadlock on client nodes when canceling - * locks synchronously. - */ - mutex_lock(ldlm_namespace_lock(client)); - if (list_empty(ldlm_namespace_list(client))) { - mutex_unlock(ldlm_namespace_lock(client)); - break; - } - ns = ldlm_namespace_first_locked(client); - - if (ns_old == ns) { /* Full pass complete */ - mutex_unlock(ldlm_namespace_lock(client)); - break; - } - - /* We got an empty namespace, need to move it back to inactive - * list. - * The race with parallel resource creation is fine: - * - If they do namespace_get before our check, we fail the - * check and they move this item to the end of the list anyway - * - If we do the check and then they do namespace_get, then - * we move the namespace to inactive and they will move - * it back to active (synchronised by the lock, so no clash - * there). - */ - if (ldlm_ns_empty(ns)) { - ldlm_namespace_move_to_inactive_locked(ns, client); - mutex_unlock(ldlm_namespace_lock(client)); - continue; - } - - if (!ns_old) - ns_old = ns; - - spin_lock(&ns->ns_lock); - /* - * skip ns which is being freed, and we don't want to increase - * its refcount again, not even temporarily. bz21519 & LU-499. - */ - if (ns->ns_stopping) { - skip = 1; - } else { - skip = 0; - ldlm_namespace_get(ns); - } - spin_unlock(&ns->ns_lock); - - ldlm_namespace_move_to_active_locked(ns, client); - mutex_unlock(ldlm_namespace_lock(client)); - - /* - * After setup is done - recalc the pool. - */ - if (!skip) { - int ttime = ldlm_pool_recalc(&ns->ns_pool); - - if (ttime < time) - time = ttime; - - ldlm_namespace_put(ns); - } - } - - /* Wake up the blocking threads from time to time. */ - ldlm_bl_thread_wakeup(); - - schedule_delayed_work(&ldlm_recalc_pools, time * HZ); -} - -static int ldlm_pools_thread_start(void) -{ - schedule_delayed_work(&ldlm_recalc_pools, 0); - - return 0; -} - -static void ldlm_pools_thread_stop(void) -{ - cancel_delayed_work_sync(&ldlm_recalc_pools); -} - -static struct shrinker ldlm_pools_cli_shrinker = { - .count_objects = ldlm_pools_cli_count, - .scan_objects = ldlm_pools_cli_scan, - .seeks = DEFAULT_SEEKS, -}; - -int ldlm_pools_init(void) -{ - int rc; - - rc = ldlm_pools_thread_start(); - if (!rc) - rc = register_shrinker(&ldlm_pools_cli_shrinker); - - return rc; -} - -void ldlm_pools_fini(void) -{ - unregister_shrinker(&ldlm_pools_cli_shrinker); - - ldlm_pools_thread_stop(); -} diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_request.c b/drivers/staging/lustre/lustre/ldlm/ldlm_request.c deleted file mode 100644 index cdc52eed6d85edbcabf03dec00c6f520141f9962..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ldlm/ldlm_request.c +++ /dev/null @@ -1,2033 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2010, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ -/** - * This file contains Asynchronous System Trap (AST) handlers and related - * LDLM request-processing routines. - * - * An AST is a callback issued on a lock when its state is changed. There are - * several different types of ASTs (callbacks) registered for each lock: - * - * - completion AST: when a lock is enqueued by some process, but cannot be - * granted immediately due to other conflicting locks on the same resource, - * the completion AST is sent to notify the caller when the lock is - * eventually granted - * - * - blocking AST: when a lock is granted to some process, if another process - * enqueues a conflicting (blocking) lock on a resource, a blocking AST is - * sent to notify the holder(s) of the lock(s) of the conflicting lock - * request. The lock holder(s) must release their lock(s) on that resource in - * a timely manner or be evicted by the server. - * - * - glimpse AST: this is used when a process wants information about a lock - * (i.e. the lock value block (LVB)) but does not necessarily require holding - * the lock. If the resource is locked, the lock holder(s) are sent glimpse - * ASTs and the LVB is returned to the caller, and lock holder(s) may CANCEL - * their lock(s) if they are idle. If the resource is not locked, the server - * may grant the lock. - */ - -#define DEBUG_SUBSYSTEM S_LDLM - -#include -#include -#include -#include -#include - -#include "ldlm_internal.h" - -unsigned int ldlm_enqueue_min = OBD_TIMEOUT_DEFAULT; -module_param(ldlm_enqueue_min, uint, 0644); -MODULE_PARM_DESC(ldlm_enqueue_min, "lock enqueue timeout minimum"); - -/* in client side, whether the cached locks will be canceled before replay */ -unsigned int ldlm_cancel_unused_locks_before_replay = 1; - -struct ldlm_async_args { - struct lustre_handle lock_handle; -}; - -/** - * ldlm_request_bufsize - * - * @count: number of ldlm handles - * @type: ldlm opcode - * - * If opcode=LDLM_ENQUEUE, 1 slot is already occupied, - * LDLM_LOCKREQ_HANDLE -1 slots are available. - * Otherwise, LDLM_LOCKREQ_HANDLE slots are available. - * - * Return: size of the request buffer - */ -static int ldlm_request_bufsize(int count, int type) -{ - int avail = LDLM_LOCKREQ_HANDLES; - - if (type == LDLM_ENQUEUE) - avail -= LDLM_ENQUEUE_CANCEL_OFF; - - if (count > avail) - avail = (count - avail) * sizeof(struct lustre_handle); - else - avail = 0; - - return sizeof(struct ldlm_request) + avail; -} - -static void ldlm_expired_completion_wait(struct ldlm_lock *lock, __u32 conn_cnt) -{ - struct obd_import *imp; - struct obd_device *obd; - - if (!lock->l_conn_export) { - static unsigned long next_dump, last_dump; - - LDLM_ERROR(lock, - "lock timed out (enqueued at %lld, %llds ago); not entering recovery in server code, just going back to sleep", - (s64)lock->l_last_activity, - (s64)(ktime_get_real_seconds() - - lock->l_last_activity)); - if (time_after(jiffies, next_dump)) { - last_dump = next_dump; - next_dump = jiffies + 300 * HZ; - ldlm_namespace_dump(D_DLMTRACE, - ldlm_lock_to_ns(lock)); - if (last_dump == 0) - libcfs_debug_dumplog(); - } - return; - } - - obd = lock->l_conn_export->exp_obd; - imp = obd->u.cli.cl_import; - ptlrpc_fail_import(imp, conn_cnt); - LDLM_ERROR(lock, - "lock timed out (enqueued at %lld, %llds ago), entering recovery for %s@%s", - (s64)lock->l_last_activity, - (s64)(ktime_get_real_seconds() - lock->l_last_activity), - obd2cli_tgt(obd), imp->imp_connection->c_remote_uuid.uuid); -} - -/** - * Calculate the Completion timeout (covering enqueue, BL AST, data flush, - * lock cancel, and their replies). Used for lock completion timeout on the - * client side. - * - * \param[in] lock lock which is waiting the completion callback - * - * \retval timeout in seconds to wait for the server reply - */ -/* We use the same basis for both server side and client side functions - * from a single node. - */ -static unsigned int ldlm_cp_timeout(struct ldlm_lock *lock) -{ - unsigned int timeout; - - if (AT_OFF) - return obd_timeout; - - /* - * Wait a long time for enqueue - server may have to callback a - * lock from another client. Server will evict the other client if it - * doesn't respond reasonably, and then give us the lock. - */ - timeout = at_get(ldlm_lock_to_ns_at(lock)); - return max(3 * timeout, ldlm_enqueue_min); -} - -/** - * Helper function for ldlm_completion_ast(), updating timings when lock is - * actually granted. - */ -static int ldlm_completion_tail(struct ldlm_lock *lock, void *data) -{ - long delay; - int result = 0; - - if (ldlm_is_destroyed(lock) || ldlm_is_failed(lock)) { - LDLM_DEBUG(lock, "client-side enqueue: destroyed"); - result = -EIO; - } else if (!data) { - LDLM_DEBUG(lock, "client-side enqueue: granted"); - } else { - /* Take into AT only CP RPC, not immediately granted locks */ - delay = ktime_get_real_seconds() - lock->l_last_activity; - LDLM_DEBUG(lock, "client-side enqueue: granted after %lds", - delay); - - /* Update our time estimate */ - at_measured(ldlm_lock_to_ns_at(lock), delay); - } - return result; -} - -/** - * Generic LDLM "completion" AST. This is called in several cases: - * - * - when a reply to an ENQUEUE RPC is received from the server - * (ldlm_cli_enqueue_fini()). Lock might be granted or not granted at - * this point (determined by flags); - * - * - when LDLM_CP_CALLBACK RPC comes to client to notify it that lock has - * been granted; - * - * - when ldlm_lock_match(LDLM_FL_LVB_READY) is about to wait until lock - * gets correct lvb; - * - * - to force all locks when resource is destroyed (cleanup_resource()); - * - * - during lock conversion (not used currently). - * - * If lock is not granted in the first case, this function waits until second - * or penultimate cases happen in some other thread. - * - */ -int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data) -{ - /* XXX ALLOCATE - 160 bytes */ - struct obd_device *obd; - struct obd_import *imp = NULL; - __u32 timeout; - __u32 conn_cnt = 0; - int rc = 0; - - if (flags == LDLM_FL_WAIT_NOREPROC) { - LDLM_DEBUG(lock, "client-side enqueue waiting on pending lock"); - goto noreproc; - } - - if (!(flags & LDLM_FL_BLOCKED_MASK)) { - wake_up(&lock->l_waitq); - return 0; - } - - LDLM_DEBUG(lock, - "client-side enqueue returned a blocked lock, sleeping"); - -noreproc: - - obd = class_exp2obd(lock->l_conn_export); - - /* if this is a local lock, then there is no import */ - if (obd) - imp = obd->u.cli.cl_import; - - timeout = ldlm_cp_timeout(lock); - - lock->l_last_activity = ktime_get_real_seconds(); - - if (imp) { - spin_lock(&imp->imp_lock); - conn_cnt = imp->imp_conn_cnt; - spin_unlock(&imp->imp_lock); - } - if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LDLM_INTR_CP_AST, - OBD_FAIL_LDLM_CP_BL_RACE | OBD_FAIL_ONCE)) { - ldlm_set_fail_loc(lock); - rc = -EINTR; - } else { - /* Go to sleep until the lock is granted or canceled. */ - if (!ldlm_is_no_timeout(lock)) { - /* Wait uninterruptible for a while first */ - rc = wait_event_idle_timeout(lock->l_waitq, - is_granted_or_cancelled(lock), - timeout * HZ); - if (rc == 0) - ldlm_expired_completion_wait(lock, conn_cnt); - } - /* Now wait abortable */ - if (rc == 0) - rc = l_wait_event_abortable(lock->l_waitq, - is_granted_or_cancelled(lock)); - else - rc = 0; - } - - if (rc) { - LDLM_DEBUG(lock, "client-side enqueue waking up: failed (%d)", - rc); - return rc; - } - - return ldlm_completion_tail(lock, data); -} -EXPORT_SYMBOL(ldlm_completion_ast); - -static void failed_lock_cleanup(struct ldlm_namespace *ns, - struct ldlm_lock *lock, int mode) -{ - int need_cancel = 0; - - /* Set a flag to prevent us from sending a CANCEL (bug 407) */ - lock_res_and_lock(lock); - /* Check that lock is not granted or failed, we might race. */ - if ((lock->l_req_mode != lock->l_granted_mode) && - !ldlm_is_failed(lock)) { - /* Make sure that this lock will not be found by raced - * bl_ast and -EINVAL reply is sent to server anyways. - * bug 17645 - */ - lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_FAILED | - LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING; - need_cancel = 1; - } - unlock_res_and_lock(lock); - - if (need_cancel) - LDLM_DEBUG(lock, - "setting FL_LOCAL_ONLY | LDLM_FL_FAILED | LDLM_FL_ATOMIC_CB | LDLM_FL_CBPENDING"); - else - LDLM_DEBUG(lock, "lock was granted or failed in race"); - - /* XXX - HACK because we shouldn't call ldlm_lock_destroy() - * from llite/file.c/ll_file_flock(). - */ - /* This code makes for the fact that we do not have blocking handler on - * a client for flock locks. As such this is the place where we must - * completely kill failed locks. (interrupted and those that - * were waiting to be granted when server evicted us. - */ - if (lock->l_resource->lr_type == LDLM_FLOCK) { - lock_res_and_lock(lock); - if (!ldlm_is_destroyed(lock)) { - ldlm_resource_unlink_lock(lock); - ldlm_lock_decref_internal_nolock(lock, mode); - ldlm_lock_destroy_nolock(lock); - } - unlock_res_and_lock(lock); - } else { - ldlm_lock_decref_internal(lock, mode); - } -} - -/** - * Finishing portion of client lock enqueue code. - * - * Called after receiving reply from server. - */ -int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req, - enum ldlm_type type, __u8 with_policy, - enum ldlm_mode mode, - __u64 *flags, void *lvb, __u32 lvb_len, - const struct lustre_handle *lockh, int rc) -{ - struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; - int is_replay = *flags & LDLM_FL_REPLAY; - struct ldlm_lock *lock; - struct ldlm_reply *reply; - int cleanup_phase = 1; - - lock = ldlm_handle2lock(lockh); - /* ldlm_cli_enqueue is holding a reference on this lock. */ - if (!lock) { - LASSERT(type == LDLM_FLOCK); - return -ENOLCK; - } - - LASSERTF(ergo(lvb_len != 0, lvb_len == lock->l_lvb_len), - "lvb_len = %d, l_lvb_len = %d\n", lvb_len, lock->l_lvb_len); - - if (rc != ELDLM_OK) { - LASSERT(!is_replay); - LDLM_DEBUG(lock, "client-side enqueue END (%s)", - rc == ELDLM_LOCK_ABORTED ? "ABORTED" : "FAILED"); - - if (rc != ELDLM_LOCK_ABORTED) - goto cleanup; - } - - /* Before we return, swab the reply */ - reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); - if (!reply) { - rc = -EPROTO; - goto cleanup; - } - - if (lvb_len > 0) { - int size = 0; - - size = req_capsule_get_size(&req->rq_pill, &RMF_DLM_LVB, - RCL_SERVER); - if (size < 0) { - LDLM_ERROR(lock, "Fail to get lvb_len, rc = %d", size); - rc = size; - goto cleanup; - } else if (unlikely(size > lvb_len)) { - LDLM_ERROR(lock, - "Replied LVB is larger than expectation, expected = %d, replied = %d", - lvb_len, size); - rc = -EINVAL; - goto cleanup; - } - lvb_len = size; - } - - if (rc == ELDLM_LOCK_ABORTED) { - if (lvb_len > 0 && lvb) - rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER, - lvb, lvb_len); - if (rc == 0) - rc = ELDLM_LOCK_ABORTED; - goto cleanup; - } - - /* lock enqueued on the server */ - cleanup_phase = 0; - - lock_res_and_lock(lock); - lock->l_remote_handle = reply->lock_handle; - - *flags = ldlm_flags_from_wire(reply->lock_flags); - lock->l_flags |= ldlm_flags_from_wire(reply->lock_flags & - LDLM_FL_INHERIT_MASK); - unlock_res_and_lock(lock); - - CDEBUG(D_INFO, "local: %p, remote cookie: %#llx, flags: 0x%llx\n", - lock, reply->lock_handle.cookie, *flags); - - /* If enqueue returned a blocked lock but the completion handler has - * already run, then it fixed up the resource and we don't need to do it - * again. - */ - if ((*flags) & LDLM_FL_LOCK_CHANGED) { - int newmode = reply->lock_desc.l_req_mode; - - LASSERT(!is_replay); - if (newmode && newmode != lock->l_req_mode) { - LDLM_DEBUG(lock, "server returned different mode %s", - ldlm_lockname[newmode]); - lock->l_req_mode = newmode; - } - - if (!ldlm_res_eq(&reply->lock_desc.l_resource.lr_name, - &lock->l_resource->lr_name)) { - CDEBUG(D_INFO, - "remote intent success, locking " DLDLMRES " instead of " DLDLMRES "\n", - PLDLMRES(&reply->lock_desc.l_resource), - PLDLMRES(lock->l_resource)); - - rc = ldlm_lock_change_resource(ns, lock, - &reply->lock_desc.l_resource.lr_name); - if (rc || !lock->l_resource) { - rc = -ENOMEM; - goto cleanup; - } - LDLM_DEBUG(lock, "client-side enqueue, new resource"); - } - if (with_policy) - if (!(type == LDLM_IBITS && - !(exp_connect_flags(exp) & OBD_CONNECT_IBITS))) - /* We assume lock type cannot change on server*/ - ldlm_convert_policy_to_local(exp, - lock->l_resource->lr_type, - &reply->lock_desc.l_policy_data, - &lock->l_policy_data); - if (type != LDLM_PLAIN) - LDLM_DEBUG(lock, - "client-side enqueue, new policy data"); - } - - if ((*flags) & LDLM_FL_AST_SENT) { - lock_res_and_lock(lock); - lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_BL_AST; - unlock_res_and_lock(lock); - LDLM_DEBUG(lock, "enqueue reply includes blocking AST"); - } - - /* If the lock has already been granted by a completion AST, don't - * clobber the LVB with an older one. - */ - if (lvb_len > 0) { - /* We must lock or a racing completion might update lvb without - * letting us know and we'll clobber the correct value. - * Cannot unlock after the check either, as that still leaves - * a tiny window for completion to get in - */ - lock_res_and_lock(lock); - if (lock->l_req_mode != lock->l_granted_mode) - rc = ldlm_fill_lvb(lock, &req->rq_pill, RCL_SERVER, - lock->l_lvb_data, lvb_len); - unlock_res_and_lock(lock); - if (rc < 0) { - cleanup_phase = 1; - goto cleanup; - } - } - - if (!is_replay) { - rc = ldlm_lock_enqueue(ns, &lock, NULL, flags); - if (lock->l_completion_ast) { - int err = lock->l_completion_ast(lock, *flags, NULL); - - if (!rc) - rc = err; - if (rc) - cleanup_phase = 1; - } - } - - if (lvb_len > 0 && lvb) { - /* Copy the LVB here, and not earlier, because the completion - * AST (if any) can override what we got in the reply - */ - memcpy(lvb, lock->l_lvb_data, lvb_len); - } - - LDLM_DEBUG(lock, "client-side enqueue END"); -cleanup: - if (cleanup_phase == 1 && rc) - failed_lock_cleanup(ns, lock, mode); - /* Put lock 2 times, the second reference is held by ldlm_cli_enqueue */ - LDLM_LOCK_PUT(lock); - LDLM_LOCK_RELEASE(lock); - return rc; -} -EXPORT_SYMBOL(ldlm_cli_enqueue_fini); - -/** - * Estimate number of lock handles that would fit into request of given - * size. PAGE_SIZE-512 is to allow TCP/IP and LNET headers to fit into - * a single page on the send/receive side. XXX: 512 should be changed to - * more adequate value. - */ -static inline int ldlm_req_handles_avail(int req_size, int off) -{ - int avail; - - avail = min_t(int, LDLM_MAXREQSIZE, PAGE_SIZE - 512) - req_size; - if (likely(avail >= 0)) - avail /= (int)sizeof(struct lustre_handle); - else - avail = 0; - avail += LDLM_LOCKREQ_HANDLES - off; - - return avail; -} - -static inline int ldlm_capsule_handles_avail(struct req_capsule *pill, - enum req_location loc, - int off) -{ - u32 size = req_capsule_msg_size(pill, loc); - - return ldlm_req_handles_avail(size, off); -} - -static inline int ldlm_format_handles_avail(struct obd_import *imp, - const struct req_format *fmt, - enum req_location loc, int off) -{ - u32 size = req_capsule_fmt_size(imp->imp_msg_magic, fmt, loc); - - return ldlm_req_handles_avail(size, off); -} - -/** - * Cancel LRU locks and pack them into the enqueue request. Pack there the given - * \a count locks in \a cancels. - * - * This is to be called by functions preparing their own requests that - * might contain lists of locks to cancel in addition to actual operation - * that needs to be performed. - */ -int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req, - int version, int opc, int canceloff, - struct list_head *cancels, int count) -{ - struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; - struct req_capsule *pill = &req->rq_pill; - struct ldlm_request *dlm = NULL; - int flags, avail, to_free, pack = 0; - LIST_HEAD(head); - int rc; - - if (!cancels) - cancels = &head; - if (ns_connect_cancelset(ns)) { - /* Estimate the amount of available space in the request. */ - req_capsule_filled_sizes(pill, RCL_CLIENT); - avail = ldlm_capsule_handles_avail(pill, RCL_CLIENT, canceloff); - - flags = ns_connect_lru_resize(ns) ? - LDLM_LRU_FLAG_LRUR_NO_WAIT : LDLM_LRU_FLAG_AGED; - to_free = !ns_connect_lru_resize(ns) && - opc == LDLM_ENQUEUE ? 1 : 0; - - /* Cancel LRU locks here _only_ if the server supports - * EARLY_CANCEL. Otherwise we have to send extra CANCEL - * RPC, which will make us slower. - */ - if (avail > count) - count += ldlm_cancel_lru_local(ns, cancels, to_free, - avail - count, 0, flags); - if (avail > count) - pack = count; - else - pack = avail; - req_capsule_set_size(pill, &RMF_DLM_REQ, RCL_CLIENT, - ldlm_request_bufsize(pack, opc)); - } - - rc = ptlrpc_request_pack(req, version, opc); - if (rc) { - ldlm_lock_list_put(cancels, l_bl_ast, count); - return rc; - } - - if (ns_connect_cancelset(ns)) { - if (canceloff) { - dlm = req_capsule_client_get(pill, &RMF_DLM_REQ); - LASSERT(dlm); - /* Skip first lock handler in ldlm_request_pack(), - * this method will increment @lock_count according - * to the lock handle amount actually written to - * the buffer. - */ - dlm->lock_count = canceloff; - } - /* Pack into the request @pack lock handles. */ - ldlm_cli_cancel_list(cancels, pack, req, 0); - /* Prepare and send separate cancel RPC for others. */ - ldlm_cli_cancel_list(cancels, count - pack, NULL, 0); - } else { - ldlm_lock_list_put(cancels, l_bl_ast, count); - } - return 0; -} -EXPORT_SYMBOL(ldlm_prep_elc_req); - -int ldlm_prep_enqueue_req(struct obd_export *exp, struct ptlrpc_request *req, - struct list_head *cancels, int count) -{ - return ldlm_prep_elc_req(exp, req, LUSTRE_DLM_VERSION, LDLM_ENQUEUE, - LDLM_ENQUEUE_CANCEL_OFF, cancels, count); -} -EXPORT_SYMBOL(ldlm_prep_enqueue_req); - -static struct ptlrpc_request *ldlm_enqueue_pack(struct obd_export *exp, - int lvb_len) -{ - struct ptlrpc_request *req; - int rc; - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_ENQUEUE); - if (!req) - return ERR_PTR(-ENOMEM); - - rc = ldlm_prep_enqueue_req(exp, req, NULL, 0); - if (rc) { - ptlrpc_request_free(req); - return ERR_PTR(rc); - } - - req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, lvb_len); - ptlrpc_request_set_replen(req); - return req; -} - -/** - * Client-side lock enqueue. - * - * If a request has some specific initialisation it is passed in \a reqp, - * otherwise it is created in ldlm_cli_enqueue. - * - * Supports sync and async requests, pass \a async flag accordingly. If a - * request was created in ldlm_cli_enqueue and it is the async request, - * pass it to the caller in \a reqp. - */ -int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp, - struct ldlm_enqueue_info *einfo, - const struct ldlm_res_id *res_id, - union ldlm_policy_data const *policy, __u64 *flags, - void *lvb, __u32 lvb_len, enum lvb_type lvb_type, - struct lustre_handle *lockh, int async) -{ - struct ldlm_namespace *ns; - struct ldlm_lock *lock; - struct ldlm_request *body; - int is_replay = *flags & LDLM_FL_REPLAY; - int req_passed_in = 1; - int rc, err; - struct ptlrpc_request *req; - - ns = exp->exp_obd->obd_namespace; - - /* If we're replaying this lock, just check some invariants. - * If we're creating a new lock, get everything all setup nicely. - */ - if (is_replay) { - lock = ldlm_handle2lock_long(lockh, 0); - LASSERT(lock); - LDLM_DEBUG(lock, "client-side enqueue START"); - LASSERT(exp == lock->l_conn_export); - } else { - const struct ldlm_callback_suite cbs = { - .lcs_completion = einfo->ei_cb_cp, - .lcs_blocking = einfo->ei_cb_bl, - .lcs_glimpse = einfo->ei_cb_gl - }; - lock = ldlm_lock_create(ns, res_id, einfo->ei_type, - einfo->ei_mode, &cbs, einfo->ei_cbdata, - lvb_len, lvb_type); - if (IS_ERR(lock)) - return PTR_ERR(lock); - /* for the local lock, add the reference */ - ldlm_lock_addref_internal(lock, einfo->ei_mode); - ldlm_lock2handle(lock, lockh); - if (policy) - lock->l_policy_data = *policy; - - if (einfo->ei_type == LDLM_EXTENT) { - /* extent lock without policy is a bug */ - if (!policy) - LBUG(); - - lock->l_req_extent = policy->l_extent; - } - LDLM_DEBUG(lock, "client-side enqueue START, flags %llx", - *flags); - } - - lock->l_conn_export = exp; - lock->l_export = NULL; - lock->l_blocking_ast = einfo->ei_cb_bl; - lock->l_flags |= (*flags & (LDLM_FL_NO_LRU | LDLM_FL_EXCL)); - lock->l_last_activity = ktime_get_real_seconds(); - - /* lock not sent to server yet */ - if (!reqp || !*reqp) { - req = ldlm_enqueue_pack(exp, lvb_len); - if (IS_ERR(req)) { - failed_lock_cleanup(ns, lock, einfo->ei_mode); - LDLM_LOCK_RELEASE(lock); - return PTR_ERR(req); - } - - req_passed_in = 0; - if (reqp) - *reqp = req; - } else { - int len; - - req = *reqp; - len = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, - RCL_CLIENT); - LASSERTF(len >= sizeof(*body), "buflen[%d] = %d, not %d\n", - DLM_LOCKREQ_OFF, len, (int)sizeof(*body)); - } - - /* Dump lock data into the request buffer */ - body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); - ldlm_lock2desc(lock, &body->lock_desc); - body->lock_flags = ldlm_flags_to_wire(*flags); - body->lock_handle[0] = *lockh; - - if (async) { - LASSERT(reqp); - return 0; - } - - LDLM_DEBUG(lock, "sending request"); - - rc = ptlrpc_queue_wait(req); - - err = ldlm_cli_enqueue_fini(exp, req, einfo->ei_type, policy ? 1 : 0, - einfo->ei_mode, flags, lvb, lvb_len, - lockh, rc); - - /* If ldlm_cli_enqueue_fini did not find the lock, we need to free - * one reference that we took - */ - if (err == -ENOLCK) - LDLM_LOCK_RELEASE(lock); - else - rc = err; - - if (!req_passed_in && req) { - ptlrpc_req_finished(req); - if (reqp) - *reqp = NULL; - } - - return rc; -} -EXPORT_SYMBOL(ldlm_cli_enqueue); - -/** - * Cancel locks locally. - * Returns: - * \retval LDLM_FL_LOCAL_ONLY if there is no need for a CANCEL RPC to the server - * \retval LDLM_FL_CANCELING otherwise; - * \retval LDLM_FL_BL_AST if there is a need for a separate CANCEL RPC. - */ -static __u64 ldlm_cli_cancel_local(struct ldlm_lock *lock) -{ - __u64 rc = LDLM_FL_LOCAL_ONLY; - - if (lock->l_conn_export) { - bool local_only; - - LDLM_DEBUG(lock, "client-side cancel"); - /* Set this flag to prevent others from getting new references*/ - lock_res_and_lock(lock); - ldlm_set_cbpending(lock); - local_only = !!(lock->l_flags & - (LDLM_FL_LOCAL_ONLY | LDLM_FL_CANCEL_ON_BLOCK)); - ldlm_cancel_callback(lock); - rc = ldlm_is_bl_ast(lock) ? LDLM_FL_BL_AST : LDLM_FL_CANCELING; - unlock_res_and_lock(lock); - - if (local_only) { - CDEBUG(D_DLMTRACE, - "not sending request (at caller's instruction)\n"); - rc = LDLM_FL_LOCAL_ONLY; - } - ldlm_lock_cancel(lock); - } else { - LDLM_ERROR(lock, "Trying to cancel local lock"); - LBUG(); - } - - return rc; -} - -/** - * Pack \a count locks in \a head into ldlm_request buffer of request \a req. - */ -static void ldlm_cancel_pack(struct ptlrpc_request *req, - struct list_head *head, int count) -{ - struct ldlm_request *dlm; - struct ldlm_lock *lock; - int max, packed = 0; - - dlm = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); - LASSERT(dlm); - - /* Check the room in the request buffer. */ - max = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT) - - sizeof(struct ldlm_request); - max /= sizeof(struct lustre_handle); - max += LDLM_LOCKREQ_HANDLES; - LASSERT(max >= dlm->lock_count + count); - - /* XXX: it would be better to pack lock handles grouped by resource. - * so that the server cancel would call filter_lvbo_update() less - * frequently. - */ - list_for_each_entry(lock, head, l_bl_ast) { - if (!count--) - break; - LASSERT(lock->l_conn_export); - /* Pack the lock handle to the given request buffer. */ - LDLM_DEBUG(lock, "packing"); - dlm->lock_handle[dlm->lock_count++] = lock->l_remote_handle; - packed++; - } - CDEBUG(D_DLMTRACE, "%d locks packed\n", packed); -} - -/** - * Prepare and send a batched cancel RPC. It will include \a count lock - * handles of locks given in \a cancels list. - */ -static int ldlm_cli_cancel_req(struct obd_export *exp, - struct list_head *cancels, - int count, enum ldlm_cancel_flags flags) -{ - struct ptlrpc_request *req = NULL; - struct obd_import *imp; - int free, sent = 0; - int rc = 0; - - LASSERT(exp); - LASSERT(count > 0); - - CFS_FAIL_TIMEOUT(OBD_FAIL_LDLM_PAUSE_CANCEL, cfs_fail_val); - - if (CFS_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_RACE)) - return count; - - free = ldlm_format_handles_avail(class_exp2cliimp(exp), - &RQF_LDLM_CANCEL, RCL_CLIENT, 0); - if (count > free) - count = free; - - while (1) { - imp = class_exp2cliimp(exp); - if (!imp || imp->imp_invalid) { - CDEBUG(D_DLMTRACE, - "skipping cancel on invalid import %p\n", imp); - return count; - } - - req = ptlrpc_request_alloc(imp, &RQF_LDLM_CANCEL); - if (!req) { - rc = -ENOMEM; - goto out; - } - - req_capsule_filled_sizes(&req->rq_pill, RCL_CLIENT); - req_capsule_set_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT, - ldlm_request_bufsize(count, LDLM_CANCEL)); - - rc = ptlrpc_request_pack(req, LUSTRE_DLM_VERSION, LDLM_CANCEL); - if (rc) { - ptlrpc_request_free(req); - goto out; - } - - req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL; - req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL; - ptlrpc_at_set_req_timeout(req); - - ldlm_cancel_pack(req, cancels, count); - - ptlrpc_request_set_replen(req); - if (flags & LCF_ASYNC) { - ptlrpcd_add_req(req); - sent = count; - goto out; - } - - rc = ptlrpc_queue_wait(req); - if (rc == LUSTRE_ESTALE) { - CDEBUG(D_DLMTRACE, - "client/server (nid %s) out of sync -- not fatal\n", - libcfs_nid2str(req->rq_import-> - imp_connection->c_peer.nid)); - rc = 0; - } else if (rc == -ETIMEDOUT && /* check there was no reconnect*/ - req->rq_import_generation == imp->imp_generation) { - ptlrpc_req_finished(req); - continue; - } else if (rc != ELDLM_OK) { - /* -ESHUTDOWN is common on umount */ - CDEBUG_LIMIT(rc == -ESHUTDOWN ? D_DLMTRACE : D_ERROR, - "Got rc %d from cancel RPC: canceling anyway\n", - rc); - break; - } - sent = count; - break; - } - - ptlrpc_req_finished(req); -out: - return sent ? sent : rc; -} - -static inline struct ldlm_pool *ldlm_imp2pl(struct obd_import *imp) -{ - return &imp->imp_obd->obd_namespace->ns_pool; -} - -/** - * Update client's OBD pool related fields with new SLV and Limit from \a req. - */ -int ldlm_cli_update_pool(struct ptlrpc_request *req) -{ - struct obd_device *obd; - __u64 new_slv; - __u32 new_limit; - - if (unlikely(!req->rq_import || !req->rq_import->imp_obd || - !imp_connect_lru_resize(req->rq_import))) { - /* - * Do nothing for corner cases. - */ - return 0; - } - - /* In some cases RPC may contain SLV and limit zeroed out. This - * is the case when server does not support LRU resize feature. - * This is also possible in some recovery cases when server-side - * reqs have no reference to the OBD export and thus access to - * server-side namespace is not possible. - */ - if (lustre_msg_get_slv(req->rq_repmsg) == 0 || - lustre_msg_get_limit(req->rq_repmsg) == 0) { - DEBUG_REQ(D_HA, req, - "Zero SLV or Limit found (SLV: %llu, Limit: %u)", - lustre_msg_get_slv(req->rq_repmsg), - lustre_msg_get_limit(req->rq_repmsg)); - return 0; - } - - new_limit = lustre_msg_get_limit(req->rq_repmsg); - new_slv = lustre_msg_get_slv(req->rq_repmsg); - obd = req->rq_import->imp_obd; - - /* Set new SLV and limit in OBD fields to make them accessible - * to the pool thread. We do not access obd_namespace and pool - * directly here as there is no reliable way to make sure that - * they are still alive at cleanup time. Evil races are possible - * which may cause Oops at that time. - */ - write_lock(&obd->obd_pool_lock); - obd->obd_pool_slv = new_slv; - obd->obd_pool_limit = new_limit; - write_unlock(&obd->obd_pool_lock); - - return 0; -} - -/** - * Client side lock cancel. - * - * Lock must not have any readers or writers by this time. - */ -int ldlm_cli_cancel(const struct lustre_handle *lockh, - enum ldlm_cancel_flags cancel_flags) -{ - struct obd_export *exp; - int avail, flags, count = 1; - __u64 rc = 0; - struct ldlm_namespace *ns; - struct ldlm_lock *lock; - LIST_HEAD(cancels); - - lock = ldlm_handle2lock_long(lockh, 0); - if (!lock) { - LDLM_DEBUG_NOLOCK("lock is already being destroyed"); - return 0; - } - - lock_res_and_lock(lock); - /* Lock is being canceled and the caller doesn't want to wait */ - if (ldlm_is_canceling(lock) && (cancel_flags & LCF_ASYNC)) { - unlock_res_and_lock(lock); - LDLM_LOCK_RELEASE(lock); - return 0; - } - - ldlm_set_canceling(lock); - unlock_res_and_lock(lock); - - rc = ldlm_cli_cancel_local(lock); - if (rc == LDLM_FL_LOCAL_ONLY || cancel_flags & LCF_LOCAL) { - LDLM_LOCK_RELEASE(lock); - return 0; - } - /* Even if the lock is marked as LDLM_FL_BL_AST, this is a LDLM_CANCEL - * RPC which goes to canceld portal, so we can cancel other LRU locks - * here and send them all as one LDLM_CANCEL RPC. - */ - LASSERT(list_empty(&lock->l_bl_ast)); - list_add(&lock->l_bl_ast, &cancels); - - exp = lock->l_conn_export; - if (exp_connect_cancelset(exp)) { - avail = ldlm_format_handles_avail(class_exp2cliimp(exp), - &RQF_LDLM_CANCEL, - RCL_CLIENT, 0); - LASSERT(avail > 0); - - ns = ldlm_lock_to_ns(lock); - flags = ns_connect_lru_resize(ns) ? - LDLM_LRU_FLAG_LRUR : LDLM_LRU_FLAG_AGED; - count += ldlm_cancel_lru_local(ns, &cancels, 0, avail - 1, - LCF_BL_AST, flags); - } - ldlm_cli_cancel_list(&cancels, count, NULL, cancel_flags); - return 0; -} -EXPORT_SYMBOL(ldlm_cli_cancel); - -/** - * Locally cancel up to \a count locks in list \a cancels. - * Return the number of cancelled locks. - */ -int ldlm_cli_cancel_list_local(struct list_head *cancels, int count, - enum ldlm_cancel_flags flags) -{ - LIST_HEAD(head); - struct ldlm_lock *lock, *next; - int left = 0, bl_ast = 0; - __u64 rc; - - left = count; - list_for_each_entry_safe(lock, next, cancels, l_bl_ast) { - if (left-- == 0) - break; - - if (flags & LCF_LOCAL) { - rc = LDLM_FL_LOCAL_ONLY; - ldlm_lock_cancel(lock); - } else { - rc = ldlm_cli_cancel_local(lock); - } - /* Until we have compound requests and can send LDLM_CANCEL - * requests batched with generic RPCs, we need to send cancels - * with the LDLM_FL_BL_AST flag in a separate RPC from - * the one being generated now. - */ - if (!(flags & LCF_BL_AST) && (rc == LDLM_FL_BL_AST)) { - LDLM_DEBUG(lock, "Cancel lock separately"); - list_del_init(&lock->l_bl_ast); - list_add(&lock->l_bl_ast, &head); - bl_ast++; - continue; - } - if (rc == LDLM_FL_LOCAL_ONLY) { - /* CANCEL RPC should not be sent to server. */ - list_del_init(&lock->l_bl_ast); - LDLM_LOCK_RELEASE(lock); - count--; - } - } - if (bl_ast > 0) { - count -= bl_ast; - ldlm_cli_cancel_list(&head, bl_ast, NULL, 0); - } - - return count; -} - -/** - * Cancel as many locks as possible w/o sending any RPCs (e.g. to write back - * dirty data, to close a file, ...) or waiting for any RPCs in-flight (e.g. - * readahead requests, ...) - */ -static enum ldlm_policy_res -ldlm_cancel_no_wait_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock, - int unused, int added, int count) -{ - enum ldlm_policy_res result = LDLM_POLICY_CANCEL_LOCK; - - /* don't check added & count since we want to process all locks - * from unused list. - * It's fine to not take lock to access lock->l_resource since - * the lock has already been granted so it won't change. - */ - switch (lock->l_resource->lr_type) { - case LDLM_EXTENT: - case LDLM_IBITS: - if (ns->ns_cancel && ns->ns_cancel(lock) != 0) - break; - /* fall through */ - default: - result = LDLM_POLICY_SKIP_LOCK; - lock_res_and_lock(lock); - ldlm_set_skipped(lock); - unlock_res_and_lock(lock); - break; - } - - return result; -} - -/** - * Callback function for LRU-resize policy. Decides whether to keep - * \a lock in LRU for current \a LRU size \a unused, added in current - * scan \a added and number of locks to be preferably canceled \a count. - * - * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning - * - * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU - */ -static enum ldlm_policy_res ldlm_cancel_lrur_policy(struct ldlm_namespace *ns, - struct ldlm_lock *lock, - int unused, int added, - int count) -{ - unsigned long cur = jiffies; - struct ldlm_pool *pl = &ns->ns_pool; - __u64 slv, lvf, lv; - unsigned long la; - - /* Stop LRU processing when we reach past @count or have checked all - * locks in LRU. - */ - if (count && added >= count) - return LDLM_POLICY_KEEP_LOCK; - - /* - * Despite of the LV, It doesn't make sense to keep the lock which - * is unused for ns_max_age time. - */ - if (time_after(jiffies, lock->l_last_used + ns->ns_max_age)) - return LDLM_POLICY_CANCEL_LOCK; - - slv = ldlm_pool_get_slv(pl); - lvf = ldlm_pool_get_lvf(pl); - la = (cur - lock->l_last_used) / HZ; - lv = lvf * la * unused; - - /* Inform pool about current CLV to see it via debugfs. */ - ldlm_pool_set_clv(pl, lv); - - /* Stop when SLV is not yet come from server or lv is smaller than - * it is. - */ - if (slv == 0 || lv < slv) - return LDLM_POLICY_KEEP_LOCK; - - return LDLM_POLICY_CANCEL_LOCK; -} - -/** - * Callback function for debugfs used policy. Makes decision whether to keep - * \a lock in LRU for current \a LRU size \a unused, added in current scan \a - * added and number of locks to be preferably canceled \a count. - * - * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning - * - * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU - */ -static enum ldlm_policy_res ldlm_cancel_passed_policy(struct ldlm_namespace *ns, - struct ldlm_lock *lock, - int unused, int added, - int count) -{ - /* Stop LRU processing when we reach past @count or have checked all - * locks in LRU. - */ - return (added >= count) ? - LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK; -} - -/** - * Callback function for aged policy. Makes decision whether to keep \a lock in - * LRU for current LRU size \a unused, added in current scan \a added and - * number of locks to be preferably canceled \a count. - * - * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning - * - * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU - */ -static enum ldlm_policy_res ldlm_cancel_aged_policy(struct ldlm_namespace *ns, - struct ldlm_lock *lock, - int unused, int added, - int count) -{ - if ((added >= count) && - time_before(jiffies, lock->l_last_used + ns->ns_max_age)) - return LDLM_POLICY_KEEP_LOCK; - - return LDLM_POLICY_CANCEL_LOCK; -} - -static enum ldlm_policy_res -ldlm_cancel_lrur_no_wait_policy(struct ldlm_namespace *ns, - struct ldlm_lock *lock, - int unused, int added, - int count) -{ - enum ldlm_policy_res result; - - result = ldlm_cancel_lrur_policy(ns, lock, unused, added, count); - if (result == LDLM_POLICY_KEEP_LOCK) - return result; - - return ldlm_cancel_no_wait_policy(ns, lock, unused, added, count); -} - -/** - * Callback function for default policy. Makes decision whether to keep \a lock - * in LRU for current LRU size \a unused, added in current scan \a added and - * number of locks to be preferably canceled \a count. - * - * \retval LDLM_POLICY_KEEP_LOCK keep lock in LRU in stop scanning - * - * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU - */ -static enum ldlm_policy_res -ldlm_cancel_default_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock, - int unused, int added, int count) -{ - /* Stop LRU processing when we reach past count or have checked all - * locks in LRU. - */ - return (added >= count) ? - LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK; -} - -typedef enum ldlm_policy_res (*ldlm_cancel_lru_policy_t)( - struct ldlm_namespace *, - struct ldlm_lock *, int, - int, int); - -static ldlm_cancel_lru_policy_t -ldlm_cancel_lru_policy(struct ldlm_namespace *ns, int flags) -{ - if (flags & LDLM_LRU_FLAG_NO_WAIT) - return ldlm_cancel_no_wait_policy; - - if (ns_connect_lru_resize(ns)) { - if (flags & LDLM_LRU_FLAG_SHRINK) - /* We kill passed number of old locks. */ - return ldlm_cancel_passed_policy; - else if (flags & LDLM_LRU_FLAG_LRUR) - return ldlm_cancel_lrur_policy; - else if (flags & LDLM_LRU_FLAG_PASSED) - return ldlm_cancel_passed_policy; - else if (flags & LDLM_LRU_FLAG_LRUR_NO_WAIT) - return ldlm_cancel_lrur_no_wait_policy; - } else { - if (flags & LDLM_LRU_FLAG_AGED) - return ldlm_cancel_aged_policy; - } - - return ldlm_cancel_default_policy; -} - -/** - * - Free space in LRU for \a count new locks, - * redundant unused locks are canceled locally; - * - also cancel locally unused aged locks; - * - do not cancel more than \a max locks; - * - GET the found locks and add them into the \a cancels list. - * - * A client lock can be added to the l_bl_ast list only when it is - * marked LDLM_FL_CANCELING. Otherwise, somebody is already doing - * CANCEL. There are the following use cases: - * ldlm_cancel_resource_local(), ldlm_cancel_lru_local() and - * ldlm_cli_cancel(), which check and set this flag properly. As any - * attempt to cancel a lock rely on this flag, l_bl_ast list is accessed - * later without any special locking. - * - * Calling policies for enabled LRU resize: - * ---------------------------------------- - * flags & LDLM_LRU_FLAG_LRUR - use LRU resize policy (SLV from server) to - * cancel not more than \a count locks; - * - * flags & LDLM_LRU_FLAG_PASSED - cancel \a count number of old locks (located - * at the beginning of LRU list); - * - * flags & LDLM_LRU_FLAG_SHRINK - cancel not more than \a count locks according - * to memory pressure policy function; - * - * flags & LDLM_LRU_FLAG_AGED - cancel \a count locks according to - * "aged policy". - * - * flags & LDLM_LRU_FLAG_NO_WAIT - cancel as many unused locks as possible - * (typically before replaying locks) w/o - * sending any RPCs or waiting for any - * outstanding RPC to complete. - */ -static int ldlm_prepare_lru_list(struct ldlm_namespace *ns, - struct list_head *cancels, int count, int max, - int flags) -{ - ldlm_cancel_lru_policy_t pf; - struct ldlm_lock *lock, *next; - int added = 0, unused, remained; - int no_wait = flags & - (LDLM_LRU_FLAG_NO_WAIT | LDLM_LRU_FLAG_LRUR_NO_WAIT); - - spin_lock(&ns->ns_lock); - unused = ns->ns_nr_unused; - remained = unused; - - if (!ns_connect_lru_resize(ns)) - count += unused - ns->ns_max_unused; - - pf = ldlm_cancel_lru_policy(ns, flags); - LASSERT(pf); - - while (!list_empty(&ns->ns_unused_list)) { - enum ldlm_policy_res result; - time_t last_use = 0; - - /* all unused locks */ - if (remained-- <= 0) - break; - - /* For any flags, stop scanning if @max is reached. */ - if (max && added >= max) - break; - - list_for_each_entry_safe(lock, next, &ns->ns_unused_list, - l_lru) { - /* No locks which got blocking requests. */ - LASSERT(!ldlm_is_bl_ast(lock)); - - if (no_wait && ldlm_is_skipped(lock)) - /* already processed */ - continue; - - last_use = lock->l_last_used; - if (last_use == jiffies) - continue; - - /* Somebody is already doing CANCEL. No need for this - * lock in LRU, do not traverse it again. - */ - if (!ldlm_is_canceling(lock)) - break; - - ldlm_lock_remove_from_lru_nolock(lock); - } - if (&lock->l_lru == &ns->ns_unused_list) - break; - - LDLM_LOCK_GET(lock); - spin_unlock(&ns->ns_lock); - lu_ref_add(&lock->l_reference, __func__, current); - - /* Pass the lock through the policy filter and see if it - * should stay in LRU. - * - * Even for shrinker policy we stop scanning if - * we find a lock that should stay in the cache. - * We should take into account lock age anyway - * as a new lock is a valuable resource even if - * it has a low weight. - * - * That is, for shrinker policy we drop only - * old locks, but additionally choose them by - * their weight. Big extent locks will stay in - * the cache. - */ - result = pf(ns, lock, unused, added, count); - if (result == LDLM_POLICY_KEEP_LOCK) { - lu_ref_del(&lock->l_reference, - __func__, current); - LDLM_LOCK_RELEASE(lock); - spin_lock(&ns->ns_lock); - break; - } - if (result == LDLM_POLICY_SKIP_LOCK) { - lu_ref_del(&lock->l_reference, - __func__, current); - LDLM_LOCK_RELEASE(lock); - spin_lock(&ns->ns_lock); - continue; - } - - lock_res_and_lock(lock); - /* Check flags again under the lock. */ - if (ldlm_is_canceling(lock) || - (ldlm_lock_remove_from_lru_check(lock, last_use) == 0)) { - /* Another thread is removing lock from LRU, or - * somebody is already doing CANCEL, or there - * is a blocking request which will send cancel - * by itself, or the lock is no longer unused or - * the lock has been used since the pf() call and - * pages could be put under it. - */ - unlock_res_and_lock(lock); - lu_ref_del(&lock->l_reference, - __func__, current); - LDLM_LOCK_RELEASE(lock); - spin_lock(&ns->ns_lock); - continue; - } - LASSERT(!lock->l_readers && !lock->l_writers); - - /* If we have chosen to cancel this lock voluntarily, we - * better send cancel notification to server, so that it - * frees appropriate state. This might lead to a race - * where while we are doing cancel here, server is also - * silently cancelling this lock. - */ - ldlm_clear_cancel_on_block(lock); - - /* Setting the CBPENDING flag is a little misleading, - * but prevents an important race; namely, once - * CBPENDING is set, the lock can accumulate no more - * readers/writers. Since readers and writers are - * already zero here, ldlm_lock_decref() won't see - * this flag and call l_blocking_ast - */ - lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING; - - /* We can't re-add to l_lru as it confuses the - * refcounting in ldlm_lock_remove_from_lru() if an AST - * arrives after we drop lr_lock below. We use l_bl_ast - * and can't use l_pending_chain as it is used both on - * server and client nevertheless bug 5666 says it is - * used only on server - */ - LASSERT(list_empty(&lock->l_bl_ast)); - list_add(&lock->l_bl_ast, cancels); - unlock_res_and_lock(lock); - lu_ref_del(&lock->l_reference, __func__, current); - spin_lock(&ns->ns_lock); - added++; - unused--; - } - spin_unlock(&ns->ns_lock); - return added; -} - -int ldlm_cancel_lru_local(struct ldlm_namespace *ns, - struct list_head *cancels, int count, int max, - enum ldlm_cancel_flags cancel_flags, int flags) -{ - int added; - - added = ldlm_prepare_lru_list(ns, cancels, count, max, flags); - if (added <= 0) - return added; - return ldlm_cli_cancel_list_local(cancels, added, cancel_flags); -} - -/** - * Cancel at least \a nr locks from given namespace LRU. - * - * When called with LCF_ASYNC the blocking callback will be handled - * in a thread and this function will return after the thread has been - * asked to call the callback. When called with LCF_ASYNC the blocking - * callback will be performed in this function. - */ -int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, - enum ldlm_cancel_flags cancel_flags, - int flags) -{ - LIST_HEAD(cancels); - int count, rc; - - /* Just prepare the list of locks, do not actually cancel them yet. - * Locks are cancelled later in a separate thread. - */ - count = ldlm_prepare_lru_list(ns, &cancels, nr, 0, flags); - rc = ldlm_bl_to_thread_list(ns, NULL, &cancels, count, cancel_flags); - if (rc == 0) - return count; - - return 0; -} - -/** - * Find and cancel locally unused locks found on resource, matched to the - * given policy, mode. GET the found locks and add them into the \a cancels - * list. - */ -int ldlm_cancel_resource_local(struct ldlm_resource *res, - struct list_head *cancels, - union ldlm_policy_data *policy, - enum ldlm_mode mode, __u64 lock_flags, - enum ldlm_cancel_flags cancel_flags, - void *opaque) -{ - struct ldlm_lock *lock; - int count = 0; - - lock_res(res); - list_for_each_entry(lock, &res->lr_granted, l_res_link) { - if (opaque && lock->l_ast_data != opaque) { - LDLM_ERROR(lock, "data %p doesn't match opaque %p", - lock->l_ast_data, opaque); - continue; - } - - if (lock->l_readers || lock->l_writers) - continue; - - /* If somebody is already doing CANCEL, or blocking AST came, - * skip this lock. - */ - if (ldlm_is_bl_ast(lock) || ldlm_is_canceling(lock)) - continue; - - if (lockmode_compat(lock->l_granted_mode, mode)) - continue; - - /* If policy is given and this is IBITS lock, add to list only - * those locks that match by policy. - */ - if (policy && (lock->l_resource->lr_type == LDLM_IBITS) && - !(lock->l_policy_data.l_inodebits.bits & - policy->l_inodebits.bits)) - continue; - - /* See CBPENDING comment in ldlm_cancel_lru */ - lock->l_flags |= LDLM_FL_CBPENDING | LDLM_FL_CANCELING | - lock_flags; - - LASSERT(list_empty(&lock->l_bl_ast)); - list_add(&lock->l_bl_ast, cancels); - LDLM_LOCK_GET(lock); - count++; - } - unlock_res(res); - - return ldlm_cli_cancel_list_local(cancels, count, cancel_flags); -} -EXPORT_SYMBOL(ldlm_cancel_resource_local); - -/** - * Cancel client-side locks from a list and send/prepare cancel RPCs to the - * server. - * If \a req is NULL, send CANCEL request to server with handles of locks - * in the \a cancels. If EARLY_CANCEL is not supported, send CANCEL requests - * separately per lock. - * If \a req is not NULL, put handles of locks in \a cancels into the request - * buffer at the offset \a off. - * Destroy \a cancels at the end. - */ -int ldlm_cli_cancel_list(struct list_head *cancels, int count, - struct ptlrpc_request *req, - enum ldlm_cancel_flags flags) -{ - struct ldlm_lock *lock; - int res = 0; - - if (list_empty(cancels) || count == 0) - return 0; - - /* XXX: requests (both batched and not) could be sent in parallel. - * Usually it is enough to have just 1 RPC, but it is possible that - * there are too many locks to be cancelled in LRU or on a resource. - * It would also speed up the case when the server does not support - * the feature. - */ - while (count > 0) { - LASSERT(!list_empty(cancels)); - lock = list_first_entry(cancels, struct ldlm_lock, l_bl_ast); - LASSERT(lock->l_conn_export); - - if (exp_connect_cancelset(lock->l_conn_export)) { - res = count; - if (req) - ldlm_cancel_pack(req, cancels, count); - else - res = ldlm_cli_cancel_req(lock->l_conn_export, - cancels, count, - flags); - } else { - res = ldlm_cli_cancel_req(lock->l_conn_export, - cancels, 1, flags); - } - - if (res < 0) { - CDEBUG_LIMIT(res == -ESHUTDOWN ? D_DLMTRACE : D_ERROR, - "%s: %d\n", __func__, res); - res = count; - } - - count -= res; - ldlm_lock_list_put(cancels, l_bl_ast, res); - } - LASSERT(count == 0); - return 0; -} -EXPORT_SYMBOL(ldlm_cli_cancel_list); - -/** - * Cancel all locks on a resource that have 0 readers/writers. - * - * If flags & LDLM_FL_LOCAL_ONLY, throw the locks away without trying - * to notify the server. - */ -int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns, - const struct ldlm_res_id *res_id, - union ldlm_policy_data *policy, - enum ldlm_mode mode, - enum ldlm_cancel_flags flags, - void *opaque) -{ - struct ldlm_resource *res; - LIST_HEAD(cancels); - int count; - int rc; - - res = ldlm_resource_get(ns, NULL, res_id, 0, 0); - if (IS_ERR(res)) { - /* This is not a problem. */ - CDEBUG(D_INFO, "No resource %llu\n", res_id->name[0]); - return 0; - } - - LDLM_RESOURCE_ADDREF(res); - count = ldlm_cancel_resource_local(res, &cancels, policy, mode, - 0, flags | LCF_BL_AST, opaque); - rc = ldlm_cli_cancel_list(&cancels, count, NULL, flags); - if (rc != ELDLM_OK) - CERROR("canceling unused lock " DLDLMRES ": rc = %d\n", - PLDLMRES(res), rc); - - LDLM_RESOURCE_DELREF(res); - ldlm_resource_putref(res); - return 0; -} -EXPORT_SYMBOL(ldlm_cli_cancel_unused_resource); - -struct ldlm_cli_cancel_arg { - int lc_flags; - void *lc_opaque; -}; - -static int ldlm_cli_hash_cancel_unused(struct cfs_hash *hs, - struct cfs_hash_bd *bd, - struct hlist_node *hnode, void *arg) -{ - struct ldlm_resource *res = cfs_hash_object(hs, hnode); - struct ldlm_cli_cancel_arg *lc = arg; - - ldlm_cli_cancel_unused_resource(ldlm_res_to_ns(res), &res->lr_name, - NULL, LCK_MINMODE, - lc->lc_flags, lc->lc_opaque); - /* must return 0 for hash iteration */ - return 0; -} - -/** - * Cancel all locks on a namespace (or a specific resource, if given) - * that have 0 readers/writers. - * - * If flags & LCF_LOCAL, throw the locks away without trying - * to notify the server. - */ -int ldlm_cli_cancel_unused(struct ldlm_namespace *ns, - const struct ldlm_res_id *res_id, - enum ldlm_cancel_flags flags, void *opaque) -{ - struct ldlm_cli_cancel_arg arg = { - .lc_flags = flags, - .lc_opaque = opaque, - }; - - if (!ns) - return ELDLM_OK; - - if (res_id) { - return ldlm_cli_cancel_unused_resource(ns, res_id, NULL, - LCK_MINMODE, flags, - opaque); - } else { - cfs_hash_for_each_nolock(ns->ns_rs_hash, - ldlm_cli_hash_cancel_unused, &arg, 0); - return ELDLM_OK; - } -} -EXPORT_SYMBOL(ldlm_cli_cancel_unused); - -/* Lock iterators. */ - -static int ldlm_resource_foreach(struct ldlm_resource *res, - ldlm_iterator_t iter, void *closure) -{ - struct ldlm_lock *tmp; - struct ldlm_lock *lock; - int rc = LDLM_ITER_CONTINUE; - - if (!res) - return LDLM_ITER_CONTINUE; - - lock_res(res); - list_for_each_entry_safe(lock, tmp, &res->lr_granted, l_res_link) { - if (iter(lock, closure) == LDLM_ITER_STOP) { - rc = LDLM_ITER_STOP; - goto out; - } - } - - list_for_each_entry_safe(lock, tmp, &res->lr_waiting, l_res_link) { - if (iter(lock, closure) == LDLM_ITER_STOP) { - rc = LDLM_ITER_STOP; - goto out; - } - } - out: - unlock_res(res); - return rc; -} - -struct iter_helper_data { - ldlm_iterator_t iter; - void *closure; -}; - -static int ldlm_iter_helper(struct ldlm_lock *lock, void *closure) -{ - struct iter_helper_data *helper = closure; - - return helper->iter(lock, helper->closure); -} - -static int ldlm_res_iter_helper(struct cfs_hash *hs, struct cfs_hash_bd *bd, - struct hlist_node *hnode, void *arg) - -{ - struct ldlm_resource *res = cfs_hash_object(hs, hnode); - - return ldlm_resource_foreach(res, ldlm_iter_helper, arg) == - LDLM_ITER_STOP; -} - -static void ldlm_namespace_foreach(struct ldlm_namespace *ns, - ldlm_iterator_t iter, void *closure) - -{ - struct iter_helper_data helper = { - .iter = iter, - .closure = closure, - }; - - cfs_hash_for_each_nolock(ns->ns_rs_hash, - ldlm_res_iter_helper, &helper, 0); -} - -/* non-blocking function to manipulate a lock whose cb_data is being put away. - * return 0: find no resource - * > 0: must be LDLM_ITER_STOP/LDLM_ITER_CONTINUE. - * < 0: errors - */ -int ldlm_resource_iterate(struct ldlm_namespace *ns, - const struct ldlm_res_id *res_id, - ldlm_iterator_t iter, void *data) -{ - struct ldlm_resource *res; - int rc; - - LASSERTF(ns, "must pass in namespace\n"); - - res = ldlm_resource_get(ns, NULL, res_id, 0, 0); - if (IS_ERR(res)) - return 0; - - LDLM_RESOURCE_ADDREF(res); - rc = ldlm_resource_foreach(res, iter, data); - LDLM_RESOURCE_DELREF(res); - ldlm_resource_putref(res); - return rc; -} -EXPORT_SYMBOL(ldlm_resource_iterate); - -/* Lock replay */ - -static int ldlm_chain_lock_for_replay(struct ldlm_lock *lock, void *closure) -{ - struct list_head *list = closure; - - /* we use l_pending_chain here, because it's unused on clients. */ - LASSERTF(list_empty(&lock->l_pending_chain), - "lock %p next %p prev %p\n", - lock, &lock->l_pending_chain.next, - &lock->l_pending_chain.prev); - /* bug 9573: don't replay locks left after eviction, or - * bug 17614: locks being actively cancelled. Get a reference - * on a lock so that it does not disappear under us (e.g. due to cancel) - */ - if (!(lock->l_flags & (LDLM_FL_FAILED | LDLM_FL_BL_DONE))) { - list_add(&lock->l_pending_chain, list); - LDLM_LOCK_GET(lock); - } - - return LDLM_ITER_CONTINUE; -} - -static int replay_lock_interpret(const struct lu_env *env, - struct ptlrpc_request *req, - struct ldlm_async_args *aa, int rc) -{ - struct ldlm_lock *lock; - struct ldlm_reply *reply; - struct obd_export *exp; - - atomic_dec(&req->rq_import->imp_replay_inflight); - if (rc != ELDLM_OK) - goto out; - - reply = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); - if (!reply) { - rc = -EPROTO; - goto out; - } - - lock = ldlm_handle2lock(&aa->lock_handle); - if (!lock) { - CERROR("received replay ack for unknown local cookie %#llx remote cookie %#llx from server %s id %s\n", - aa->lock_handle.cookie, reply->lock_handle.cookie, - req->rq_export->exp_client_uuid.uuid, - libcfs_id2str(req->rq_peer)); - rc = -ESTALE; - goto out; - } - - /* Key change rehash lock in per-export hash with new key */ - exp = req->rq_export; - lock->l_remote_handle = reply->lock_handle; - - LDLM_DEBUG(lock, "replayed lock:"); - ptlrpc_import_recovery_state_machine(req->rq_import); - LDLM_LOCK_PUT(lock); -out: - if (rc != ELDLM_OK) - ptlrpc_connect_import(req->rq_import); - - return rc; -} - -static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock) -{ - struct ptlrpc_request *req; - struct ldlm_async_args *aa; - struct ldlm_request *body; - int flags; - - /* Bug 11974: Do not replay a lock which is actively being canceled */ - if (ldlm_is_bl_done(lock)) { - LDLM_DEBUG(lock, "Not replaying canceled lock:"); - return 0; - } - - /* If this is reply-less callback lock, we cannot replay it, since - * server might have long dropped it, but notification of that event was - * lost by network. (and server granted conflicting lock already) - */ - if (ldlm_is_cancel_on_block(lock)) { - LDLM_DEBUG(lock, "Not replaying reply-less lock:"); - ldlm_lock_cancel(lock); - return 0; - } - - /* - * If granted mode matches the requested mode, this lock is granted. - * - * If they differ, but we have a granted mode, then we were granted - * one mode and now want another: ergo, converting. - * - * If we haven't been granted anything and are on a resource list, - * then we're blocked/waiting. - * - * If we haven't been granted anything and we're NOT on a resource list, - * then we haven't got a reply yet and don't have a known disposition. - * This happens whenever a lock enqueue is the request that triggers - * recovery. - */ - if (lock->l_granted_mode == lock->l_req_mode) - flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_GRANTED; - else if (lock->l_granted_mode) - flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_CONV; - else if (!list_empty(&lock->l_res_link)) - flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_WAIT; - else - flags = LDLM_FL_REPLAY; - - req = ptlrpc_request_alloc_pack(imp, &RQF_LDLM_ENQUEUE, - LUSTRE_DLM_VERSION, LDLM_ENQUEUE); - if (!req) - return -ENOMEM; - - /* We're part of recovery, so don't wait for it. */ - req->rq_send_state = LUSTRE_IMP_REPLAY_LOCKS; - - body = req_capsule_client_get(&req->rq_pill, &RMF_DLM_REQ); - ldlm_lock2desc(lock, &body->lock_desc); - body->lock_flags = ldlm_flags_to_wire(flags); - - ldlm_lock2handle(lock, &body->lock_handle[0]); - if (lock->l_lvb_len > 0) - req_capsule_extend(&req->rq_pill, &RQF_LDLM_ENQUEUE_LVB); - req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, - lock->l_lvb_len); - ptlrpc_request_set_replen(req); - /* notify the server we've replayed all requests. - * also, we mark the request to be put on a dedicated - * queue to be processed after all request replayes. - * bug 6063 - */ - lustre_msg_set_flags(req->rq_reqmsg, MSG_REQ_REPLAY_DONE); - - LDLM_DEBUG(lock, "replaying lock:"); - - atomic_inc(&req->rq_import->imp_replay_inflight); - BUILD_BUG_ON(sizeof(*aa) > sizeof(req->rq_async_args)); - aa = ptlrpc_req_async_args(req); - aa->lock_handle = body->lock_handle[0]; - req->rq_interpret_reply = (ptlrpc_interpterer_t)replay_lock_interpret; - ptlrpcd_add_req(req); - - return 0; -} - -/** - * Cancel as many unused locks as possible before replay. since we are - * in recovery, we can't wait for any outstanding RPCs to send any RPC - * to the server. - * - * Called only in recovery before replaying locks. there is no need to - * replay locks that are unused. since the clients may hold thousands of - * cached unused locks, dropping the unused locks can greatly reduce the - * load on the servers at recovery time. - */ -static void ldlm_cancel_unused_locks_for_replay(struct ldlm_namespace *ns) -{ - int canceled; - LIST_HEAD(cancels); - - CDEBUG(D_DLMTRACE, - "Dropping as many unused locks as possible before replay for namespace %s (%d)\n", - ldlm_ns_name(ns), ns->ns_nr_unused); - - /* We don't need to care whether or not LRU resize is enabled - * because the LDLM_LRU_FLAG_NO_WAIT policy doesn't use the - * count parameter - */ - canceled = ldlm_cancel_lru_local(ns, &cancels, ns->ns_nr_unused, 0, - LCF_LOCAL, LDLM_LRU_FLAG_NO_WAIT); - - CDEBUG(D_DLMTRACE, "Canceled %d unused locks from namespace %s\n", - canceled, ldlm_ns_name(ns)); -} - -int ldlm_replay_locks(struct obd_import *imp) -{ - struct ldlm_namespace *ns = imp->imp_obd->obd_namespace; - LIST_HEAD(list); - struct ldlm_lock *lock, *next; - int rc = 0; - - LASSERT(atomic_read(&imp->imp_replay_inflight) == 0); - - /* don't replay locks if import failed recovery */ - if (imp->imp_vbr_failed) - return 0; - - /* ensure this doesn't fall to 0 before all have been queued */ - atomic_inc(&imp->imp_replay_inflight); - - if (ldlm_cancel_unused_locks_before_replay) - ldlm_cancel_unused_locks_for_replay(ns); - - ldlm_namespace_foreach(ns, ldlm_chain_lock_for_replay, &list); - - list_for_each_entry_safe(lock, next, &list, l_pending_chain) { - list_del_init(&lock->l_pending_chain); - if (rc) { - LDLM_LOCK_RELEASE(lock); - continue; /* or try to do the rest? */ - } - rc = replay_one_lock(imp, lock); - LDLM_LOCK_RELEASE(lock); - } - - atomic_dec(&imp->imp_replay_inflight); - - return rc; -} diff --git a/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c b/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c deleted file mode 100644 index c93b019b8e37ce623748a71b5bf3cc1672e62f1a..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ldlm/ldlm_resource.c +++ /dev/null @@ -1,1318 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2010, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/ldlm/ldlm_resource.c - * - * Author: Phil Schwan - * Author: Peter Braam - */ - -#define DEBUG_SUBSYSTEM S_LDLM -#include -#include -#include -#include "ldlm_internal.h" -#include - -struct kmem_cache *ldlm_resource_slab, *ldlm_lock_slab; - -int ldlm_srv_namespace_nr; -int ldlm_cli_namespace_nr; - -struct mutex ldlm_srv_namespace_lock; -LIST_HEAD(ldlm_srv_namespace_list); - -struct mutex ldlm_cli_namespace_lock; -/* Client Namespaces that have active resources in them. - * Once all resources go away, ldlm_poold moves such namespaces to the - * inactive list - */ -LIST_HEAD(ldlm_cli_active_namespace_list); -/* Client namespaces that don't have any locks in them */ -static LIST_HEAD(ldlm_cli_inactive_namespace_list); - -static struct dentry *ldlm_debugfs_dir; -static struct dentry *ldlm_ns_debugfs_dir; -struct dentry *ldlm_svc_debugfs_dir; - -/* during debug dump certain amount of granted locks for one resource to avoid - * DDOS. - */ -static unsigned int ldlm_dump_granted_max = 256; - -static ssize_t -lprocfs_wr_dump_ns(struct file *file, const char __user *buffer, - size_t count, loff_t *off) -{ - ldlm_dump_all_namespaces(LDLM_NAMESPACE_SERVER, D_DLMTRACE); - ldlm_dump_all_namespaces(LDLM_NAMESPACE_CLIENT, D_DLMTRACE); - return count; -} - -LPROC_SEQ_FOPS_WR_ONLY(ldlm, dump_ns); - -static int ldlm_rw_uint_seq_show(struct seq_file *m, void *v) -{ - seq_printf(m, "%u\n", *(unsigned int *)m->private); - return 0; -} - -static ssize_t -ldlm_rw_uint_seq_write(struct file *file, const char __user *buffer, - size_t count, loff_t *off) -{ - struct seq_file *seq = file->private_data; - - if (count == 0) - return 0; - return kstrtouint_from_user(buffer, count, 0, - (unsigned int *)seq->private); -} - -LPROC_SEQ_FOPS(ldlm_rw_uint); - -static struct lprocfs_vars ldlm_debugfs_list[] = { - { "dump_namespaces", &ldlm_dump_ns_fops, NULL, 0222 }, - { "dump_granted_max", &ldlm_rw_uint_fops, &ldlm_dump_granted_max }, - { NULL } -}; - -void ldlm_debugfs_setup(void) -{ - ldlm_debugfs_dir = debugfs_create_dir(OBD_LDLM_DEVICENAME, - debugfs_lustre_root); - - ldlm_ns_debugfs_dir = debugfs_create_dir("namespaces", - ldlm_debugfs_dir); - - ldlm_svc_debugfs_dir = debugfs_create_dir("services", ldlm_debugfs_dir); - - ldebugfs_add_vars(ldlm_debugfs_dir, ldlm_debugfs_list, NULL); -} - -void ldlm_debugfs_cleanup(void) -{ - debugfs_remove_recursive(ldlm_svc_debugfs_dir); - debugfs_remove_recursive(ldlm_ns_debugfs_dir); - debugfs_remove_recursive(ldlm_debugfs_dir); -} - -static ssize_t resource_count_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, - ns_kobj); - __u64 res = 0; - struct cfs_hash_bd bd; - int i; - - /* result is not strictly consistent */ - cfs_hash_for_each_bucket(ns->ns_rs_hash, &bd, i) - res += cfs_hash_bd_count_get(&bd); - return sprintf(buf, "%lld\n", res); -} -LUSTRE_RO_ATTR(resource_count); - -static ssize_t lock_count_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, - ns_kobj); - __u64 locks; - - locks = lprocfs_stats_collector(ns->ns_stats, LDLM_NSS_LOCKS, - LPROCFS_FIELDS_FLAGS_SUM); - return sprintf(buf, "%lld\n", locks); -} -LUSTRE_RO_ATTR(lock_count); - -static ssize_t lock_unused_count_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, - ns_kobj); - - return sprintf(buf, "%d\n", ns->ns_nr_unused); -} -LUSTRE_RO_ATTR(lock_unused_count); - -static ssize_t lru_size_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, - ns_kobj); - __u32 *nr = &ns->ns_max_unused; - - if (ns_connect_lru_resize(ns)) - nr = &ns->ns_nr_unused; - return sprintf(buf, "%u\n", *nr); -} - -static ssize_t lru_size_store(struct kobject *kobj, struct attribute *attr, - const char *buffer, size_t count) -{ - struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, - ns_kobj); - unsigned long tmp; - int lru_resize; - int err; - - if (strncmp(buffer, "clear", 5) == 0) { - CDEBUG(D_DLMTRACE, - "dropping all unused locks from namespace %s\n", - ldlm_ns_name(ns)); - if (ns_connect_lru_resize(ns)) { - int canceled, unused = ns->ns_nr_unused; - - /* Try to cancel all @ns_nr_unused locks. */ - canceled = ldlm_cancel_lru(ns, unused, 0, - LDLM_LRU_FLAG_PASSED); - if (canceled < unused) { - CDEBUG(D_DLMTRACE, - "not all requested locks are canceled, requested: %d, canceled: %d\n", - unused, - canceled); - return -EINVAL; - } - } else { - tmp = ns->ns_max_unused; - ns->ns_max_unused = 0; - ldlm_cancel_lru(ns, 0, 0, LDLM_LRU_FLAG_PASSED); - ns->ns_max_unused = tmp; - } - return count; - } - - err = kstrtoul(buffer, 10, &tmp); - if (err != 0) { - CERROR("lru_size: invalid value written\n"); - return -EINVAL; - } - lru_resize = (tmp == 0); - - if (ns_connect_lru_resize(ns)) { - if (!lru_resize) - ns->ns_max_unused = (unsigned int)tmp; - - if (tmp > ns->ns_nr_unused) - tmp = ns->ns_nr_unused; - tmp = ns->ns_nr_unused - tmp; - - CDEBUG(D_DLMTRACE, - "changing namespace %s unused locks from %u to %u\n", - ldlm_ns_name(ns), ns->ns_nr_unused, - (unsigned int)tmp); - ldlm_cancel_lru(ns, tmp, LCF_ASYNC, LDLM_LRU_FLAG_PASSED); - - if (!lru_resize) { - CDEBUG(D_DLMTRACE, - "disable lru_resize for namespace %s\n", - ldlm_ns_name(ns)); - ns->ns_connect_flags &= ~OBD_CONNECT_LRU_RESIZE; - } - } else { - CDEBUG(D_DLMTRACE, - "changing namespace %s max_unused from %u to %u\n", - ldlm_ns_name(ns), ns->ns_max_unused, - (unsigned int)tmp); - ns->ns_max_unused = (unsigned int)tmp; - ldlm_cancel_lru(ns, 0, LCF_ASYNC, LDLM_LRU_FLAG_PASSED); - - /* Make sure that LRU resize was originally supported before - * turning it on here. - */ - if (lru_resize && - (ns->ns_orig_connect_flags & OBD_CONNECT_LRU_RESIZE)) { - CDEBUG(D_DLMTRACE, - "enable lru_resize for namespace %s\n", - ldlm_ns_name(ns)); - ns->ns_connect_flags |= OBD_CONNECT_LRU_RESIZE; - } - } - - return count; -} -LUSTRE_RW_ATTR(lru_size); - -static ssize_t lru_max_age_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, - ns_kobj); - - return sprintf(buf, "%u\n", ns->ns_max_age); -} - -static ssize_t lru_max_age_store(struct kobject *kobj, struct attribute *attr, - const char *buffer, size_t count) -{ - struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, - ns_kobj); - unsigned long tmp; - int err; - - err = kstrtoul(buffer, 10, &tmp); - if (err != 0) - return -EINVAL; - - ns->ns_max_age = tmp; - - return count; -} -LUSTRE_RW_ATTR(lru_max_age); - -static ssize_t early_lock_cancel_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, - ns_kobj); - - return sprintf(buf, "%d\n", ns_connect_cancelset(ns)); -} - -static ssize_t early_lock_cancel_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, - ns_kobj); - unsigned long supp = -1; - int rc; - - rc = kstrtoul(buffer, 10, &supp); - if (rc < 0) - return rc; - - if (supp == 0) - ns->ns_connect_flags &= ~OBD_CONNECT_CANCELSET; - else if (ns->ns_orig_connect_flags & OBD_CONNECT_CANCELSET) - ns->ns_connect_flags |= OBD_CONNECT_CANCELSET; - return count; -} -LUSTRE_RW_ATTR(early_lock_cancel); - -/* These are for namespaces in /sys/fs/lustre/ldlm/namespaces/ */ -static struct attribute *ldlm_ns_attrs[] = { - &lustre_attr_resource_count.attr, - &lustre_attr_lock_count.attr, - &lustre_attr_lock_unused_count.attr, - &lustre_attr_lru_size.attr, - &lustre_attr_lru_max_age.attr, - &lustre_attr_early_lock_cancel.attr, - NULL, -}; - -static void ldlm_ns_release(struct kobject *kobj) -{ - struct ldlm_namespace *ns = container_of(kobj, struct ldlm_namespace, - ns_kobj); - complete(&ns->ns_kobj_unregister); -} - -static struct kobj_type ldlm_ns_ktype = { - .default_attrs = ldlm_ns_attrs, - .sysfs_ops = &lustre_sysfs_ops, - .release = ldlm_ns_release, -}; - -static void ldlm_namespace_debugfs_unregister(struct ldlm_namespace *ns) -{ - debugfs_remove_recursive(ns->ns_debugfs_entry); - - if (ns->ns_stats) - lprocfs_free_stats(&ns->ns_stats); -} - -static void ldlm_namespace_sysfs_unregister(struct ldlm_namespace *ns) -{ - kobject_put(&ns->ns_kobj); - wait_for_completion(&ns->ns_kobj_unregister); -} - -static int ldlm_namespace_sysfs_register(struct ldlm_namespace *ns) -{ - int err; - - ns->ns_kobj.kset = ldlm_ns_kset; - init_completion(&ns->ns_kobj_unregister); - err = kobject_init_and_add(&ns->ns_kobj, &ldlm_ns_ktype, NULL, - "%s", ldlm_ns_name(ns)); - - ns->ns_stats = lprocfs_alloc_stats(LDLM_NSS_LAST, 0); - if (!ns->ns_stats) { - kobject_put(&ns->ns_kobj); - return -ENOMEM; - } - - lprocfs_counter_init(ns->ns_stats, LDLM_NSS_LOCKS, - LPROCFS_CNTR_AVGMINMAX, "locks", "locks"); - - return err; -} - -static int ldlm_namespace_debugfs_register(struct ldlm_namespace *ns) -{ - struct dentry *ns_entry; - - if (!IS_ERR_OR_NULL(ns->ns_debugfs_entry)) { - ns_entry = ns->ns_debugfs_entry; - } else { - ns_entry = debugfs_create_dir(ldlm_ns_name(ns), - ldlm_ns_debugfs_dir); - if (!ns_entry) - return -ENOMEM; - ns->ns_debugfs_entry = ns_entry; - } - - return 0; -} - -#undef MAX_STRING_SIZE - -static struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res) -{ - LASSERT(res); - LASSERT(res != LP_POISON); - atomic_inc(&res->lr_refcount); - CDEBUG(D_INFO, "getref res: %p count: %d\n", res, - atomic_read(&res->lr_refcount)); - return res; -} - -static unsigned int ldlm_res_hop_hash(struct cfs_hash *hs, - const void *key, unsigned int mask) -{ - const struct ldlm_res_id *id = key; - unsigned int val = 0; - unsigned int i; - - for (i = 0; i < RES_NAME_SIZE; i++) - val += id->name[i]; - return val & mask; -} - -static unsigned int ldlm_res_hop_fid_hash(struct cfs_hash *hs, - const void *key, unsigned int mask) -{ - const struct ldlm_res_id *id = key; - struct lu_fid fid; - __u32 hash; - __u32 val; - - fid.f_seq = id->name[LUSTRE_RES_ID_SEQ_OFF]; - fid.f_oid = (__u32)id->name[LUSTRE_RES_ID_VER_OID_OFF]; - fid.f_ver = (__u32)(id->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32); - - hash = fid_flatten32(&fid); - hash += (hash >> 4) + (hash << 12); /* mixing oid and seq */ - if (id->name[LUSTRE_RES_ID_HSH_OFF] != 0) { - val = id->name[LUSTRE_RES_ID_HSH_OFF]; - hash += (val >> 5) + (val << 11); - } else { - val = fid_oid(&fid); - } - hash = hash_long(hash, hs->hs_bkt_bits); - /* give me another random factor */ - hash -= hash_long((unsigned long)hs, val % 11 + 3); - - hash <<= hs->hs_cur_bits - hs->hs_bkt_bits; - hash |= ldlm_res_hop_hash(hs, key, CFS_HASH_NBKT(hs) - 1); - - return hash & mask; -} - -static void *ldlm_res_hop_key(struct hlist_node *hnode) -{ - struct ldlm_resource *res; - - res = hlist_entry(hnode, struct ldlm_resource, lr_hash); - return &res->lr_name; -} - -static int ldlm_res_hop_keycmp(const void *key, struct hlist_node *hnode) -{ - struct ldlm_resource *res; - - res = hlist_entry(hnode, struct ldlm_resource, lr_hash); - return ldlm_res_eq((const struct ldlm_res_id *)key, - (const struct ldlm_res_id *)&res->lr_name); -} - -static void *ldlm_res_hop_object(struct hlist_node *hnode) -{ - return hlist_entry(hnode, struct ldlm_resource, lr_hash); -} - -static void ldlm_res_hop_get_locked(struct cfs_hash *hs, - struct hlist_node *hnode) -{ - struct ldlm_resource *res; - - res = hlist_entry(hnode, struct ldlm_resource, lr_hash); - ldlm_resource_getref(res); -} - -static void ldlm_res_hop_put(struct cfs_hash *hs, struct hlist_node *hnode) -{ - struct ldlm_resource *res; - - res = hlist_entry(hnode, struct ldlm_resource, lr_hash); - ldlm_resource_putref(res); -} - -static struct cfs_hash_ops ldlm_ns_hash_ops = { - .hs_hash = ldlm_res_hop_hash, - .hs_key = ldlm_res_hop_key, - .hs_keycmp = ldlm_res_hop_keycmp, - .hs_keycpy = NULL, - .hs_object = ldlm_res_hop_object, - .hs_get = ldlm_res_hop_get_locked, - .hs_put = ldlm_res_hop_put -}; - -static struct cfs_hash_ops ldlm_ns_fid_hash_ops = { - .hs_hash = ldlm_res_hop_fid_hash, - .hs_key = ldlm_res_hop_key, - .hs_keycmp = ldlm_res_hop_keycmp, - .hs_keycpy = NULL, - .hs_object = ldlm_res_hop_object, - .hs_get = ldlm_res_hop_get_locked, - .hs_put = ldlm_res_hop_put -}; - -struct ldlm_ns_hash_def { - enum ldlm_ns_type nsd_type; - /** hash bucket bits */ - unsigned int nsd_bkt_bits; - /** hash bits */ - unsigned int nsd_all_bits; - /** hash operations */ - struct cfs_hash_ops *nsd_hops; -}; - -static struct ldlm_ns_hash_def ldlm_ns_hash_defs[] = { - { - .nsd_type = LDLM_NS_TYPE_MDC, - .nsd_bkt_bits = 11, - .nsd_all_bits = 16, - .nsd_hops = &ldlm_ns_fid_hash_ops, - }, - { - .nsd_type = LDLM_NS_TYPE_MDT, - .nsd_bkt_bits = 14, - .nsd_all_bits = 21, - .nsd_hops = &ldlm_ns_fid_hash_ops, - }, - { - .nsd_type = LDLM_NS_TYPE_OSC, - .nsd_bkt_bits = 8, - .nsd_all_bits = 12, - .nsd_hops = &ldlm_ns_hash_ops, - }, - { - .nsd_type = LDLM_NS_TYPE_OST, - .nsd_bkt_bits = 11, - .nsd_all_bits = 17, - .nsd_hops = &ldlm_ns_hash_ops, - }, - { - .nsd_type = LDLM_NS_TYPE_MGC, - .nsd_bkt_bits = 4, - .nsd_all_bits = 4, - .nsd_hops = &ldlm_ns_hash_ops, - }, - { - .nsd_type = LDLM_NS_TYPE_MGT, - .nsd_bkt_bits = 4, - .nsd_all_bits = 4, - .nsd_hops = &ldlm_ns_hash_ops, - }, - { - .nsd_type = LDLM_NS_TYPE_UNKNOWN, - }, -}; - -/** Register \a ns in the list of namespaces */ -static void ldlm_namespace_register(struct ldlm_namespace *ns, - enum ldlm_side client) -{ - mutex_lock(ldlm_namespace_lock(client)); - LASSERT(list_empty(&ns->ns_list_chain)); - list_add(&ns->ns_list_chain, &ldlm_cli_inactive_namespace_list); - ldlm_namespace_nr_inc(client); - mutex_unlock(ldlm_namespace_lock(client)); -} - -/** - * Create and initialize new empty namespace. - */ -struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name, - enum ldlm_side client, - enum ldlm_appetite apt, - enum ldlm_ns_type ns_type) -{ - struct ldlm_namespace *ns = NULL; - struct ldlm_ns_bucket *nsb; - struct ldlm_ns_hash_def *nsd; - struct cfs_hash_bd bd; - int idx; - int rc; - - LASSERT(obd); - - rc = ldlm_get_ref(); - if (rc) { - CERROR("ldlm_get_ref failed: %d\n", rc); - return NULL; - } - - for (idx = 0;; idx++) { - nsd = &ldlm_ns_hash_defs[idx]; - if (nsd->nsd_type == LDLM_NS_TYPE_UNKNOWN) { - CERROR("Unknown type %d for ns %s\n", ns_type, name); - goto out_ref; - } - - if (nsd->nsd_type == ns_type) - break; - } - - ns = kzalloc(sizeof(*ns), GFP_NOFS); - if (!ns) - goto out_ref; - - ns->ns_rs_hash = cfs_hash_create(name, - nsd->nsd_all_bits, nsd->nsd_all_bits, - nsd->nsd_bkt_bits, sizeof(*nsb), - CFS_HASH_MIN_THETA, - CFS_HASH_MAX_THETA, - nsd->nsd_hops, - CFS_HASH_DEPTH | - CFS_HASH_BIGNAME | - CFS_HASH_SPIN_BKTLOCK | - CFS_HASH_NO_ITEMREF); - if (!ns->ns_rs_hash) - goto out_ns; - - cfs_hash_for_each_bucket(ns->ns_rs_hash, &bd, idx) { - nsb = cfs_hash_bd_extra_get(ns->ns_rs_hash, &bd); - at_init(&nsb->nsb_at_estimate, ldlm_enqueue_min, 0); - nsb->nsb_namespace = ns; - } - - ns->ns_obd = obd; - ns->ns_appetite = apt; - ns->ns_client = client; - ns->ns_name = kstrdup(name, GFP_KERNEL); - if (!ns->ns_name) - goto out_hash; - - INIT_LIST_HEAD(&ns->ns_list_chain); - INIT_LIST_HEAD(&ns->ns_unused_list); - spin_lock_init(&ns->ns_lock); - atomic_set(&ns->ns_bref, 0); - init_waitqueue_head(&ns->ns_waitq); - - ns->ns_max_parallel_ast = LDLM_DEFAULT_PARALLEL_AST_LIMIT; - ns->ns_nr_unused = 0; - ns->ns_max_unused = LDLM_DEFAULT_LRU_SIZE; - ns->ns_max_age = LDLM_DEFAULT_MAX_ALIVE; - ns->ns_orig_connect_flags = 0; - ns->ns_connect_flags = 0; - ns->ns_stopping = 0; - - rc = ldlm_namespace_sysfs_register(ns); - if (rc != 0) { - CERROR("Can't initialize ns sysfs, rc %d\n", rc); - goto out_hash; - } - - rc = ldlm_namespace_debugfs_register(ns); - if (rc != 0) { - CERROR("Can't initialize ns proc, rc %d\n", rc); - goto out_sysfs; - } - - idx = ldlm_namespace_nr_read(client); - rc = ldlm_pool_init(&ns->ns_pool, ns, idx, client); - if (rc) { - CERROR("Can't initialize lock pool, rc %d\n", rc); - goto out_proc; - } - - ldlm_namespace_register(ns, client); - return ns; -out_proc: - ldlm_namespace_debugfs_unregister(ns); -out_sysfs: - ldlm_namespace_sysfs_unregister(ns); - ldlm_namespace_cleanup(ns, 0); -out_hash: - kfree(ns->ns_name); - cfs_hash_putref(ns->ns_rs_hash); -out_ns: - kfree(ns); -out_ref: - ldlm_put_ref(); - return NULL; -} -EXPORT_SYMBOL(ldlm_namespace_new); - -extern struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock); - -/** - * Cancel and destroy all locks on a resource. - * - * If flags contains FL_LOCAL_ONLY, don't try to tell the server, just - * clean up. This is currently only used for recovery, and we make - * certain assumptions as a result--notably, that we shouldn't cancel - * locks with refs. - */ -static void cleanup_resource(struct ldlm_resource *res, struct list_head *q, - __u64 flags) -{ - int rc = 0; - bool local_only = !!(flags & LDLM_FL_LOCAL_ONLY); - - do { - struct ldlm_lock *lock = NULL, *tmp; - struct lustre_handle lockh; - - /* First, we look for non-cleaned-yet lock - * all cleaned locks are marked by CLEANED flag. - */ - lock_res(res); - list_for_each_entry(tmp, q, l_res_link) { - if (ldlm_is_cleaned(tmp)) - continue; - - lock = tmp; - LDLM_LOCK_GET(lock); - ldlm_set_cleaned(lock); - break; - } - - if (!lock) { - unlock_res(res); - break; - } - - /* Set CBPENDING so nothing in the cancellation path - * can match this lock. - */ - ldlm_set_cbpending(lock); - ldlm_set_failed(lock); - lock->l_flags |= flags; - - /* ... without sending a CANCEL message for local_only. */ - if (local_only) - ldlm_set_local_only(lock); - - if (local_only && (lock->l_readers || lock->l_writers)) { - /* This is a little bit gross, but much better than the - * alternative: pretend that we got a blocking AST from - * the server, so that when the lock is decref'd, it - * will go away ... - */ - unlock_res(res); - LDLM_DEBUG(lock, "setting FL_LOCAL_ONLY"); - if (lock->l_flags & LDLM_FL_FAIL_LOC) { - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(4 * HZ); - set_current_state(TASK_RUNNING); - } - if (lock->l_completion_ast) - lock->l_completion_ast(lock, LDLM_FL_FAILED, - NULL); - LDLM_LOCK_RELEASE(lock); - continue; - } - - unlock_res(res); - ldlm_lock2handle(lock, &lockh); - rc = ldlm_cli_cancel(&lockh, LCF_LOCAL); - if (rc) - CERROR("ldlm_cli_cancel: %d\n", rc); - LDLM_LOCK_RELEASE(lock); - } while (1); -} - -static int ldlm_resource_clean(struct cfs_hash *hs, struct cfs_hash_bd *bd, - struct hlist_node *hnode, void *arg) -{ - struct ldlm_resource *res = cfs_hash_object(hs, hnode); - __u64 flags = *(__u64 *)arg; - - cleanup_resource(res, &res->lr_granted, flags); - cleanup_resource(res, &res->lr_waiting, flags); - - return 0; -} - -static int ldlm_resource_complain(struct cfs_hash *hs, struct cfs_hash_bd *bd, - struct hlist_node *hnode, void *arg) -{ - struct ldlm_resource *res = cfs_hash_object(hs, hnode); - - lock_res(res); - CERROR("%s: namespace resource " DLDLMRES - " (%p) refcount nonzero (%d) after lock cleanup; forcing cleanup.\n", - ldlm_ns_name(ldlm_res_to_ns(res)), PLDLMRES(res), res, - atomic_read(&res->lr_refcount) - 1); - - ldlm_resource_dump(D_ERROR, res); - unlock_res(res); - return 0; -} - -/** - * Cancel and destroy all locks in the namespace. - * - * Typically used during evictions when server notified client that it was - * evicted and all of its state needs to be destroyed. - * Also used during shutdown. - */ -int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags) -{ - if (!ns) { - CDEBUG(D_INFO, "NULL ns, skipping cleanup\n"); - return ELDLM_OK; - } - - cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_clean, - &flags, 0); - cfs_hash_for_each_nolock(ns->ns_rs_hash, ldlm_resource_complain, - NULL, 0); - return ELDLM_OK; -} -EXPORT_SYMBOL(ldlm_namespace_cleanup); - -/** - * Attempts to free namespace. - * - * Only used when namespace goes away, like during an unmount. - */ -static int __ldlm_namespace_free(struct ldlm_namespace *ns, int force) -{ - /* At shutdown time, don't call the cancellation callback */ - ldlm_namespace_cleanup(ns, force ? LDLM_FL_LOCAL_ONLY : 0); - - if (atomic_read(&ns->ns_bref) > 0) { - int rc; - - CDEBUG(D_DLMTRACE, - "dlm namespace %s free waiting on refcount %d\n", - ldlm_ns_name(ns), atomic_read(&ns->ns_bref)); -force_wait: - if (force) - rc = wait_event_idle_timeout(ns->ns_waitq, - atomic_read(&ns->ns_bref) == 0, - obd_timeout * HZ / 4) ? 0 : -ETIMEDOUT; - else - rc = l_wait_event_abortable(ns->ns_waitq, - atomic_read(&ns->ns_bref) == 0); - - /* Forced cleanups should be able to reclaim all references, - * so it's safe to wait forever... we can't leak locks... - */ - if (force && rc == -ETIMEDOUT) { - LCONSOLE_ERROR("Forced cleanup waiting for %s namespace with %d resources in use, (rc=%d)\n", - ldlm_ns_name(ns), - atomic_read(&ns->ns_bref), rc); - goto force_wait; - } - - if (atomic_read(&ns->ns_bref)) { - LCONSOLE_ERROR("Cleanup waiting for %s namespace with %d resources in use, (rc=%d)\n", - ldlm_ns_name(ns), - atomic_read(&ns->ns_bref), rc); - return ELDLM_NAMESPACE_EXISTS; - } - CDEBUG(D_DLMTRACE, "dlm namespace %s free done waiting\n", - ldlm_ns_name(ns)); - } - - return ELDLM_OK; -} - -/** - * Performs various cleanups for passed \a ns to make it drop refc and be - * ready for freeing. Waits for refc == 0. - * - * The following is done: - * (0) Unregister \a ns from its list to make inaccessible for potential - * users like pools thread and others; - * (1) Clear all locks in \a ns. - */ -void ldlm_namespace_free_prior(struct ldlm_namespace *ns, - struct obd_import *imp, - int force) -{ - int rc; - - if (!ns) - return; - - spin_lock(&ns->ns_lock); - ns->ns_stopping = 1; - spin_unlock(&ns->ns_lock); - - /* - * Can fail with -EINTR when force == 0 in which case try harder. - */ - rc = __ldlm_namespace_free(ns, force); - if (rc != ELDLM_OK) { - if (imp) { - ptlrpc_disconnect_import(imp, 0); - ptlrpc_invalidate_import(imp); - } - - /* - * With all requests dropped and the import inactive - * we are guaranteed all reference will be dropped. - */ - rc = __ldlm_namespace_free(ns, 1); - LASSERT(rc == 0); - } -} - -/** Unregister \a ns from the list of namespaces. */ -static void ldlm_namespace_unregister(struct ldlm_namespace *ns, - enum ldlm_side client) -{ - mutex_lock(ldlm_namespace_lock(client)); - LASSERT(!list_empty(&ns->ns_list_chain)); - /* Some asserts and possibly other parts of the code are still - * using list_empty(&ns->ns_list_chain). This is why it is - * important to use list_del_init() here. - */ - list_del_init(&ns->ns_list_chain); - ldlm_namespace_nr_dec(client); - mutex_unlock(ldlm_namespace_lock(client)); -} - -/** - * Performs freeing memory structures related to \a ns. This is only done - * when ldlm_namespce_free_prior() successfully removed all resources - * referencing \a ns and its refc == 0. - */ -void ldlm_namespace_free_post(struct ldlm_namespace *ns) -{ - if (!ns) - return; - - /* Make sure that nobody can find this ns in its list. */ - ldlm_namespace_unregister(ns, ns->ns_client); - /* Fini pool _before_ parent proc dir is removed. This is important as - * ldlm_pool_fini() removes own proc dir which is child to @dir. - * Removing it after @dir may cause oops. - */ - ldlm_pool_fini(&ns->ns_pool); - - ldlm_namespace_debugfs_unregister(ns); - ldlm_namespace_sysfs_unregister(ns); - cfs_hash_putref(ns->ns_rs_hash); - kfree(ns->ns_name); - /* Namespace \a ns should be not on list at this time, otherwise - * this will cause issues related to using freed \a ns in poold - * thread. - */ - LASSERT(list_empty(&ns->ns_list_chain)); - kfree(ns); - ldlm_put_ref(); -} - -void ldlm_namespace_get(struct ldlm_namespace *ns) -{ - atomic_inc(&ns->ns_bref); -} - -/* This is only for callers that care about refcount */ -static int ldlm_namespace_get_return(struct ldlm_namespace *ns) -{ - return atomic_inc_return(&ns->ns_bref); -} - -void ldlm_namespace_put(struct ldlm_namespace *ns) -{ - if (atomic_dec_and_lock(&ns->ns_bref, &ns->ns_lock)) { - wake_up(&ns->ns_waitq); - spin_unlock(&ns->ns_lock); - } -} - -/** Should be called with ldlm_namespace_lock(client) taken. */ -void ldlm_namespace_move_to_active_locked(struct ldlm_namespace *ns, - enum ldlm_side client) -{ - LASSERT(!list_empty(&ns->ns_list_chain)); - LASSERT(mutex_is_locked(ldlm_namespace_lock(client))); - list_move_tail(&ns->ns_list_chain, ldlm_namespace_list(client)); -} - -/** Should be called with ldlm_namespace_lock(client) taken. */ -void ldlm_namespace_move_to_inactive_locked(struct ldlm_namespace *ns, - enum ldlm_side client) -{ - LASSERT(!list_empty(&ns->ns_list_chain)); - LASSERT(mutex_is_locked(ldlm_namespace_lock(client))); - list_move_tail(&ns->ns_list_chain, &ldlm_cli_inactive_namespace_list); -} - -/** Should be called with ldlm_namespace_lock(client) taken. */ -struct ldlm_namespace *ldlm_namespace_first_locked(enum ldlm_side client) -{ - LASSERT(mutex_is_locked(ldlm_namespace_lock(client))); - LASSERT(!list_empty(ldlm_namespace_list(client))); - return container_of(ldlm_namespace_list(client)->next, - struct ldlm_namespace, ns_list_chain); -} - -/** Create and initialize new resource. */ -static struct ldlm_resource *ldlm_resource_new(void) -{ - struct ldlm_resource *res; - int idx; - - res = kmem_cache_zalloc(ldlm_resource_slab, GFP_NOFS); - if (!res) - return NULL; - - INIT_LIST_HEAD(&res->lr_granted); - INIT_LIST_HEAD(&res->lr_waiting); - - /* Initialize interval trees for each lock mode. */ - for (idx = 0; idx < LCK_MODE_NUM; idx++) { - res->lr_itree[idx].lit_size = 0; - res->lr_itree[idx].lit_mode = 1 << idx; - res->lr_itree[idx].lit_root = NULL; - } - - atomic_set(&res->lr_refcount, 1); - spin_lock_init(&res->lr_lock); - lu_ref_init(&res->lr_reference); - - /* The creator of the resource must unlock the mutex after LVB - * initialization. - */ - mutex_init(&res->lr_lvb_mutex); - mutex_lock(&res->lr_lvb_mutex); - - return res; -} - -/** - * Return a reference to resource with given name, creating it if necessary. - * Args: namespace with ns_lock unlocked - * Locks: takes and releases NS hash-lock and res->lr_lock - * Returns: referenced, unlocked ldlm_resource or NULL - */ -struct ldlm_resource * -ldlm_resource_get(struct ldlm_namespace *ns, struct ldlm_resource *parent, - const struct ldlm_res_id *name, enum ldlm_type type, - int create) -{ - struct hlist_node *hnode; - struct ldlm_resource *res = NULL; - struct cfs_hash_bd bd; - __u64 version; - int ns_refcount = 0; - int rc; - - LASSERT(!parent); - LASSERT(ns->ns_rs_hash); - LASSERT(name->name[0] != 0); - - cfs_hash_bd_get_and_lock(ns->ns_rs_hash, (void *)name, &bd, 0); - hnode = cfs_hash_bd_lookup_locked(ns->ns_rs_hash, &bd, (void *)name); - if (hnode) { - cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 0); - goto lvbo_init; - } - - version = cfs_hash_bd_version_get(&bd); - cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 0); - - if (create == 0) - return ERR_PTR(-ENOENT); - - LASSERTF(type >= LDLM_MIN_TYPE && type < LDLM_MAX_TYPE, - "type: %d\n", type); - res = ldlm_resource_new(); - if (!res) - return ERR_PTR(-ENOMEM); - - res->lr_ns_bucket = cfs_hash_bd_extra_get(ns->ns_rs_hash, &bd); - res->lr_name = *name; - res->lr_type = type; - - cfs_hash_bd_lock(ns->ns_rs_hash, &bd, 1); - hnode = (version == cfs_hash_bd_version_get(&bd)) ? NULL : - cfs_hash_bd_lookup_locked(ns->ns_rs_hash, &bd, (void *)name); - - if (hnode) { - /* Someone won the race and already added the resource. */ - cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1); - /* Clean lu_ref for failed resource. */ - lu_ref_fini(&res->lr_reference); - /* We have taken lr_lvb_mutex. Drop it. */ - mutex_unlock(&res->lr_lvb_mutex); - kmem_cache_free(ldlm_resource_slab, res); -lvbo_init: - res = hlist_entry(hnode, struct ldlm_resource, lr_hash); - /* Synchronize with regard to resource creation. */ - if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) { - mutex_lock(&res->lr_lvb_mutex); - mutex_unlock(&res->lr_lvb_mutex); - } - - if (unlikely(res->lr_lvb_len < 0)) { - rc = res->lr_lvb_len; - ldlm_resource_putref(res); - res = ERR_PTR(rc); - } - return res; - } - /* We won! Let's add the resource. */ - cfs_hash_bd_add_locked(ns->ns_rs_hash, &bd, &res->lr_hash); - if (cfs_hash_bd_count_get(&bd) == 1) - ns_refcount = ldlm_namespace_get_return(ns); - - cfs_hash_bd_unlock(ns->ns_rs_hash, &bd, 1); - if (ns->ns_lvbo && ns->ns_lvbo->lvbo_init) { - OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_CREATE_RESOURCE, 2); - rc = ns->ns_lvbo->lvbo_init(res); - if (rc < 0) { - CERROR("%s: lvbo_init failed for resource %#llx:%#llx: rc = %d\n", - ns->ns_obd->obd_name, name->name[0], - name->name[1], rc); - res->lr_lvb_len = rc; - mutex_unlock(&res->lr_lvb_mutex); - ldlm_resource_putref(res); - return ERR_PTR(rc); - } - } - - /* We create resource with locked lr_lvb_mutex. */ - mutex_unlock(&res->lr_lvb_mutex); - - /* Let's see if we happened to be the very first resource in this - * namespace. If so, and this is a client namespace, we need to move - * the namespace into the active namespaces list to be patrolled by - * the ldlm_poold. - */ - if (ns_refcount == 1) { - mutex_lock(ldlm_namespace_lock(LDLM_NAMESPACE_CLIENT)); - ldlm_namespace_move_to_active_locked(ns, LDLM_NAMESPACE_CLIENT); - mutex_unlock(ldlm_namespace_lock(LDLM_NAMESPACE_CLIENT)); - } - - return res; -} -EXPORT_SYMBOL(ldlm_resource_get); - -static void __ldlm_resource_putref_final(struct cfs_hash_bd *bd, - struct ldlm_resource *res) -{ - struct ldlm_ns_bucket *nsb = res->lr_ns_bucket; - struct ldlm_namespace *ns = nsb->nsb_namespace; - - if (!list_empty(&res->lr_granted)) { - ldlm_resource_dump(D_ERROR, res); - LBUG(); - } - - if (!list_empty(&res->lr_waiting)) { - ldlm_resource_dump(D_ERROR, res); - LBUG(); - } - - cfs_hash_bd_del_locked(ns->ns_rs_hash, - bd, &res->lr_hash); - lu_ref_fini(&res->lr_reference); - cfs_hash_bd_unlock(ns->ns_rs_hash, bd, 1); - if (ns->ns_lvbo && ns->ns_lvbo->lvbo_free) - ns->ns_lvbo->lvbo_free(res); - if (cfs_hash_bd_count_get(bd) == 0) - ldlm_namespace_put(ns); - kmem_cache_free(ldlm_resource_slab, res); -} - -void ldlm_resource_putref(struct ldlm_resource *res) -{ - struct ldlm_namespace *ns = ldlm_res_to_ns(res); - struct cfs_hash_bd bd; - - LASSERT_ATOMIC_GT_LT(&res->lr_refcount, 0, LI_POISON); - CDEBUG(D_INFO, "putref res: %p count: %d\n", - res, atomic_read(&res->lr_refcount) - 1); - - cfs_hash_bd_get(ns->ns_rs_hash, &res->lr_name, &bd); - if (cfs_hash_bd_dec_and_lock(ns->ns_rs_hash, &bd, &res->lr_refcount)) - __ldlm_resource_putref_final(&bd, res); -} -EXPORT_SYMBOL(ldlm_resource_putref); - -/** - * Add a lock into a given resource into specified lock list. - */ -void ldlm_resource_add_lock(struct ldlm_resource *res, struct list_head *head, - struct ldlm_lock *lock) -{ - check_res_locked(res); - - LDLM_DEBUG(lock, "About to add this lock:"); - - if (ldlm_is_destroyed(lock)) { - CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n"); - return; - } - - LASSERT(list_empty(&lock->l_res_link)); - - list_add_tail(&lock->l_res_link, head); -} - -void ldlm_resource_unlink_lock(struct ldlm_lock *lock) -{ - int type = lock->l_resource->lr_type; - - check_res_locked(lock->l_resource); - if (type == LDLM_IBITS || type == LDLM_PLAIN) - ldlm_unlink_lock_skiplist(lock); - else if (type == LDLM_EXTENT) - ldlm_extent_unlink_lock(lock); - list_del_init(&lock->l_res_link); -} -EXPORT_SYMBOL(ldlm_resource_unlink_lock); - -void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc) -{ - desc->lr_type = res->lr_type; - desc->lr_name = res->lr_name; -} - -/** - * Print information about all locks in all namespaces on this node to debug - * log. - */ -void ldlm_dump_all_namespaces(enum ldlm_side client, int level) -{ - struct ldlm_namespace *ns; - - if (!((libcfs_debug | D_ERROR) & level)) - return; - - mutex_lock(ldlm_namespace_lock(client)); - - list_for_each_entry(ns, ldlm_namespace_list(client), ns_list_chain) - ldlm_namespace_dump(level, ns); - - mutex_unlock(ldlm_namespace_lock(client)); -} - -static int ldlm_res_hash_dump(struct cfs_hash *hs, struct cfs_hash_bd *bd, - struct hlist_node *hnode, void *arg) -{ - struct ldlm_resource *res = cfs_hash_object(hs, hnode); - int level = (int)(unsigned long)arg; - - lock_res(res); - ldlm_resource_dump(level, res); - unlock_res(res); - - return 0; -} - -/** - * Print information about all locks in this namespace on this node to debug - * log. - */ -void ldlm_namespace_dump(int level, struct ldlm_namespace *ns) -{ - if (!((libcfs_debug | D_ERROR) & level)) - return; - - CDEBUG(level, "--- Namespace: %s (rc: %d, side: client)\n", - ldlm_ns_name(ns), atomic_read(&ns->ns_bref)); - - if (time_before(jiffies, ns->ns_next_dump)) - return; - - cfs_hash_for_each_nolock(ns->ns_rs_hash, - ldlm_res_hash_dump, - (void *)(unsigned long)level, 0); - spin_lock(&ns->ns_lock); - ns->ns_next_dump = jiffies + 10 * HZ; - spin_unlock(&ns->ns_lock); -} - -/** - * Print information about all locks in this resource to debug log. - */ -void ldlm_resource_dump(int level, struct ldlm_resource *res) -{ - struct ldlm_lock *lock; - unsigned int granted = 0; - - BUILD_BUG_ON(RES_NAME_SIZE != 4); - - if (!((libcfs_debug | D_ERROR) & level)) - return; - - CDEBUG(level, "--- Resource: " DLDLMRES " (%p) refcount = %d\n", - PLDLMRES(res), res, atomic_read(&res->lr_refcount)); - - if (!list_empty(&res->lr_granted)) { - CDEBUG(level, "Granted locks (in reverse order):\n"); - list_for_each_entry_reverse(lock, &res->lr_granted, - l_res_link) { - LDLM_DEBUG_LIMIT(level, lock, "###"); - if (!(level & D_CANTMASK) && - ++granted > ldlm_dump_granted_max) { - CDEBUG(level, - "only dump %d granted locks to avoid DDOS.\n", - granted); - break; - } - } - } - if (!list_empty(&res->lr_waiting)) { - CDEBUG(level, "Waiting locks:\n"); - list_for_each_entry(lock, &res->lr_waiting, l_res_link) - LDLM_DEBUG_LIMIT(level, lock, "###"); - } -} -EXPORT_SYMBOL(ldlm_resource_dump); diff --git a/drivers/staging/lustre/lustre/llite/Makefile b/drivers/staging/lustre/lustre/llite/Makefile deleted file mode 100644 index 5200924182ae69270a0a679ac64a65b728247bfc..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/Makefile +++ /dev/null @@ -1,13 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/include -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/lustre/include - -obj-$(CONFIG_LUSTRE_FS) += lustre.o -lustre-y := dcache.o dir.o file.o llite_lib.o llite_nfs.o \ - rw.o rw26.o namei.o symlink.o llite_mmap.o range_lock.o \ - xattr.o xattr_cache.o xattr_security.o \ - super25.o statahead.o glimpse.o lcommon_cl.o lcommon_misc.o \ - vvp_dev.o vvp_page.o vvp_lock.o vvp_io.o vvp_object.o \ - lproc_llite.o - -lustre-$(CONFIG_FS_POSIX_ACL) += acl.o diff --git a/drivers/staging/lustre/lustre/llite/acl.c b/drivers/staging/lustre/lustre/llite/acl.c deleted file mode 100644 index 2ee9ff931236132ff0dde33d28843ee730052f66..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/acl.c +++ /dev/null @@ -1,108 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/llite/acl.c - */ - -#define DEBUG_SUBSYSTEM S_LLITE - -#include "llite_internal.h" - -struct posix_acl *ll_get_acl(struct inode *inode, int type) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct posix_acl *acl = NULL; - - spin_lock(&lli->lli_lock); - /* VFS' acl_permission_check->check_acl will release the refcount */ - acl = posix_acl_dup(lli->lli_posix_acl); - spin_unlock(&lli->lli_lock); - - return acl; -} - -int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type) -{ - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct ptlrpc_request *req = NULL; - const char *name = NULL; - size_t value_size = 0; - char *value = NULL; - int rc = 0; - - switch (type) { - case ACL_TYPE_ACCESS: - name = XATTR_NAME_POSIX_ACL_ACCESS; - if (acl) - rc = posix_acl_update_mode(inode, &inode->i_mode, &acl); - break; - - case ACL_TYPE_DEFAULT: - name = XATTR_NAME_POSIX_ACL_DEFAULT; - if (!S_ISDIR(inode->i_mode)) - rc = acl ? -EACCES : 0; - break; - - default: - rc = -EINVAL; - break; - } - if (rc) - return rc; - - if (acl) { - value_size = posix_acl_xattr_size(acl->a_count); - value = kmalloc(value_size, GFP_NOFS); - if (!value) { - rc = -ENOMEM; - goto out; - } - - rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size); - if (rc < 0) - goto out_value; - } - - rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode), - value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM, - name, value, value_size, 0, 0, &req); - - ptlrpc_req_finished(req); -out_value: - kfree(value); -out: - if (rc) - forget_cached_acl(inode, type); - else - set_cached_acl(inode, type, acl); - return rc; -} diff --git a/drivers/staging/lustre/lustre/llite/dcache.c b/drivers/staging/lustre/lustre/llite/dcache.c deleted file mode 100644 index 11b82c639bfe07d5c71cd4300feea983938dcac2..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/dcache.c +++ /dev/null @@ -1,300 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#include -#include -#include - -#define DEBUG_SUBSYSTEM S_LLITE - -#include -#include -#include - -#include "llite_internal.h" - -static void free_dentry_data(struct rcu_head *head) -{ - struct ll_dentry_data *lld; - - lld = container_of(head, struct ll_dentry_data, lld_rcu_head); - kfree(lld); -} - -/* should NOT be called with the dcache lock, see fs/dcache.c */ -static void ll_release(struct dentry *de) -{ - struct ll_dentry_data *lld; - - LASSERT(de); - lld = ll_d2d(de); - if (lld->lld_it) { - ll_intent_release(lld->lld_it); - kfree(lld->lld_it); - } - - de->d_fsdata = NULL; - call_rcu(&lld->lld_rcu_head, free_dentry_data); -} - -/* Compare if two dentries are the same. Don't match if the existing dentry - * is marked invalid. Returns 1 if different, 0 if the same. - * - * This avoids a race where ll_lookup_it() instantiates a dentry, but we get - * an AST before calling d_revalidate_it(). The dentry still exists (marked - * INVALID) so d_lookup() matches it, but we have no lock on it (so - * lock_match() fails) and we spin around real_lookup(). - * - * This race doesn't apply to lookups in d_alloc_parallel(), and for - * those we want to ensure that only one dentry with a given name is - * in ll_lookup_nd() at a time. So allow invalid dentries to match - * while d_in_lookup(). We will be called again when the lookup - * completes, and can give a different answer then. - */ -static int ll_dcompare(const struct dentry *dentry, - unsigned int len, const char *str, - const struct qstr *name) -{ - if (len != name->len) - return 1; - - if (memcmp(str, name->name, len)) - return 1; - - CDEBUG(D_DENTRY, "found name %.*s(%p) flags %#x refc %d\n", - name->len, name->name, dentry, dentry->d_flags, - d_count(dentry)); - - /* mountpoint is always valid */ - if (d_mountpoint(dentry)) - return 0; - - /* ensure exclusion against parallel lookup of the same name */ - if (d_in_lookup((struct dentry *)dentry)) - return 0; - - if (d_lustre_invalid(dentry)) - return 1; - - return 0; -} - -/** - * Called when last reference to a dentry is dropped and dcache wants to know - * whether or not it should cache it: - * - return 1 to delete the dentry immediately - * - return 0 to cache the dentry - * Should NOT be called with the dcache lock, see fs/dcache.c - */ -static int ll_ddelete(const struct dentry *de) -{ - LASSERT(de); - - CDEBUG(D_DENTRY, "%s dentry %pd (%p, parent %p, inode %p) %s%s\n", - d_lustre_invalid(de) ? "deleting" : "keeping", - de, de, de->d_parent, d_inode(de), - d_unhashed(de) ? "" : "hashed,", - list_empty(&de->d_subdirs) ? "" : "subdirs"); - - /* kernel >= 2.6.38 last refcount is decreased after this function. */ - LASSERT(d_count(de) == 1); - - if (d_lustre_invalid(de)) - return 1; - return 0; -} - -static int ll_d_init(struct dentry *de) -{ - struct ll_dentry_data *lld = kzalloc(sizeof(*lld), GFP_KERNEL); - - if (unlikely(!lld)) - return -ENOMEM; - lld->lld_invalid = 1; - de->d_fsdata = lld; - return 0; -} - -void ll_intent_drop_lock(struct lookup_intent *it) -{ - if (it->it_op && it->it_lock_mode) { - struct lustre_handle handle; - - handle.cookie = it->it_lock_handle; - - CDEBUG(D_DLMTRACE, - "releasing lock with cookie %#llx from it %p\n", - handle.cookie, it); - ldlm_lock_decref(&handle, it->it_lock_mode); - - /* bug 494: intent_release may be called multiple times, from - * this thread and we don't want to double-decref this lock - */ - it->it_lock_mode = 0; - if (it->it_remote_lock_mode != 0) { - handle.cookie = it->it_remote_lock_handle; - - CDEBUG(D_DLMTRACE, - "releasing remote lock with cookie%#llx from it %p\n", - handle.cookie, it); - ldlm_lock_decref(&handle, - it->it_remote_lock_mode); - it->it_remote_lock_mode = 0; - } - } -} - -void ll_intent_release(struct lookup_intent *it) -{ - CDEBUG(D_INFO, "intent %p released\n", it); - ll_intent_drop_lock(it); - /* We are still holding extra reference on a request, need to free it */ - if (it_disposition(it, DISP_ENQ_OPEN_REF)) - ptlrpc_req_finished(it->it_request); /* ll_file_open */ - - if (it_disposition(it, DISP_ENQ_CREATE_REF)) /* create rec */ - ptlrpc_req_finished(it->it_request); - - it->it_disposition = 0; - it->it_request = NULL; -} - -void ll_invalidate_aliases(struct inode *inode) -{ - struct dentry *dentry; - - CDEBUG(D_INODE, "marking dentries for ino " DFID "(%p) invalid\n", - PFID(ll_inode2fid(inode)), inode); - - spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { - CDEBUG(D_DENTRY, - "dentry in drop %pd (%p) parent %p inode %p flags %d\n", - dentry, dentry, dentry->d_parent, - d_inode(dentry), dentry->d_flags); - - d_lustre_invalidate(dentry, 0); - } - spin_unlock(&inode->i_lock); -} - -int ll_revalidate_it_finish(struct ptlrpc_request *request, - struct lookup_intent *it, - struct inode *inode) -{ - int rc = 0; - - if (!request) - return 0; - - if (it_disposition(it, DISP_LOOKUP_NEG)) - return -ENOENT; - - rc = ll_prep_inode(&inode, request, NULL, it); - - return rc; -} - -void ll_lookup_finish_locks(struct lookup_intent *it, struct inode *inode) -{ - if (it->it_lock_mode && inode) { - struct ll_sb_info *sbi = ll_i2sbi(inode); - - CDEBUG(D_DLMTRACE, "setting l_data to inode " DFID "(%p)\n", - PFID(ll_inode2fid(inode)), inode); - ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL); - } - - /* drop lookup or getattr locks immediately */ - if (it->it_op == IT_LOOKUP || it->it_op == IT_GETATTR) { - /* on 2.6 there are situation when several lookups and - * revalidations may be requested during single operation. - * therefore, we don't release intent here -bzzz - */ - ll_intent_drop_lock(it); - } -} - -static int ll_revalidate_dentry(struct dentry *dentry, - unsigned int lookup_flags) -{ - struct inode *dir = d_inode(dentry->d_parent); - - /* If this is intermediate component path lookup and we were able to get - * to this dentry, then its lock has not been revoked and the - * path component is valid. - */ - if (lookup_flags & LOOKUP_PARENT) - return 1; - - /* Symlink - always valid as long as the dentry was found */ - if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) - return 1; - - /* - * VFS warns us that this is the second go around and previous - * operation failed (most likely open|creat), so this time - * we better talk to the server via the lookup path by name, - * not by fid. - */ - if (lookup_flags & LOOKUP_REVAL) - return 0; - - if (!dentry_may_statahead(dir, dentry)) - return 1; - - if (lookup_flags & LOOKUP_RCU) - return -ECHILD; - - ll_statahead(dir, &dentry, !d_inode(dentry)); - return 1; -} - -/* - * Always trust cached dentries. Update statahead window if necessary. - */ -static int ll_revalidate_nd(struct dentry *dentry, unsigned int flags) -{ - CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, flags=%u\n", - dentry, flags); - - return ll_revalidate_dentry(dentry, flags); -} - -const struct dentry_operations ll_d_ops = { - .d_init = ll_d_init, - .d_revalidate = ll_revalidate_nd, - .d_release = ll_release, - .d_delete = ll_ddelete, - .d_compare = ll_dcompare, -}; diff --git a/drivers/staging/lustre/lustre/llite/dir.c b/drivers/staging/lustre/lustre/llite/dir.c deleted file mode 100644 index 688dddf3ca47bcbefbb35a2205ed350133b8253c..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/dir.c +++ /dev/null @@ -1,1708 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/llite/dir.c - * - * Directory code for lustre client. - */ - -#include -#include -#include -#include -#include /* for wait_on_buffer */ -#include -#include - -#define DEBUG_SUBSYSTEM S_LLITE - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "llite_internal.h" - -/* - * (new) readdir implementation overview. - * - * Original lustre readdir implementation cached exact copy of raw directory - * pages on the client. These pages were indexed in client page cache by - * logical offset in the directory file. This design, while very simple and - * intuitive had some inherent problems: - * - * . it implies that byte offset to the directory entry serves as a - * telldir(3)/seekdir(3) cookie, but that offset is not stable: in - * ext3/htree directory entries may move due to splits, and more - * importantly, - * - * . it is incompatible with the design of split directories for cmd3, - * that assumes that names are distributed across nodes based on their - * hash, and so readdir should be done in hash order. - * - * New readdir implementation does readdir in hash order, and uses hash of a - * file name as a telldir/seekdir cookie. This led to number of complications: - * - * . hash is not unique, so it cannot be used to index cached directory - * pages on the client (note, that it requires a whole pageful of hash - * collided entries to cause two pages to have identical hashes); - * - * . hash is not unique, so it cannot, strictly speaking, be used as an - * entry cookie. ext3/htree has the same problem and lustre implementation - * mimics their solution: seekdir(hash) positions directory at the first - * entry with the given hash. - * - * Client side. - * - * 0. caching - * - * Client caches directory pages using hash of the first entry as an index. As - * noted above hash is not unique, so this solution doesn't work as is: - * special processing is needed for "page hash chains" (i.e., sequences of - * pages filled with entries all having the same hash value). - * - * First, such chains have to be detected. To this end, server returns to the - * client the hash of the first entry on the page next to one returned. When - * client detects that this hash is the same as hash of the first entry on the - * returned page, page hash collision has to be handled. Pages in the - * hash chain, except first one, are termed "overflow pages". - * - * Solution to index uniqueness problem is to not cache overflow - * pages. Instead, when page hash collision is detected, all overflow pages - * from emerging chain are immediately requested from the server and placed in - * a special data structure (struct ll_dir_chain). This data structure is used - * by ll_readdir() to process entries from overflow pages. When readdir - * invocation finishes, overflow pages are discarded. If page hash collision - * chain weren't completely processed, next call to readdir will again detect - * page hash collision, again read overflow pages in, process next portion of - * entries and again discard the pages. This is not as wasteful as it looks, - * because, given reasonable hash, page hash collisions are extremely rare. - * - * 1. directory positioning - * - * When seekdir(hash) is called, original - * - * - * - * - * - * - * - * - * Server. - * - * identification of and access to overflow pages - * - * page format - * - * Page in MDS_READPAGE RPC is packed in LU_PAGE_SIZE, and each page contains - * a header lu_dirpage which describes the start/end hash, and whether this - * page is empty (contains no dir entry) or hash collide with next page. - * After client receives reply, several pages will be integrated into dir page - * in PAGE_SIZE (if PAGE_SIZE greater than LU_PAGE_SIZE), and the lu_dirpage - * for this integrated page will be adjusted. See lmv_adjust_dirpages(). - * - */ -struct page *ll_get_dir_page(struct inode *dir, struct md_op_data *op_data, - __u64 offset) -{ - struct md_callback cb_op; - struct page *page; - int rc; - - cb_op.md_blocking_ast = ll_md_blocking_ast; - rc = md_read_page(ll_i2mdexp(dir), op_data, &cb_op, offset, &page); - if (rc) - return ERR_PTR(rc); - - return page; -} - -void ll_release_page(struct inode *inode, struct page *page, bool remove) -{ - kunmap(page); - - /* - * Always remove the page for striped dir, because the page is - * built from temporarily in LMV layer - */ - if (inode && S_ISDIR(inode->i_mode) && - ll_i2info(inode)->lli_lsm_md) { - __free_page(page); - return; - } - - if (remove) { - lock_page(page); - if (likely(page->mapping)) - truncate_complete_page(page->mapping, page); - unlock_page(page); - } - put_page(page); -} - -/** - * return IF_* type for given lu_dirent entry. - * IF_* flag shld be converted to particular OS file type in - * platform llite module. - */ -static __u16 ll_dirent_type_get(struct lu_dirent *ent) -{ - __u16 type = 0; - struct luda_type *lt; - int len = 0; - - if (le32_to_cpu(ent->lde_attrs) & LUDA_TYPE) { - const unsigned int align = sizeof(struct luda_type) - 1; - - len = le16_to_cpu(ent->lde_namelen); - len = (len + align) & ~align; - lt = (void *)ent->lde_name + len; - type = IFTODT(le16_to_cpu(lt->lt_type)); - } - return type; -} - -int ll_dir_read(struct inode *inode, __u64 *ppos, struct md_op_data *op_data, - struct dir_context *ctx) -{ - struct ll_sb_info *sbi = ll_i2sbi(inode); - __u64 pos = *ppos; - int is_api32 = ll_need_32bit_api(sbi); - int is_hash64 = sbi->ll_flags & LL_SBI_64BIT_HASH; - struct page *page; - bool done = false; - int rc = 0; - - page = ll_get_dir_page(inode, op_data, pos); - - while (rc == 0 && !done) { - struct lu_dirpage *dp; - struct lu_dirent *ent; - __u64 hash; - __u64 next; - - if (IS_ERR(page)) { - rc = PTR_ERR(page); - break; - } - - hash = MDS_DIR_END_OFF; - dp = page_address(page); - for (ent = lu_dirent_start(dp); ent && !done; - ent = lu_dirent_next(ent)) { - __u16 type; - int namelen; - struct lu_fid fid; - __u64 lhash; - __u64 ino; - - hash = le64_to_cpu(ent->lde_hash); - if (hash < pos) - /* - * Skip until we find target hash - * value. - */ - continue; - - namelen = le16_to_cpu(ent->lde_namelen); - if (namelen == 0) - /* - * Skip dummy record. - */ - continue; - - if (is_api32 && is_hash64) - lhash = hash >> 32; - else - lhash = hash; - fid_le_to_cpu(&fid, &ent->lde_fid); - ino = cl_fid_build_ino(&fid, is_api32); - type = ll_dirent_type_get(ent); - ctx->pos = lhash; - /* For 'll_nfs_get_name_filldir()', it will try - * to access the 'ent' through its 'lde_name', - * so the parameter 'name' for 'ctx->actor()' - * must be part of the 'ent'. - */ - done = !dir_emit(ctx, ent->lde_name, - namelen, ino, type); - } - - if (done) { - pos = hash; - ll_release_page(inode, page, false); - break; - } - - next = le64_to_cpu(dp->ldp_hash_end); - pos = next; - if (pos == MDS_DIR_END_OFF) { - /* - * End of directory reached. - */ - done = 1; - ll_release_page(inode, page, false); - } else { - /* - * Normal case: continue to the next - * page. - */ - ll_release_page(inode, page, - le32_to_cpu(dp->ldp_flags) & - LDF_COLLIDE); - next = pos; - page = ll_get_dir_page(inode, op_data, pos); - } - } - - ctx->pos = pos; - return rc; -} - -static int ll_readdir(struct file *filp, struct dir_context *ctx) -{ - struct inode *inode = file_inode(filp); - struct ll_file_data *lfd = LUSTRE_FPRIVATE(filp); - struct ll_sb_info *sbi = ll_i2sbi(inode); - __u64 pos = lfd ? lfd->lfd_pos : 0; - int hash64 = sbi->ll_flags & LL_SBI_64BIT_HASH; - int api32 = ll_need_32bit_api(sbi); - struct md_op_data *op_data; - int rc; - - CDEBUG(D_VFSTRACE, - "VFS Op:inode=" DFID "(%p) pos/size %lu/%llu 32bit_api %d\n", - PFID(ll_inode2fid(inode)), inode, (unsigned long)pos, - i_size_read(inode), api32); - - if (pos == MDS_DIR_END_OFF) { - /* - * end-of-file. - */ - rc = 0; - goto out; - } - - op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0, - LUSTRE_OPC_ANY, inode); - if (IS_ERR(op_data)) { - rc = PTR_ERR(op_data); - goto out; - } - - if (unlikely(op_data->op_mea1)) { - /* - * This is only needed for striped dir to fill .., - * see lmv_read_page - */ - if (file_dentry(filp)->d_parent && - file_dentry(filp)->d_parent->d_inode) { - __u64 ibits = MDS_INODELOCK_UPDATE; - struct inode *parent; - - parent = file_dentry(filp)->d_parent->d_inode; - if (ll_have_md_lock(parent, &ibits, LCK_MINMODE)) - op_data->op_fid3 = *ll_inode2fid(parent); - } - - /* - * If it can not find in cache, do lookup .. on the master - * object - */ - if (fid_is_zero(&op_data->op_fid3)) { - rc = ll_dir_get_parent_fid(inode, &op_data->op_fid3); - if (rc) { - ll_finish_md_op_data(op_data); - return rc; - } - } - } - op_data->op_max_pages = sbi->ll_md_brw_pages; - ctx->pos = pos; - rc = ll_dir_read(inode, &pos, op_data, ctx); - pos = ctx->pos; - if (lfd) - lfd->lfd_pos = pos; - - if (pos == MDS_DIR_END_OFF) { - if (api32) - pos = LL_DIR_END_OFF_32BIT; - else - pos = LL_DIR_END_OFF; - } else { - if (api32 && hash64) - pos >>= 32; - } - ctx->pos = pos; - ll_finish_md_op_data(op_data); -out: - if (!rc) - ll_stats_ops_tally(sbi, LPROC_LL_READDIR, 1); - - return rc; -} - -static int ll_send_mgc_param(struct obd_export *mgc, char *string) -{ - struct mgs_send_param *msp; - int rc = 0; - - msp = kzalloc(sizeof(*msp), GFP_NOFS); - if (!msp) - return -ENOMEM; - - strlcpy(msp->mgs_param, string, sizeof(msp->mgs_param)); - rc = obd_set_info_async(NULL, mgc, sizeof(KEY_SET_INFO), KEY_SET_INFO, - sizeof(struct mgs_send_param), msp, NULL); - if (rc) - CERROR("Failed to set parameter: %d\n", rc); - kfree(msp); - - return rc; -} - -/** - * Create striped directory with specified stripe(@lump) - * - * param[in] parent the parent of the directory. - * param[in] lump the specified stripes. - * param[in] dirname the name of the directory. - * param[in] mode the specified mode of the directory. - * - * retval =0 if striped directory is being created successfully. - * <0 if the creation is failed. - */ -static int ll_dir_setdirstripe(struct inode *parent, struct lmv_user_md *lump, - const char *dirname, umode_t mode) -{ - struct ptlrpc_request *request = NULL; - struct md_op_data *op_data; - struct ll_sb_info *sbi = ll_i2sbi(parent); - struct inode *inode = NULL; - struct dentry dentry; - int err; - - if (unlikely(lump->lum_magic != LMV_USER_MAGIC)) - return -EINVAL; - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p) name %s stripe_offset %d, stripe_count: %u\n", - PFID(ll_inode2fid(parent)), parent, dirname, - (int)lump->lum_stripe_offset, lump->lum_stripe_count); - - if (lump->lum_stripe_count > 1 && - !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_DIR_STRIPE)) - return -EINVAL; - - if (lump->lum_magic != cpu_to_le32(LMV_USER_MAGIC)) - lustre_swab_lmv_user_md(lump); - - if (!IS_POSIXACL(parent) || !exp_connect_umask(ll_i2mdexp(parent))) - mode &= ~current_umask(); - mode = (mode & (0777 | S_ISVTX)) | S_IFDIR; - op_data = ll_prep_md_op_data(NULL, parent, NULL, dirname, - strlen(dirname), mode, LUSTRE_OPC_MKDIR, - lump); - if (IS_ERR(op_data)) { - err = PTR_ERR(op_data); - goto err_exit; - } - - op_data->op_cli_flags |= CLI_SET_MEA; - err = md_create(sbi->ll_md_exp, op_data, lump, sizeof(*lump), mode, - from_kuid(&init_user_ns, current_fsuid()), - from_kgid(&init_user_ns, current_fsgid()), - current_cap(), 0, &request); - ll_finish_md_op_data(op_data); - - err = ll_prep_inode(&inode, request, parent->i_sb, NULL); - if (err) - goto err_exit; - - memset(&dentry, 0, sizeof(dentry)); - dentry.d_inode = inode; - - err = ll_init_security(&dentry, inode, parent); - iput(inode); - -err_exit: - ptlrpc_req_finished(request); - return err; -} - -int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump, - int set_default) -{ - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct md_op_data *op_data; - struct ptlrpc_request *req = NULL; - int rc = 0; - struct lustre_sb_info *lsi = s2lsi(inode->i_sb); - struct obd_device *mgc = lsi->lsi_mgc; - int lum_size; - - if (lump) { - /* - * This is coming from userspace, so should be in - * local endian. But the MDS would like it in little - * endian, so we swab it before we send it. - */ - switch (lump->lmm_magic) { - case LOV_USER_MAGIC_V1: { - if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V1)) - lustre_swab_lov_user_md_v1(lump); - lum_size = sizeof(struct lov_user_md_v1); - break; - } - case LOV_USER_MAGIC_V3: { - if (lump->lmm_magic != cpu_to_le32(LOV_USER_MAGIC_V3)) - lustre_swab_lov_user_md_v3( - (struct lov_user_md_v3 *)lump); - lum_size = sizeof(struct lov_user_md_v3); - break; - } - case LMV_USER_MAGIC: { - if (lump->lmm_magic != cpu_to_le32(LMV_USER_MAGIC)) - lustre_swab_lmv_user_md( - (struct lmv_user_md *)lump); - lum_size = sizeof(struct lmv_user_md); - break; - } - default: { - CDEBUG(D_IOCTL, - "bad userland LOV MAGIC: %#08x != %#08x nor %#08x\n", - lump->lmm_magic, LOV_USER_MAGIC_V1, - LOV_USER_MAGIC_V3); - return -EINVAL; - } - } - } else { - lum_size = sizeof(struct lov_user_md_v1); - } - - op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, - LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - /* swabbing is done in lov_setstripe() on server side */ - rc = md_setattr(sbi->ll_md_exp, op_data, lump, lum_size, &req); - ll_finish_md_op_data(op_data); - ptlrpc_req_finished(req); - if (rc) - return rc; - -#if OBD_OCD_VERSION(2, 13, 53, 0) > LUSTRE_VERSION_CODE - /* - * 2.9 server has stored filesystem default stripe in ROOT xattr, - * and it's stored into system config for backward compatibility. - * - * In the following we use the fact that LOV_USER_MAGIC_V1 and - * LOV_USER_MAGIC_V3 have the same initial fields so we do not - * need to make the distinction between the 2 versions - */ - if (set_default && mgc->u.cli.cl_mgc_mgsexp) { - char *param = NULL; - char *buf; - - param = kzalloc(MGS_PARAM_MAXLEN, GFP_NOFS); - if (!param) - return -ENOMEM; - - buf = param; - /* Get fsname and assume devname to be -MDT0000. */ - ll_get_fsname(inode->i_sb, buf, MTI_NAME_MAXLEN); - strcat(buf, "-MDT0000.lov"); - buf += strlen(buf); - - /* Set root stripesize */ - sprintf(buf, ".stripesize=%u", - lump ? le32_to_cpu(lump->lmm_stripe_size) : 0); - rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param); - if (rc) - goto end; - - /* Set root stripecount */ - sprintf(buf, ".stripecount=%hd", - lump ? le16_to_cpu(lump->lmm_stripe_count) : 0); - rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param); - if (rc) - goto end; - - /* Set root stripeoffset */ - sprintf(buf, ".stripeoffset=%hd", - lump ? le16_to_cpu(lump->lmm_stripe_offset) : - (typeof(lump->lmm_stripe_offset))(-1)); - rc = ll_send_mgc_param(mgc->u.cli.cl_mgc_mgsexp, param); - -end: - kfree(param); - } -#endif - return rc; -} - -/** - * This function will be used to get default LOV/LMV/Default LMV - * @valid will be used to indicate which stripe it will retrieve - * OBD_MD_MEA LMV stripe EA - * OBD_MD_DEFAULT_MEA Default LMV stripe EA - * otherwise Default LOV EA. - * Each time, it can only retrieve 1 stripe EA - **/ -int ll_dir_getstripe(struct inode *inode, void **plmm, int *plmm_size, - struct ptlrpc_request **request, u64 valid) -{ - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct mdt_body *body; - struct lov_mds_md *lmm = NULL; - struct ptlrpc_request *req = NULL; - int rc, lmmsize; - struct md_op_data *op_data; - - rc = ll_get_max_mdsize(sbi, &lmmsize); - if (rc) - return rc; - - op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, - 0, lmmsize, LUSTRE_OPC_ANY, - NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA; - rc = md_getattr(sbi->ll_md_exp, op_data, &req); - ll_finish_md_op_data(op_data); - if (rc < 0) { - CDEBUG(D_INFO, "md_getattr failed on inode " DFID ": rc %d\n", - PFID(ll_inode2fid(inode)), rc); - goto out; - } - - body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - - lmmsize = body->mbo_eadatasize; - - if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) || - lmmsize == 0) { - rc = -ENODATA; - goto out; - } - - lmm = req_capsule_server_sized_get(&req->rq_pill, - &RMF_MDT_MD, lmmsize); - LASSERT(lmm); - - /* - * This is coming from the MDS, so is probably in - * little endian. We convert it to host endian before - * passing it to userspace. - */ - /* We don't swab objects for directories */ - switch (le32_to_cpu(lmm->lmm_magic)) { - case LOV_MAGIC_V1: - if (cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) - lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm); - break; - case LOV_MAGIC_V3: - if (cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) - lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm); - break; - case LMV_MAGIC_V1: - if (cpu_to_le32(LMV_MAGIC) != LMV_MAGIC) - lustre_swab_lmv_mds_md((union lmv_mds_md *)lmm); - break; - case LMV_USER_MAGIC: - if (cpu_to_le32(LMV_USER_MAGIC) != LMV_USER_MAGIC) - lustre_swab_lmv_user_md((struct lmv_user_md *)lmm); - break; - default: - CERROR("unknown magic: %lX\n", (unsigned long)lmm->lmm_magic); - rc = -EPROTO; - } -out: - *plmm = lmm; - *plmm_size = lmmsize; - *request = req; - return rc; -} - -int ll_get_mdt_idx_by_fid(struct ll_sb_info *sbi, const struct lu_fid *fid) -{ - struct md_op_data *op_data; - int mdt_index, rc; - - op_data = kzalloc(sizeof(*op_data), GFP_NOFS); - if (!op_data) - return -ENOMEM; - - op_data->op_flags |= MF_GET_MDT_IDX; - op_data->op_fid1 = *fid; - rc = md_getattr(sbi->ll_md_exp, op_data, NULL); - mdt_index = op_data->op_mds; - kvfree(op_data); - if (rc < 0) - return rc; - - return mdt_index; -} - -/* - * Get MDT index for the inode. - */ -int ll_get_mdt_idx(struct inode *inode) -{ - return ll_get_mdt_idx_by_fid(ll_i2sbi(inode), ll_inode2fid(inode)); -} - -/** - * Generic handler to do any pre-copy work. - * - * It sends a first hsm_progress (with extent length == 0) to coordinator as a - * first information for it that real work has started. - * - * Moreover, for a ARCHIVE request, it will sample the file data version and - * store it in \a copy. - * - * \return 0 on success. - */ -static int ll_ioc_copy_start(struct super_block *sb, struct hsm_copy *copy) -{ - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct hsm_progress_kernel hpk; - int rc2, rc = 0; - - /* Forge a hsm_progress based on data from copy. */ - hpk.hpk_fid = copy->hc_hai.hai_fid; - hpk.hpk_cookie = copy->hc_hai.hai_cookie; - hpk.hpk_extent.offset = copy->hc_hai.hai_extent.offset; - hpk.hpk_extent.length = 0; - hpk.hpk_flags = 0; - hpk.hpk_errval = 0; - hpk.hpk_data_version = 0; - - /* For archive request, we need to read the current file version. */ - if (copy->hc_hai.hai_action == HSMA_ARCHIVE) { - struct inode *inode; - __u64 data_version = 0; - - /* Get inode for this fid */ - inode = search_inode_for_lustre(sb, ©->hc_hai.hai_fid); - if (IS_ERR(inode)) { - hpk.hpk_flags |= HP_FLAG_RETRY; - /* hpk_errval is >= 0 */ - hpk.hpk_errval = -PTR_ERR(inode); - rc = PTR_ERR(inode); - goto progress; - } - - /* Read current file data version */ - rc = ll_data_version(inode, &data_version, LL_DV_RD_FLUSH); - iput(inode); - if (rc != 0) { - CDEBUG(D_HSM, - "Could not read file data version of " DFID " (rc = %d). Archive request (%#llx) could not be done.\n", - PFID(©->hc_hai.hai_fid), rc, - copy->hc_hai.hai_cookie); - hpk.hpk_flags |= HP_FLAG_RETRY; - /* hpk_errval must be >= 0 */ - hpk.hpk_errval = -rc; - goto progress; - } - - /* Store in the hsm_copy for later copytool use. - * Always modified even if no lsm. - */ - copy->hc_data_version = data_version; - } - -progress: - /* On error, the request should be considered as completed */ - if (hpk.hpk_errval > 0) - hpk.hpk_flags |= HP_FLAG_COMPLETED; - rc2 = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk), - &hpk, NULL); - - return rc ? rc : rc2; -} - -/** - * Generic handler to do any post-copy work. - * - * It will send the last hsm_progress update to coordinator to inform it - * that copy is finished and whether it was successful or not. - * - * Moreover, - * - for ARCHIVE request, it will sample the file data version and compare it - * with the version saved in ll_ioc_copy_start(). If they do not match, copy - * will be considered as failed. - * - for RESTORE request, it will sample the file data version and send it to - * coordinator which is useful if the file was imported as 'released'. - * - * \return 0 on success. - */ -static int ll_ioc_copy_end(struct super_block *sb, struct hsm_copy *copy) -{ - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct hsm_progress_kernel hpk; - int rc2, rc = 0; - - /* If you modify the logic here, also check llapi_hsm_copy_end(). */ - /* Take care: copy->hc_hai.hai_action, len, gid and data are not - * initialized if copy_end was called with copy == NULL. - */ - - /* Forge a hsm_progress based on data from copy. */ - hpk.hpk_fid = copy->hc_hai.hai_fid; - hpk.hpk_cookie = copy->hc_hai.hai_cookie; - hpk.hpk_extent = copy->hc_hai.hai_extent; - hpk.hpk_flags = copy->hc_flags | HP_FLAG_COMPLETED; - hpk.hpk_errval = copy->hc_errval; - hpk.hpk_data_version = 0; - - /* For archive request, we need to check the file data was not changed. - * - * For restore request, we need to send the file data version, this is - * useful when the file was created using hsm_import. - */ - if (((copy->hc_hai.hai_action == HSMA_ARCHIVE) || - (copy->hc_hai.hai_action == HSMA_RESTORE)) && - (copy->hc_errval == 0)) { - struct inode *inode; - __u64 data_version = 0; - - /* Get lsm for this fid */ - inode = search_inode_for_lustre(sb, ©->hc_hai.hai_fid); - if (IS_ERR(inode)) { - hpk.hpk_flags |= HP_FLAG_RETRY; - /* hpk_errval must be >= 0 */ - hpk.hpk_errval = -PTR_ERR(inode); - rc = PTR_ERR(inode); - goto progress; - } - - rc = ll_data_version(inode, &data_version, LL_DV_RD_FLUSH); - iput(inode); - if (rc) { - CDEBUG(D_HSM, - "Could not read file data version. Request could not be confirmed.\n"); - if (hpk.hpk_errval == 0) - hpk.hpk_errval = -rc; - goto progress; - } - - /* Store in the hsm_copy for later copytool use. - * Always modified even if no lsm. - */ - hpk.hpk_data_version = data_version; - - /* File could have been stripped during archiving, so we need - * to check anyway. - */ - if ((copy->hc_hai.hai_action == HSMA_ARCHIVE) && - (copy->hc_data_version != data_version)) { - CDEBUG(D_HSM, "File data version mismatched. File content was changed during archiving. " DFID ", start:%#llx current:%#llx\n", - PFID(©->hc_hai.hai_fid), - copy->hc_data_version, data_version); - /* File was changed, send error to cdt. Do not ask for - * retry because if a file is modified frequently, - * the cdt will loop on retried archive requests. - * The policy engine will ask for a new archive later - * when the file will not be modified for some tunable - * time - */ - hpk.hpk_flags &= ~HP_FLAG_RETRY; - rc = -EBUSY; - /* hpk_errval must be >= 0 */ - hpk.hpk_errval = -rc; - } - } - -progress: - rc2 = obd_iocontrol(LL_IOC_HSM_PROGRESS, sbi->ll_md_exp, sizeof(hpk), - &hpk, NULL); - - return rc ? rc : rc2; -} - -static int copy_and_ioctl(int cmd, struct obd_export *exp, - const void __user *data, size_t size) -{ - void *copy; - int rc; - - copy = memdup_user(data, size); - if (IS_ERR(copy)) - return PTR_ERR(copy); - - rc = obd_iocontrol(cmd, exp, size, copy, NULL); - kfree(copy); - - return rc; -} - -static int quotactl_ioctl(struct ll_sb_info *sbi, struct if_quotactl *qctl) -{ - int cmd = qctl->qc_cmd; - int type = qctl->qc_type; - int id = qctl->qc_id; - int valid = qctl->qc_valid; - int rc = 0; - - switch (cmd) { - case Q_SETQUOTA: - case Q_SETINFO: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - break; - case Q_GETQUOTA: - if (((type == USRQUOTA && - !uid_eq(current_euid(), make_kuid(&init_user_ns, id))) || - (type == GRPQUOTA && - !in_egroup_p(make_kgid(&init_user_ns, id)))) && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - break; - case Q_GETINFO: - break; - default: - CERROR("unsupported quotactl op: %#x\n", cmd); - return -ENOTTY; - } - - if (valid != QC_GENERAL) { - if (cmd == Q_GETINFO) - qctl->qc_cmd = Q_GETOINFO; - else if (cmd == Q_GETQUOTA) - qctl->qc_cmd = Q_GETOQUOTA; - else - return -EINVAL; - - switch (valid) { - case QC_MDTIDX: - rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp, - sizeof(*qctl), qctl, NULL); - break; - case QC_OSTIDX: - rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_dt_exp, - sizeof(*qctl), qctl, NULL); - break; - case QC_UUID: - rc = obd_iocontrol(OBD_IOC_QUOTACTL, sbi->ll_md_exp, - sizeof(*qctl), qctl, NULL); - if (rc == -EAGAIN) - rc = obd_iocontrol(OBD_IOC_QUOTACTL, - sbi->ll_dt_exp, - sizeof(*qctl), qctl, NULL); - break; - default: - rc = -EINVAL; - break; - } - - if (rc) - return rc; - - qctl->qc_cmd = cmd; - } else { - struct obd_quotactl *oqctl; - - oqctl = kzalloc(sizeof(*oqctl), GFP_NOFS); - if (!oqctl) - return -ENOMEM; - - QCTL_COPY(oqctl, qctl); - rc = obd_quotactl(sbi->ll_md_exp, oqctl); - if (rc) { - kfree(oqctl); - return rc; - } - /* If QIF_SPACE is not set, client should collect the - * space usage from OSSs by itself - */ - if (cmd == Q_GETQUOTA && - !(oqctl->qc_dqblk.dqb_valid & QIF_SPACE) && - !oqctl->qc_dqblk.dqb_curspace) { - struct obd_quotactl *oqctl_tmp; - - oqctl_tmp = kzalloc(sizeof(*oqctl_tmp), GFP_NOFS); - if (!oqctl_tmp) { - rc = -ENOMEM; - goto out; - } - - oqctl_tmp->qc_cmd = Q_GETOQUOTA; - oqctl_tmp->qc_id = oqctl->qc_id; - oqctl_tmp->qc_type = oqctl->qc_type; - - /* collect space usage from OSTs */ - oqctl_tmp->qc_dqblk.dqb_curspace = 0; - rc = obd_quotactl(sbi->ll_dt_exp, oqctl_tmp); - if (!rc || rc == -EREMOTEIO) { - oqctl->qc_dqblk.dqb_curspace = - oqctl_tmp->qc_dqblk.dqb_curspace; - oqctl->qc_dqblk.dqb_valid |= QIF_SPACE; - } - - /* collect space & inode usage from MDTs */ - oqctl_tmp->qc_dqblk.dqb_curspace = 0; - oqctl_tmp->qc_dqblk.dqb_curinodes = 0; - rc = obd_quotactl(sbi->ll_md_exp, oqctl_tmp); - if (!rc || rc == -EREMOTEIO) { - oqctl->qc_dqblk.dqb_curspace += - oqctl_tmp->qc_dqblk.dqb_curspace; - oqctl->qc_dqblk.dqb_curinodes = - oqctl_tmp->qc_dqblk.dqb_curinodes; - oqctl->qc_dqblk.dqb_valid |= QIF_INODES; - } else { - oqctl->qc_dqblk.dqb_valid &= ~QIF_SPACE; - } - - kfree(oqctl_tmp); - } -out: - QCTL_COPY(qctl, oqctl); - kfree(oqctl); - } - - return rc; -} - -/* This function tries to get a single name component, - * to send to the server. No actual path traversal involved, - * so we limit to NAME_MAX - */ -static char *ll_getname(const char __user *filename) -{ - int ret = 0, len; - char *tmp; - - tmp = kzalloc(NAME_MAX + 1, GFP_KERNEL); - if (!tmp) - return ERR_PTR(-ENOMEM); - - len = strncpy_from_user(tmp, filename, NAME_MAX + 1); - if (len < 0) - ret = len; - else if (len == 0) - ret = -ENOENT; - else if (len > NAME_MAX && tmp[NAME_MAX] != 0) - ret = -ENAMETOOLONG; - - if (ret) { - kfree(tmp); - tmp = ERR_PTR(ret); - } - return tmp; -} - -#define ll_putname(filename) kfree(filename) - -static long ll_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - struct inode *inode = file_inode(file); - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct obd_ioctl_data *data; - int rc = 0; - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p), cmd=%#x\n", - PFID(ll_inode2fid(inode)), inode, cmd); - - /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */ - if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */ - return -ENOTTY; - - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1); - switch (cmd) { - case FSFILT_IOC_GETFLAGS: - case FSFILT_IOC_SETFLAGS: - return ll_iocontrol(inode, file, cmd, arg); - case FSFILT_IOC_GETVERSION_OLD: - case FSFILT_IOC_GETVERSION: - return put_user(inode->i_generation, (int __user *)arg); - /* We need to special case any other ioctls we want to handle, - * to send them to the MDS/OST as appropriate and to properly - * network encode the arg field. - case FSFILT_IOC_SETVERSION_OLD: - case FSFILT_IOC_SETVERSION: - */ - case LL_IOC_GET_MDTIDX: { - int mdtidx; - - mdtidx = ll_get_mdt_idx(inode); - if (mdtidx < 0) - return mdtidx; - - if (put_user((int)mdtidx, (int __user *)arg)) - return -EFAULT; - - return 0; - } - case IOC_MDC_LOOKUP: { - int namelen, len = 0; - char *buf = NULL; - char *filename; - - rc = obd_ioctl_getdata(&buf, &len, (void __user *)arg); - if (rc) - return rc; - data = (void *)buf; - - filename = data->ioc_inlbuf1; - namelen = strlen(filename); - - if (namelen < 1) { - CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n"); - rc = -EINVAL; - goto out_free; - } - - rc = ll_get_fid_by_name(inode, filename, namelen, NULL, NULL); - if (rc < 0) { - CERROR("%s: lookup %.*s failed: rc = %d\n", - ll_get_fsname(inode->i_sb, NULL, 0), namelen, - filename, rc); - goto out_free; - } -out_free: - kvfree(buf); - return rc; - } - case LL_IOC_LMV_SETSTRIPE: { - struct lmv_user_md *lum; - char *buf = NULL; - char *filename; - int namelen = 0; - int lumlen = 0; - umode_t mode; - int len; - int rc; - - rc = obd_ioctl_getdata(&buf, &len, (void __user *)arg); - if (rc) - return rc; - - data = (void *)buf; - if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2 || - data->ioc_inllen1 == 0 || data->ioc_inllen2 == 0) { - rc = -EINVAL; - goto lmv_out_free; - } - - filename = data->ioc_inlbuf1; - namelen = data->ioc_inllen1; - - if (namelen < 1) { - CDEBUG(D_INFO, "IOC_MDC_LOOKUP missing filename\n"); - rc = -EINVAL; - goto lmv_out_free; - } - lum = (struct lmv_user_md *)data->ioc_inlbuf2; - lumlen = data->ioc_inllen2; - - if (lum->lum_magic != LMV_USER_MAGIC || - lumlen != sizeof(*lum)) { - CERROR("%s: wrong lum magic %x or size %d: rc = %d\n", - filename, lum->lum_magic, lumlen, -EFAULT); - rc = -EINVAL; - goto lmv_out_free; - } - -#if OBD_OCD_VERSION(2, 9, 50, 0) > LUSTRE_VERSION_CODE - mode = data->ioc_type != 0 ? data->ioc_type : 0777; -#else - mode = data->ioc_type; -#endif - rc = ll_dir_setdirstripe(inode, lum, filename, mode); -lmv_out_free: - kvfree(buf); - return rc; - } - case LL_IOC_LMV_SET_DEFAULT_STRIPE: { - struct lmv_user_md __user *ulump; - struct lmv_user_md lum; - int rc; - - ulump = (struct lmv_user_md __user *)arg; - if (copy_from_user(&lum, ulump, sizeof(lum))) - return -EFAULT; - - if (lum.lum_magic != LMV_USER_MAGIC) - return -EINVAL; - - rc = ll_dir_setstripe(inode, (struct lov_user_md *)&lum, 0); - - return rc; - } - case LL_IOC_LOV_SETSTRIPE: { - struct lov_user_md_v3 lumv3; - struct lov_user_md_v1 *lumv1 = (struct lov_user_md_v1 *)&lumv3; - struct lov_user_md_v1 __user *lumv1p = (void __user *)arg; - struct lov_user_md_v3 __user *lumv3p = (void __user *)arg; - - int set_default = 0; - - LASSERT(sizeof(lumv3) == sizeof(*lumv3p)); - LASSERT(sizeof(lumv3.lmm_objects[0]) == - sizeof(lumv3p->lmm_objects[0])); - /* first try with v1 which is smaller than v3 */ - if (copy_from_user(lumv1, lumv1p, sizeof(*lumv1))) - return -EFAULT; - - if (lumv1->lmm_magic == LOV_USER_MAGIC_V3) { - if (copy_from_user(&lumv3, lumv3p, sizeof(lumv3))) - return -EFAULT; - if (lumv3.lmm_magic != LOV_USER_MAGIC_V3) - return -EINVAL; - } - - if (is_root_inode(inode)) - set_default = 1; - - /* in v1 and v3 cases lumv1 points to data */ - rc = ll_dir_setstripe(inode, lumv1, set_default); - - return rc; - } - case LL_IOC_LMV_GETSTRIPE: { - struct lmv_user_md __user *ulmv; - struct lmv_user_md lum; - struct ptlrpc_request *request = NULL; - struct lmv_user_md *tmp = NULL; - union lmv_mds_md *lmm = NULL; - u64 valid = 0; - int max_stripe_count; - int stripe_count; - int mdt_index; - int lum_size; - int lmmsize; - int rc; - int i; - - ulmv = (struct lmv_user_md __user *)arg; - if (copy_from_user(&lum, ulmv, sizeof(*ulmv))) - return -EFAULT; - - max_stripe_count = lum.lum_stripe_count; - /* - * lum_magic will indicate which stripe the ioctl will like - * to get, LMV_MAGIC_V1 is for normal LMV stripe, LMV_USER_MAGIC - * is for default LMV stripe - */ - if (lum.lum_magic == LMV_MAGIC_V1) - valid |= OBD_MD_MEA; - else if (lum.lum_magic == LMV_USER_MAGIC) - valid |= OBD_MD_DEFAULT_MEA; - else - return -EINVAL; - - rc = ll_dir_getstripe(inode, (void **)&lmm, &lmmsize, &request, - valid); - if (rc) - goto finish_req; - - /* Get default LMV EA */ - if (lum.lum_magic == LMV_USER_MAGIC) { - if (lmmsize > sizeof(*ulmv)) { - rc = -EINVAL; - goto finish_req; - } - - if (copy_to_user(ulmv, lmm, lmmsize)) - rc = -EFAULT; - - goto finish_req; - } - - stripe_count = lmv_mds_md_stripe_count_get(lmm); - if (max_stripe_count < stripe_count) { - lum.lum_stripe_count = stripe_count; - if (copy_to_user(ulmv, &lum, sizeof(lum))) { - rc = -EFAULT; - goto finish_req; - } - rc = -E2BIG; - goto finish_req; - } - - lum_size = lmv_user_md_size(stripe_count, LMV_MAGIC_V1); - tmp = kzalloc(lum_size, GFP_NOFS); - if (!tmp) { - rc = -ENOMEM; - goto finish_req; - } - - mdt_index = ll_get_mdt_idx(inode); - if (mdt_index < 0) { - rc = -ENOMEM; - goto out_tmp; - } - tmp->lum_magic = LMV_MAGIC_V1; - tmp->lum_stripe_count = 0; - tmp->lum_stripe_offset = mdt_index; - for (i = 0; i < stripe_count; i++) { - struct lu_fid fid; - - fid_le_to_cpu(&fid, &lmm->lmv_md_v1.lmv_stripe_fids[i]); - mdt_index = ll_get_mdt_idx_by_fid(sbi, &fid); - if (mdt_index < 0) { - rc = mdt_index; - goto out_tmp; - } - tmp->lum_objects[i].lum_mds = mdt_index; - tmp->lum_objects[i].lum_fid = fid; - tmp->lum_stripe_count++; - } - - if (copy_to_user(ulmv, tmp, lum_size)) { - rc = -EFAULT; - goto out_tmp; - } -out_tmp: - kfree(tmp); -finish_req: - ptlrpc_req_finished(request); - return rc; - } - - case LL_IOC_LOV_SWAP_LAYOUTS: - return -EPERM; - case IOC_OBD_STATFS: - return ll_obd_statfs(inode, (void __user *)arg); - case LL_IOC_LOV_GETSTRIPE: - case LL_IOC_MDC_GETINFO: - case IOC_MDC_GETFILEINFO: - case IOC_MDC_GETFILESTRIPE: { - struct ptlrpc_request *request = NULL; - struct lov_user_md __user *lump; - struct lov_mds_md *lmm = NULL; - struct mdt_body *body; - char *filename = NULL; - int lmmsize; - - if (cmd == IOC_MDC_GETFILEINFO || - cmd == IOC_MDC_GETFILESTRIPE) { - filename = ll_getname((const char __user *)arg); - if (IS_ERR(filename)) - return PTR_ERR(filename); - - rc = ll_lov_getstripe_ea_info(inode, filename, &lmm, - &lmmsize, &request); - } else { - rc = ll_dir_getstripe(inode, (void **)&lmm, &lmmsize, - &request, 0); - } - - if (request) { - body = req_capsule_server_get(&request->rq_pill, - &RMF_MDT_BODY); - LASSERT(body); - } else { - goto out_req; - } - - if (rc < 0) { - if (rc == -ENODATA && (cmd == IOC_MDC_GETFILEINFO || - cmd == LL_IOC_MDC_GETINFO)) { - rc = 0; - goto skip_lmm; - } - - goto out_req; - } - - if (cmd == IOC_MDC_GETFILESTRIPE || - cmd == LL_IOC_LOV_GETSTRIPE) { - lump = (struct lov_user_md __user *)arg; - } else { - struct lov_user_mds_data __user *lmdp; - - lmdp = (struct lov_user_mds_data __user *)arg; - lump = &lmdp->lmd_lmm; - } - if (copy_to_user(lump, lmm, lmmsize)) { - if (copy_to_user(lump, lmm, sizeof(*lump))) { - rc = -EFAULT; - goto out_req; - } - rc = -EOVERFLOW; - } -skip_lmm: - if (cmd == IOC_MDC_GETFILEINFO || cmd == LL_IOC_MDC_GETINFO) { - struct lov_user_mds_data __user *lmdp; - lstat_t st = { 0 }; - - st.st_dev = inode->i_sb->s_dev; - st.st_mode = body->mbo_mode; - st.st_nlink = body->mbo_nlink; - st.st_uid = body->mbo_uid; - st.st_gid = body->mbo_gid; - st.st_rdev = body->mbo_rdev; - st.st_size = body->mbo_size; - st.st_blksize = PAGE_SIZE; - st.st_blocks = body->mbo_blocks; - st.st_atime = body->mbo_atime; - st.st_mtime = body->mbo_mtime; - st.st_ctime = body->mbo_ctime; - st.st_ino = cl_fid_build_ino(&body->mbo_fid1, - sbi->ll_flags & - LL_SBI_32BIT_API); - - lmdp = (struct lov_user_mds_data __user *)arg; - if (copy_to_user(&lmdp->lmd_st, &st, sizeof(st))) { - rc = -EFAULT; - goto out_req; - } - } - -out_req: - ptlrpc_req_finished(request); - if (filename) - ll_putname(filename); - return rc; - } - case OBD_IOC_QUOTACTL: { - struct if_quotactl *qctl; - - qctl = kzalloc(sizeof(*qctl), GFP_NOFS); - if (!qctl) - return -ENOMEM; - - if (copy_from_user(qctl, (void __user *)arg, sizeof(*qctl))) { - rc = -EFAULT; - goto out_quotactl; - } - - rc = quotactl_ioctl(sbi, qctl); - - if (rc == 0 && copy_to_user((void __user *)arg, qctl, - sizeof(*qctl))) - rc = -EFAULT; - -out_quotactl: - kfree(qctl); - return rc; - } - case OBD_IOC_GETDTNAME: - case OBD_IOC_GETMDNAME: - return ll_get_obd_name(inode, cmd, arg); - case LL_IOC_FLUSHCTX: - return ll_flush_ctx(inode); - case LL_IOC_GETOBDCOUNT: { - int count, vallen; - struct obd_export *exp; - - if (copy_from_user(&count, (int __user *)arg, sizeof(int))) - return -EFAULT; - - /* get ost count when count is zero, get mdt count otherwise */ - exp = count ? sbi->ll_md_exp : sbi->ll_dt_exp; - vallen = sizeof(count); - rc = obd_get_info(NULL, exp, sizeof(KEY_TGT_COUNT), - KEY_TGT_COUNT, &vallen, &count); - if (rc) { - CERROR("get target count failed: %d\n", rc); - return rc; - } - - if (copy_to_user((int __user *)arg, &count, sizeof(int))) - return -EFAULT; - - return 0; - } - case LL_IOC_PATH2FID: - if (copy_to_user((void __user *)arg, ll_inode2fid(inode), - sizeof(struct lu_fid))) - return -EFAULT; - return 0; - case LL_IOC_GET_CONNECT_FLAGS: { - return obd_iocontrol(cmd, sbi->ll_md_exp, 0, NULL, - (void __user *)arg); - } - case OBD_IOC_CHANGELOG_SEND: - case OBD_IOC_CHANGELOG_CLEAR: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - rc = copy_and_ioctl(cmd, sbi->ll_md_exp, (void __user *)arg, - sizeof(struct ioc_changelog)); - return rc; - case OBD_IOC_FID2PATH: - return ll_fid2path(inode, (void __user *)arg); - case LL_IOC_GETPARENT: - return ll_getparent(file, (void __user *)arg); - case LL_IOC_FID2MDTIDX: { - struct obd_export *exp = ll_i2mdexp(inode); - struct lu_fid fid; - __u32 index; - - if (copy_from_user(&fid, (const struct lu_fid __user *)arg, - sizeof(fid))) - return -EFAULT; - - /* Call mdc_iocontrol */ - rc = obd_iocontrol(LL_IOC_FID2MDTIDX, exp, sizeof(fid), &fid, - &index); - if (rc) - return rc; - - return index; - } - case LL_IOC_HSM_REQUEST: { - struct hsm_user_request *hur; - ssize_t totalsize; - - hur = memdup_user((void __user *)arg, sizeof(*hur)); - if (IS_ERR(hur)) - return PTR_ERR(hur); - - /* Compute the whole struct size */ - totalsize = hur_len(hur); - kfree(hur); - if (totalsize < 0) - return -E2BIG; - - /* Final size will be more than double totalsize */ - if (totalsize >= MDS_MAXREQSIZE / 3) - return -E2BIG; - - hur = kzalloc(totalsize, GFP_NOFS); - if (!hur) - return -ENOMEM; - - /* Copy the whole struct */ - if (copy_from_user(hur, (void __user *)arg, totalsize)) { - kvfree(hur); - return -EFAULT; - } - - if (hur->hur_request.hr_action == HUA_RELEASE) { - const struct lu_fid *fid; - struct inode *f; - int i; - - for (i = 0; i < hur->hur_request.hr_itemcount; i++) { - fid = &hur->hur_user_item[i].hui_fid; - f = search_inode_for_lustre(inode->i_sb, fid); - if (IS_ERR(f)) { - rc = PTR_ERR(f); - break; - } - - rc = ll_hsm_release(f); - iput(f); - if (rc != 0) - break; - } - } else { - rc = obd_iocontrol(cmd, ll_i2mdexp(inode), totalsize, - hur, NULL); - } - - kvfree(hur); - - return rc; - } - case LL_IOC_HSM_PROGRESS: { - struct hsm_progress_kernel hpk; - struct hsm_progress hp; - - if (copy_from_user(&hp, (void __user *)arg, sizeof(hp))) - return -EFAULT; - - hpk.hpk_fid = hp.hp_fid; - hpk.hpk_cookie = hp.hp_cookie; - hpk.hpk_extent = hp.hp_extent; - hpk.hpk_flags = hp.hp_flags; - hpk.hpk_errval = hp.hp_errval; - hpk.hpk_data_version = 0; - - /* File may not exist in Lustre; all progress - * reported to Lustre root - */ - rc = obd_iocontrol(cmd, sbi->ll_md_exp, sizeof(hpk), &hpk, - NULL); - return rc; - } - case LL_IOC_HSM_CT_START: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - rc = copy_and_ioctl(cmd, sbi->ll_md_exp, (void __user *)arg, - sizeof(struct lustre_kernelcomm)); - return rc; - - case LL_IOC_HSM_COPY_START: { - struct hsm_copy *copy; - int rc; - - copy = memdup_user((char __user *)arg, sizeof(*copy)); - if (IS_ERR(copy)) - return PTR_ERR(copy); - - rc = ll_ioc_copy_start(inode->i_sb, copy); - if (copy_to_user((char __user *)arg, copy, sizeof(*copy))) - rc = -EFAULT; - - kfree(copy); - return rc; - } - case LL_IOC_HSM_COPY_END: { - struct hsm_copy *copy; - int rc; - - copy = memdup_user((char __user *)arg, sizeof(*copy)); - if (IS_ERR(copy)) - return PTR_ERR(copy); - - rc = ll_ioc_copy_end(inode->i_sb, copy); - if (copy_to_user((char __user *)arg, copy, sizeof(*copy))) - rc = -EFAULT; - - kfree(copy); - return rc; - } - case LL_IOC_MIGRATE: { - char *buf = NULL; - const char *filename; - int namelen = 0; - int len; - int rc; - int mdtidx; - - rc = obd_ioctl_getdata(&buf, &len, (void __user *)arg); - if (rc < 0) - return rc; - - data = (struct obd_ioctl_data *)buf; - if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2 || - !data->ioc_inllen1 || !data->ioc_inllen2) { - rc = -EINVAL; - goto migrate_free; - } - - filename = data->ioc_inlbuf1; - namelen = data->ioc_inllen1; - if (namelen < 1 || namelen != strlen(filename) + 1) { - rc = -EINVAL; - goto migrate_free; - } - - if (data->ioc_inllen2 != sizeof(mdtidx)) { - rc = -EINVAL; - goto migrate_free; - } - mdtidx = *(int *)data->ioc_inlbuf2; - - rc = ll_migrate(inode, file, mdtidx, filename, namelen - 1); -migrate_free: - kvfree(buf); - - return rc; - } - - default: - return obd_iocontrol(cmd, sbi->ll_dt_exp, 0, NULL, - (void __user *)arg); - } -} - -static loff_t ll_dir_seek(struct file *file, loff_t offset, int origin) -{ - struct inode *inode = file->f_mapping->host; - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - struct ll_sb_info *sbi = ll_i2sbi(inode); - int api32 = ll_need_32bit_api(sbi); - loff_t ret = -EINVAL; - - switch (origin) { - case SEEK_SET: - break; - case SEEK_CUR: - offset += file->f_pos; - break; - case SEEK_END: - if (offset > 0) - goto out; - if (api32) - offset += LL_DIR_END_OFF_32BIT; - else - offset += LL_DIR_END_OFF; - break; - default: - goto out; - } - - if (offset >= 0 && - ((api32 && offset <= LL_DIR_END_OFF_32BIT) || - (!api32 && offset <= LL_DIR_END_OFF))) { - if (offset != file->f_pos) { - if ((api32 && offset == LL_DIR_END_OFF_32BIT) || - (!api32 && offset == LL_DIR_END_OFF)) - fd->lfd_pos = MDS_DIR_END_OFF; - else if (api32 && sbi->ll_flags & LL_SBI_64BIT_HASH) - fd->lfd_pos = offset << 32; - else - fd->lfd_pos = offset; - file->f_pos = offset; - } - ret = offset; - } - goto out; - -out: - return ret; -} - -static int ll_dir_open(struct inode *inode, struct file *file) -{ - return ll_file_open(inode, file); -} - -static int ll_dir_release(struct inode *inode, struct file *file) -{ - return ll_file_release(inode, file); -} - -const struct file_operations ll_dir_operations = { - .llseek = ll_dir_seek, - .open = ll_dir_open, - .release = ll_dir_release, - .read = generic_read_dir, - .iterate_shared = ll_readdir, - .unlocked_ioctl = ll_dir_ioctl, - .fsync = ll_fsync, -}; diff --git a/drivers/staging/lustre/lustre/llite/file.c b/drivers/staging/lustre/lustre/llite/file.c deleted file mode 100644 index 02295931883bcc6166641a1e9fbe5054c0d28b93..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/file.c +++ /dev/null @@ -1,3580 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/llite/file.c - * - * Author: Peter Braam - * Author: Phil Schwan - * Author: Andreas Dilger - */ - -#define DEBUG_SUBSYSTEM S_LLITE -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include "llite_internal.h" - -static int -ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg); - -static int ll_lease_close(struct obd_client_handle *och, struct inode *inode, - bool *lease_broken); - -static enum llioc_iter -ll_iocontrol_call(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg, int *rcp); - -static struct ll_file_data *ll_file_data_get(void) -{ - struct ll_file_data *fd; - - fd = kmem_cache_zalloc(ll_file_data_slab, GFP_NOFS); - if (!fd) - return NULL; - fd->fd_write_failed = false; - return fd; -} - -static void ll_file_data_put(struct ll_file_data *fd) -{ - if (fd) - kmem_cache_free(ll_file_data_slab, fd); -} - -/** - * Packs all the attributes into @op_data for the CLOSE rpc. - */ -static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data, - struct obd_client_handle *och) -{ - struct ll_inode_info *lli = ll_i2info(inode); - - ll_prep_md_op_data(op_data, inode, NULL, NULL, - 0, 0, LUSTRE_OPC_ANY, NULL); - - op_data->op_attr.ia_mode = inode->i_mode; - op_data->op_attr.ia_atime = inode->i_atime; - op_data->op_attr.ia_mtime = inode->i_mtime; - op_data->op_attr.ia_ctime = inode->i_ctime; - op_data->op_attr.ia_size = i_size_read(inode); - op_data->op_attr.ia_valid |= ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET | - ATTR_MTIME | ATTR_MTIME_SET | - ATTR_CTIME | ATTR_CTIME_SET; - op_data->op_attr_blocks = inode->i_blocks; - op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags); - op_data->op_handle = och->och_fh; - - /* - * For HSM: if inode data has been modified, pack it so that - * MDT can set data dirty flag in the archive. - */ - if (och->och_flags & FMODE_WRITE && - test_and_clear_bit(LLIF_DATA_MODIFIED, &lli->lli_flags)) - op_data->op_bias |= MDS_DATA_MODIFIED; -} - -/** - * Perform a close, possibly with a bias. - * The meaning of "data" depends on the value of "bias". - * - * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version. - * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to - * swap layouts with. - */ -static int ll_close_inode_openhandle(struct inode *inode, - struct obd_client_handle *och, - enum mds_op_bias bias, - void *data) -{ - const struct ll_inode_info *lli = ll_i2info(inode); - struct obd_export *md_exp = ll_i2mdexp(inode); - struct md_op_data *op_data; - struct ptlrpc_request *req = NULL; - int rc; - - if (!class_exp2obd(md_exp)) { - CERROR("%s: invalid MDC connection handle closing " DFID "\n", - ll_get_fsname(inode->i_sb, NULL, 0), - PFID(&lli->lli_fid)); - rc = 0; - goto out; - } - - op_data = kzalloc(sizeof(*op_data), GFP_NOFS); - /* - * We leak openhandle and request here on error, but not much to be - * done in OOM case since app won't retry close on error either. - */ - if (!op_data) { - rc = -ENOMEM; - goto out; - } - - ll_prepare_close(inode, op_data, och); - switch (bias) { - case MDS_CLOSE_LAYOUT_SWAP: - LASSERT(data); - op_data->op_bias |= MDS_CLOSE_LAYOUT_SWAP; - op_data->op_data_version = 0; - op_data->op_lease_handle = och->och_lease_handle; - op_data->op_fid2 = *ll_inode2fid(data); - break; - - case MDS_HSM_RELEASE: - LASSERT(data); - op_data->op_bias |= MDS_HSM_RELEASE; - op_data->op_data_version = *(__u64 *)data; - op_data->op_lease_handle = och->och_lease_handle; - op_data->op_attr.ia_valid |= ATTR_SIZE | ATTR_BLOCKS; - break; - - default: - LASSERT(!data); - break; - } - - rc = md_close(md_exp, op_data, och->och_mod, &req); - if (rc && rc != -EINTR) { - CERROR("%s: inode " DFID " mdc close failed: rc = %d\n", - md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc); - } - - if (op_data->op_bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP) && - !rc) { - struct mdt_body *body; - - body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED)) - rc = -EBUSY; - } - - ll_finish_md_op_data(op_data); - -out: - md_clear_open_replay_data(md_exp, och); - och->och_fh.cookie = DEAD_HANDLE_MAGIC; - kfree(och); - - ptlrpc_req_finished(req); - return rc; -} - -int ll_md_real_close(struct inode *inode, fmode_t fmode) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct obd_client_handle **och_p; - struct obd_client_handle *och; - __u64 *och_usecount; - int rc = 0; - - if (fmode & FMODE_WRITE) { - och_p = &lli->lli_mds_write_och; - och_usecount = &lli->lli_open_fd_write_count; - } else if (fmode & FMODE_EXEC) { - och_p = &lli->lli_mds_exec_och; - och_usecount = &lli->lli_open_fd_exec_count; - } else { - LASSERT(fmode & FMODE_READ); - och_p = &lli->lli_mds_read_och; - och_usecount = &lli->lli_open_fd_read_count; - } - - mutex_lock(&lli->lli_och_mutex); - if (*och_usecount > 0) { - /* There are still users of this handle, so skip - * freeing it. - */ - mutex_unlock(&lli->lli_och_mutex); - return 0; - } - - och = *och_p; - *och_p = NULL; - mutex_unlock(&lli->lli_och_mutex); - - if (och) { - /* There might be a race and this handle may already - * be closed. - */ - rc = ll_close_inode_openhandle(inode, och, 0, NULL); - } - - return rc; -} - -static int ll_md_close(struct inode *inode, struct file *file) -{ - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - struct ll_inode_info *lli = ll_i2info(inode); - int lockmode; - __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK; - struct lustre_handle lockh; - union ldlm_policy_data policy = { - .l_inodebits = { MDS_INODELOCK_OPEN } - }; - int rc = 0; - - /* clear group lock, if present */ - if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) - ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid); - - if (fd->fd_lease_och) { - bool lease_broken; - - /* Usually the lease is not released when the - * application crashed, we need to release here. - */ - rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken); - CDEBUG(rc ? D_ERROR : D_INODE, - "Clean up lease " DFID " %d/%d\n", - PFID(&lli->lli_fid), rc, lease_broken); - - fd->fd_lease_och = NULL; - } - - if (fd->fd_och) { - rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL); - fd->fd_och = NULL; - goto out; - } - - /* Let's see if we have good enough OPEN lock on the file and if - * we can skip talking to MDS - */ - - mutex_lock(&lli->lli_och_mutex); - if (fd->fd_omode & FMODE_WRITE) { - lockmode = LCK_CW; - LASSERT(lli->lli_open_fd_write_count); - lli->lli_open_fd_write_count--; - } else if (fd->fd_omode & FMODE_EXEC) { - lockmode = LCK_PR; - LASSERT(lli->lli_open_fd_exec_count); - lli->lli_open_fd_exec_count--; - } else { - lockmode = LCK_CR; - LASSERT(lli->lli_open_fd_read_count); - lli->lli_open_fd_read_count--; - } - mutex_unlock(&lli->lli_och_mutex); - - if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode), - LDLM_IBITS, &policy, lockmode, &lockh)) - rc = ll_md_real_close(inode, fd->fd_omode); - -out: - LUSTRE_FPRIVATE(file) = NULL; - ll_file_data_put(fd); - - return rc; -} - -/* While this returns an error code, fput() the caller does not, so we need - * to make every effort to clean up all of our state here. Also, applications - * rarely check close errors and even if an error is returned they will not - * re-try the close call. - */ -int ll_file_release(struct inode *inode, struct file *file) -{ - struct ll_file_data *fd; - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct ll_inode_info *lli = ll_i2info(inode); - int rc; - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p)\n", - PFID(ll_inode2fid(inode)), inode); - - if (!is_root_inode(inode)) - ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1); - fd = LUSTRE_FPRIVATE(file); - LASSERT(fd); - - /* The last ref on @file, maybe not be the owner pid of statahead, - * because parent and child process can share the same file handle. - */ - if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd) - ll_deauthorize_statahead(inode, fd); - - if (is_root_inode(inode)) { - LUSTRE_FPRIVATE(file) = NULL; - ll_file_data_put(fd); - return 0; - } - - if (!S_ISDIR(inode->i_mode)) { - if (lli->lli_clob) - lov_read_and_clear_async_rc(lli->lli_clob); - lli->lli_async_rc = 0; - } - - rc = ll_md_close(inode, file); - - if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val)) - libcfs_debug_dumplog(); - - return rc; -} - -static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize, - struct lookup_intent *itp) -{ - struct inode *inode = d_inode(de); - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct dentry *parent = de->d_parent; - const char *name = NULL; - struct md_op_data *op_data; - struct ptlrpc_request *req = NULL; - int len = 0, rc; - - LASSERT(parent); - LASSERT(itp->it_flags & MDS_OPEN_BY_FID); - - /* - * if server supports open-by-fid, or file name is invalid, don't pack - * name in open request - */ - if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) && - lu_name_is_valid_2(de->d_name.name, de->d_name.len)) { - name = de->d_name.name; - len = de->d_name.len; - } - - op_data = ll_prep_md_op_data(NULL, d_inode(parent), inode, name, len, - O_RDWR, LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - op_data->op_data = lmm; - op_data->op_data_size = lmmsize; - - rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req, - &ll_md_blocking_ast, 0); - ll_finish_md_op_data(op_data); - if (rc == -ESTALE) { - /* reason for keep own exit path - don`t flood log - * with messages with -ESTALE errors. - */ - if (!it_disposition(itp, DISP_OPEN_OPEN) || - it_open_error(DISP_OPEN_OPEN, itp)) - goto out; - ll_release_openhandle(inode, itp); - goto out; - } - - if (it_disposition(itp, DISP_LOOKUP_NEG)) { - rc = -ENOENT; - goto out; - } - - if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) { - rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp); - CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc); - goto out; - } - - rc = ll_prep_inode(&inode, req, NULL, itp); - if (!rc && itp->it_lock_mode) - ll_set_lock_data(sbi->ll_md_exp, inode, itp, NULL); - -out: - ptlrpc_req_finished(req); - ll_intent_drop_lock(itp); - - /* - * We did open by fid, but by the time we got to the server, - * the object disappeared. If this is a create, we cannot really - * tell the userspace that the file it was trying to create - * does not exist. Instead let's return -ESTALE, and the VFS will - * retry the create with LOOKUP_REVAL that we are going to catch - * in ll_revalidate_dentry() and use lookup then. - */ - if (rc == -ENOENT && itp->it_op & IT_CREAT) - rc = -ESTALE; - - return rc; -} - -static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it, - struct obd_client_handle *och) -{ - struct mdt_body *body; - - body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY); - och->och_fh = body->mbo_handle; - och->och_fid = body->mbo_fid1; - och->och_lease_handle.cookie = it->it_lock_handle; - och->och_magic = OBD_CLIENT_HANDLE_MAGIC; - och->och_flags = it->it_flags; - - return md_set_open_replay_data(md_exp, och, it); -} - -static int ll_local_open(struct file *file, struct lookup_intent *it, - struct ll_file_data *fd, struct obd_client_handle *och) -{ - struct inode *inode = file_inode(file); - - LASSERT(!LUSTRE_FPRIVATE(file)); - - LASSERT(fd); - - if (och) { - int rc; - - rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och); - if (rc != 0) - return rc; - } - - LUSTRE_FPRIVATE(file) = fd; - ll_readahead_init(inode, &fd->fd_ras); - fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC); - - /* ll_cl_context initialize */ - rwlock_init(&fd->fd_lock); - INIT_LIST_HEAD(&fd->fd_lccs); - - return 0; -} - -/* Open a file, and (for the very first open) create objects on the OSTs at - * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object - * creation or open until ll_lov_setstripe() ioctl is called. - * - * If we already have the stripe MD locally then we don't request it in - * md_open(), by passing a lmm_size = 0. - * - * It is up to the application to ensure no other processes open this file - * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be - * used. We might be able to avoid races of that sort by getting lli_open_sem - * before returning in the O_LOV_DELAY_CREATE case and dropping it here - * or in ll_file_release(), but I'm not sure that is desirable/necessary. - */ -int ll_file_open(struct inode *inode, struct file *file) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct lookup_intent *it, oit = { .it_op = IT_OPEN, - .it_flags = file->f_flags }; - struct obd_client_handle **och_p = NULL; - __u64 *och_usecount = NULL; - struct ll_file_data *fd; - int rc = 0; - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p), flags %o\n", - PFID(ll_inode2fid(inode)), inode, file->f_flags); - - it = file->private_data; /* XXX: compat macro */ - file->private_data = NULL; /* prevent ll_local_open assertion */ - - fd = ll_file_data_get(); - if (!fd) { - rc = -ENOMEM; - goto out_openerr; - } - - fd->fd_file = file; - if (S_ISDIR(inode->i_mode)) - ll_authorize_statahead(inode, fd); - - if (is_root_inode(inode)) { - LUSTRE_FPRIVATE(file) = fd; - return 0; - } - - if (!it || !it->it_disposition) { - /* Convert f_flags into access mode. We cannot use file->f_mode, - * because everything but O_ACCMODE mask was stripped from - * there - */ - if ((oit.it_flags + 1) & O_ACCMODE) - oit.it_flags++; - if (file->f_flags & O_TRUNC) - oit.it_flags |= FMODE_WRITE; - - /* kernel only call f_op->open in dentry_open. filp_open calls - * dentry_open after call to open_namei that checks permissions. - * Only nfsd_open call dentry_open directly without checking - * permissions and because of that this code below is safe. - */ - if (oit.it_flags & (FMODE_WRITE | FMODE_READ)) - oit.it_flags |= MDS_OPEN_OWNEROVERRIDE; - - /* We do not want O_EXCL here, presumably we opened the file - * already? XXX - NFS implications? - */ - oit.it_flags &= ~O_EXCL; - - /* bug20584, if "it_flags" contains O_CREAT, the file will be - * created if necessary, then "IT_CREAT" should be set to keep - * consistent with it - */ - if (oit.it_flags & O_CREAT) - oit.it_op |= IT_CREAT; - - it = &oit; - } - -restart: - /* Let's see if we have file open on MDS already. */ - if (it->it_flags & FMODE_WRITE) { - och_p = &lli->lli_mds_write_och; - och_usecount = &lli->lli_open_fd_write_count; - } else if (it->it_flags & FMODE_EXEC) { - och_p = &lli->lli_mds_exec_och; - och_usecount = &lli->lli_open_fd_exec_count; - } else { - och_p = &lli->lli_mds_read_och; - och_usecount = &lli->lli_open_fd_read_count; - } - - mutex_lock(&lli->lli_och_mutex); - if (*och_p) { /* Open handle is present */ - if (it_disposition(it, DISP_OPEN_OPEN)) { - /* Well, there's extra open request that we do not need, - * let's close it somehow. This will decref request. - */ - rc = it_open_error(DISP_OPEN_OPEN, it); - if (rc) { - mutex_unlock(&lli->lli_och_mutex); - goto out_openerr; - } - - ll_release_openhandle(inode, it); - } - (*och_usecount)++; - - rc = ll_local_open(file, it, fd, NULL); - if (rc) { - (*och_usecount)--; - mutex_unlock(&lli->lli_och_mutex); - goto out_openerr; - } - } else { - LASSERT(*och_usecount == 0); - if (!it->it_disposition) { - /* We cannot just request lock handle now, new ELC code - * means that one of other OPEN locks for this file - * could be cancelled, and since blocking ast handler - * would attempt to grab och_mutex as well, that would - * result in a deadlock - */ - mutex_unlock(&lli->lli_och_mutex); - /* - * Normally called under two situations: - * 1. NFS export. - * 2. revalidate with IT_OPEN (revalidate doesn't - * execute this intent any more). - * - * Always fetch MDS_OPEN_LOCK if this is not setstripe. - * - * Always specify MDS_OPEN_BY_FID because we don't want - * to get file with different fid. - */ - it->it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID; - rc = ll_intent_file_open(file->f_path.dentry, - NULL, 0, it); - if (rc) - goto out_openerr; - - goto restart; - } - *och_p = kzalloc(sizeof(struct obd_client_handle), GFP_NOFS); - if (!*och_p) { - rc = -ENOMEM; - goto out_och_free; - } - - (*och_usecount)++; - - /* md_intent_lock() didn't get a request ref if there was an - * open error, so don't do cleanup on the request here - * (bug 3430) - */ - /* XXX (green): Should not we bail out on any error here, not - * just open error? - */ - rc = it_open_error(DISP_OPEN_OPEN, it); - if (rc) - goto out_och_free; - - LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF), - "inode %p: disposition %x, status %d\n", inode, - it_disposition(it, ~0), it->it_status); - - rc = ll_local_open(file, it, fd, *och_p); - if (rc) - goto out_och_free; - } - mutex_unlock(&lli->lli_och_mutex); - fd = NULL; - - /* Must do this outside lli_och_mutex lock to prevent deadlock where - * different kind of OPEN lock for this same inode gets cancelled - * by ldlm_cancel_lru - */ - if (!S_ISREG(inode->i_mode)) - goto out_och_free; - - cl_lov_delay_create_clear(&file->f_flags); - goto out_och_free; - -out_och_free: - if (rc) { - if (och_p && *och_p) { - kfree(*och_p); - *och_p = NULL; - (*och_usecount)--; - } - mutex_unlock(&lli->lli_och_mutex); - -out_openerr: - if (lli->lli_opendir_key == fd) - ll_deauthorize_statahead(inode, fd); - if (fd) - ll_file_data_put(fd); - } else { - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1); - } - - if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) { - ptlrpc_req_finished(it->it_request); - it_clear_disposition(it, DISP_ENQ_OPEN_REF); - } - - return rc; -} - -static int ll_md_blocking_lease_ast(struct ldlm_lock *lock, - struct ldlm_lock_desc *desc, - void *data, int flag) -{ - int rc; - struct lustre_handle lockh; - - switch (flag) { - case LDLM_CB_BLOCKING: - ldlm_lock2handle(lock, &lockh); - rc = ldlm_cli_cancel(&lockh, LCF_ASYNC); - if (rc < 0) { - CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc); - return rc; - } - break; - case LDLM_CB_CANCELING: - /* do nothing */ - break; - } - return 0; -} - -/** - * Acquire a lease and open the file. - */ -static struct obd_client_handle * -ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode, - __u64 open_flags) -{ - struct lookup_intent it = { .it_op = IT_OPEN }; - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct md_op_data *op_data; - struct ptlrpc_request *req = NULL; - struct lustre_handle old_handle = { 0 }; - struct obd_client_handle *och = NULL; - int rc; - int rc2; - - if (fmode != FMODE_WRITE && fmode != FMODE_READ) - return ERR_PTR(-EINVAL); - - if (file) { - struct ll_inode_info *lli = ll_i2info(inode); - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - struct obd_client_handle **och_p; - __u64 *och_usecount; - - if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC)) - return ERR_PTR(-EPERM); - - /* Get the openhandle of the file */ - rc = -EBUSY; - mutex_lock(&lli->lli_och_mutex); - if (fd->fd_lease_och) { - mutex_unlock(&lli->lli_och_mutex); - return ERR_PTR(rc); - } - - if (!fd->fd_och) { - if (file->f_mode & FMODE_WRITE) { - LASSERT(lli->lli_mds_write_och); - och_p = &lli->lli_mds_write_och; - och_usecount = &lli->lli_open_fd_write_count; - } else { - LASSERT(lli->lli_mds_read_och); - och_p = &lli->lli_mds_read_och; - och_usecount = &lli->lli_open_fd_read_count; - } - if (*och_usecount == 1) { - fd->fd_och = *och_p; - *och_p = NULL; - *och_usecount = 0; - rc = 0; - } - } - mutex_unlock(&lli->lli_och_mutex); - if (rc < 0) /* more than 1 opener */ - return ERR_PTR(rc); - - LASSERT(fd->fd_och); - old_handle = fd->fd_och->och_fh; - } - - och = kzalloc(sizeof(*och), GFP_NOFS); - if (!och) - return ERR_PTR(-ENOMEM); - - op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0, - LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) { - rc = PTR_ERR(op_data); - goto out; - } - - /* To tell the MDT this openhandle is from the same owner */ - op_data->op_handle = old_handle; - - it.it_flags = fmode | open_flags; - it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE; - rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req, - &ll_md_blocking_lease_ast, - /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise - * it can be cancelled which may mislead applications that the lease is - * broken; - * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal - * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast - * doesn't deal with openhandle, so normal openhandle will be leaked. - */ - LDLM_FL_NO_LRU | LDLM_FL_EXCL); - ll_finish_md_op_data(op_data); - ptlrpc_req_finished(req); - if (rc < 0) - goto out_release_it; - - if (it_disposition(&it, DISP_LOOKUP_NEG)) { - rc = -ENOENT; - goto out_release_it; - } - - rc = it_open_error(DISP_OPEN_OPEN, &it); - if (rc) - goto out_release_it; - - LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF)); - ll_och_fill(sbi->ll_md_exp, &it, och); - - if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */ { - rc = -EOPNOTSUPP; - goto out_close; - } - - /* already get lease, handle lease lock */ - ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL); - if (it.it_lock_mode == 0 || - it.it_lock_bits != MDS_INODELOCK_OPEN) { - /* open lock must return for lease */ - CERROR(DFID "lease granted but no open lock, %d/%llu.\n", - PFID(ll_inode2fid(inode)), it.it_lock_mode, - it.it_lock_bits); - rc = -EPROTO; - goto out_close; - } - - ll_intent_release(&it); - return och; - -out_close: - /* Cancel open lock */ - if (it.it_lock_mode != 0) { - ldlm_lock_decref_and_cancel(&och->och_lease_handle, - it.it_lock_mode); - it.it_lock_mode = 0; - och->och_lease_handle.cookie = 0ULL; - } - rc2 = ll_close_inode_openhandle(inode, och, 0, NULL); - if (rc2 < 0) - CERROR("%s: error closing file " DFID ": %d\n", - ll_get_fsname(inode->i_sb, NULL, 0), - PFID(&ll_i2info(inode)->lli_fid), rc2); - och = NULL; /* och has been freed in ll_close_inode_openhandle() */ -out_release_it: - ll_intent_release(&it); -out: - kfree(och); - return ERR_PTR(rc); -} - -/** - * Check whether a layout swap can be done between two inodes. - * - * \param[in] inode1 First inode to check - * \param[in] inode2 Second inode to check - * - * \retval 0 on success, layout swap can be performed between both inodes - * \retval negative error code if requirements are not met - */ -static int ll_check_swap_layouts_validity(struct inode *inode1, - struct inode *inode2) -{ - if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode)) - return -EINVAL; - - if (inode_permission(inode1, MAY_WRITE) || - inode_permission(inode2, MAY_WRITE)) - return -EPERM; - - if (inode1->i_sb != inode2->i_sb) - return -EXDEV; - - return 0; -} - -static int ll_swap_layouts_close(struct obd_client_handle *och, - struct inode *inode, struct inode *inode2) -{ - const struct lu_fid *fid1 = ll_inode2fid(inode); - const struct lu_fid *fid2; - int rc; - - CDEBUG(D_INODE, "%s: biased close of file " DFID "\n", - ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1)); - - rc = ll_check_swap_layouts_validity(inode, inode2); - if (rc < 0) - goto out_free_och; - - /* We now know that inode2 is a lustre inode */ - fid2 = ll_inode2fid(inode2); - - rc = lu_fid_cmp(fid1, fid2); - if (!rc) { - rc = -EINVAL; - goto out_free_och; - } - - /* - * Close the file and swap layouts between inode & inode2. - * NB: lease lock handle is released in mdc_close_layout_swap_pack() - * because we still need it to pack l_remote_handle to MDT. - */ - rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP, - inode2); - - och = NULL; /* freed in ll_close_inode_openhandle() */ - -out_free_och: - kfree(och); - return rc; -} - -/** - * Release lease and close the file. - * It will check if the lease has ever broken. - */ -static int ll_lease_close(struct obd_client_handle *och, struct inode *inode, - bool *lease_broken) -{ - struct ldlm_lock *lock; - bool cancelled = true; - - lock = ldlm_handle2lock(&och->och_lease_handle); - if (lock) { - lock_res_and_lock(lock); - cancelled = ldlm_is_cancel(lock); - unlock_res_and_lock(lock); - LDLM_LOCK_PUT(lock); - } - - CDEBUG(D_INODE, "lease for " DFID " broken? %d\n", - PFID(&ll_i2info(inode)->lli_fid), cancelled); - - if (!cancelled) - ldlm_cli_cancel(&och->och_lease_handle, 0); - if (lease_broken) - *lease_broken = cancelled; - - return ll_close_inode_openhandle(inode, och, 0, NULL); -} - -int ll_merge_attr(const struct lu_env *env, struct inode *inode) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct cl_object *obj = lli->lli_clob; - struct cl_attr *attr = vvp_env_thread_attr(env); - s64 atime; - s64 mtime; - s64 ctime; - int rc = 0; - - ll_inode_size_lock(inode); - - /* merge timestamps the most recently obtained from mds with - * timestamps obtained from osts - */ - LTIME_S(inode->i_atime) = lli->lli_atime; - LTIME_S(inode->i_mtime) = lli->lli_mtime; - LTIME_S(inode->i_ctime) = lli->lli_ctime; - - mtime = LTIME_S(inode->i_mtime); - atime = LTIME_S(inode->i_atime); - ctime = LTIME_S(inode->i_ctime); - - cl_object_attr_lock(obj); - rc = cl_object_attr_get(env, obj, attr); - cl_object_attr_unlock(obj); - - if (rc != 0) - goto out_size_unlock; - - if (atime < attr->cat_atime) - atime = attr->cat_atime; - - if (ctime < attr->cat_ctime) - ctime = attr->cat_ctime; - - if (mtime < attr->cat_mtime) - mtime = attr->cat_mtime; - - CDEBUG(D_VFSTRACE, DFID " updating i_size %llu\n", - PFID(&lli->lli_fid), attr->cat_size); - - i_size_write(inode, attr->cat_size); - - inode->i_blocks = attr->cat_blocks; - - LTIME_S(inode->i_mtime) = mtime; - LTIME_S(inode->i_atime) = atime; - LTIME_S(inode->i_ctime) = ctime; - -out_size_unlock: - ll_inode_size_unlock(inode); - - return rc; -} - -static bool file_is_noatime(const struct file *file) -{ - const struct vfsmount *mnt = file->f_path.mnt; - const struct inode *inode = file_inode(file); - - /* Adapted from file_accessed() and touch_atime().*/ - if (file->f_flags & O_NOATIME) - return true; - - if (inode->i_flags & S_NOATIME) - return true; - - if (IS_NOATIME(inode)) - return true; - - if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY)) - return true; - - if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) - return true; - - if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode)) - return true; - - return false; -} - -static void ll_io_init(struct cl_io *io, const struct file *file, int write) -{ - struct inode *inode = file_inode(file); - - io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK; - if (write) { - io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND); - io->u.ci_wr.wr_sync = file->f_flags & O_SYNC || - file->f_flags & O_DIRECT || - IS_SYNC(inode); - } - io->ci_obj = ll_i2info(inode)->lli_clob; - io->ci_lockreq = CILR_MAYBE; - if (ll_file_nolock(file)) { - io->ci_lockreq = CILR_NEVER; - io->ci_no_srvlock = 1; - } else if (file->f_flags & O_APPEND) { - io->ci_lockreq = CILR_MANDATORY; - } - - io->ci_noatime = file_is_noatime(file); -} - -static ssize_t -ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args, - struct file *file, enum cl_io_type iot, - loff_t *ppos, size_t count) -{ - struct ll_inode_info *lli = ll_i2info(file_inode(file)); - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - struct vvp_io *vio = vvp_env_io(env); - struct range_lock range; - struct cl_io *io; - ssize_t result = 0; - int rc = 0; - - CDEBUG(D_VFSTRACE, "file: %pD, type: %d ppos: %llu, count: %zu\n", - file, iot, *ppos, count); - -restart: - io = vvp_env_thread_io(env); - ll_io_init(io, file, iot == CIT_WRITE); - - if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) { - struct vvp_io *vio = vvp_env_io(env); - bool range_locked = false; - - if (file->f_flags & O_APPEND) - range_lock_init(&range, 0, LUSTRE_EOF); - else - range_lock_init(&range, *ppos, *ppos + count - 1); - - vio->vui_fd = LUSTRE_FPRIVATE(file); - vio->vui_iter = args->u.normal.via_iter; - vio->vui_iocb = args->u.normal.via_iocb; - /* - * Direct IO reads must also take range lock, - * or multiple reads will try to work on the same pages - * See LU-6227 for details. - */ - if (((iot == CIT_WRITE) || - (iot == CIT_READ && (file->f_flags & O_DIRECT))) && - !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) { - CDEBUG(D_VFSTRACE, "Range lock [%llu, %llu]\n", - range.rl_node.in_extent.start, - range.rl_node.in_extent.end); - rc = range_lock(&lli->lli_write_tree, &range); - if (rc < 0) - goto out; - - range_locked = true; - } - ll_cl_add(file, env, io); - rc = cl_io_loop(env, io); - ll_cl_remove(file, env); - if (range_locked) { - CDEBUG(D_VFSTRACE, "Range unlock [%llu, %llu]\n", - range.rl_node.in_extent.start, - range.rl_node.in_extent.end); - range_unlock(&lli->lli_write_tree, &range); - } - } else { - /* cl_io_rw_init() handled IO */ - rc = io->ci_result; - } - - if (io->ci_nob > 0) { - result = io->ci_nob; - count -= io->ci_nob; - *ppos = io->u.ci_wr.wr.crw_pos; - - /* prepare IO restart */ - if (count > 0) - args->u.normal.via_iter = vio->vui_iter; - } -out: - cl_io_fini(env, io); - - if ((!rc || rc == -ENODATA) && count > 0 && io->ci_need_restart) { - CDEBUG(D_VFSTRACE, - "%s: restart %s from %lld, count:%zu, result: %zd\n", - file_dentry(file)->d_name.name, - iot == CIT_READ ? "read" : "write", - *ppos, count, result); - goto restart; - } - - if (iot == CIT_READ) { - if (result >= 0) - ll_stats_ops_tally(ll_i2sbi(file_inode(file)), - LPROC_LL_READ_BYTES, result); - } else if (iot == CIT_WRITE) { - if (result >= 0) { - ll_stats_ops_tally(ll_i2sbi(file_inode(file)), - LPROC_LL_WRITE_BYTES, result); - fd->fd_write_failed = false; - } else if (!result && !rc) { - rc = io->ci_result; - if (rc < 0) - fd->fd_write_failed = true; - else - fd->fd_write_failed = false; - } else if (rc != -ERESTARTSYS) { - fd->fd_write_failed = true; - } - } - CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result); - - return result > 0 ? result : rc; -} - -static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to) -{ - struct lu_env *env; - struct vvp_io_args *args; - ssize_t result; - u16 refcheck; - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - args = ll_env_args(env); - args->u.normal.via_iter = to; - args->u.normal.via_iocb = iocb; - - result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ, - &iocb->ki_pos, iov_iter_count(to)); - cl_env_put(env, &refcheck); - return result; -} - -/* - * Write to a file (through the page cache). - */ -static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from) -{ - struct lu_env *env; - struct vvp_io_args *args; - ssize_t result; - u16 refcheck; - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - args = ll_env_args(env); - args->u.normal.via_iter = from; - args->u.normal.via_iocb = iocb; - - result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE, - &iocb->ki_pos, iov_iter_count(from)); - cl_env_put(env, &refcheck); - return result; -} - -int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry, - __u64 flags, struct lov_user_md *lum, - int lum_size) -{ - struct lookup_intent oit = { - .it_op = IT_OPEN, - .it_flags = flags | MDS_OPEN_BY_FID, - }; - int rc = 0; - - ll_inode_size_lock(inode); - rc = ll_intent_file_open(dentry, lum, lum_size, &oit); - if (rc < 0) - goto out_unlock; - - ll_release_openhandle(inode, &oit); - -out_unlock: - ll_inode_size_unlock(inode); - ll_intent_release(&oit); - return rc; -} - -int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename, - struct lov_mds_md **lmmp, int *lmm_size, - struct ptlrpc_request **request) -{ - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct mdt_body *body; - struct lov_mds_md *lmm = NULL; - struct ptlrpc_request *req = NULL; - struct md_op_data *op_data; - int rc, lmmsize; - - rc = ll_get_default_mdsize(sbi, &lmmsize); - if (rc) - return rc; - - op_data = ll_prep_md_op_data(NULL, inode, NULL, filename, - strlen(filename), lmmsize, - LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA; - rc = md_getattr_name(sbi->ll_md_exp, op_data, &req); - ll_finish_md_op_data(op_data); - if (rc < 0) { - CDEBUG(D_INFO, "md_getattr_name failed on %s: rc %d\n", - filename, rc); - goto out; - } - - body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - - lmmsize = body->mbo_eadatasize; - - if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) || - lmmsize == 0) { - rc = -ENODATA; - goto out; - } - - lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize); - - if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1)) && - (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3))) { - rc = -EPROTO; - goto out; - } - - /* - * This is coming from the MDS, so is probably in - * little endian. We convert it to host endian before - * passing it to userspace. - */ - if (cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) { - int stripe_count; - - stripe_count = le16_to_cpu(lmm->lmm_stripe_count); - if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED) - stripe_count = 0; - - /* if function called for directory - we should - * avoid swab not existent lsm objects - */ - if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) { - lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lmm); - if (S_ISREG(body->mbo_mode)) - lustre_swab_lov_user_md_objects( - ((struct lov_user_md_v1 *)lmm)->lmm_objects, - stripe_count); - } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) { - lustre_swab_lov_user_md_v3((struct lov_user_md_v3 *)lmm); - if (S_ISREG(body->mbo_mode)) - lustre_swab_lov_user_md_objects( - ((struct lov_user_md_v3 *)lmm)->lmm_objects, - stripe_count); - } - } - -out: - *lmmp = lmm; - *lmm_size = lmmsize; - *request = req; - return rc; -} - -static int ll_lov_setea(struct inode *inode, struct file *file, - unsigned long arg) -{ - __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE; - struct lov_user_md *lump; - int lum_size = sizeof(struct lov_user_md) + - sizeof(struct lov_user_ost_data); - int rc; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - lump = kzalloc(lum_size, GFP_NOFS); - if (!lump) - return -ENOMEM; - - if (copy_from_user(lump, (struct lov_user_md __user *)arg, lum_size)) { - kvfree(lump); - return -EFAULT; - } - - rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, lump, - lum_size); - cl_lov_delay_create_clear(&file->f_flags); - - kvfree(lump); - return rc; -} - -static int ll_file_getstripe(struct inode *inode, - struct lov_user_md __user *lum) -{ - struct lu_env *env; - u16 refcheck; - int rc; - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum); - cl_env_put(env, &refcheck); - return rc; -} - -static int ll_lov_setstripe(struct inode *inode, struct file *file, - unsigned long arg) -{ - struct lov_user_md __user *lum = (struct lov_user_md __user *)arg; - struct lov_user_md *klum; - int lum_size, rc; - __u64 flags = FMODE_WRITE; - - rc = ll_copy_user_md(lum, &klum); - if (rc < 0) - return rc; - - lum_size = rc; - rc = ll_lov_setstripe_ea_info(inode, file->f_path.dentry, flags, klum, - lum_size); - cl_lov_delay_create_clear(&file->f_flags); - if (rc == 0) { - __u32 gen; - - put_user(0, &lum->lmm_stripe_count); - - ll_layout_refresh(inode, &gen); - rc = ll_file_getstripe(inode, (struct lov_user_md __user *)arg); - } - - kfree(klum); - return rc; -} - -static int -ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - struct ll_grouplock grouplock; - int rc; - - if (arg == 0) { - CWARN("group id for group lock must not be 0\n"); - return -EINVAL; - } - - if (ll_file_nolock(file)) - return -EOPNOTSUPP; - - spin_lock(&lli->lli_lock); - if (fd->fd_flags & LL_FILE_GROUP_LOCKED) { - CWARN("group lock already existed with gid %lu\n", - fd->fd_grouplock.lg_gid); - spin_unlock(&lli->lli_lock); - return -EINVAL; - } - LASSERT(!fd->fd_grouplock.lg_lock); - spin_unlock(&lli->lli_lock); - - rc = cl_get_grouplock(ll_i2info(inode)->lli_clob, - arg, (file->f_flags & O_NONBLOCK), &grouplock); - if (rc) - return rc; - - spin_lock(&lli->lli_lock); - if (fd->fd_flags & LL_FILE_GROUP_LOCKED) { - spin_unlock(&lli->lli_lock); - CERROR("another thread just won the race\n"); - cl_put_grouplock(&grouplock); - return -EINVAL; - } - - fd->fd_flags |= LL_FILE_GROUP_LOCKED; - fd->fd_grouplock = grouplock; - spin_unlock(&lli->lli_lock); - - CDEBUG(D_INFO, "group lock %lu obtained\n", arg); - return 0; -} - -static int ll_put_grouplock(struct inode *inode, struct file *file, - unsigned long arg) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - struct ll_grouplock grouplock; - - spin_lock(&lli->lli_lock); - if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) { - spin_unlock(&lli->lli_lock); - CWARN("no group lock held\n"); - return -EINVAL; - } - LASSERT(fd->fd_grouplock.lg_lock); - - if (fd->fd_grouplock.lg_gid != arg) { - CWARN("group lock %lu doesn't match current id %lu\n", - arg, fd->fd_grouplock.lg_gid); - spin_unlock(&lli->lli_lock); - return -EINVAL; - } - - grouplock = fd->fd_grouplock; - memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock)); - fd->fd_flags &= ~LL_FILE_GROUP_LOCKED; - spin_unlock(&lli->lli_lock); - - cl_put_grouplock(&grouplock); - CDEBUG(D_INFO, "group lock %lu released\n", arg); - return 0; -} - -/** - * Close inode open handle - * - * \param inode [in] inode in question - * \param it [in,out] intent which contains open info and result - * - * \retval 0 success - * \retval <0 failure - */ -int ll_release_openhandle(struct inode *inode, struct lookup_intent *it) -{ - struct obd_client_handle *och; - int rc; - - LASSERT(inode); - - /* Root ? Do nothing. */ - if (is_root_inode(inode)) - return 0; - - /* No open handle to close? Move away */ - if (!it_disposition(it, DISP_OPEN_OPEN)) - return 0; - - LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0); - - och = kzalloc(sizeof(*och), GFP_NOFS); - if (!och) { - rc = -ENOMEM; - goto out; - } - - ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och); - - rc = ll_close_inode_openhandle(inode, och, 0, NULL); -out: - /* this one is in place of ll_file_open */ - if (it_disposition(it, DISP_ENQ_OPEN_REF)) { - ptlrpc_req_finished(it->it_request); - it_clear_disposition(it, DISP_ENQ_OPEN_REF); - } - return rc; -} - -/** - * Get size for inode for which FIEMAP mapping is requested. - * Make the FIEMAP get_info call and returns the result. - * - * \param fiemap kernel buffer to hold extens - * \param num_bytes kernel buffer size - */ -static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap, - size_t num_bytes) -{ - struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, }; - struct lu_env *env; - u16 refcheck; - int rc = 0; - - /* Checks for fiemap flags */ - if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) { - fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT; - return -EBADR; - } - - /* Check for FIEMAP_FLAG_SYNC */ - if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) { - rc = filemap_fdatawrite(inode->i_mapping); - if (rc) - return rc; - } - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - if (i_size_read(inode) == 0) { - rc = ll_glimpse_size(inode); - if (rc) - goto out; - } - - fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; - obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE); - obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid); - - /* If filesize is 0, then there would be no objects for mapping */ - if (fmkey.lfik_oa.o_size == 0) { - fiemap->fm_mapped_extents = 0; - rc = 0; - goto out; - } - - memcpy(&fmkey.lfik_fiemap, fiemap, sizeof(*fiemap)); - - rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob, - &fmkey, fiemap, &num_bytes); -out: - cl_env_put(env, &refcheck); - return rc; -} - -int ll_fid2path(struct inode *inode, void __user *arg) -{ - struct obd_export *exp = ll_i2mdexp(inode); - const struct getinfo_fid2path __user *gfin = arg; - struct getinfo_fid2path *gfout; - u32 pathlen; - size_t outsize; - int rc; - - if (!capable(CAP_DAC_READ_SEARCH) && - !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH)) - return -EPERM; - - /* Only need to get the buflen */ - if (get_user(pathlen, &gfin->gf_pathlen)) - return -EFAULT; - - if (pathlen > PATH_MAX) - return -EINVAL; - - outsize = sizeof(*gfout) + pathlen; - - gfout = kzalloc(outsize, GFP_NOFS); - if (!gfout) - return -ENOMEM; - - if (copy_from_user(gfout, arg, sizeof(*gfout))) { - rc = -EFAULT; - goto gf_free; - } - - /* Call mdc_iocontrol */ - rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL); - if (rc != 0) - goto gf_free; - - if (copy_to_user(arg, gfout, outsize)) - rc = -EFAULT; - -gf_free: - kfree(gfout); - return rc; -} - -/* - * Read the data_version for inode. - * - * This value is computed using stripe object version on OST. - * Version is computed using server side locking. - * - * @param flags if do sync on the OST side; - * 0: no sync - * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs - * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs - */ -int ll_data_version(struct inode *inode, __u64 *data_version, int flags) -{ - struct cl_object *obj = ll_i2info(inode)->lli_clob; - struct lu_env *env; - struct cl_io *io; - u16 refcheck; - int result; - - /* If no file object initialized, we consider its version is 0. */ - if (!obj) { - *data_version = 0; - return 0; - } - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - io = vvp_env_thread_io(env); - io->ci_obj = obj; - io->u.ci_data_version.dv_data_version = 0; - io->u.ci_data_version.dv_flags = flags; - -restart: - if (!cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj)) - result = cl_io_loop(env, io); - else - result = io->ci_result; - - *data_version = io->u.ci_data_version.dv_data_version; - - cl_io_fini(env, io); - - if (unlikely(io->ci_need_restart)) - goto restart; - - cl_env_put(env, &refcheck); - - return result; -} - -/* - * Trigger a HSM release request for the provided inode. - */ -int ll_hsm_release(struct inode *inode) -{ - struct lu_env *env; - struct obd_client_handle *och = NULL; - __u64 data_version = 0; - int rc; - u16 refcheck; - - CDEBUG(D_INODE, "%s: Releasing file " DFID ".\n", - ll_get_fsname(inode->i_sb, NULL, 0), - PFID(&ll_i2info(inode)->lli_fid)); - - och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE); - if (IS_ERR(och)) { - rc = PTR_ERR(och); - goto out; - } - - /* Grab latest data_version and [am]time values */ - rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH); - if (rc != 0) - goto out; - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) { - rc = PTR_ERR(env); - goto out; - } - - ll_merge_attr(env, inode); - cl_env_put(env, &refcheck); - - /* Release the file. - * NB: lease lock handle is released in mdc_hsm_release_pack() because - * we still need it to pack l_remote_handle to MDT. - */ - rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE, - &data_version); - och = NULL; - -out: - if (och && !IS_ERR(och)) /* close the file */ - ll_lease_close(och, inode, NULL); - - return rc; -} - -struct ll_swap_stack { - u64 dv1; - u64 dv2; - struct inode *inode1; - struct inode *inode2; - bool check_dv1; - bool check_dv2; -}; - -static int ll_swap_layouts(struct file *file1, struct file *file2, - struct lustre_swap_layouts *lsl) -{ - struct mdc_swap_layouts msl; - struct md_op_data *op_data; - __u32 gid; - __u64 dv; - struct ll_swap_stack *llss = NULL; - int rc; - - llss = kzalloc(sizeof(*llss), GFP_NOFS); - if (!llss) - return -ENOMEM; - - llss->inode1 = file_inode(file1); - llss->inode2 = file_inode(file2); - - rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2); - if (rc < 0) - goto free; - - /* we use 2 bool because it is easier to swap than 2 bits */ - if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1) - llss->check_dv1 = true; - - if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2) - llss->check_dv2 = true; - - /* we cannot use lsl->sl_dvX directly because we may swap them */ - llss->dv1 = lsl->sl_dv1; - llss->dv2 = lsl->sl_dv2; - - rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2)); - if (!rc) /* same file, done! */ - goto free; - - if (rc < 0) { /* sequentialize it */ - swap(llss->inode1, llss->inode2); - swap(file1, file2); - swap(llss->dv1, llss->dv2); - swap(llss->check_dv1, llss->check_dv2); - } - - gid = lsl->sl_gid; - if (gid != 0) { /* application asks to flush dirty cache */ - rc = ll_get_grouplock(llss->inode1, file1, gid); - if (rc < 0) - goto free; - - rc = ll_get_grouplock(llss->inode2, file2, gid); - if (rc < 0) { - ll_put_grouplock(llss->inode1, file1, gid); - goto free; - } - } - - /* ultimate check, before swapping the layouts we check if - * dataversion has changed (if requested) - */ - if (llss->check_dv1) { - rc = ll_data_version(llss->inode1, &dv, 0); - if (rc) - goto putgl; - if (dv != llss->dv1) { - rc = -EAGAIN; - goto putgl; - } - } - - if (llss->check_dv2) { - rc = ll_data_version(llss->inode2, &dv, 0); - if (rc) - goto putgl; - if (dv != llss->dv2) { - rc = -EAGAIN; - goto putgl; - } - } - - /* struct md_op_data is used to send the swap args to the mdt - * only flags is missing, so we use struct mdc_swap_layouts - * through the md_op_data->op_data - */ - /* flags from user space have to be converted before they are send to - * server, no flag is sent today, they are only used on the client - */ - msl.msl_flags = 0; - rc = -ENOMEM; - op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0, - 0, LUSTRE_OPC_ANY, &msl); - if (IS_ERR(op_data)) { - rc = PTR_ERR(op_data); - goto free; - } - - rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1), - sizeof(*op_data), op_data, NULL); - ll_finish_md_op_data(op_data); - -putgl: - if (gid != 0) { - ll_put_grouplock(llss->inode2, file2, gid); - ll_put_grouplock(llss->inode1, file1, gid); - } - -free: - kfree(llss); - - return rc; -} - -int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss) -{ - struct md_op_data *op_data; - int rc; - - /* Detect out-of range masks */ - if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK) - return -EINVAL; - - /* Non-root users are forbidden to set or clear flags which are - * NOT defined in HSM_USER_MASK. - */ - if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - - /* Detect out-of range archive id */ - if ((hss->hss_valid & HSS_ARCHIVE_ID) && - (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE)) - return -EINVAL; - - op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, - LUSTRE_OPC_ANY, hss); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode), - sizeof(*op_data), op_data, NULL); - - ll_finish_md_op_data(op_data); - - return rc; -} - -static int ll_hsm_import(struct inode *inode, struct file *file, - struct hsm_user_import *hui) -{ - struct hsm_state_set *hss = NULL; - struct iattr *attr = NULL; - int rc; - - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - - /* set HSM flags */ - hss = kzalloc(sizeof(*hss), GFP_NOFS); - if (!hss) - return -ENOMEM; - - hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID; - hss->hss_archive_id = hui->hui_archive_id; - hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED; - rc = ll_hsm_state_set(inode, hss); - if (rc != 0) - goto free_hss; - - attr = kzalloc(sizeof(*attr), GFP_NOFS); - if (!attr) { - rc = -ENOMEM; - goto free_hss; - } - - attr->ia_mode = hui->hui_mode & 0777; - attr->ia_mode |= S_IFREG; - attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid); - attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid); - attr->ia_size = hui->hui_size; - attr->ia_mtime.tv_sec = hui->hui_mtime; - attr->ia_mtime.tv_nsec = hui->hui_mtime_ns; - attr->ia_atime.tv_sec = hui->hui_atime; - attr->ia_atime.tv_nsec = hui->hui_atime_ns; - - attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE | - ATTR_UID | ATTR_GID | - ATTR_MTIME | ATTR_MTIME_SET | - ATTR_ATIME | ATTR_ATIME_SET; - - inode_lock(inode); - - rc = ll_setattr_raw(file->f_path.dentry, attr, true); - if (rc == -ENODATA) - rc = 0; - - inode_unlock(inode); - - kfree(attr); -free_hss: - kfree(hss); - return rc; -} - -static inline long ll_lease_type_from_fmode(fmode_t fmode) -{ - return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) | - ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0); -} - -static long -ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - struct inode *inode = file_inode(file); - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - int flags, rc; - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p),cmd=%x\n", - PFID(ll_inode2fid(inode)), inode, cmd); - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1); - - /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */ - if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */ - return -ENOTTY; - - switch (cmd) { - case LL_IOC_GETFLAGS: - /* Get the current value of the file flags */ - return put_user(fd->fd_flags, (int __user *)arg); - case LL_IOC_SETFLAGS: - case LL_IOC_CLRFLAGS: - /* Set or clear specific file flags */ - /* XXX This probably needs checks to ensure the flags are - * not abused, and to handle any flag side effects. - */ - if (get_user(flags, (int __user *)arg)) - return -EFAULT; - - if (cmd == LL_IOC_SETFLAGS) { - if ((flags & LL_FILE_IGNORE_LOCK) && - !(file->f_flags & O_DIRECT)) { - CERROR("%s: unable to disable locking on non-O_DIRECT file\n", - current->comm); - return -EINVAL; - } - - fd->fd_flags |= flags; - } else { - fd->fd_flags &= ~flags; - } - return 0; - case LL_IOC_LOV_SETSTRIPE: - return ll_lov_setstripe(inode, file, arg); - case LL_IOC_LOV_SETEA: - return ll_lov_setea(inode, file, arg); - case LL_IOC_LOV_SWAP_LAYOUTS: { - struct file *file2; - struct lustre_swap_layouts lsl; - - if (copy_from_user(&lsl, (char __user *)arg, - sizeof(struct lustre_swap_layouts))) - return -EFAULT; - - if ((file->f_flags & O_ACCMODE) == O_RDONLY) - return -EPERM; - - file2 = fget(lsl.sl_fd); - if (!file2) - return -EBADF; - - /* O_WRONLY or O_RDWR */ - if ((file2->f_flags & O_ACCMODE) == O_RDONLY) { - rc = -EPERM; - goto out; - } - - if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) { - struct obd_client_handle *och = NULL; - struct ll_inode_info *lli; - struct inode *inode2; - - if (lsl.sl_flags != SWAP_LAYOUTS_CLOSE) { - rc = -EINVAL; - goto out; - } - - lli = ll_i2info(inode); - mutex_lock(&lli->lli_och_mutex); - if (fd->fd_lease_och) { - och = fd->fd_lease_och; - fd->fd_lease_och = NULL; - } - mutex_unlock(&lli->lli_och_mutex); - if (!och) { - rc = -ENOLCK; - goto out; - } - inode2 = file_inode(file2); - rc = ll_swap_layouts_close(och, inode, inode2); - } else { - rc = ll_swap_layouts(file, file2, &lsl); - } -out: - fput(file2); - return rc; - } - case LL_IOC_LOV_GETSTRIPE: - return ll_file_getstripe(inode, - (struct lov_user_md __user *)arg); - case FSFILT_IOC_GETFLAGS: - case FSFILT_IOC_SETFLAGS: - return ll_iocontrol(inode, file, cmd, arg); - case FSFILT_IOC_GETVERSION_OLD: - case FSFILT_IOC_GETVERSION: - return put_user(inode->i_generation, (int __user *)arg); - case LL_IOC_GROUP_LOCK: - return ll_get_grouplock(inode, file, arg); - case LL_IOC_GROUP_UNLOCK: - return ll_put_grouplock(inode, file, arg); - case IOC_OBD_STATFS: - return ll_obd_statfs(inode, (void __user *)arg); - - /* We need to special case any other ioctls we want to handle, - * to send them to the MDS/OST as appropriate and to properly - * network encode the arg field. - case FSFILT_IOC_SETVERSION_OLD: - case FSFILT_IOC_SETVERSION: - */ - case LL_IOC_FLUSHCTX: - return ll_flush_ctx(inode); - case LL_IOC_PATH2FID: { - if (copy_to_user((void __user *)arg, ll_inode2fid(inode), - sizeof(struct lu_fid))) - return -EFAULT; - - return 0; - } - case LL_IOC_GETPARENT: - return ll_getparent(file, (struct getparent __user *)arg); - case OBD_IOC_FID2PATH: - return ll_fid2path(inode, (void __user *)arg); - case LL_IOC_DATA_VERSION: { - struct ioc_data_version idv; - int rc; - - if (copy_from_user(&idv, (char __user *)arg, sizeof(idv))) - return -EFAULT; - - idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH; - rc = ll_data_version(inode, &idv.idv_version, idv.idv_flags); - if (rc == 0 && copy_to_user((char __user *)arg, &idv, - sizeof(idv))) - return -EFAULT; - - return rc; - } - - case LL_IOC_GET_MDTIDX: { - int mdtidx; - - mdtidx = ll_get_mdt_idx(inode); - if (mdtidx < 0) - return mdtidx; - - if (put_user(mdtidx, (int __user *)arg)) - return -EFAULT; - - return 0; - } - case OBD_IOC_GETDTNAME: - case OBD_IOC_GETMDNAME: - return ll_get_obd_name(inode, cmd, arg); - case LL_IOC_HSM_STATE_GET: { - struct md_op_data *op_data; - struct hsm_user_state *hus; - int rc; - - hus = kzalloc(sizeof(*hus), GFP_NOFS); - if (!hus) - return -ENOMEM; - - op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, - LUSTRE_OPC_ANY, hus); - if (IS_ERR(op_data)) { - kfree(hus); - return PTR_ERR(op_data); - } - - rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data), - op_data, NULL); - - if (copy_to_user((void __user *)arg, hus, sizeof(*hus))) - rc = -EFAULT; - - ll_finish_md_op_data(op_data); - kfree(hus); - return rc; - } - case LL_IOC_HSM_STATE_SET: { - struct hsm_state_set *hss; - int rc; - - hss = memdup_user((char __user *)arg, sizeof(*hss)); - if (IS_ERR(hss)) - return PTR_ERR(hss); - - rc = ll_hsm_state_set(inode, hss); - - kfree(hss); - return rc; - } - case LL_IOC_HSM_ACTION: { - struct md_op_data *op_data; - struct hsm_current_action *hca; - int rc; - - hca = kzalloc(sizeof(*hca), GFP_NOFS); - if (!hca) - return -ENOMEM; - - op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, - LUSTRE_OPC_ANY, hca); - if (IS_ERR(op_data)) { - kfree(hca); - return PTR_ERR(op_data); - } - - rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data), - op_data, NULL); - - if (copy_to_user((char __user *)arg, hca, sizeof(*hca))) - rc = -EFAULT; - - ll_finish_md_op_data(op_data); - kfree(hca); - return rc; - } - case LL_IOC_SET_LEASE: { - struct ll_inode_info *lli = ll_i2info(inode); - struct obd_client_handle *och = NULL; - bool lease_broken; - fmode_t fmode; - - switch (arg) { - case LL_LEASE_WRLCK: - if (!(file->f_mode & FMODE_WRITE)) - return -EPERM; - fmode = FMODE_WRITE; - break; - case LL_LEASE_RDLCK: - if (!(file->f_mode & FMODE_READ)) - return -EPERM; - fmode = FMODE_READ; - break; - case LL_LEASE_UNLCK: - mutex_lock(&lli->lli_och_mutex); - if (fd->fd_lease_och) { - och = fd->fd_lease_och; - fd->fd_lease_och = NULL; - } - mutex_unlock(&lli->lli_och_mutex); - - if (!och) - return -ENOLCK; - - fmode = och->och_flags; - rc = ll_lease_close(och, inode, &lease_broken); - if (rc < 0) - return rc; - - if (lease_broken) - fmode = 0; - - return ll_lease_type_from_fmode(fmode); - default: - return -EINVAL; - } - - CDEBUG(D_INODE, "Set lease with mode %u\n", fmode); - - /* apply for lease */ - och = ll_lease_open(inode, file, fmode, 0); - if (IS_ERR(och)) - return PTR_ERR(och); - - rc = 0; - mutex_lock(&lli->lli_och_mutex); - if (!fd->fd_lease_och) { - fd->fd_lease_och = och; - och = NULL; - } - mutex_unlock(&lli->lli_och_mutex); - if (och) { - /* impossible now that only excl is supported for now */ - ll_lease_close(och, inode, &lease_broken); - rc = -EBUSY; - } - return rc; - } - case LL_IOC_GET_LEASE: { - struct ll_inode_info *lli = ll_i2info(inode); - struct ldlm_lock *lock = NULL; - fmode_t fmode = 0; - - mutex_lock(&lli->lli_och_mutex); - if (fd->fd_lease_och) { - struct obd_client_handle *och = fd->fd_lease_och; - - lock = ldlm_handle2lock(&och->och_lease_handle); - if (lock) { - lock_res_and_lock(lock); - if (!ldlm_is_cancel(lock)) - fmode = och->och_flags; - unlock_res_and_lock(lock); - LDLM_LOCK_PUT(lock); - } - } - mutex_unlock(&lli->lli_och_mutex); - return ll_lease_type_from_fmode(fmode); - } - case LL_IOC_HSM_IMPORT: { - struct hsm_user_import *hui; - - hui = memdup_user((void __user *)arg, sizeof(*hui)); - if (IS_ERR(hui)) - return PTR_ERR(hui); - - rc = ll_hsm_import(inode, file, hui); - - kfree(hui); - return rc; - } - default: { - int err; - - if (ll_iocontrol_call(inode, file, cmd, arg, &err) == - LLIOC_STOP) - return err; - - return obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL, - (void __user *)arg); - } - } -} - -static loff_t ll_file_seek(struct file *file, loff_t offset, int origin) -{ - struct inode *inode = file_inode(file); - loff_t retval, eof = 0; - - retval = offset + ((origin == SEEK_END) ? i_size_read(inode) : - (origin == SEEK_CUR) ? file->f_pos : 0); - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p), to=%llu=%#llx(%d)\n", - PFID(ll_inode2fid(inode)), inode, retval, retval, origin); - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1); - - if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) { - retval = ll_glimpse_size(inode); - if (retval != 0) - return retval; - eof = i_size_read(inode); - } - - return generic_file_llseek_size(file, offset, origin, - ll_file_maxbytes(inode), eof); -} - -static int ll_flush(struct file *file, fl_owner_t id) -{ - struct inode *inode = file_inode(file); - struct ll_inode_info *lli = ll_i2info(inode); - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - int rc, err; - - LASSERT(!S_ISDIR(inode->i_mode)); - - /* catch async errors that were recorded back when async writeback - * failed for pages in this mapping. - */ - rc = lli->lli_async_rc; - lli->lli_async_rc = 0; - if (lli->lli_clob) { - err = lov_read_and_clear_async_rc(lli->lli_clob); - if (!rc) - rc = err; - } - - /* The application has been told about write failure already. - * Do not report failure again. - */ - if (fd->fd_write_failed) - return 0; - return rc ? -EIO : 0; -} - -/** - * Called to make sure a portion of file has been written out. - * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST. - * - * Return how many pages have been written. - */ -int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end, - enum cl_fsync_mode mode, int ignore_layout) -{ - struct lu_env *env; - struct cl_io *io; - struct cl_fsync_io *fio; - int result; - u16 refcheck; - - if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL && - mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL) - return -EINVAL; - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - io = vvp_env_thread_io(env); - io->ci_obj = ll_i2info(inode)->lli_clob; - io->ci_ignore_layout = ignore_layout; - - /* initialize parameters for sync */ - fio = &io->u.ci_fsync; - fio->fi_start = start; - fio->fi_end = end; - fio->fi_fid = ll_inode2fid(inode); - fio->fi_mode = mode; - fio->fi_nr_written = 0; - - if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0) - result = cl_io_loop(env, io); - else - result = io->ci_result; - if (result == 0) - result = fio->fi_nr_written; - cl_io_fini(env, io); - cl_env_put(env, &refcheck); - - return result; -} - -int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync) -{ - struct inode *inode = file_inode(file); - struct ll_inode_info *lli = ll_i2info(inode); - struct ptlrpc_request *req; - int rc, err; - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p)\n", - PFID(ll_inode2fid(inode)), inode); - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1); - - rc = file_write_and_wait_range(file, start, end); - inode_lock(inode); - - /* catch async errors that were recorded back when async writeback - * failed for pages in this mapping. - */ - if (!S_ISDIR(inode->i_mode)) { - err = lli->lli_async_rc; - lli->lli_async_rc = 0; - if (rc == 0) - rc = err; - if (lli->lli_clob) { - err = lov_read_and_clear_async_rc(lli->lli_clob); - if (rc == 0) - rc = err; - } - } - - err = md_sync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req); - if (!rc) - rc = err; - if (!err) - ptlrpc_req_finished(req); - - if (S_ISREG(inode->i_mode)) { - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - - err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0); - if (rc == 0 && err < 0) - rc = err; - if (rc < 0) - fd->fd_write_failed = true; - else - fd->fd_write_failed = false; - } - - inode_unlock(inode); - return rc; -} - -static int -ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock) -{ - struct inode *inode = file_inode(file); - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct ldlm_enqueue_info einfo = { - .ei_type = LDLM_FLOCK, - .ei_cb_cp = ldlm_flock_completion_ast, - .ei_cbdata = file_lock, - }; - struct md_op_data *op_data; - struct lustre_handle lockh = {0}; - union ldlm_policy_data flock = { { 0 } }; - int fl_type = file_lock->fl_type; - __u64 flags = 0; - int rc; - int rc2 = 0; - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID " file_lock=%p\n", - PFID(ll_inode2fid(inode)), file_lock); - - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1); - - if (file_lock->fl_flags & FL_FLOCK) - LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK)); - else if (!(file_lock->fl_flags & FL_POSIX)) - return -EINVAL; - - flock.l_flock.owner = (unsigned long)file_lock->fl_owner; - flock.l_flock.pid = file_lock->fl_pid; - flock.l_flock.start = file_lock->fl_start; - flock.l_flock.end = file_lock->fl_end; - - /* Somewhat ugly workaround for svc lockd. - * lockd installs custom fl_lmops->lm_compare_owner that checks - * for the fl_owner to be the same (which it always is on local node - * I guess between lockd processes) and then compares pid. - * As such we assign pid to the owner field to make it all work, - * conflict with normal locks is unlikely since pid space and - * pointer space for current->files are not intersecting - */ - if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner) - flock.l_flock.owner = (unsigned long)file_lock->fl_pid; - - switch (fl_type) { - case F_RDLCK: - einfo.ei_mode = LCK_PR; - break; - case F_UNLCK: - /* An unlock request may or may not have any relation to - * existing locks so we may not be able to pass a lock handle - * via a normal ldlm_lock_cancel() request. The request may even - * unlock a byte range in the middle of an existing lock. In - * order to process an unlock request we need all of the same - * information that is given with a normal read or write record - * lock request. To avoid creating another ldlm unlock (cancel) - * message we'll treat a LCK_NL flock request as an unlock. - */ - einfo.ei_mode = LCK_NL; - break; - case F_WRLCK: - einfo.ei_mode = LCK_PW; - break; - default: - CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type); - return -ENOTSUPP; - } - - switch (cmd) { - case F_SETLKW: -#ifdef F_SETLKW64 - case F_SETLKW64: -#endif - flags = 0; - break; - case F_SETLK: -#ifdef F_SETLK64 - case F_SETLK64: -#endif - flags = LDLM_FL_BLOCK_NOWAIT; - break; - case F_GETLK: -#ifdef F_GETLK64 - case F_GETLK64: -#endif - flags = LDLM_FL_TEST_LOCK; - break; - default: - CERROR("unknown fcntl lock command: %d\n", cmd); - return -EINVAL; - } - - /* - * Save the old mode so that if the mode in the lock changes we - * can decrement the appropriate reader or writer refcount. - */ - file_lock->fl_type = einfo.ei_mode; - - op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, - LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - CDEBUG(D_DLMTRACE, "inode=" DFID ", pid=%u, flags=%#llx, mode=%u, start=%llu, end=%llu\n", - PFID(ll_inode2fid(inode)), flock.l_flock.pid, flags, - einfo.ei_mode, flock.l_flock.start, flock.l_flock.end); - - rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh, - flags); - - /* Restore the file lock type if not TEST lock. */ - if (!(flags & LDLM_FL_TEST_LOCK)) - file_lock->fl_type = fl_type; - - if ((rc == 0 || file_lock->fl_type == F_UNLCK) && - !(flags & LDLM_FL_TEST_LOCK)) - rc2 = locks_lock_file_wait(file, file_lock); - - if (rc2 && file_lock->fl_type != F_UNLCK) { - einfo.ei_mode = LCK_NL; - md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, - &lockh, flags); - rc = rc2; - } - - ll_finish_md_op_data(op_data); - - return rc; -} - -int ll_get_fid_by_name(struct inode *parent, const char *name, - int namelen, struct lu_fid *fid, - struct inode **inode) -{ - struct md_op_data *op_data = NULL; - struct ptlrpc_request *req; - struct mdt_body *body; - int rc; - - op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0, - LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE; - rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req); - ll_finish_md_op_data(op_data); - if (rc < 0) - return rc; - - body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - if (!body) { - rc = -EFAULT; - goto out_req; - } - if (fid) - *fid = body->mbo_fid1; - - if (inode) - rc = ll_prep_inode(inode, req, parent->i_sb, NULL); -out_req: - ptlrpc_req_finished(req); - return rc; -} - -int ll_migrate(struct inode *parent, struct file *file, int mdtidx, - const char *name, int namelen) -{ - struct ptlrpc_request *request = NULL; - struct obd_client_handle *och = NULL; - struct inode *child_inode = NULL; - struct dentry *dchild = NULL; - struct md_op_data *op_data; - struct mdt_body *body; - u64 data_version = 0; - struct qstr qstr; - int rc; - - CDEBUG(D_VFSTRACE, "migrate %s under " DFID " to MDT%d\n", - name, PFID(ll_inode2fid(parent)), mdtidx); - - op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, - 0, LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - /* Get child FID first */ - qstr.hash = full_name_hash(parent, name, namelen); - qstr.name = name; - qstr.len = namelen; - dchild = d_lookup(file_dentry(file), &qstr); - if (dchild) { - op_data->op_fid3 = *ll_inode2fid(dchild->d_inode); - if (dchild->d_inode) - child_inode = igrab(dchild->d_inode); - dput(dchild); - } - - if (!child_inode) { - rc = ll_get_fid_by_name(parent, name, namelen, - &op_data->op_fid3, &child_inode); - if (rc) - goto out_free; - } - - if (!child_inode) { - rc = -EINVAL; - goto out_free; - } - - inode_lock(child_inode); - op_data->op_fid3 = *ll_inode2fid(child_inode); - if (!fid_is_sane(&op_data->op_fid3)) { - CERROR("%s: migrate %s, but fid " DFID " is insane\n", - ll_get_fsname(parent->i_sb, NULL, 0), name, - PFID(&op_data->op_fid3)); - rc = -EINVAL; - goto out_unlock; - } - - rc = ll_get_mdt_idx_by_fid(ll_i2sbi(parent), &op_data->op_fid3); - if (rc < 0) - goto out_unlock; - - if (rc == mdtidx) { - CDEBUG(D_INFO, "%s: " DFID " is already on MDT%d.\n", name, - PFID(&op_data->op_fid3), mdtidx); - rc = 0; - goto out_unlock; - } -again: - if (S_ISREG(child_inode->i_mode)) { - och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0); - if (IS_ERR(och)) { - rc = PTR_ERR(och); - och = NULL; - goto out_unlock; - } - - rc = ll_data_version(child_inode, &data_version, - LL_DV_WR_FLUSH); - if (rc) - goto out_close; - - op_data->op_handle = och->och_fh; - op_data->op_data = och->och_mod; - op_data->op_data_version = data_version; - op_data->op_lease_handle = och->och_lease_handle; - op_data->op_bias |= MDS_RENAME_MIGRATE; - } - - op_data->op_mds = mdtidx; - op_data->op_cli_flags = CLI_MIGRATE; - rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, - namelen, name, namelen, &request); - if (!rc) { - LASSERT(request); - ll_update_times(request, parent); - - body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY); - LASSERT(body); - - /* - * If the server does release layout lock, then we cleanup - * the client och here, otherwise release it in out_close: - */ - if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) { - obd_mod_put(och->och_mod); - md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp, - och); - och->och_fh.cookie = DEAD_HANDLE_MAGIC; - kfree(och); - och = NULL; - } - } - - if (request) { - ptlrpc_req_finished(request); - request = NULL; - } - - /* Try again if the file layout has changed. */ - if (rc == -EAGAIN && S_ISREG(child_inode->i_mode)) - goto again; - -out_close: - if (och) /* close the file */ - ll_lease_close(och, child_inode, NULL); - if (!rc) - clear_nlink(child_inode); -out_unlock: - inode_unlock(child_inode); - iput(child_inode); -out_free: - ll_finish_md_op_data(op_data); - return rc; -} - -static int -ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock) -{ - return -ENOSYS; -} - -/** - * test if some locks matching bits and l_req_mode are acquired - * - bits can be in different locks - * - if found clear the common lock bits in *bits - * - the bits not found, are kept in *bits - * \param inode [IN] - * \param bits [IN] searched lock bits [IN] - * \param l_req_mode [IN] searched lock mode - * \retval boolean, true iff all bits are found - */ -int ll_have_md_lock(struct inode *inode, __u64 *bits, - enum ldlm_mode l_req_mode) -{ - struct lustre_handle lockh; - union ldlm_policy_data policy; - enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ? - (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode; - struct lu_fid *fid; - __u64 flags; - int i; - - if (!inode) - return 0; - - fid = &ll_i2info(inode)->lli_fid; - CDEBUG(D_INFO, "trying to match res " DFID " mode %s\n", PFID(fid), - ldlm_lockname[mode]); - - flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK; - for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) { - policy.l_inodebits.bits = *bits & (1 << i); - if (policy.l_inodebits.bits == 0) - continue; - - if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS, - &policy, mode, &lockh)) { - struct ldlm_lock *lock; - - lock = ldlm_handle2lock(&lockh); - if (lock) { - *bits &= - ~(lock->l_policy_data.l_inodebits.bits); - LDLM_LOCK_PUT(lock); - } else { - *bits &= ~policy.l_inodebits.bits; - } - } - } - return *bits == 0; -} - -enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits, - struct lustre_handle *lockh, __u64 flags, - enum ldlm_mode mode) -{ - union ldlm_policy_data policy = { .l_inodebits = { bits } }; - struct lu_fid *fid; - - fid = &ll_i2info(inode)->lli_fid; - CDEBUG(D_INFO, "trying to match res " DFID "\n", PFID(fid)); - - return md_lock_match(ll_i2mdexp(inode), flags | LDLM_FL_BLOCK_GRANTED, - fid, LDLM_IBITS, &policy, mode, lockh); -} - -static int ll_inode_revalidate_fini(struct inode *inode, int rc) -{ - /* Already unlinked. Just update nlink and return success */ - if (rc == -ENOENT) { - clear_nlink(inode); - /* If it is striped directory, and there is bad stripe - * Let's revalidate the dentry again, instead of returning - * error - */ - if (S_ISDIR(inode->i_mode) && ll_i2info(inode)->lli_lsm_md) - return 0; - - /* This path cannot be hit for regular files unless in - * case of obscure races, so no need to validate size. - */ - if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) - return 0; - } else if (rc != 0) { - CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR, - "%s: revalidate FID " DFID " error: rc = %d\n", - ll_get_fsname(inode->i_sb, NULL, 0), - PFID(ll_inode2fid(inode)), rc); - } - - return rc; -} - -static int __ll_inode_revalidate(struct dentry *dentry, __u64 ibits) -{ - struct inode *inode = d_inode(dentry); - struct ptlrpc_request *req = NULL; - struct obd_export *exp; - int rc = 0; - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p),name=%pd\n", - PFID(ll_inode2fid(inode)), inode, dentry); - - exp = ll_i2mdexp(inode); - - /* XXX: Enable OBD_CONNECT_ATTRFID to reduce unnecessary getattr RPC. - * But under CMD case, it caused some lock issues, should be fixed - * with new CMD ibits lock. See bug 12718 - */ - if (exp_connect_flags(exp) & OBD_CONNECT_ATTRFID) { - struct lookup_intent oit = { .it_op = IT_GETATTR }; - struct md_op_data *op_data; - - if (ibits == MDS_INODELOCK_LOOKUP) - oit.it_op = IT_LOOKUP; - - /* Call getattr by fid, so do not provide name at all. */ - op_data = ll_prep_md_op_data(NULL, inode, - inode, NULL, 0, 0, - LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - rc = md_intent_lock(exp, op_data, &oit, &req, - &ll_md_blocking_ast, 0); - ll_finish_md_op_data(op_data); - if (rc < 0) { - rc = ll_inode_revalidate_fini(inode, rc); - goto out; - } - - rc = ll_revalidate_it_finish(req, &oit, inode); - if (rc != 0) { - ll_intent_release(&oit); - goto out; - } - - /* Unlinked? Unhash dentry, so it is not picked up later by - * do_lookup() -> ll_revalidate_it(). We cannot use d_drop - * here to preserve get_cwd functionality on 2.6. - * Bug 10503 - */ - if (!d_inode(dentry)->i_nlink) { - spin_lock(&inode->i_lock); - d_lustre_invalidate(dentry, 0); - spin_unlock(&inode->i_lock); - } - - ll_lookup_finish_locks(&oit, inode); - } else if (!ll_have_md_lock(d_inode(dentry), &ibits, LCK_MINMODE)) { - struct ll_sb_info *sbi = ll_i2sbi(d_inode(dentry)); - u64 valid = OBD_MD_FLGETATTR; - struct md_op_data *op_data; - int ealen = 0; - - if (S_ISREG(inode->i_mode)) { - rc = ll_get_default_mdsize(sbi, &ealen); - if (rc) - return rc; - valid |= OBD_MD_FLEASIZE | OBD_MD_FLMODEASIZE; - } - - op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, - 0, ealen, LUSTRE_OPC_ANY, - NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - op_data->op_valid = valid; - rc = md_getattr(sbi->ll_md_exp, op_data, &req); - ll_finish_md_op_data(op_data); - if (rc) - return ll_inode_revalidate_fini(inode, rc); - - rc = ll_prep_inode(&inode, req, NULL, NULL); - } -out: - ptlrpc_req_finished(req); - return rc; -} - -static int ll_merge_md_attr(struct inode *inode) -{ - struct cl_attr attr = { 0 }; - int rc; - - LASSERT(ll_i2info(inode)->lli_lsm_md); - rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md, - &attr, ll_md_blocking_ast); - if (rc) - return rc; - - set_nlink(inode, attr.cat_nlink); - inode->i_blocks = attr.cat_blocks; - i_size_write(inode, attr.cat_size); - - ll_i2info(inode)->lli_atime = attr.cat_atime; - ll_i2info(inode)->lli_mtime = attr.cat_mtime; - ll_i2info(inode)->lli_ctime = attr.cat_ctime; - - return 0; -} - -static int ll_inode_revalidate(struct dentry *dentry, __u64 ibits) -{ - struct inode *inode = d_inode(dentry); - int rc; - - rc = __ll_inode_revalidate(dentry, ibits); - if (rc != 0) - return rc; - - /* if object isn't regular file, don't validate size */ - if (!S_ISREG(inode->i_mode)) { - if (S_ISDIR(inode->i_mode) && - ll_i2info(inode)->lli_lsm_md) { - rc = ll_merge_md_attr(inode); - if (rc) - return rc; - } - - LTIME_S(inode->i_atime) = ll_i2info(inode)->lli_atime; - LTIME_S(inode->i_mtime) = ll_i2info(inode)->lli_mtime; - LTIME_S(inode->i_ctime) = ll_i2info(inode)->lli_ctime; - } else { - struct ll_inode_info *lli = ll_i2info(inode); - - /* In case of restore, the MDT has the right size and has - * already send it back without granting the layout lock, - * inode is up-to-date so glimpse is useless. - * Also to glimpse we need the layout, in case of a running - * restore the MDT holds the layout lock so the glimpse will - * block up to the end of restore (getattr will block) - */ - if (!test_bit(LLIF_FILE_RESTORING, &lli->lli_flags)) - rc = ll_glimpse_size(inode); - } - return rc; -} - -int ll_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) -{ - struct inode *inode = d_inode(path->dentry); - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct ll_inode_info *lli = ll_i2info(inode); - int res; - - res = ll_inode_revalidate(path->dentry, - MDS_INODELOCK_UPDATE | MDS_INODELOCK_LOOKUP); - ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1); - - if (res) - return res; - - OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30); - - stat->dev = inode->i_sb->s_dev; - if (ll_need_32bit_api(sbi)) - stat->ino = cl_fid_build_ino(&lli->lli_fid, 1); - else - stat->ino = inode->i_ino; - stat->mode = inode->i_mode; - stat->uid = inode->i_uid; - stat->gid = inode->i_gid; - stat->rdev = inode->i_rdev; - stat->atime = inode->i_atime; - stat->mtime = inode->i_mtime; - stat->ctime = inode->i_ctime; - stat->blksize = 1 << inode->i_blkbits; - - stat->nlink = inode->i_nlink; - stat->size = i_size_read(inode); - stat->blocks = inode->i_blocks; - - return 0; -} - -static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len) -{ - int rc; - size_t num_bytes; - struct fiemap *fiemap; - unsigned int extent_count = fieinfo->fi_extents_max; - - num_bytes = sizeof(*fiemap) + (extent_count * - sizeof(struct fiemap_extent)); - fiemap = kvzalloc(num_bytes, GFP_KERNEL); - if (!fiemap) - return -ENOMEM; - - fiemap->fm_flags = fieinfo->fi_flags; - fiemap->fm_extent_count = fieinfo->fi_extents_max; - fiemap->fm_start = start; - fiemap->fm_length = len; - - if (extent_count > 0 && - copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start, - sizeof(struct fiemap_extent))) { - rc = -EFAULT; - goto out; - } - - rc = ll_do_fiemap(inode, fiemap, num_bytes); - - fieinfo->fi_flags = fiemap->fm_flags; - fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents; - if (extent_count > 0 && - copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0], - fiemap->fm_mapped_extents * - sizeof(struct fiemap_extent))) { - rc = -EFAULT; - goto out; - } -out: - kvfree(fiemap); - return rc; -} - -int ll_inode_permission(struct inode *inode, int mask) -{ - struct ll_sb_info *sbi; - struct root_squash_info *squash; - const struct cred *old_cred = NULL; - struct cred *cred = NULL; - bool squash_id = false; - int rc = 0; - - if (mask & MAY_NOT_BLOCK) - return -ECHILD; - - /* as root inode are NOT getting validated in lookup operation, - * need to do it before permission check. - */ - - if (is_root_inode(inode)) { - rc = __ll_inode_revalidate(inode->i_sb->s_root, - MDS_INODELOCK_LOOKUP); - if (rc) - return rc; - } - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p), inode mode %x mask %o\n", - PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask); - - /* squash fsuid/fsgid if needed */ - sbi = ll_i2sbi(inode); - squash = &sbi->ll_squash; - if (unlikely(squash->rsi_uid && - uid_eq(current_fsuid(), GLOBAL_ROOT_UID) && - !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) { - squash_id = true; - } - - if (squash_id) { - CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n", - __kuid_val(current_fsuid()), __kgid_val(current_fsgid()), - squash->rsi_uid, squash->rsi_gid); - - /* - * update current process's credentials - * and FS capability - */ - cred = prepare_creds(); - if (!cred) - return -ENOMEM; - - cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid); - cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid); - cred->cap_effective = cap_drop_nfsd_set(cred->cap_effective); - cred->cap_effective = cap_drop_fs_set(cred->cap_effective); - - old_cred = override_creds(cred); - } - - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_INODE_PERM, 1); - rc = generic_permission(inode, mask); - - /* restore current process's credentials and FS capability */ - if (squash_id) { - revert_creds(old_cred); - put_cred(cred); - } - - return rc; -} - -/* -o localflock - only provides locally consistent flock locks */ -const struct file_operations ll_file_operations = { - .read_iter = ll_file_read_iter, - .write_iter = ll_file_write_iter, - .unlocked_ioctl = ll_file_ioctl, - .open = ll_file_open, - .release = ll_file_release, - .mmap = ll_file_mmap, - .llseek = ll_file_seek, - .splice_read = generic_file_splice_read, - .fsync = ll_fsync, - .flush = ll_flush -}; - -const struct file_operations ll_file_operations_flock = { - .read_iter = ll_file_read_iter, - .write_iter = ll_file_write_iter, - .unlocked_ioctl = ll_file_ioctl, - .open = ll_file_open, - .release = ll_file_release, - .mmap = ll_file_mmap, - .llseek = ll_file_seek, - .splice_read = generic_file_splice_read, - .fsync = ll_fsync, - .flush = ll_flush, - .flock = ll_file_flock, - .lock = ll_file_flock -}; - -/* These are for -o noflock - to return ENOSYS on flock calls */ -const struct file_operations ll_file_operations_noflock = { - .read_iter = ll_file_read_iter, - .write_iter = ll_file_write_iter, - .unlocked_ioctl = ll_file_ioctl, - .open = ll_file_open, - .release = ll_file_release, - .mmap = ll_file_mmap, - .llseek = ll_file_seek, - .splice_read = generic_file_splice_read, - .fsync = ll_fsync, - .flush = ll_flush, - .flock = ll_file_noflock, - .lock = ll_file_noflock -}; - -const struct inode_operations ll_file_inode_operations = { - .setattr = ll_setattr, - .getattr = ll_getattr, - .permission = ll_inode_permission, - .listxattr = ll_listxattr, - .fiemap = ll_fiemap, - .get_acl = ll_get_acl, -}; - -/* dynamic ioctl number support routines */ -static struct llioc_ctl_data { - struct rw_semaphore ioc_sem; - struct list_head ioc_head; -} llioc = { - __RWSEM_INITIALIZER(llioc.ioc_sem), - LIST_HEAD_INIT(llioc.ioc_head) -}; - -struct llioc_data { - struct list_head iocd_list; - unsigned int iocd_size; - llioc_callback_t iocd_cb; - unsigned int iocd_count; - unsigned int iocd_cmd[0]; -}; - -void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd) -{ - unsigned int size; - struct llioc_data *in_data = NULL; - - if (!cb || !cmd || count > LLIOC_MAX_CMD || count < 0) - return NULL; - - size = sizeof(*in_data) + count * sizeof(unsigned int); - in_data = kzalloc(size, GFP_NOFS); - if (!in_data) - return NULL; - - in_data->iocd_size = size; - in_data->iocd_cb = cb; - in_data->iocd_count = count; - memcpy(in_data->iocd_cmd, cmd, sizeof(unsigned int) * count); - - down_write(&llioc.ioc_sem); - list_add_tail(&in_data->iocd_list, &llioc.ioc_head); - up_write(&llioc.ioc_sem); - - return in_data; -} -EXPORT_SYMBOL(ll_iocontrol_register); - -void ll_iocontrol_unregister(void *magic) -{ - struct llioc_data *tmp; - - if (!magic) - return; - - down_write(&llioc.ioc_sem); - list_for_each_entry(tmp, &llioc.ioc_head, iocd_list) { - if (tmp == magic) { - list_del(&tmp->iocd_list); - up_write(&llioc.ioc_sem); - - kfree(tmp); - return; - } - } - up_write(&llioc.ioc_sem); - - CWARN("didn't find iocontrol register block with magic: %p\n", magic); -} -EXPORT_SYMBOL(ll_iocontrol_unregister); - -static enum llioc_iter -ll_iocontrol_call(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg, int *rcp) -{ - enum llioc_iter ret = LLIOC_CONT; - struct llioc_data *data; - int rc = -EINVAL, i; - - down_read(&llioc.ioc_sem); - list_for_each_entry(data, &llioc.ioc_head, iocd_list) { - for (i = 0; i < data->iocd_count; i++) { - if (cmd != data->iocd_cmd[i]) - continue; - - ret = data->iocd_cb(inode, file, cmd, arg, data, &rc); - break; - } - - if (ret == LLIOC_STOP) - break; - } - up_read(&llioc.ioc_sem); - - if (rcp) - *rcp = rc; - return ret; -} - -int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct cl_object *obj = lli->lli_clob; - struct lu_env *env; - int rc; - u16 refcheck; - - if (!obj) - return 0; - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - rc = cl_conf_set(env, obj, conf); - if (rc < 0) - goto out; - - if (conf->coc_opc == OBJECT_CONF_SET) { - struct ldlm_lock *lock = conf->coc_lock; - struct cl_layout cl = { - .cl_layout_gen = 0, - }; - - LASSERT(lock); - LASSERT(ldlm_has_layout(lock)); - - /* it can only be allowed to match after layout is - * applied to inode otherwise false layout would be - * seen. Applying layout should happen before dropping - * the intent lock. - */ - ldlm_lock_allow_match(lock); - - rc = cl_object_layout_get(env, obj, &cl); - if (rc < 0) - goto out; - - CDEBUG(D_VFSTRACE, DFID ": layout version change: %u -> %u\n", - PFID(&lli->lli_fid), ll_layout_version_get(lli), - cl.cl_layout_gen); - ll_layout_version_set(lli, cl.cl_layout_gen); - } -out: - cl_env_put(env, &refcheck); - return rc; -} - -/* Fetch layout from MDT with getxattr request, if it's not ready yet */ -static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock) - -{ - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct ptlrpc_request *req; - struct mdt_body *body; - void *lvbdata; - void *lmm; - int lmmsize; - int rc; - - CDEBUG(D_INODE, DFID " LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n", - PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock), - lock->l_lvb_data, lock->l_lvb_len); - - if (lock->l_lvb_data && ldlm_is_lvb_ready(lock)) - return 0; - - /* if layout lock was granted right away, the layout is returned - * within DLM_LVB of dlm reply; otherwise if the lock was ever - * blocked and then granted via completion ast, we have to fetch - * layout here. Please note that we can't use the LVB buffer in - * completion AST because it doesn't have a large enough buffer - */ - rc = ll_get_default_mdsize(sbi, &lmmsize); - if (rc == 0) - rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), - OBD_MD_FLXATTR, XATTR_NAME_LOV, lmmsize, &req); - if (rc < 0) - return rc; - - body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - if (!body) { - rc = -EPROTO; - goto out; - } - - lmmsize = body->mbo_eadatasize; - if (lmmsize == 0) /* empty layout */ { - rc = 0; - goto out; - } - - lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize); - if (!lmm) { - rc = -EFAULT; - goto out; - } - - lvbdata = kvzalloc(lmmsize, GFP_NOFS); - if (!lvbdata) { - rc = -ENOMEM; - goto out; - } - - memcpy(lvbdata, lmm, lmmsize); - lock_res_and_lock(lock); - if (lock->l_lvb_data) - kvfree(lock->l_lvb_data); - - lock->l_lvb_data = lvbdata; - lock->l_lvb_len = lmmsize; - unlock_res_and_lock(lock); - -out: - ptlrpc_req_finished(req); - return rc; -} - -/** - * Apply the layout to the inode. Layout lock is held and will be released - * in this function. - */ -static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode, - struct inode *inode) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct ldlm_lock *lock; - struct cl_object_conf conf; - int rc = 0; - bool lvb_ready; - bool wait_layout = false; - - LASSERT(lustre_handle_is_used(lockh)); - - lock = ldlm_handle2lock(lockh); - LASSERT(lock); - LASSERT(ldlm_has_layout(lock)); - - LDLM_DEBUG(lock, "File " DFID "(%p) being reconfigured", - PFID(&lli->lli_fid), inode); - - /* in case this is a caching lock and reinstate with new inode */ - md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL); - - lock_res_and_lock(lock); - lvb_ready = ldlm_is_lvb_ready(lock); - unlock_res_and_lock(lock); - /* checking lvb_ready is racy but this is okay. The worst case is - * that multi processes may configure the file on the same time. - */ - if (lvb_ready) { - rc = 0; - goto out; - } - - rc = ll_layout_fetch(inode, lock); - if (rc < 0) - goto out; - - /* for layout lock, lmm is returned in lock's lvb. - * lvb_data is immutable if the lock is held so it's safe to access it - * without res lock. - * - * set layout to file. Unlikely this will fail as old layout was - * surely eliminated - */ - memset(&conf, 0, sizeof(conf)); - conf.coc_opc = OBJECT_CONF_SET; - conf.coc_inode = inode; - conf.coc_lock = lock; - conf.u.coc_layout.lb_buf = lock->l_lvb_data; - conf.u.coc_layout.lb_len = lock->l_lvb_len; - rc = ll_layout_conf(inode, &conf); - - /* refresh layout failed, need to wait */ - wait_layout = rc == -EBUSY; - -out: - LDLM_LOCK_PUT(lock); - ldlm_lock_decref(lockh, mode); - - /* wait for IO to complete if it's still being used. */ - if (wait_layout) { - CDEBUG(D_INODE, "%s: " DFID "(%p) wait for layout reconf\n", - ll_get_fsname(inode->i_sb, NULL, 0), - PFID(&lli->lli_fid), inode); - - memset(&conf, 0, sizeof(conf)); - conf.coc_opc = OBJECT_CONF_WAIT; - conf.coc_inode = inode; - rc = ll_layout_conf(inode, &conf); - if (rc == 0) - rc = -EAGAIN; - - CDEBUG(D_INODE, - "%s: file=" DFID " waiting layout return: %d.\n", - ll_get_fsname(inode->i_sb, NULL, 0), - PFID(&lli->lli_fid), rc); - } - return rc; -} - -static int ll_layout_refresh_locked(struct inode *inode) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct md_op_data *op_data; - struct lookup_intent it; - struct lustre_handle lockh; - enum ldlm_mode mode; - struct ptlrpc_request *req; - int rc; - -again: - /* mostly layout lock is caching on the local side, so try to match - * it before grabbing layout lock mutex. - */ - mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0, - LCK_CR | LCK_CW | LCK_PR | LCK_PW); - if (mode != 0) { /* hit cached lock */ - rc = ll_layout_lock_set(&lockh, mode, inode); - if (rc == -EAGAIN) - goto again; - return rc; - } - - op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, - 0, 0, LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - /* have to enqueue one */ - memset(&it, 0, sizeof(it)); - it.it_op = IT_LAYOUT; - - LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file " DFID "(%p)", - ll_get_fsname(inode->i_sb, NULL, 0), - PFID(&lli->lli_fid), inode); - - rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req, - &ll_md_blocking_ast, 0); - ptlrpc_req_finished(it.it_request); - it.it_request = NULL; - - ll_finish_md_op_data(op_data); - - mode = it.it_lock_mode; - it.it_lock_mode = 0; - ll_intent_drop_lock(&it); - - if (rc == 0) { - /* set lock data in case this is a new lock */ - ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL); - lockh.cookie = it.it_lock_handle; - rc = ll_layout_lock_set(&lockh, mode, inode); - if (rc == -EAGAIN) - goto again; - } - - return rc; -} - -/** - * This function checks if there exists a LAYOUT lock on the client side, - * or enqueues it if it doesn't have one in cache. - * - * This function will not hold layout lock so it may be revoked any time after - * this function returns. Any operations depend on layout should be redone - * in that case. - * - * This function should be called before lov_io_init() to get an uptodate - * layout version, the caller should save the version number and after IO - * is finished, this function should be called again to verify that layout - * is not changed during IO time. - */ -int ll_layout_refresh(struct inode *inode, __u32 *gen) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct ll_sb_info *sbi = ll_i2sbi(inode); - int rc; - - *gen = ll_layout_version_get(lli); - if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE) - return 0; - - /* sanity checks */ - LASSERT(fid_is_sane(ll_inode2fid(inode))); - LASSERT(S_ISREG(inode->i_mode)); - - /* take layout lock mutex to enqueue layout lock exclusively. */ - mutex_lock(&lli->lli_layout_mutex); - - rc = ll_layout_refresh_locked(inode); - if (rc < 0) - goto out; - - *gen = ll_layout_version_get(lli); -out: - mutex_unlock(&lli->lli_layout_mutex); - - return rc; -} - -/** - * This function send a restore request to the MDT - */ -int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length) -{ - struct hsm_user_request *hur; - int len, rc; - - len = sizeof(struct hsm_user_request) + - sizeof(struct hsm_user_item); - hur = kzalloc(len, GFP_NOFS); - if (!hur) - return -ENOMEM; - - hur->hur_request.hr_action = HUA_RESTORE; - hur->hur_request.hr_archive_id = 0; - hur->hur_request.hr_flags = 0; - memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid, - sizeof(hur->hur_user_item[0].hui_fid)); - hur->hur_user_item[0].hui_extent.offset = offset; - hur->hur_user_item[0].hui_extent.length = length; - hur->hur_request.hr_itemcount = 1; - rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp, - len, hur, NULL); - kfree(hur); - return rc; -} diff --git a/drivers/staging/lustre/lustre/llite/glimpse.c b/drivers/staging/lustre/lustre/llite/glimpse.c deleted file mode 100644 index ce0d51767da378a2224feea64510e3181ba780b8..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/glimpse.c +++ /dev/null @@ -1,205 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * glimpse code shared between vvp and liblustre (and other Lustre clients in - * the future). - * - * Author: Nikita Danilov - * Author: Oleg Drokin - */ - -#include -#include -#include - -#include -#include -#include -#include - -#include -#include "llite_internal.h" - -static const struct cl_lock_descr whole_file = { - .cld_start = 0, - .cld_end = CL_PAGE_EOF, - .cld_mode = CLM_READ -}; - -/* - * Check whether file has possible unwriten pages. - * - * \retval 1 file is mmap-ed or has dirty pages - * 0 otherwise - */ -blkcnt_t dirty_cnt(struct inode *inode) -{ - blkcnt_t cnt = 0; - struct vvp_object *vob = cl_inode2vvp(inode); - void *results[1]; - - if (inode->i_mapping) - cnt += radix_tree_gang_lookup_tag(&inode->i_mapping->i_pages, - results, 0, 1, - PAGECACHE_TAG_DIRTY); - if (cnt == 0 && atomic_read(&vob->vob_mmap_cnt) > 0) - cnt = 1; - - return (cnt > 0) ? 1 : 0; -} - -int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io, - struct inode *inode, struct cl_object *clob, int agl) -{ - const struct lu_fid *fid = lu_object_fid(&clob->co_lu); - struct cl_lock *lock = vvp_env_lock(env); - struct cl_lock_descr *descr = &lock->cll_descr; - int result = 0; - - CDEBUG(D_DLMTRACE, "Glimpsing inode " DFID "\n", PFID(fid)); - - /* NOTE: this looks like DLM lock request, but it may - * not be one. Due to CEF_ASYNC flag (translated - * to LDLM_FL_HAS_INTENT by osc), this is - * glimpse request, that won't revoke any - * conflicting DLM locks held. Instead, - * ll_glimpse_callback() will be called on each - * client holding a DLM lock against this file, - * and resulting size will be returned for each - * stripe. DLM lock on [0, EOF] is acquired only - * if there were no conflicting locks. If there - * were conflicting locks, enqueuing or waiting - * fails with -ENAVAIL, but valid inode - * attributes are returned anyway. - */ - *descr = whole_file; - descr->cld_obj = clob; - descr->cld_mode = CLM_READ; - descr->cld_enq_flags = CEF_ASYNC | CEF_MUST; - if (agl) - descr->cld_enq_flags |= CEF_AGL; - /* - * CEF_ASYNC is used because glimpse sub-locks cannot - * deadlock (because they never conflict with other - * locks) and, hence, can be enqueued out-of-order. - * - * CEF_MUST protects glimpse lock from conversion into - * a lockless mode. - */ - result = cl_lock_request(env, io, lock); - if (result < 0) - return result; - - if (!agl) { - ll_merge_attr(env, inode); - if (i_size_read(inode) > 0 && !inode->i_blocks) { - /* - * LU-417: Add dirty pages block count - * lest i_blocks reports 0, some "cp" or - * "tar" may think it's a completely - * sparse file and skip it. - */ - inode->i_blocks = dirty_cnt(inode); - } - } - - cl_lock_release(env, lock); - - return result; -} - -static int cl_io_get(struct inode *inode, struct lu_env **envout, - struct cl_io **ioout, u16 *refcheck) -{ - struct lu_env *env; - struct cl_io *io; - struct ll_inode_info *lli = ll_i2info(inode); - struct cl_object *clob = lli->lli_clob; - int result; - - if (S_ISREG(inode->i_mode)) { - env = cl_env_get(refcheck); - if (!IS_ERR(env)) { - io = vvp_env_thread_io(env); - io->ci_obj = clob; - *envout = env; - *ioout = io; - result = 1; - } else { - result = PTR_ERR(env); - } - } else { - result = 0; - } - return result; -} - -int cl_glimpse_size0(struct inode *inode, int agl) -{ - /* - * We don't need ast_flags argument to cl_glimpse_size(), because - * osc_lock_enqueue() takes care of the possible deadlock that said - * argument was introduced to avoid. - */ - /* - * XXX but note that ll_file_seek() passes LDLM_FL_BLOCK_NOWAIT to - * cl_glimpse_size(), which doesn't make sense: glimpse locks are not - * blocking anyway. - */ - struct lu_env *env = NULL; - struct cl_io *io = NULL; - int result; - u16 refcheck; - - result = cl_io_get(inode, &env, &io, &refcheck); - if (result > 0) { -again: - io->ci_verify_layout = 1; - result = cl_io_init(env, io, CIT_MISC, io->ci_obj); - if (result > 0) - /* - * nothing to do for this io. This currently happens - * when stripe sub-object's are not yet created. - */ - result = io->ci_result; - else if (result == 0) - result = cl_glimpse_lock(env, io, inode, io->ci_obj, - agl); - - OBD_FAIL_TIMEOUT(OBD_FAIL_GLIMPSE_DELAY, 2); - cl_io_fini(env, io); - if (unlikely(io->ci_need_restart)) - goto again; - cl_env_put(env, &refcheck); - } - return result; -} diff --git a/drivers/staging/lustre/lustre/llite/lcommon_cl.c b/drivers/staging/lustre/lustre/llite/lcommon_cl.c deleted file mode 100644 index d7ea39ce0cb243b1241042e5833a69ce9e03ae20..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/lcommon_cl.c +++ /dev/null @@ -1,292 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * cl code shared between vvp and liblustre (and other Lustre clients in the - * future). - * - * Author: Nikita Danilov - */ - -#define DEBUG_SUBSYSTEM S_LLITE - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "llite_internal.h" - -/* - * ccc_ prefix stands for "Common Client Code". - */ - -/***************************************************************************** - * - * Vvp device and device type functions. - * - */ - -/** - * An `emergency' environment used by cl_inode_fini() when cl_env_get() - * fails. Access to this environment is serialized by cl_inode_fini_guard - * mutex. - */ -struct lu_env *cl_inode_fini_env; -u16 cl_inode_fini_refcheck; - -/** - * A mutex serializing calls to slp_inode_fini() under extreme memory - * pressure, when environments cannot be allocated. - */ -static DEFINE_MUTEX(cl_inode_fini_guard); - -int cl_setattr_ost(struct cl_object *obj, const struct iattr *attr, - unsigned int attr_flags) -{ - struct lu_env *env; - struct cl_io *io; - int result; - u16 refcheck; - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - io = vvp_env_thread_io(env); - io->ci_obj = obj; - io->ci_verify_layout = 1; - - io->u.ci_setattr.sa_attr.lvb_atime = LTIME_S(attr->ia_atime); - io->u.ci_setattr.sa_attr.lvb_mtime = LTIME_S(attr->ia_mtime); - io->u.ci_setattr.sa_attr.lvb_ctime = LTIME_S(attr->ia_ctime); - io->u.ci_setattr.sa_attr.lvb_size = attr->ia_size; - io->u.ci_setattr.sa_attr_flags = attr_flags; - io->u.ci_setattr.sa_valid = attr->ia_valid; - io->u.ci_setattr.sa_parent_fid = lu_object_fid(&obj->co_lu); - -again: - if (cl_io_init(env, io, CIT_SETATTR, io->ci_obj) == 0) { - struct vvp_io *vio = vvp_env_io(env); - - if (attr->ia_valid & ATTR_FILE) - /* populate the file descriptor for ftruncate to honor - * group lock - see LU-787 - */ - vio->vui_fd = LUSTRE_FPRIVATE(attr->ia_file); - - result = cl_io_loop(env, io); - } else { - result = io->ci_result; - } - cl_io_fini(env, io); - if (unlikely(io->ci_need_restart)) - goto again; - - cl_env_put(env, &refcheck); - return result; -} - -/** - * Initialize or update CLIO structures for regular files when new - * meta-data arrives from the server. - * - * \param inode regular file inode - * \param md new file metadata from MDS - * - allocates cl_object if necessary, - * - updated layout, if object was already here. - */ -int cl_file_inode_init(struct inode *inode, struct lustre_md *md) -{ - struct lu_env *env; - struct ll_inode_info *lli; - struct cl_object *clob; - struct lu_site *site; - struct lu_fid *fid; - struct cl_object_conf conf = { - .coc_inode = inode, - .u = { - .coc_layout = md->layout, - } - }; - int result = 0; - u16 refcheck; - - LASSERT(md->body->mbo_valid & OBD_MD_FLID); - LASSERT(S_ISREG(inode->i_mode)); - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - site = ll_i2sbi(inode)->ll_site; - lli = ll_i2info(inode); - fid = &lli->lli_fid; - LASSERT(fid_is_sane(fid)); - - if (!lli->lli_clob) { - /* clob is slave of inode, empty lli_clob means for new inode, - * there is no clob in cache with the given fid, so it is - * unnecessary to perform lookup-alloc-lookup-insert, just - * alloc and insert directly. - */ - LASSERT(inode->i_state & I_NEW); - conf.coc_lu.loc_flags = LOC_F_NEW; - clob = cl_object_find(env, lu2cl_dev(site->ls_top_dev), - fid, &conf); - if (!IS_ERR(clob)) { - /* - * No locking is necessary, as new inode is - * locked by I_NEW bit. - */ - lli->lli_clob = clob; - lu_object_ref_add(&clob->co_lu, "inode", inode); - } else { - result = PTR_ERR(clob); - } - } else { - result = cl_conf_set(env, lli->lli_clob, &conf); - } - - cl_env_put(env, &refcheck); - - if (result != 0) - CERROR("Failure to initialize cl object " DFID ": %d\n", - PFID(fid), result); - return result; -} - -/** - * Wait for others drop their references of the object at first, then we drop - * the last one, which will lead to the object be destroyed immediately. - * Must be called after cl_object_kill() against this object. - * - * The reason we want to do this is: destroying top object will wait for sub - * objects being destroyed first, so we can't let bottom layer (e.g. from ASTs) - * to initiate top object destroying which may deadlock. See bz22520. - */ -static void cl_object_put_last(struct lu_env *env, struct cl_object *obj) -{ - struct lu_object_header *header = obj->co_lu.lo_header; - wait_queue_entry_t waiter; - - if (unlikely(atomic_read(&header->loh_ref) != 1)) { - struct lu_site *site = obj->co_lu.lo_dev->ld_site; - wait_queue_head_t *wq; - - wq = lu_site_wq_from_fid(site, &header->loh_fid); - - init_waitqueue_entry(&waiter, current); - add_wait_queue(wq, &waiter); - - while (1) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (atomic_read(&header->loh_ref) == 1) - break; - schedule(); - } - - set_current_state(TASK_RUNNING); - remove_wait_queue(wq, &waiter); - } - - cl_object_put(env, obj); -} - -void cl_inode_fini(struct inode *inode) -{ - struct lu_env *env; - struct ll_inode_info *lli = ll_i2info(inode); - struct cl_object *clob = lli->lli_clob; - u16 refcheck; - int emergency; - - if (clob) { - env = cl_env_get(&refcheck); - emergency = IS_ERR(env); - if (emergency) { - mutex_lock(&cl_inode_fini_guard); - LASSERT(cl_inode_fini_env); - env = cl_inode_fini_env; - } - /* - * cl_object cache is a slave to inode cache (which, in turn - * is a slave to dentry cache), don't keep cl_object in memory - * when its master is evicted. - */ - cl_object_kill(env, clob); - lu_object_ref_del(&clob->co_lu, "inode", inode); - cl_object_put_last(env, clob); - lli->lli_clob = NULL; - if (emergency) - mutex_unlock(&cl_inode_fini_guard); - else - cl_env_put(env, &refcheck); - } -} - -/** - * build inode number from passed @fid - */ -__u64 cl_fid_build_ino(const struct lu_fid *fid, int api32) -{ - if (BITS_PER_LONG == 32 || api32) - return fid_flatten32(fid); - else - return fid_flatten(fid); -} - -/** - * build inode generation from passed @fid. If our FID overflows the 32-bit - * inode number then return a non-zero generation to distinguish them. - */ -__u32 cl_fid_build_gen(const struct lu_fid *fid) -{ - __u32 gen; - - if (fid_is_igif(fid)) { - gen = lu_igif_gen(fid); - return gen; - } - - gen = fid_flatten(fid) >> 32; - return gen; -} diff --git a/drivers/staging/lustre/lustre/llite/lcommon_misc.c b/drivers/staging/lustre/lustre/llite/lcommon_misc.c deleted file mode 100644 index a246b955306e0da919a2bf8db78d7735339fcf5f..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/lcommon_misc.c +++ /dev/null @@ -1,186 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * cl code shared between vvp and liblustre (and other Lustre clients in the - * future). - * - */ -#define DEBUG_SUBSYSTEM S_LLITE -#include -#include -#include -#include - -#include "llite_internal.h" - -/* Initialize the default and maximum LOV EA and cookie sizes. This allows - * us to make MDS RPCs with large enough reply buffers to hold the - * maximum-sized (= maximum striped) EA and cookie without having to - * calculate this (via a call into the LOV + OSCs) each time we make an RPC. - */ -int cl_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp) -{ - u32 val_size, max_easize, def_easize; - int rc; - - val_size = sizeof(max_easize); - rc = obd_get_info(NULL, dt_exp, sizeof(KEY_MAX_EASIZE), KEY_MAX_EASIZE, - &val_size, &max_easize); - if (rc) - return rc; - - val_size = sizeof(def_easize); - rc = obd_get_info(NULL, dt_exp, sizeof(KEY_DEFAULT_EASIZE), - KEY_DEFAULT_EASIZE, &val_size, &def_easize); - if (rc) - return rc; - - /* - * default cookiesize is 0 because from 2.4 server doesn't send - * llog cookies to client. - */ - CDEBUG(D_HA, "updating def/max_easize: %d/%d\n", - def_easize, max_easize); - - rc = md_init_ea_size(md_exp, max_easize, def_easize); - return rc; -} - -/** - * This function is used as an upcall-callback hooked by liblustre and llite - * clients into obd_notify() listeners chain to handle notifications about - * change of import connect_flags. See llu_fsswop_mount() and - * lustre_common_fill_super(). - */ -int cl_ocd_update(struct obd_device *host, - struct obd_device *watched, - enum obd_notify_event ev, void *owner, void *data) -{ - struct lustre_client_ocd *lco; - struct client_obd *cli; - __u64 flags; - int result; - - if (!strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME) && - watched->obd_set_up && !watched->obd_stopping) { - cli = &watched->u.cli; - lco = owner; - flags = cli->cl_import->imp_connect_data.ocd_connect_flags; - CDEBUG(D_SUPER, "Changing connect_flags: %#llx -> %#llx\n", - lco->lco_flags, flags); - mutex_lock(&lco->lco_lock); - lco->lco_flags &= flags; - /* for each osc event update ea size */ - if (lco->lco_dt_exp) - cl_init_ea_size(lco->lco_md_exp, lco->lco_dt_exp); - - mutex_unlock(&lco->lco_lock); - result = 0; - } else { - CERROR("unexpected notification from %s %s (setup:%d,stopping:%d)!\n", - watched->obd_type->typ_name, - watched->obd_name, watched->obd_set_up, - watched->obd_stopping); - result = -EINVAL; - } - return result; -} - -#define GROUPLOCK_SCOPE "grouplock" - -int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock, - struct ll_grouplock *cg) -{ - struct lu_env *env; - struct cl_io *io; - struct cl_lock *lock; - struct cl_lock_descr *descr; - __u32 enqflags; - u16 refcheck; - int rc; - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - io = vvp_env_thread_io(env); - io->ci_obj = obj; - - rc = cl_io_init(env, io, CIT_MISC, io->ci_obj); - if (rc != 0) { - cl_io_fini(env, io); - cl_env_put(env, &refcheck); - /* Does not make sense to take GL for released layout */ - if (rc > 0) - rc = -ENOTSUPP; - return rc; - } - - lock = vvp_env_lock(env); - descr = &lock->cll_descr; - descr->cld_obj = obj; - descr->cld_start = 0; - descr->cld_end = CL_PAGE_EOF; - descr->cld_gid = gid; - descr->cld_mode = CLM_GROUP; - - enqflags = CEF_MUST | (nonblock ? CEF_NONBLOCK : 0); - descr->cld_enq_flags = enqflags; - - rc = cl_lock_request(env, io, lock); - if (rc < 0) { - cl_io_fini(env, io); - cl_env_put(env, &refcheck); - return rc; - } - - cg->lg_env = env; - cg->lg_io = io; - cg->lg_lock = lock; - cg->lg_gid = gid; - - return 0; -} - -void cl_put_grouplock(struct ll_grouplock *cg) -{ - struct lu_env *env = cg->lg_env; - struct cl_io *io = cg->lg_io; - struct cl_lock *lock = cg->lg_lock; - - LASSERT(cg->lg_env); - LASSERT(cg->lg_gid); - - cl_lock_release(env, lock); - cl_io_fini(env, io); - cl_env_put(env, NULL); -} diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h deleted file mode 100644 index c08a6e14b6d748dc3212800392dcc1cb31e61373..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/llite_internal.h +++ /dev/null @@ -1,1344 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#ifndef LLITE_INTERNAL_H -#define LLITE_INTERNAL_H -#include -#include -#include /* for s2sbi */ -#include - -/* for struct cl_lock_descr and struct cl_io */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "vvp_internal.h" -#include "range_lock.h" - -#ifndef FMODE_EXEC -#define FMODE_EXEC 0 -#endif - -#ifndef VM_FAULT_RETRY -#define VM_FAULT_RETRY 0 -#endif - -/** Only used on client-side for indicating the tail of dir hash/offset. */ -#define LL_DIR_END_OFF 0x7fffffffffffffffULL -#define LL_DIR_END_OFF_32BIT 0x7fffffffUL - -/* 4UL * 1024 * 1024 */ -#define LL_MAX_BLKSIZE_BITS 22 - -#define LL_IT2STR(it) ((it) ? ldlm_it2str((it)->it_op) : "0") -#define LUSTRE_FPRIVATE(file) ((file)->private_data) - -struct ll_dentry_data { - struct lookup_intent *lld_it; - unsigned int lld_sa_generation; - unsigned int lld_invalid:1; - unsigned int lld_nfs_dentry:1; - struct rcu_head lld_rcu_head; -}; - -#define ll_d2d(de) ((struct ll_dentry_data *)((de)->d_fsdata)) - -#define LLI_INODE_MAGIC 0x111d0de5 -#define LLI_INODE_DEAD 0xdeadd00d - -struct ll_getname_data { - struct dir_context ctx; - char *lgd_name; /* points to buffer with NAME_MAX+1 size */ - struct lu_fid lgd_fid; /* target fid we are looking for */ - int lgd_found; /* inode matched? */ -}; - -struct ll_grouplock { - struct lu_env *lg_env; - struct cl_io *lg_io; - struct cl_lock *lg_lock; - unsigned long lg_gid; -}; - -enum ll_file_flags { - /* File data is modified. */ - LLIF_DATA_MODIFIED = 0, - /* File is being restored */ - LLIF_FILE_RESTORING = 1, - /* Xattr cache is attached to the file */ - LLIF_XATTR_CACHE = 2, -}; - -struct ll_inode_info { - __u32 lli_inode_magic; - - spinlock_t lli_lock; - unsigned long lli_flags; - struct posix_acl *lli_posix_acl; - - /* identifying fields for both metadata and data stacks. */ - struct lu_fid lli_fid; - /* master inode fid for stripe directory */ - struct lu_fid lli_pfid; - - /* We need all three because every inode may be opened in different - * modes - */ - struct obd_client_handle *lli_mds_read_och; - struct obd_client_handle *lli_mds_write_och; - struct obd_client_handle *lli_mds_exec_och; - __u64 lli_open_fd_read_count; - __u64 lli_open_fd_write_count; - __u64 lli_open_fd_exec_count; - /* Protects access to och pointers and their usage counters */ - struct mutex lli_och_mutex; - - struct inode lli_vfs_inode; - - /* the most recent timestamps obtained from mds */ - s64 lli_atime; - s64 lli_mtime; - s64 lli_ctime; - spinlock_t lli_agl_lock; - - /* Try to make the d::member and f::member are aligned. Before using - * these members, make clear whether it is directory or not. - */ - union { - /* for directory */ - struct { - /* serialize normal readdir and statahead-readdir. */ - struct mutex lli_readdir_mutex; - - /* metadata statahead */ - /* since parent-child threads can share the same @file - * struct, "opendir_key" is the token when dir close for - * case of parent exit before child -- it is me should - * cleanup the dir readahead. - */ - void *lli_opendir_key; - struct ll_statahead_info *lli_sai; - /* protect statahead stuff. */ - spinlock_t lli_sa_lock; - /* "opendir_pid" is the token when lookup/revalidate - * -- I am the owner of dir statahead. - */ - pid_t lli_opendir_pid; - /* stat will try to access statahead entries or start - * statahead if this flag is set, and this flag will be - * set upon dir open, and cleared when dir is closed, - * statahead hit ratio is too low, or start statahead - * thread failed. - */ - unsigned int lli_sa_enabled:1; - /* generation for statahead */ - unsigned int lli_sa_generation; - /* directory stripe information */ - struct lmv_stripe_md *lli_lsm_md; - /* default directory stripe offset. This is extracted - * from the "dmv" xattr in order to decide which MDT to - * create a subdirectory on. The MDS itself fetches - * "dmv" and gets the rest of the default layout itself - * (count, hash, etc). - */ - __u32 lli_def_stripe_offset; - }; - - /* for non-directory */ - struct { - struct mutex lli_size_mutex; - char *lli_symlink_name; - /* - * struct rw_semaphore { - * signed long count; // align d.d_def_acl - * spinlock_t wait_lock; // align d.d_sa_lock - * struct list_head wait_list; - * } - */ - struct rw_semaphore lli_trunc_sem; - struct range_lock_tree lli_write_tree; - - struct rw_semaphore lli_glimpse_sem; - unsigned long lli_glimpse_time; - struct list_head lli_agl_list; - __u64 lli_agl_index; - - /* for writepage() only to communicate to fsync */ - int lli_async_rc; - - /* - * whenever a process try to read/write the file, the - * jobid of the process will be saved here, and it'll - * be packed into the write PRC when flush later. - * - * so the read/write statistics for jobid will not be - * accurate if the file is shared by different jobs. - */ - char lli_jobid[LUSTRE_JOBID_SIZE]; - }; - }; - - /* XXX: For following frequent used members, although they maybe special - * used for non-directory object, it is some time-wasting to check - * whether the object is directory or not before using them. On the - * other hand, currently, sizeof(f) > sizeof(d), it cannot reduce - * the "ll_inode_info" size even if moving those members into u.f. - * So keep them out side. - * - * In the future, if more members are added only for directory, - * some of the following members can be moved into u.f. - */ - struct cl_object *lli_clob; - - /* mutex to request for layout lock exclusively. */ - struct mutex lli_layout_mutex; - /* Layout version, protected by lli_layout_lock */ - __u32 lli_layout_gen; - spinlock_t lli_layout_lock; - - struct rw_semaphore lli_xattrs_list_rwsem; - struct mutex lli_xattrs_enq_lock; - struct list_head lli_xattrs;/* ll_xattr_entry->xe_list */ -}; - -static inline __u32 ll_layout_version_get(struct ll_inode_info *lli) -{ - __u32 gen; - - spin_lock(&lli->lli_layout_lock); - gen = lli->lli_layout_gen; - spin_unlock(&lli->lli_layout_lock); - - return gen; -} - -static inline void ll_layout_version_set(struct ll_inode_info *lli, __u32 gen) -{ - spin_lock(&lli->lli_layout_lock); - lli->lli_layout_gen = gen; - spin_unlock(&lli->lli_layout_lock); -} - -int ll_xattr_cache_destroy(struct inode *inode); - -int ll_xattr_cache_get(struct inode *inode, const char *name, - char *buffer, size_t size, __u64 valid); - -int ll_init_security(struct dentry *dentry, struct inode *inode, - struct inode *dir); - -/* - * Locking to guarantee consistency of non-atomic updates to long long i_size, - * consistency between file size and KMS. - * - * Implemented by ->lli_size_mutex and ->lsm_lock, nested in that order. - */ - -void ll_inode_size_lock(struct inode *inode); -void ll_inode_size_unlock(struct inode *inode); - -/* FIXME: replace the name of this with LL_I to conform to kernel stuff */ -/* static inline struct ll_inode_info *LL_I(struct inode *inode) */ -static inline struct ll_inode_info *ll_i2info(struct inode *inode) -{ - return container_of(inode, struct ll_inode_info, lli_vfs_inode); -} - -/* default to about 64M of readahead on a given system. */ -#define SBI_DEFAULT_READAHEAD_MAX (64UL << (20 - PAGE_SHIFT)) - -/* default to read-ahead full files smaller than 2MB on the second read */ -#define SBI_DEFAULT_READAHEAD_WHOLE_MAX (2UL << (20 - PAGE_SHIFT)) - -enum ra_stat { - RA_STAT_HIT = 0, - RA_STAT_MISS, - RA_STAT_DISTANT_READPAGE, - RA_STAT_MISS_IN_WINDOW, - RA_STAT_FAILED_GRAB_PAGE, - RA_STAT_FAILED_MATCH, - RA_STAT_DISCARDED, - RA_STAT_ZERO_LEN, - RA_STAT_ZERO_WINDOW, - RA_STAT_EOF, - RA_STAT_MAX_IN_FLIGHT, - RA_STAT_WRONG_GRAB_PAGE, - RA_STAT_FAILED_REACH_END, - _NR_RA_STAT, -}; - -struct ll_ra_info { - atomic_t ra_cur_pages; - unsigned long ra_max_pages; - unsigned long ra_max_pages_per_file; - unsigned long ra_max_read_ahead_whole_pages; -}; - -/* ra_io_arg will be filled in the beginning of ll_readahead with - * ras_lock, then the following ll_read_ahead_pages will read RA - * pages according to this arg, all the items in this structure are - * counted by page index. - */ -struct ra_io_arg { - unsigned long ria_start; /* start offset of read-ahead*/ - unsigned long ria_end; /* end offset of read-ahead*/ - unsigned long ria_reserved; /* reserved pages for read-ahead */ - unsigned long ria_end_min; /* minimum end to cover current read */ - bool ria_eof; /* reach end of file */ - /* If stride read pattern is detected, ria_stoff means where - * stride read is started. Note: for normal read-ahead, the - * value here is meaningless, and also it will not be accessed - */ - pgoff_t ria_stoff; - /* ria_length and ria_pages are the length and pages length in the - * stride I/O mode. And they will also be used to check whether - * it is stride I/O read-ahead in the read-ahead pages - */ - unsigned long ria_length; - unsigned long ria_pages; -}; - -/* LL_HIST_MAX=32 causes an overflow */ -#define LL_HIST_MAX 28 -#define LL_HIST_START 12 /* buckets start at 2^12 = 4k */ -#define LL_PROCESS_HIST_MAX 10 -struct per_process_info { - pid_t pid; - struct obd_histogram pp_r_hist; - struct obd_histogram pp_w_hist; -}; - -/* pp_extents[LL_PROCESS_HIST_MAX] will hold the combined process info */ -struct ll_rw_extents_info { - struct per_process_info pp_extents[LL_PROCESS_HIST_MAX + 1]; -}; - -#define LL_OFFSET_HIST_MAX 100 -struct ll_rw_process_info { - pid_t rw_pid; - int rw_op; - loff_t rw_range_start; - loff_t rw_range_end; - loff_t rw_last_file_pos; - loff_t rw_offset; - size_t rw_smallest_extent; - size_t rw_largest_extent; - struct ll_file_data *rw_last_file; -}; - -enum stats_track_type { - STATS_TRACK_ALL = 0, /* track all processes */ - STATS_TRACK_PID, /* track process with this pid */ - STATS_TRACK_PPID, /* track processes with this ppid */ - STATS_TRACK_GID, /* track processes with this gid */ - STATS_TRACK_LAST, -}; - -/* flags for sbi->ll_flags */ -#define LL_SBI_NOLCK 0x01 /* DLM locking disabled (directio-only) */ -#define LL_SBI_CHECKSUM 0x02 /* checksum each page as it's written */ -#define LL_SBI_FLOCK 0x04 -#define LL_SBI_USER_XATTR 0x08 /* support user xattr */ -#define LL_SBI_ACL 0x10 /* support ACL */ -/* LL_SBI_RMT_CLIENT 0x40 remote client */ -#define LL_SBI_MDS_CAPA 0x80 /* support mds capa, obsolete */ -#define LL_SBI_OSS_CAPA 0x100 /* support oss capa, obsolete */ -#define LL_SBI_LOCALFLOCK 0x200 /* Local flocks support by kernel */ -#define LL_SBI_LRU_RESIZE 0x400 /* lru resize support */ -#define LL_SBI_LAZYSTATFS 0x800 /* lazystatfs mount option */ -/* LL_SBI_SOM_PREVIEW 0x1000 SOM preview mount option, obsolete */ -#define LL_SBI_32BIT_API 0x2000 /* generate 32 bit inodes. */ -#define LL_SBI_64BIT_HASH 0x4000 /* support 64-bits dir hash/offset */ -#define LL_SBI_AGL_ENABLED 0x8000 /* enable agl */ -#define LL_SBI_VERBOSE 0x10000 /* verbose mount/umount */ -#define LL_SBI_LAYOUT_LOCK 0x20000 /* layout lock support */ -#define LL_SBI_USER_FID2PATH 0x40000 /* allow fid2path by unprivileged users */ -#define LL_SBI_XATTR_CACHE 0x80000 /* support for xattr cache */ -#define LL_SBI_NOROOTSQUASH 0x100000 /* do not apply root squash */ -#define LL_SBI_ALWAYS_PING 0x200000 /* always ping even if server - * suppress_pings - */ - -#define LL_SBI_FLAGS { \ - "nolck", \ - "checksum", \ - "flock", \ - "user_xattr", \ - "acl", \ - "???", \ - "???", \ - "mds_capa", \ - "oss_capa", \ - "flock", \ - "lru_resize", \ - "lazy_statfs", \ - "som", \ - "32bit_api", \ - "64bit_hash", \ - "agl", \ - "verbose", \ - "layout", \ - "user_fid2path",\ - "xattr_cache", \ - "norootsquash", \ - "always_ping", \ -} - -/* - * This is embedded into llite super-blocks to keep track of connect - * flags (capabilities) supported by all imports given mount is - * connected to. - */ -struct lustre_client_ocd { - /* - * This is conjunction of connect_flags across all imports - * (LOVs) this mount is connected to. This field is updated by - * cl_ocd_update() under ->lco_lock. - */ - __u64 lco_flags; - struct mutex lco_lock; - struct obd_export *lco_md_exp; - struct obd_export *lco_dt_exp; -}; - -struct ll_sb_info { - /* this protects pglist and ra_info. It isn't safe to - * grab from interrupt contexts - */ - spinlock_t ll_lock; - spinlock_t ll_pp_extent_lock; /* pp_extent entry*/ - spinlock_t ll_process_lock; /* ll_rw_process_info */ - struct obd_uuid ll_sb_uuid; - struct obd_export *ll_md_exp; - struct obd_export *ll_dt_exp; - struct dentry *ll_debugfs_entry; - struct lu_fid ll_root_fid; /* root object fid */ - - int ll_flags; - unsigned int ll_umounting:1, - ll_xattr_cache_enabled:1, - ll_client_common_fill_super_succeeded:1; - - struct lustre_client_ocd ll_lco; - - struct lprocfs_stats *ll_stats; /* lprocfs stats counter */ - - /* - * Used to track "unstable" pages on a client, and maintain a - * LRU list of clean pages. An "unstable" page is defined as - * any page which is sent to a server as part of a bulk request, - * but is uncommitted to stable storage. - */ - struct cl_client_cache *ll_cache; - - struct lprocfs_stats *ll_ra_stats; - - struct ll_ra_info ll_ra_info; - unsigned int ll_namelen; - const struct file_operations *ll_fop; - - unsigned int ll_md_brw_pages; /* readdir pages per RPC */ - - struct lu_site *ll_site; - struct cl_device *ll_cl; - /* Statistics */ - struct ll_rw_extents_info ll_rw_extents_info; - int ll_extent_process_count; - struct ll_rw_process_info ll_rw_process_info[LL_PROCESS_HIST_MAX]; - unsigned int ll_offset_process_count; - struct ll_rw_process_info ll_rw_offset_info[LL_OFFSET_HIST_MAX]; - unsigned int ll_rw_offset_entry_count; - int ll_stats_track_id; - enum stats_track_type ll_stats_track_type; - int ll_rw_stats_on; - - /* metadata stat-ahead */ - unsigned int ll_sa_max; /* max statahead RPCs */ - atomic_t ll_sa_total; /* statahead thread started - * count - */ - atomic_t ll_sa_wrong; /* statahead thread stopped for - * low hit ratio - */ - atomic_t ll_sa_running; /* running statahead thread - * count - */ - atomic_t ll_agl_total; /* AGL thread started count */ - - dev_t ll_sdev_orig; /* save s_dev before assign for - * clustered nfs - */ - /* root squash */ - struct root_squash_info ll_squash; - struct path ll_mnt; - - __kernel_fsid_t ll_fsid; - struct kobject ll_kobj; /* sysfs object */ - struct super_block *ll_sb; /* struct super_block (for sysfs code)*/ - struct completion ll_kobj_unregister; -}; - -/* - * per file-descriptor read-ahead data. - */ -struct ll_readahead_state { - spinlock_t ras_lock; - /* - * index of the last page that read(2) needed and that wasn't in the - * cache. Used by ras_update() to detect seeks. - * - * XXX nikita: if access seeks into cached region, Lustre doesn't see - * this. - */ - unsigned long ras_last_readpage; - /* - * number of pages read after last read-ahead window reset. As window - * is reset on each seek, this is effectively a number of consecutive - * accesses. Maybe ->ras_accessed_in_window is better name. - * - * XXX nikita: window is also reset (by ras_update()) when Lustre - * believes that memory pressure evicts read-ahead pages. In that - * case, it probably doesn't make sense to expand window to - * PTLRPC_MAX_BRW_PAGES on the third access. - */ - unsigned long ras_consecutive_pages; - /* - * number of read requests after the last read-ahead window reset - * As window is reset on each seek, this is effectively the number - * on consecutive read request and is used to trigger read-ahead. - */ - unsigned long ras_consecutive_requests; - /* - * Parameters of current read-ahead window. Handled by - * ras_update(). On the initial access to the file or after a seek, - * window is reset to 0. After 3 consecutive accesses, window is - * expanded to PTLRPC_MAX_BRW_PAGES. Afterwards, window is enlarged by - * PTLRPC_MAX_BRW_PAGES chunks up to ->ra_max_pages. - */ - unsigned long ras_window_start, ras_window_len; - /* - * Optimal RPC size. It decides how many pages will be sent - * for each read-ahead. - */ - unsigned long ras_rpc_size; - /* - * Where next read-ahead should start at. This lies within read-ahead - * window. Read-ahead window is read in pieces rather than at once - * because: 1. lustre limits total number of pages under read-ahead by - * ->ra_max_pages (see ll_ra_count_get()), 2. client cannot read pages - * not covered by DLM lock. - */ - unsigned long ras_next_readahead; - /* - * Total number of ll_file_read requests issued, reads originating - * due to mmap are not counted in this total. This value is used to - * trigger full file read-ahead after multiple reads to a small file. - */ - unsigned long ras_requests; - /* - * Page index with respect to the current request, these value - * will not be accurate when dealing with reads issued via mmap. - */ - unsigned long ras_request_index; - /* - * The following 3 items are used for detecting the stride I/O - * mode. - * In stride I/O mode, - * ...............|-----data-----|****gap*****|--------|******|.... - * offset |-stride_pages-|-stride_gap-| - * ras_stride_offset = offset; - * ras_stride_length = stride_pages + stride_gap; - * ras_stride_pages = stride_pages; - * Note: all these three items are counted by pages. - */ - unsigned long ras_stride_length; - unsigned long ras_stride_pages; - pgoff_t ras_stride_offset; - /* - * number of consecutive stride request count, and it is similar as - * ras_consecutive_requests, but used for stride I/O mode. - * Note: only more than 2 consecutive stride request are detected, - * stride read-ahead will be enable - */ - unsigned long ras_consecutive_stride_requests; -}; - -extern struct kmem_cache *ll_file_data_slab; -struct lustre_handle; -struct ll_file_data { - struct ll_readahead_state fd_ras; - struct ll_grouplock fd_grouplock; - __u64 lfd_pos; - __u32 fd_flags; - fmode_t fd_omode; - /* openhandle if lease exists for this file. - * Borrow lli->lli_och_mutex to protect assignment - */ - struct obd_client_handle *fd_lease_och; - struct obd_client_handle *fd_och; - struct file *fd_file; - /* Indicate whether need to report failure when close. - * true: failure is known, not report again. - * false: unknown failure, should report. - */ - bool fd_write_failed; - rwlock_t fd_lock; /* protect lcc list */ - struct list_head fd_lccs; /* list of ll_cl_context */ -}; - -extern struct dentry *llite_root; -extern struct kset *llite_kset; - -static inline struct inode *ll_info2i(struct ll_inode_info *lli) -{ - return &lli->lli_vfs_inode; -} - -__u32 ll_i2suppgid(struct inode *i); -void ll_i2gids(__u32 *suppgids, struct inode *i1, struct inode *i2); - -static inline int ll_need_32bit_api(struct ll_sb_info *sbi) -{ -#if BITS_PER_LONG == 32 - return 1; -#elif defined(CONFIG_COMPAT) - return unlikely(in_compat_syscall() || - (sbi->ll_flags & LL_SBI_32BIT_API)); -#else - return unlikely(sbi->ll_flags & LL_SBI_32BIT_API); -#endif -} - -void ll_ras_enter(struct file *f); - -/* llite/lcommon_misc.c */ -int cl_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp); -int cl_ocd_update(struct obd_device *host, - struct obd_device *watched, - enum obd_notify_event ev, void *owner, void *data); -int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock, - struct ll_grouplock *cg); -void cl_put_grouplock(struct ll_grouplock *cg); - -/* llite/lproc_llite.c */ -int ldebugfs_register_mountpoint(struct dentry *parent, - struct super_block *sb, char *osc, char *mdc); -void ldebugfs_unregister_mountpoint(struct ll_sb_info *sbi); -void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count); -void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars); -void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, - struct ll_file_data *file, loff_t pos, - size_t count, int rw); - -enum { - LPROC_LL_DIRTY_HITS, - LPROC_LL_DIRTY_MISSES, - LPROC_LL_READ_BYTES, - LPROC_LL_WRITE_BYTES, - LPROC_LL_BRW_READ, - LPROC_LL_BRW_WRITE, - LPROC_LL_IOCTL, - LPROC_LL_OPEN, - LPROC_LL_RELEASE, - LPROC_LL_MAP, - LPROC_LL_LLSEEK, - LPROC_LL_FSYNC, - LPROC_LL_READDIR, - LPROC_LL_SETATTR, - LPROC_LL_TRUNC, - LPROC_LL_FLOCK, - LPROC_LL_GETATTR, - LPROC_LL_CREATE, - LPROC_LL_LINK, - LPROC_LL_UNLINK, - LPROC_LL_SYMLINK, - LPROC_LL_MKDIR, - LPROC_LL_RMDIR, - LPROC_LL_MKNOD, - LPROC_LL_RENAME, - LPROC_LL_STAFS, - LPROC_LL_ALLOC_INODE, - LPROC_LL_SETXATTR, - LPROC_LL_GETXATTR, - LPROC_LL_GETXATTR_HITS, - LPROC_LL_LISTXATTR, - LPROC_LL_REMOVEXATTR, - LPROC_LL_INODE_PERM, - LPROC_LL_FILE_OPCODES -}; - -/* llite/dir.c */ -extern const struct file_operations ll_dir_operations; -extern const struct inode_operations ll_dir_inode_operations; -int ll_dir_read(struct inode *inode, __u64 *ppos, struct md_op_data *op_data, - struct dir_context *ctx); -int ll_get_mdt_idx(struct inode *inode); -int ll_get_mdt_idx_by_fid(struct ll_sb_info *sbi, const struct lu_fid *fid); -struct page *ll_get_dir_page(struct inode *dir, struct md_op_data *op_data, - __u64 offset); -void ll_release_page(struct inode *inode, struct page *page, bool remove); - -/* llite/namei.c */ -extern const struct inode_operations ll_special_inode_operations; - -struct inode *ll_iget(struct super_block *sb, ino_t hash, - struct lustre_md *lic); -int ll_test_inode_by_fid(struct inode *inode, void *opaque); -int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, - void *data, int flag); -struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de); -void ll_update_times(struct ptlrpc_request *request, struct inode *inode); - -/* llite/rw.c */ -int ll_writepage(struct page *page, struct writeback_control *wbc); -int ll_writepages(struct address_space *mapping, struct writeback_control *wbc); -int ll_readpage(struct file *file, struct page *page); -void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras); -int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io); -struct ll_cl_context *ll_cl_find(struct file *file); -void ll_cl_add(struct file *file, const struct lu_env *env, struct cl_io *io); -void ll_cl_remove(struct file *file, const struct lu_env *env); - -extern const struct address_space_operations ll_aops; - -/* llite/file.c */ -extern const struct file_operations ll_file_operations; -extern const struct file_operations ll_file_operations_flock; -extern const struct file_operations ll_file_operations_noflock; -extern const struct inode_operations ll_file_inode_operations; -int ll_have_md_lock(struct inode *inode, __u64 *bits, - enum ldlm_mode l_req_mode); -enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits, - struct lustre_handle *lockh, __u64 flags, - enum ldlm_mode mode); -int ll_file_open(struct inode *inode, struct file *file); -int ll_file_release(struct inode *inode, struct file *file); -int ll_release_openhandle(struct inode *inode, struct lookup_intent *it); -int ll_md_real_close(struct inode *inode, fmode_t fmode); -int ll_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags); -#ifdef CONFIG_FS_POSIX_ACL -struct posix_acl *ll_get_acl(struct inode *inode, int type); -int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type); -#else -#define ll_get_acl NULL -#define ll_set_acl NULL -#endif /* CONFIG_FS_POSIX_ACL */ - -int ll_migrate(struct inode *parent, struct file *file, int mdtidx, - const char *name, int namelen); -int ll_get_fid_by_name(struct inode *parent, const char *name, - int namelen, struct lu_fid *fid, struct inode **inode); -int ll_inode_permission(struct inode *inode, int mask); - -int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry, - __u64 flags, struct lov_user_md *lum, - int lum_size); -int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename, - struct lov_mds_md **lmm, int *lmm_size, - struct ptlrpc_request **request); -int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump, - int set_default); -int ll_dir_getstripe(struct inode *inode, void **lmmp, int *lmm_size, - struct ptlrpc_request **request, u64 valid); -int ll_fsync(struct file *file, loff_t start, loff_t end, int data); -int ll_merge_attr(const struct lu_env *env, struct inode *inode); -int ll_fid2path(struct inode *inode, void __user *arg); -int ll_data_version(struct inode *inode, __u64 *data_version, int flags); -int ll_hsm_release(struct inode *inode); -int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss); - -/* llite/dcache.c */ - -extern const struct dentry_operations ll_d_ops; -void ll_intent_drop_lock(struct lookup_intent *it); -void ll_intent_release(struct lookup_intent *it); -void ll_invalidate_aliases(struct inode *inode); -void ll_lookup_finish_locks(struct lookup_intent *it, struct inode *inode); -int ll_revalidate_it_finish(struct ptlrpc_request *request, - struct lookup_intent *it, struct inode *inode); - -/* llite/llite_lib.c */ -extern struct super_operations lustre_super_operations; - -void ll_lli_init(struct ll_inode_info *lli); -int ll_fill_super(struct super_block *sb); -void ll_put_super(struct super_block *sb); -void ll_kill_super(struct super_block *sb); -struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock); -void ll_dir_clear_lsm_md(struct inode *inode); -void ll_clear_inode(struct inode *inode); -int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import); -int ll_setattr(struct dentry *de, struct iattr *attr); -int ll_statfs(struct dentry *de, struct kstatfs *sfs); -int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs, - __u64 max_age, __u32 flags); -int ll_update_inode(struct inode *inode, struct lustre_md *md); -int ll_read_inode2(struct inode *inode, void *opaque); -void ll_delete_inode(struct inode *inode); -int ll_iocontrol(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg); -int ll_flush_ctx(struct inode *inode); -void ll_umount_begin(struct super_block *sb); -int ll_remount_fs(struct super_block *sb, int *flags, char *data); -int ll_show_options(struct seq_file *seq, struct dentry *dentry); -void ll_dirty_page_discard_warn(struct page *page, int ioret); -int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req, - struct super_block *sb, struct lookup_intent *it); -int ll_obd_statfs(struct inode *inode, void __user *arg); -int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize); -int ll_get_default_mdsize(struct ll_sb_info *sbi, int *default_mdsize); -int ll_set_default_mdsize(struct ll_sb_info *sbi, int default_mdsize); -int ll_process_config(struct lustre_cfg *lcfg); - -enum { - LUSTRE_OPC_MKDIR = 0, - LUSTRE_OPC_SYMLINK = 1, - LUSTRE_OPC_MKNOD = 2, - LUSTRE_OPC_CREATE = 3, - LUSTRE_OPC_ANY = 5, -}; - -struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data, - struct inode *i1, struct inode *i2, - const char *name, size_t namelen, - u32 mode, __u32 opc, void *data); -void ll_finish_md_op_data(struct md_op_data *op_data); -int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg); -char *ll_get_fsname(struct super_block *sb, char *buf, int buflen); -void ll_compute_rootsquash_state(struct ll_sb_info *sbi); -void ll_open_cleanup(struct super_block *sb, struct ptlrpc_request *open_req); -ssize_t ll_copy_user_md(const struct lov_user_md __user *md, - struct lov_user_md **kbuf); - -/* Compute expected user md size when passing in a md from user space */ -static inline ssize_t ll_lov_user_md_size(const struct lov_user_md *lum) -{ - switch (lum->lmm_magic) { - case LOV_USER_MAGIC_V1: - return sizeof(struct lov_user_md_v1); - case LOV_USER_MAGIC_V3: - return sizeof(struct lov_user_md_v3); - case LOV_USER_MAGIC_SPECIFIC: - if (lum->lmm_stripe_count > LOV_MAX_STRIPE_COUNT) - return -EINVAL; - - return lov_user_md_size(lum->lmm_stripe_count, - LOV_USER_MAGIC_SPECIFIC); - } - return -EINVAL; -} - -/* llite/llite_nfs.c */ -extern const struct export_operations lustre_export_operations; -__u32 get_uuid2int(const char *name, int len); -void get_uuid2fsid(const char *name, int len, __kernel_fsid_t *fsid); -struct inode *search_inode_for_lustre(struct super_block *sb, - const struct lu_fid *fid); -int ll_dir_get_parent_fid(struct inode *dir, struct lu_fid *parent_fid); - -/* llite/symlink.c */ -extern const struct inode_operations ll_fast_symlink_inode_operations; - -/** - * IO arguments for various VFS I/O interfaces. - */ -struct vvp_io_args { - /** normal/splice */ - union { - struct { - struct kiocb *via_iocb; - struct iov_iter *via_iter; - } normal; - } u; -}; - -struct ll_cl_context { - struct list_head lcc_list; - void *lcc_cookie; - const struct lu_env *lcc_env; - struct cl_io *lcc_io; - struct cl_page *lcc_page; -}; - -struct ll_thread_info { - struct vvp_io_args lti_args; - struct ra_io_arg lti_ria; - struct ll_cl_context lti_io_ctx; -}; - -extern struct lu_context_key ll_thread_key; -static inline struct ll_thread_info *ll_env_info(const struct lu_env *env) -{ - struct ll_thread_info *lti; - - lti = lu_context_key_get(&env->le_ctx, &ll_thread_key); - LASSERT(lti); - return lti; -} - -static inline struct vvp_io_args *ll_env_args(const struct lu_env *env) -{ - return &ll_env_info(env)->lti_args; -} - -/* llite/llite_mmap.c */ - -int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last); -int ll_file_mmap(struct file *file, struct vm_area_struct *vma); -void policy_from_vma(union ldlm_policy_data *policy, struct vm_area_struct *vma, - unsigned long addr, size_t count); -struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr, - size_t count); - -static inline void ll_invalidate_page(struct page *vmpage) -{ - struct address_space *mapping = vmpage->mapping; - loff_t offset = vmpage->index << PAGE_SHIFT; - - LASSERT(PageLocked(vmpage)); - if (!mapping) - return; - - /* - * truncate_complete_page() calls - * a_ops->invalidatepage()->cl_page_delete()->vvp_page_delete(). - */ - ll_teardown_mmaps(mapping, offset, offset + PAGE_SIZE); - truncate_complete_page(mapping, vmpage); -} - -#define ll_s2sbi(sb) (s2lsi(sb)->lsi_llsbi) - -/* don't need an addref as the sb_info should be holding one */ -static inline struct obd_export *ll_s2dtexp(struct super_block *sb) -{ - return ll_s2sbi(sb)->ll_dt_exp; -} - -/* don't need an addref as the sb_info should be holding one */ -static inline struct obd_export *ll_s2mdexp(struct super_block *sb) -{ - return ll_s2sbi(sb)->ll_md_exp; -} - -static inline struct client_obd *sbi2mdc(struct ll_sb_info *sbi) -{ - struct obd_device *obd = sbi->ll_md_exp->exp_obd; - - if (!obd) - LBUG(); - return &obd->u.cli; -} - -/* FIXME: replace the name of this with LL_SB to conform to kernel stuff */ -static inline struct ll_sb_info *ll_i2sbi(struct inode *inode) -{ - return ll_s2sbi(inode->i_sb); -} - -static inline struct obd_export *ll_i2dtexp(struct inode *inode) -{ - return ll_s2dtexp(inode->i_sb); -} - -static inline struct obd_export *ll_i2mdexp(struct inode *inode) -{ - return ll_s2mdexp(inode->i_sb); -} - -static inline struct lu_fid *ll_inode2fid(struct inode *inode) -{ - struct lu_fid *fid; - - LASSERT(inode); - fid = &ll_i2info(inode)->lli_fid; - - return fid; -} - -static inline loff_t ll_file_maxbytes(struct inode *inode) -{ - struct cl_object *obj = ll_i2info(inode)->lli_clob; - - if (!obj) - return MAX_LFS_FILESIZE; - - return min_t(loff_t, cl_object_maxbytes(obj), MAX_LFS_FILESIZE); -} - -/* llite/xattr.c */ -extern const struct xattr_handler *ll_xattr_handlers[]; - -#define XATTR_USER_T 1 -#define XATTR_TRUSTED_T 2 -#define XATTR_SECURITY_T 3 -#define XATTR_ACL_ACCESS_T 4 -#define XATTR_ACL_DEFAULT_T 5 -#define XATTR_LUSTRE_T 6 -#define XATTR_OTHER_T 7 - -ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size); -int ll_xattr_list(struct inode *inode, const char *name, int type, - void *buffer, size_t size, __u64 valid); -const struct xattr_handler *get_xattr_type(const char *name); - -/** - * Common IO arguments for various VFS I/O interfaces. - */ -int cl_sb_init(struct super_block *sb); -int cl_sb_fini(struct super_block *sb); - -enum ras_update_flags { - LL_RAS_HIT = 0x1, - LL_RAS_MMAP = 0x2 -}; -void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len); -void ll_ra_stats_inc(struct inode *inode, enum ra_stat which); - -/* statahead.c */ -#define LL_SA_RPC_MIN 2 -#define LL_SA_RPC_DEF 32 -#define LL_SA_RPC_MAX 8192 - -#define LL_SA_CACHE_BIT 5 -#define LL_SA_CACHE_SIZE (1 << LL_SA_CACHE_BIT) -#define LL_SA_CACHE_MASK (LL_SA_CACHE_SIZE - 1) - -/* per inode struct, for dir only */ -struct ll_statahead_info { - struct dentry *sai_dentry; - atomic_t sai_refcount; /* when access this struct, hold - * refcount - */ - unsigned int sai_max; /* max ahead of lookup */ - __u64 sai_sent; /* stat requests sent count */ - __u64 sai_replied; /* stat requests which received - * reply - */ - __u64 sai_index; /* index of statahead entry */ - __u64 sai_index_wait; /* index of entry which is the - * caller is waiting for - */ - __u64 sai_hit; /* hit count */ - __u64 sai_miss; /* miss count: - * for "ls -al" case, it includes - * hidden dentry miss; - * for "ls -l" case, it does not - * include hidden dentry miss. - * "sai_miss_hidden" is used for - * the later case. - */ - unsigned int sai_consecutive_miss; /* consecutive miss */ - unsigned int sai_miss_hidden;/* "ls -al", but first dentry - * is not a hidden one - */ - unsigned int sai_skip_hidden;/* skipped hidden dentry count */ - unsigned int sai_ls_all:1, /* "ls -al", do stat-ahead for - * hidden entries - */ - sai_agl_valid:1,/* AGL is valid for the dir */ - sai_in_readpage:1;/* statahead in readdir() */ - wait_queue_head_t sai_waitq; /* stat-ahead wait queue */ - struct task_struct *sai_task; /* stat-ahead thread */ - struct task_struct *sai_agl_task; /* AGL thread */ - struct list_head sai_interim_entries; /* entries which got async - * stat reply, but not - * instantiated - */ - struct list_head sai_entries; /* completed entries */ - struct list_head sai_agls; /* AGLs to be sent */ - struct list_head sai_cache[LL_SA_CACHE_SIZE]; - spinlock_t sai_cache_lock[LL_SA_CACHE_SIZE]; - atomic_t sai_cache_count; /* entry count in cache */ -}; - -int ll_statahead(struct inode *dir, struct dentry **dentry, bool unplug); -void ll_authorize_statahead(struct inode *dir, void *key); -void ll_deauthorize_statahead(struct inode *dir, void *key); - -blkcnt_t dirty_cnt(struct inode *inode); - -int cl_glimpse_size0(struct inode *inode, int agl); -int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io, - struct inode *inode, struct cl_object *clob, int agl); - -static inline int cl_glimpse_size(struct inode *inode) -{ - return cl_glimpse_size0(inode, 0); -} - -static inline int cl_agl(struct inode *inode) -{ - return cl_glimpse_size0(inode, 1); -} - -static inline int ll_glimpse_size(struct inode *inode) -{ - struct ll_inode_info *lli = ll_i2info(inode); - int rc; - - down_read(&lli->lli_glimpse_sem); - rc = cl_glimpse_size(inode); - lli->lli_glimpse_time = jiffies; - up_read(&lli->lli_glimpse_sem); - return rc; -} - -/* - * dentry may statahead when statahead is enabled and current process has opened - * parent directory, and this dentry hasn't accessed statahead cache before - */ -static inline bool -dentry_may_statahead(struct inode *dir, struct dentry *dentry) -{ - struct ll_inode_info *lli; - struct ll_dentry_data *ldd; - - if (ll_i2sbi(dir)->ll_sa_max == 0) - return false; - - lli = ll_i2info(dir); - - /* - * statahead is not allowed for this dir, there may be three causes: - * 1. dir is not opened. - * 2. statahead hit ratio is too low. - * 3. previous stat started statahead thread failed. - */ - if (!lli->lli_sa_enabled) - return false; - - /* not the same process, don't statahead */ - if (lli->lli_opendir_pid != current->pid) - return false; - - /* - * When stating a dentry, kernel may trigger 'revalidate' or 'lookup' - * multiple times, eg. for 'getattr', 'getxattr' and etc. - * For patchless client, lookup intent is not accurate, which may - * misguide statahead. For example: - * The 'revalidate' call for 'getattr' and 'getxattr' of a dentry will - * have the same intent -- IT_GETATTR, while one dentry should access - * statahead cache once, otherwise statahead windows is messed up. - * The solution is as following: - * Assign 'lld_sa_generation' with 'lli_sa_generation' when a dentry - * IT_GETATTR for the first time, and subsequent IT_GETATTR will - * bypass interacting with statahead cache by checking - * 'lld_sa_generation == lli->lli_sa_generation'. - */ - ldd = ll_d2d(dentry); - if (ldd->lld_sa_generation == lli->lli_sa_generation) - return false; - - return true; -} - -/* llite ioctl register support routine */ -enum llioc_iter { - LLIOC_CONT = 0, - LLIOC_STOP -}; - -#define LLIOC_MAX_CMD 256 - -/* - * Rules to write a callback function: - * - * Parameters: - * @magic: Dynamic ioctl call routine will feed this value with the pointer - * returned to ll_iocontrol_register. Callback functions should use this - * data to check the potential collasion of ioctl cmd. If collasion is - * found, callback function should return LLIOC_CONT. - * @rcp: The result of ioctl command. - * - * Return values: - * If @magic matches the pointer returned by ll_iocontrol_data, the - * callback should return LLIOC_STOP; return LLIOC_STOP otherwise. - */ -typedef enum llioc_iter (*llioc_callback_t)(struct inode *inode, - struct file *file, unsigned int cmd, unsigned long arg, - void *magic, int *rcp); - -/* export functions */ -/* Register ioctl block dynamatically for a regular file. - * - * @cmd: the array of ioctl command set - * @count: number of commands in the @cmd - * @cb: callback function, it will be called if an ioctl command is found to - * belong to the command list @cmd. - * - * Return value: - * A magic pointer will be returned if success; - * otherwise, NULL will be returned. - */ -void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd); -void ll_iocontrol_unregister(void *magic); - -int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end, - enum cl_fsync_mode mode, int ignore_layout); - -/** direct write pages */ -struct ll_dio_pages { - /** page array to be written. we don't support - * partial pages except the last one. - */ - struct page **ldp_pages; - /* offset of each page */ - loff_t *ldp_offsets; - /** if ldp_offsets is NULL, it means a sequential - * pages to be written, then this is the file offset - * of the first page. - */ - loff_t ldp_start_offset; - /** how many bytes are to be written. */ - size_t ldp_size; - /** # of pages in the array. */ - int ldp_nr; -}; - -ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io, - int rw, struct inode *inode, - struct ll_dio_pages *pv); - -static inline int ll_file_nolock(const struct file *file) -{ - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - struct inode *inode = file_inode(file); - - return ((fd->fd_flags & LL_FILE_IGNORE_LOCK) || - (ll_i2sbi(inode)->ll_flags & LL_SBI_NOLCK)); -} - -static inline void ll_set_lock_data(struct obd_export *exp, struct inode *inode, - struct lookup_intent *it, __u64 *bits) -{ - if (!it->it_lock_set) { - struct lustre_handle handle; - - /* If this inode is a remote object, it will get two - * separate locks in different namespaces, Master MDT, - * where the name entry is, will grant LOOKUP lock, - * remote MDT, where the object is, will grant - * UPDATE|PERM lock. The inode will be attached to both - * LOOKUP and PERM locks, so revoking either locks will - * case the dcache being cleared - */ - if (it->it_remote_lock_mode) { - handle.cookie = it->it_remote_lock_handle; - CDEBUG(D_DLMTRACE, "setting l_data to inode " DFID "%p for remote lock %#llx\n", - PFID(ll_inode2fid(inode)), inode, - handle.cookie); - md_set_lock_data(exp, &handle, inode, NULL); - } - - handle.cookie = it->it_lock_handle; - - CDEBUG(D_DLMTRACE, - "setting l_data to inode " DFID "%p for lock %#llx\n", - PFID(ll_inode2fid(inode)), inode, handle.cookie); - - md_set_lock_data(exp, &handle, inode, &it->it_lock_bits); - it->it_lock_set = 1; - } - - if (bits) - *bits = it->it_lock_bits; -} - -static inline int d_lustre_invalid(const struct dentry *dentry) -{ - return ll_d2d(dentry)->lld_invalid; -} - -/* - * Mark dentry INVALID, if dentry refcount is zero (this is normally case for - * ll_md_blocking_ast), unhash this dentry, and let dcache to reclaim it later; - * else dput() of the last refcount will unhash this dentry and kill it. - */ -static inline void d_lustre_invalidate(struct dentry *dentry, int nested) -{ - CDEBUG(D_DENTRY, - "invalidate dentry %pd (%p) parent %p inode %p refc %d\n", - dentry, dentry, - dentry->d_parent, d_inode(dentry), d_count(dentry)); - - spin_lock_nested(&dentry->d_lock, - nested ? DENTRY_D_LOCK_NESTED : DENTRY_D_LOCK_NORMAL); - ll_d2d(dentry)->lld_invalid = 1; - if (d_count(dentry) == 0) - __d_drop(dentry); - spin_unlock(&dentry->d_lock); -} - -static inline void d_lustre_revalidate(struct dentry *dentry) -{ - spin_lock(&dentry->d_lock); - LASSERT(ll_d2d(dentry)); - ll_d2d(dentry)->lld_invalid = 0; - spin_unlock(&dentry->d_lock); -} - -int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf); -int ll_layout_refresh(struct inode *inode, __u32 *gen); -int ll_layout_restore(struct inode *inode, loff_t start, __u64 length); - -int ll_xattr_init(void); -void ll_xattr_fini(void); - -int ll_page_sync_io(const struct lu_env *env, struct cl_io *io, - struct cl_page *page, enum cl_req_type crt); - -int ll_getparent(struct file *file, struct getparent __user *arg); - -/* lcommon_cl.c */ -int cl_setattr_ost(struct cl_object *obj, const struct iattr *attr, - unsigned int attr_flags); - -extern struct lu_env *cl_inode_fini_env; -extern u16 cl_inode_fini_refcheck; - -int cl_file_inode_init(struct inode *inode, struct lustre_md *md); -void cl_inode_fini(struct inode *inode); - -__u64 cl_fid_build_ino(const struct lu_fid *fid, int api32); -__u32 cl_fid_build_gen(const struct lu_fid *fid); - -#endif /* LLITE_INTERNAL_H */ diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c b/drivers/staging/lustre/lustre/llite/llite_lib.c deleted file mode 100644 index 36066c839160c4c45c45cf29cb50a5e9e11c255b..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/llite_lib.c +++ /dev/null @@ -1,2668 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/llite/llite_lib.c - * - * Lustre Light Super operations - */ - -#define DEBUG_SUBSYSTEM S_LLITE - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "llite_internal.h" - -struct kmem_cache *ll_file_data_slab; -struct dentry *llite_root; -struct kset *llite_kset; - -#ifndef log2 -#define log2(n) ffz(~(n)) -#endif - -static struct ll_sb_info *ll_init_sbi(struct super_block *sb) -{ - struct ll_sb_info *sbi = NULL; - unsigned long pages; - unsigned long lru_page_max; - struct sysinfo si; - class_uuid_t uuid; - int i; - - sbi = kzalloc(sizeof(*sbi), GFP_NOFS); - if (!sbi) - return NULL; - - spin_lock_init(&sbi->ll_lock); - mutex_init(&sbi->ll_lco.lco_lock); - spin_lock_init(&sbi->ll_pp_extent_lock); - spin_lock_init(&sbi->ll_process_lock); - sbi->ll_rw_stats_on = 0; - - si_meminfo(&si); - pages = si.totalram - si.totalhigh; - lru_page_max = pages / 2; - - sbi->ll_cache = cl_cache_init(lru_page_max); - if (!sbi->ll_cache) { - kfree(sbi); - return NULL; - } - - sbi->ll_ra_info.ra_max_pages_per_file = min(pages / 32, - SBI_DEFAULT_READAHEAD_MAX); - sbi->ll_ra_info.ra_max_pages = sbi->ll_ra_info.ra_max_pages_per_file; - sbi->ll_ra_info.ra_max_read_ahead_whole_pages = - SBI_DEFAULT_READAHEAD_WHOLE_MAX; - - ll_generate_random_uuid(uuid); - class_uuid_unparse(uuid, &sbi->ll_sb_uuid); - CDEBUG(D_CONFIG, "generated uuid: %s\n", sbi->ll_sb_uuid.uuid); - - sbi->ll_flags |= LL_SBI_VERBOSE; - sbi->ll_flags |= LL_SBI_CHECKSUM; - - sbi->ll_flags |= LL_SBI_LRU_RESIZE; - sbi->ll_flags |= LL_SBI_LAZYSTATFS; - - for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) { - spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i]. - pp_r_hist.oh_lock); - spin_lock_init(&sbi->ll_rw_extents_info.pp_extents[i]. - pp_w_hist.oh_lock); - } - - /* metadata statahead is enabled by default */ - sbi->ll_sa_max = LL_SA_RPC_DEF; - atomic_set(&sbi->ll_sa_total, 0); - atomic_set(&sbi->ll_sa_wrong, 0); - atomic_set(&sbi->ll_sa_running, 0); - atomic_set(&sbi->ll_agl_total, 0); - sbi->ll_flags |= LL_SBI_AGL_ENABLED; - - /* root squash */ - sbi->ll_squash.rsi_uid = 0; - sbi->ll_squash.rsi_gid = 0; - INIT_LIST_HEAD(&sbi->ll_squash.rsi_nosquash_nids); - init_rwsem(&sbi->ll_squash.rsi_sem); - - sbi->ll_sb = sb; - - return sbi; -} - -static void ll_free_sbi(struct super_block *sb) -{ - struct ll_sb_info *sbi = ll_s2sbi(sb); - - if (sbi->ll_cache) { - if (!list_empty(&sbi->ll_squash.rsi_nosquash_nids)) - cfs_free_nidlist(&sbi->ll_squash.rsi_nosquash_nids); - cl_cache_decref(sbi->ll_cache); - sbi->ll_cache = NULL; - } - - kfree(sbi); -} - -static int client_common_fill_super(struct super_block *sb, char *md, char *dt) -{ - struct inode *root = NULL; - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct obd_device *obd; - struct obd_statfs *osfs = NULL; - struct ptlrpc_request *request = NULL; - struct obd_connect_data *data = NULL; - struct obd_uuid *uuid; - struct md_op_data *op_data; - struct lustre_md lmd; - u64 valid; - int size, err, checksum; - - obd = class_name2obd(md); - if (!obd) { - CERROR("MD %s: not setup or attached\n", md); - return -EINVAL; - } - - data = kzalloc(sizeof(*data), GFP_NOFS); - if (!data) - return -ENOMEM; - - osfs = kzalloc(sizeof(*osfs), GFP_NOFS); - if (!osfs) { - kfree(data); - return -ENOMEM; - } - - /* indicate the features supported by this client */ - data->ocd_connect_flags = OBD_CONNECT_IBITS | OBD_CONNECT_NODEVOH | - OBD_CONNECT_ATTRFID | - OBD_CONNECT_VERSION | OBD_CONNECT_BRW_SIZE | - OBD_CONNECT_CANCELSET | OBD_CONNECT_FID | - OBD_CONNECT_AT | OBD_CONNECT_LOV_V3 | - OBD_CONNECT_VBR | OBD_CONNECT_FULL20 | - OBD_CONNECT_64BITHASH | - OBD_CONNECT_EINPROGRESS | - OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE | - OBD_CONNECT_LAYOUTLOCK | - OBD_CONNECT_PINGLESS | - OBD_CONNECT_MAX_EASIZE | - OBD_CONNECT_FLOCK_DEAD | - OBD_CONNECT_DISP_STRIPE | OBD_CONNECT_LFSCK | - OBD_CONNECT_OPEN_BY_FID | - OBD_CONNECT_DIR_STRIPE | - OBD_CONNECT_BULK_MBITS; - - if (sbi->ll_flags & LL_SBI_LRU_RESIZE) - data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE; -#ifdef CONFIG_FS_POSIX_ACL - data->ocd_connect_flags |= OBD_CONNECT_ACL | OBD_CONNECT_UMASK | - OBD_CONNECT_LARGE_ACL; -#endif - - if (OBD_FAIL_CHECK(OBD_FAIL_MDC_LIGHTWEIGHT)) - /* flag mdc connection as lightweight, only used for test - * purpose, use with care - */ - data->ocd_connect_flags |= OBD_CONNECT_LIGHTWEIGHT; - - data->ocd_ibits_known = MDS_INODELOCK_FULL; - data->ocd_version = LUSTRE_VERSION_CODE; - - if (sb_rdonly(sb)) - data->ocd_connect_flags |= OBD_CONNECT_RDONLY; - if (sbi->ll_flags & LL_SBI_USER_XATTR) - data->ocd_connect_flags |= OBD_CONNECT_XATTR; - - if (sbi->ll_flags & LL_SBI_FLOCK) - sbi->ll_fop = &ll_file_operations_flock; - else if (sbi->ll_flags & LL_SBI_LOCALFLOCK) - sbi->ll_fop = &ll_file_operations; - else - sbi->ll_fop = &ll_file_operations_noflock; - - /* always ping even if server suppress_pings */ - if (sbi->ll_flags & LL_SBI_ALWAYS_PING) - data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS; - - data->ocd_brw_size = MD_MAX_BRW_SIZE; - - err = obd_connect(NULL, &sbi->ll_md_exp, obd, &sbi->ll_sb_uuid, - data, NULL); - if (err == -EBUSY) { - LCONSOLE_ERROR_MSG(0x14f, - "An MDT (md %s) is performing recovery, of which this client is not a part. Please wait for recovery to complete, abort, or time out.\n", - md); - goto out; - } - - if (err) { - CERROR("cannot connect to %s: rc = %d\n", md, err); - goto out; - } - - sbi->ll_md_exp->exp_connect_data = *data; - - err = obd_fid_init(sbi->ll_md_exp->exp_obd, sbi->ll_md_exp, - LUSTRE_SEQ_METADATA); - if (err) { - CERROR("%s: Can't init metadata layer FID infrastructure, rc = %d\n", - sbi->ll_md_exp->exp_obd->obd_name, err); - goto out_md; - } - - /* For mount, we only need fs info from MDT0, and also in DNE, it - * can make sure the client can be mounted as long as MDT0 is - * available - */ - err = obd_statfs(NULL, sbi->ll_md_exp, osfs, - get_jiffies_64() - OBD_STATFS_CACHE_SECONDS * HZ, - OBD_STATFS_FOR_MDT0); - if (err) - goto out_md_fid; - - /* This needs to be after statfs to ensure connect has finished. - * Note that "data" does NOT contain the valid connect reply. - * If connecting to a 1.8 server there will be no LMV device, so - * we can access the MDC export directly and exp_connect_flags will - * be non-zero, but if accessing an upgraded 2.1 server it will - * have the correct flags filled in. - * XXX: fill in the LMV exp_connect_flags from MDC(s). - */ - valid = exp_connect_flags(sbi->ll_md_exp) & CLIENT_CONNECT_MDT_REQD; - if (exp_connect_flags(sbi->ll_md_exp) != 0 && - valid != CLIENT_CONNECT_MDT_REQD) { - char *buf; - - buf = kzalloc(PAGE_SIZE, GFP_KERNEL); - if (!buf) { - err = -ENOMEM; - goto out_md_fid; - } - obd_connect_flags2str(buf, PAGE_SIZE, - valid ^ CLIENT_CONNECT_MDT_REQD, ","); - LCONSOLE_ERROR_MSG(0x170, - "Server %s does not support feature(s) needed for correct operation of this client (%s). Please upgrade server or downgrade client.\n", - sbi->ll_md_exp->exp_obd->obd_name, buf); - kfree(buf); - err = -EPROTO; - goto out_md_fid; - } - - size = sizeof(*data); - err = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_CONN_DATA), - KEY_CONN_DATA, &size, data); - if (err) { - CERROR("%s: Get connect data failed: rc = %d\n", - sbi->ll_md_exp->exp_obd->obd_name, err); - goto out_md_fid; - } - - LASSERT(osfs->os_bsize); - sb->s_blocksize = osfs->os_bsize; - sb->s_blocksize_bits = log2(osfs->os_bsize); - sb->s_magic = LL_SUPER_MAGIC; - sb->s_maxbytes = MAX_LFS_FILESIZE; - sbi->ll_namelen = osfs->os_namelen; - sbi->ll_mnt.mnt = current->fs->root.mnt; - - if ((sbi->ll_flags & LL_SBI_USER_XATTR) && - !(data->ocd_connect_flags & OBD_CONNECT_XATTR)) { - LCONSOLE_INFO("Disabling user_xattr feature because it is not supported on the server\n"); - sbi->ll_flags &= ~LL_SBI_USER_XATTR; - } - - if (data->ocd_connect_flags & OBD_CONNECT_ACL) { - sb->s_flags |= SB_POSIXACL; - sbi->ll_flags |= LL_SBI_ACL; - } else { - LCONSOLE_INFO("client wants to enable acl, but mdt not!\n"); - sb->s_flags &= ~SB_POSIXACL; - sbi->ll_flags &= ~LL_SBI_ACL; - } - - if (data->ocd_connect_flags & OBD_CONNECT_64BITHASH) - sbi->ll_flags |= LL_SBI_64BIT_HASH; - - if (data->ocd_connect_flags & OBD_CONNECT_BRW_SIZE) - sbi->ll_md_brw_pages = data->ocd_brw_size >> PAGE_SHIFT; - else - sbi->ll_md_brw_pages = 1; - - if (data->ocd_connect_flags & OBD_CONNECT_LAYOUTLOCK) - sbi->ll_flags |= LL_SBI_LAYOUT_LOCK; - - if (data->ocd_ibits_known & MDS_INODELOCK_XATTR) { - if (!(data->ocd_connect_flags & OBD_CONNECT_MAX_EASIZE)) { - LCONSOLE_INFO( - "%s: disabling xattr cache due to unknown maximum xattr size.\n", - dt); - } else { - sbi->ll_flags |= LL_SBI_XATTR_CACHE; - sbi->ll_xattr_cache_enabled = 1; - } - } - - obd = class_name2obd(dt); - if (!obd) { - CERROR("DT %s: not setup or attached\n", dt); - err = -ENODEV; - goto out_md_fid; - } - - data->ocd_connect_flags = OBD_CONNECT_GRANT | OBD_CONNECT_VERSION | - OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE | - OBD_CONNECT_CANCELSET | OBD_CONNECT_FID | - OBD_CONNECT_SRVLOCK | OBD_CONNECT_TRUNCLOCK| - OBD_CONNECT_AT | OBD_CONNECT_OSS_CAPA | - OBD_CONNECT_VBR | OBD_CONNECT_FULL20 | - OBD_CONNECT_64BITHASH | OBD_CONNECT_MAXBYTES | - OBD_CONNECT_EINPROGRESS | - OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE | - OBD_CONNECT_LAYOUTLOCK | - OBD_CONNECT_PINGLESS | OBD_CONNECT_LFSCK | - OBD_CONNECT_BULK_MBITS; - - if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_CKSUM)) { - /* OBD_CONNECT_CKSUM should always be set, even if checksums are - * disabled by default, because it can still be enabled on the - * fly via /sys. As a consequence, we still need to come to an - * agreement on the supported algorithms at connect time - */ - data->ocd_connect_flags |= OBD_CONNECT_CKSUM; - - if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CKSUM_ADLER_ONLY)) - data->ocd_cksum_types = OBD_CKSUM_ADLER; - else - data->ocd_cksum_types = cksum_types_supported_client(); - } - - data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE; - - /* always ping even if server suppress_pings */ - if (sbi->ll_flags & LL_SBI_ALWAYS_PING) - data->ocd_connect_flags &= ~OBD_CONNECT_PINGLESS; - - CDEBUG(D_RPCTRACE, - "ocd_connect_flags: %#llx ocd_version: %d ocd_grant: %d\n", - data->ocd_connect_flags, - data->ocd_version, data->ocd_grant); - - obd->obd_upcall.onu_owner = &sbi->ll_lco; - obd->obd_upcall.onu_upcall = cl_ocd_update; - - data->ocd_brw_size = DT_MAX_BRW_SIZE; - - err = obd_connect(NULL, &sbi->ll_dt_exp, obd, &sbi->ll_sb_uuid, data, - NULL); - if (err == -EBUSY) { - LCONSOLE_ERROR_MSG(0x150, - "An OST (dt %s) is performing recovery, of which this client is not a part. Please wait for recovery to complete, abort, or time out.\n", - dt); - goto out_md_fid; - } else if (err) { - CERROR("%s: Cannot connect to %s: rc = %d\n", - sbi->ll_dt_exp->exp_obd->obd_name, dt, err); - goto out_md_fid; - } - - sbi->ll_dt_exp->exp_connect_data = *data; - - err = obd_fid_init(sbi->ll_dt_exp->exp_obd, sbi->ll_dt_exp, - LUSTRE_SEQ_METADATA); - if (err) { - CERROR("%s: Can't init data layer FID infrastructure, rc = %d\n", - sbi->ll_dt_exp->exp_obd->obd_name, err); - goto out_dt; - } - - mutex_lock(&sbi->ll_lco.lco_lock); - sbi->ll_lco.lco_flags = data->ocd_connect_flags; - sbi->ll_lco.lco_md_exp = sbi->ll_md_exp; - sbi->ll_lco.lco_dt_exp = sbi->ll_dt_exp; - mutex_unlock(&sbi->ll_lco.lco_lock); - - fid_zero(&sbi->ll_root_fid); - err = md_getstatus(sbi->ll_md_exp, &sbi->ll_root_fid); - if (err) { - CERROR("cannot mds_connect: rc = %d\n", err); - goto out_lock_cn_cb; - } - if (!fid_is_sane(&sbi->ll_root_fid)) { - CERROR("%s: Invalid root fid " DFID " during mount\n", - sbi->ll_md_exp->exp_obd->obd_name, - PFID(&sbi->ll_root_fid)); - err = -EINVAL; - goto out_lock_cn_cb; - } - CDEBUG(D_SUPER, "rootfid " DFID "\n", PFID(&sbi->ll_root_fid)); - - sb->s_op = &lustre_super_operations; - sb->s_xattr = ll_xattr_handlers; -#if THREAD_SIZE >= 8192 /*b=17630*/ - sb->s_export_op = &lustre_export_operations; -#endif - - /* make root inode - * XXX: move this to after cbd setup? - */ - valid = OBD_MD_FLGETATTR | OBD_MD_FLBLOCKS | OBD_MD_FLMODEASIZE; - if (sbi->ll_flags & LL_SBI_ACL) - valid |= OBD_MD_FLACL; - - op_data = kzalloc(sizeof(*op_data), GFP_NOFS); - if (!op_data) { - err = -ENOMEM; - goto out_lock_cn_cb; - } - - op_data->op_fid1 = sbi->ll_root_fid; - op_data->op_mode = 0; - op_data->op_valid = valid; - - err = md_getattr(sbi->ll_md_exp, op_data, &request); - kfree(op_data); - if (err) { - CERROR("%s: md_getattr failed for root: rc = %d\n", - sbi->ll_md_exp->exp_obd->obd_name, err); - goto out_lock_cn_cb; - } - - err = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp, - sbi->ll_md_exp, &lmd); - if (err) { - CERROR("failed to understand root inode md: rc = %d\n", err); - ptlrpc_req_finished(request); - goto out_lock_cn_cb; - } - - LASSERT(fid_is_sane(&sbi->ll_root_fid)); - root = ll_iget(sb, cl_fid_build_ino(&sbi->ll_root_fid, - sbi->ll_flags & LL_SBI_32BIT_API), - &lmd); - md_free_lustre_md(sbi->ll_md_exp, &lmd); - ptlrpc_req_finished(request); - - if (IS_ERR(root)) { -#ifdef CONFIG_FS_POSIX_ACL - if (lmd.posix_acl) { - posix_acl_release(lmd.posix_acl); - lmd.posix_acl = NULL; - } -#endif - err = -EBADF; - CERROR("lustre_lite: bad iget4 for root\n"); - goto out_root; - } - - checksum = sbi->ll_flags & LL_SBI_CHECKSUM; - err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM), - KEY_CHECKSUM, sizeof(checksum), &checksum, - NULL); - if (err) { - CERROR("%s: Set checksum failed: rc = %d\n", - sbi->ll_dt_exp->exp_obd->obd_name, err); - goto out_root; - } - cl_sb_init(sb); - - err = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CACHE_SET), - KEY_CACHE_SET, sizeof(*sbi->ll_cache), - sbi->ll_cache, NULL); - if (err) { - CERROR("%s: Set cache_set failed: rc = %d\n", - sbi->ll_dt_exp->exp_obd->obd_name, err); - goto out_root; - } - - sb->s_root = d_make_root(root); - if (!sb->s_root) { - CERROR("%s: can't make root dentry\n", - ll_get_fsname(sb, NULL, 0)); - err = -ENOMEM; - goto out_lock_cn_cb; - } - - sbi->ll_sdev_orig = sb->s_dev; - - /* We set sb->s_dev equal on all lustre clients in order to support - * NFS export clustering. NFSD requires that the FSID be the same - * on all clients. - */ - /* s_dev is also used in lt_compare() to compare two fs, but that is - * only a node-local comparison. - */ - uuid = obd_get_uuid(sbi->ll_md_exp); - if (uuid) { - sb->s_dev = get_uuid2int(uuid->uuid, strlen(uuid->uuid)); - get_uuid2fsid(uuid->uuid, strlen(uuid->uuid), &sbi->ll_fsid); - } - - kfree(data); - kfree(osfs); - - if (llite_root) { - err = ldebugfs_register_mountpoint(llite_root, sb, dt, md); - if (err < 0) { - CERROR("%s: could not register mount in debugfs: " - "rc = %d\n", ll_get_fsname(sb, NULL, 0), err); - err = 0; - } - } - - return err; -out_root: - iput(root); -out_lock_cn_cb: - obd_fid_fini(sbi->ll_dt_exp->exp_obd); -out_dt: - obd_disconnect(sbi->ll_dt_exp); - sbi->ll_dt_exp = NULL; -out_md_fid: - obd_fid_fini(sbi->ll_md_exp->exp_obd); -out_md: - obd_disconnect(sbi->ll_md_exp); - sbi->ll_md_exp = NULL; -out: - kfree(data); - kfree(osfs); - return err; -} - -int ll_get_max_mdsize(struct ll_sb_info *sbi, int *lmmsize) -{ - int size, rc; - - size = sizeof(*lmmsize); - rc = obd_get_info(NULL, sbi->ll_dt_exp, sizeof(KEY_MAX_EASIZE), - KEY_MAX_EASIZE, &size, lmmsize); - if (rc) { - CERROR("%s: cannot get max LOV EA size: rc = %d\n", - sbi->ll_dt_exp->exp_obd->obd_name, rc); - return rc; - } - - size = sizeof(int); - rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_MAX_EASIZE), - KEY_MAX_EASIZE, &size, lmmsize); - if (rc) - CERROR("Get max mdsize error rc %d\n", rc); - - return rc; -} - -/** - * Get the value of the default_easize parameter. - * - * \see client_obd::cl_default_mds_easize - * - * \param[in] sbi superblock info for this filesystem - * \param[out] lmmsize pointer to storage location for value - * - * \retval 0 on success - * \retval negative negated errno on failure - */ -int ll_get_default_mdsize(struct ll_sb_info *sbi, int *lmmsize) -{ - int size, rc; - - size = sizeof(int); - rc = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_DEFAULT_EASIZE), - KEY_DEFAULT_EASIZE, &size, lmmsize); - if (rc) - CERROR("Get default mdsize error rc %d\n", rc); - - return rc; -} - -/** - * Set the default_easize parameter to the given value. - * - * \see client_obd::cl_default_mds_easize - * - * \param[in] sbi superblock info for this filesystem - * \param[in] lmmsize the size to set - * - * \retval 0 on success - * \retval negative negated errno on failure - */ -int ll_set_default_mdsize(struct ll_sb_info *sbi, int lmmsize) -{ - if (lmmsize < sizeof(struct lov_mds_md) || - lmmsize > OBD_MAX_DEFAULT_EA_SIZE) - return -EINVAL; - - return obd_set_info_async(NULL, sbi->ll_md_exp, - sizeof(KEY_DEFAULT_EASIZE), - KEY_DEFAULT_EASIZE, - sizeof(int), &lmmsize, NULL); -} - -static void client_common_put_super(struct super_block *sb) -{ - struct ll_sb_info *sbi = ll_s2sbi(sb); - - cl_sb_fini(sb); - - obd_fid_fini(sbi->ll_dt_exp->exp_obd); - obd_disconnect(sbi->ll_dt_exp); - sbi->ll_dt_exp = NULL; - - ldebugfs_unregister_mountpoint(sbi); - - obd_fid_fini(sbi->ll_md_exp->exp_obd); - obd_disconnect(sbi->ll_md_exp); - sbi->ll_md_exp = NULL; -} - -void ll_kill_super(struct super_block *sb) -{ - struct ll_sb_info *sbi; - - /* not init sb ?*/ - if (!(sb->s_flags & SB_ACTIVE)) - return; - - sbi = ll_s2sbi(sb); - /* we need to restore s_dev from changed for clustered NFS before - * put_super because new kernels have cached s_dev and change sb->s_dev - * in put_super not affected real removing devices - */ - if (sbi) { - sb->s_dev = sbi->ll_sdev_orig; - sbi->ll_umounting = 1; - - /* wait running statahead threads to quit */ - while (atomic_read(&sbi->ll_sa_running) > 0) { - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC >> 3)); - } - } -} - -static inline int ll_set_opt(const char *opt, char *data, int fl) -{ - if (strncmp(opt, data, strlen(opt)) != 0) - return 0; - else - return fl; -} - -/* non-client-specific mount options are parsed in lmd_parse */ -static int ll_options(char *options, int *flags) -{ - int tmp; - char *s1 = options, *s2; - - if (!options) - return 0; - - CDEBUG(D_CONFIG, "Parsing opts %s\n", options); - - while (*s1) { - CDEBUG(D_SUPER, "next opt=%s\n", s1); - tmp = ll_set_opt("nolock", s1, LL_SBI_NOLCK); - if (tmp) { - *flags |= tmp; - goto next; - } - tmp = ll_set_opt("flock", s1, LL_SBI_FLOCK); - if (tmp) { - *flags |= tmp; - goto next; - } - tmp = ll_set_opt("localflock", s1, LL_SBI_LOCALFLOCK); - if (tmp) { - *flags |= tmp; - goto next; - } - tmp = ll_set_opt("noflock", s1, - LL_SBI_FLOCK | LL_SBI_LOCALFLOCK); - if (tmp) { - *flags &= ~tmp; - goto next; - } - tmp = ll_set_opt("user_xattr", s1, LL_SBI_USER_XATTR); - if (tmp) { - *flags |= tmp; - goto next; - } - tmp = ll_set_opt("nouser_xattr", s1, LL_SBI_USER_XATTR); - if (tmp) { - *flags &= ~tmp; - goto next; - } - tmp = ll_set_opt("context", s1, 1); - if (tmp) - goto next; - tmp = ll_set_opt("fscontext", s1, 1); - if (tmp) - goto next; - tmp = ll_set_opt("defcontext", s1, 1); - if (tmp) - goto next; - tmp = ll_set_opt("rootcontext", s1, 1); - if (tmp) - goto next; - tmp = ll_set_opt("user_fid2path", s1, LL_SBI_USER_FID2PATH); - if (tmp) { - *flags |= tmp; - goto next; - } - tmp = ll_set_opt("nouser_fid2path", s1, LL_SBI_USER_FID2PATH); - if (tmp) { - *flags &= ~tmp; - goto next; - } - - tmp = ll_set_opt("checksum", s1, LL_SBI_CHECKSUM); - if (tmp) { - *flags |= tmp; - goto next; - } - tmp = ll_set_opt("nochecksum", s1, LL_SBI_CHECKSUM); - if (tmp) { - *flags &= ~tmp; - goto next; - } - tmp = ll_set_opt("lruresize", s1, LL_SBI_LRU_RESIZE); - if (tmp) { - *flags |= tmp; - goto next; - } - tmp = ll_set_opt("nolruresize", s1, LL_SBI_LRU_RESIZE); - if (tmp) { - *flags &= ~tmp; - goto next; - } - tmp = ll_set_opt("lazystatfs", s1, LL_SBI_LAZYSTATFS); - if (tmp) { - *flags |= tmp; - goto next; - } - tmp = ll_set_opt("nolazystatfs", s1, LL_SBI_LAZYSTATFS); - if (tmp) { - *flags &= ~tmp; - goto next; - } - tmp = ll_set_opt("32bitapi", s1, LL_SBI_32BIT_API); - if (tmp) { - *flags |= tmp; - goto next; - } - tmp = ll_set_opt("verbose", s1, LL_SBI_VERBOSE); - if (tmp) { - *flags |= tmp; - goto next; - } - tmp = ll_set_opt("noverbose", s1, LL_SBI_VERBOSE); - if (tmp) { - *flags &= ~tmp; - goto next; - } - tmp = ll_set_opt("always_ping", s1, LL_SBI_ALWAYS_PING); - if (tmp) { - *flags |= tmp; - goto next; - } - LCONSOLE_ERROR_MSG(0x152, "Unknown option '%s', won't mount.\n", - s1); - return -EINVAL; - -next: - /* Find next opt */ - s2 = strchr(s1, ','); - if (!s2) - break; - s1 = s2 + 1; - } - return 0; -} - -void ll_lli_init(struct ll_inode_info *lli) -{ - lli->lli_inode_magic = LLI_INODE_MAGIC; - lli->lli_flags = 0; - spin_lock_init(&lli->lli_lock); - lli->lli_posix_acl = NULL; - /* Do not set lli_fid, it has been initialized already. */ - fid_zero(&lli->lli_pfid); - lli->lli_mds_read_och = NULL; - lli->lli_mds_write_och = NULL; - lli->lli_mds_exec_och = NULL; - lli->lli_open_fd_read_count = 0; - lli->lli_open_fd_write_count = 0; - lli->lli_open_fd_exec_count = 0; - mutex_init(&lli->lli_och_mutex); - spin_lock_init(&lli->lli_agl_lock); - spin_lock_init(&lli->lli_layout_lock); - ll_layout_version_set(lli, CL_LAYOUT_GEN_NONE); - lli->lli_clob = NULL; - - init_rwsem(&lli->lli_xattrs_list_rwsem); - mutex_init(&lli->lli_xattrs_enq_lock); - - LASSERT(lli->lli_vfs_inode.i_mode != 0); - if (S_ISDIR(lli->lli_vfs_inode.i_mode)) { - mutex_init(&lli->lli_readdir_mutex); - lli->lli_opendir_key = NULL; - lli->lli_sai = NULL; - spin_lock_init(&lli->lli_sa_lock); - lli->lli_opendir_pid = 0; - lli->lli_sa_enabled = 0; - lli->lli_def_stripe_offset = -1; - } else { - mutex_init(&lli->lli_size_mutex); - lli->lli_symlink_name = NULL; - init_rwsem(&lli->lli_trunc_sem); - range_lock_tree_init(&lli->lli_write_tree); - init_rwsem(&lli->lli_glimpse_sem); - lli->lli_glimpse_time = 0; - INIT_LIST_HEAD(&lli->lli_agl_list); - lli->lli_agl_index = 0; - lli->lli_async_rc = 0; - } - mutex_init(&lli->lli_layout_mutex); -} - -int ll_fill_super(struct super_block *sb) -{ - struct lustre_profile *lprof = NULL; - struct lustre_sb_info *lsi = s2lsi(sb); - struct ll_sb_info *sbi; - char *dt = NULL, *md = NULL; - char *profilenm = get_profile_name(sb); - struct config_llog_instance *cfg; - int err; - static atomic_t ll_bdi_num = ATOMIC_INIT(0); - - CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb); - - err = ptlrpc_inc_ref(); - if (err) - return err; - - cfg = kzalloc(sizeof(*cfg), GFP_NOFS); - if (!cfg) { - err = -ENOMEM; - goto out_put; - } - - try_module_get(THIS_MODULE); - - /* client additional sb info */ - sbi = ll_init_sbi(sb); - lsi->lsi_llsbi = sbi; - if (!sbi) { - module_put(THIS_MODULE); - kfree(cfg); - err = -ENOMEM; - goto out_put; - } - - err = ll_options(lsi->lsi_lmd->lmd_opts, &sbi->ll_flags); - if (err) - goto out_free; - - err = super_setup_bdi_name(sb, "lustre-%d", - atomic_inc_return(&ll_bdi_num)); - if (err) - goto out_free; - - /* kernel >= 2.6.38 store dentry operations in sb->s_d_op. */ - sb->s_d_op = &ll_d_ops; - - /* Generate a string unique to this super, in case some joker tries - * to mount the same fs at two mount points. - * Use the address of the super itself. - */ - cfg->cfg_instance = sb; - cfg->cfg_uuid = lsi->lsi_llsbi->ll_sb_uuid; - cfg->cfg_callback = class_config_llog_handler; - /* set up client obds */ - err = lustre_process_log(sb, profilenm, cfg); - if (err < 0) - goto out_free; - - /* Profile set with LCFG_MOUNTOPT so we can find our mdc and osc obds */ - lprof = class_get_profile(profilenm); - if (!lprof) { - LCONSOLE_ERROR_MSG(0x156, - "The client profile '%s' could not be read from the MGS. Does that filesystem exist?\n", - profilenm); - err = -EINVAL; - goto out_free; - } - CDEBUG(D_CONFIG, "Found profile %s: mdc=%s osc=%s\n", profilenm, - lprof->lp_md, lprof->lp_dt); - - dt = kasprintf(GFP_NOFS, "%s-%p", lprof->lp_dt, cfg->cfg_instance); - if (!dt) { - err = -ENOMEM; - goto out_free; - } - - md = kasprintf(GFP_NOFS, "%s-%p", lprof->lp_md, cfg->cfg_instance); - if (!md) { - err = -ENOMEM; - goto out_free; - } - - /* connections, registrations, sb setup */ - err = client_common_fill_super(sb, md, dt); - if (!err) - sbi->ll_client_common_fill_super_succeeded = 1; - -out_free: - kfree(md); - kfree(dt); - if (lprof) - class_put_profile(lprof); - if (err) - ll_put_super(sb); - else if (sbi->ll_flags & LL_SBI_VERBOSE) - LCONSOLE_WARN("Mounted %s\n", profilenm); - - kfree(cfg); -out_put: - if (err) - ptlrpc_dec_ref(); - return err; -} /* ll_fill_super */ - -void ll_put_super(struct super_block *sb) -{ - struct config_llog_instance cfg, params_cfg; - struct obd_device *obd; - struct lustre_sb_info *lsi = s2lsi(sb); - struct ll_sb_info *sbi = ll_s2sbi(sb); - char *profilenm = get_profile_name(sb); - int next, force = 1, rc = 0; - long ccc_count; - - CDEBUG(D_VFSTRACE, "VFS Op: sb %p - %s\n", sb, profilenm); - - cfg.cfg_instance = sb; - lustre_end_log(sb, profilenm, &cfg); - - params_cfg.cfg_instance = sb; - lustre_end_log(sb, PARAMS_FILENAME, ¶ms_cfg); - - if (sbi->ll_md_exp) { - obd = class_exp2obd(sbi->ll_md_exp); - if (obd) - force = obd->obd_force; - } - - /* Wait for unstable pages to be committed to stable storage */ - if (!force) - rc = l_wait_event_abortable(sbi->ll_cache->ccc_unstable_waitq, - !atomic_long_read(&sbi->ll_cache->ccc_unstable_nr)); - - ccc_count = atomic_long_read(&sbi->ll_cache->ccc_unstable_nr); - if (!force && rc != -ERESTARTSYS) - LASSERTF(!ccc_count, "count: %li\n", ccc_count); - - /* We need to set force before the lov_disconnect in - * lustre_common_put_super, since l_d cleans up osc's as well. - */ - if (force) { - next = 0; - while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, - &next)) != NULL) { - obd->obd_force = force; - } - } - - if (sbi->ll_client_common_fill_super_succeeded) { - /* Only if client_common_fill_super succeeded */ - client_common_put_super(sb); - } - - next = 0; - while ((obd = class_devices_in_group(&sbi->ll_sb_uuid, &next))) - class_manual_cleanup(obd); - - if (sbi->ll_flags & LL_SBI_VERBOSE) - LCONSOLE_WARN("Unmounted %s\n", profilenm ? profilenm : ""); - - if (profilenm) - class_del_profile(profilenm); - - ll_free_sbi(sb); - lsi->lsi_llsbi = NULL; - - lustre_common_put_super(sb); - - cl_env_cache_purge(~0); - - module_put(THIS_MODULE); - - ptlrpc_dec_ref(); -} /* client_put_super */ - -struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock) -{ - struct inode *inode = NULL; - - /* NOTE: we depend on atomic igrab() -bzzz */ - lock_res_and_lock(lock); - if (lock->l_resource->lr_lvb_inode) { - struct ll_inode_info *lli; - - lli = ll_i2info(lock->l_resource->lr_lvb_inode); - if (lli->lli_inode_magic == LLI_INODE_MAGIC) { - inode = igrab(lock->l_resource->lr_lvb_inode); - } else { - inode = lock->l_resource->lr_lvb_inode; - LDLM_DEBUG_LIMIT(inode->i_state & I_FREEING ? D_INFO : - D_WARNING, lock, - "lr_lvb_inode %p is bogus: magic %08x", - lock->l_resource->lr_lvb_inode, - lli->lli_inode_magic); - inode = NULL; - } - } - unlock_res_and_lock(lock); - return inode; -} - -void ll_dir_clear_lsm_md(struct inode *inode) -{ - struct ll_inode_info *lli = ll_i2info(inode); - - LASSERT(S_ISDIR(inode->i_mode)); - - if (lli->lli_lsm_md) { - lmv_free_memmd(lli->lli_lsm_md); - lli->lli_lsm_md = NULL; - } -} - -static struct inode *ll_iget_anon_dir(struct super_block *sb, - const struct lu_fid *fid, - struct lustre_md *md) -{ - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct mdt_body *body = md->body; - struct inode *inode; - ino_t ino; - - ino = cl_fid_build_ino(fid, sbi->ll_flags & LL_SBI_32BIT_API); - inode = iget_locked(sb, ino); - if (!inode) { - CERROR("%s: failed get simple inode " DFID ": rc = -ENOENT\n", - ll_get_fsname(sb, NULL, 0), PFID(fid)); - return ERR_PTR(-ENOENT); - } - - if (inode->i_state & I_NEW) { - struct ll_inode_info *lli = ll_i2info(inode); - struct lmv_stripe_md *lsm = md->lmv; - - inode->i_mode = (inode->i_mode & ~S_IFMT) | - (body->mbo_mode & S_IFMT); - LASSERTF(S_ISDIR(inode->i_mode), "Not slave inode " DFID "\n", - PFID(fid)); - - LTIME_S(inode->i_mtime) = 0; - LTIME_S(inode->i_atime) = 0; - LTIME_S(inode->i_ctime) = 0; - inode->i_rdev = 0; - - inode->i_op = &ll_dir_inode_operations; - inode->i_fop = &ll_dir_operations; - lli->lli_fid = *fid; - ll_lli_init(lli); - - LASSERT(lsm); - /* master object FID */ - lli->lli_pfid = body->mbo_fid1; - CDEBUG(D_INODE, "lli %p slave " DFID " master " DFID "\n", - lli, PFID(fid), PFID(&lli->lli_pfid)); - unlock_new_inode(inode); - } - - return inode; -} - -static int ll_init_lsm_md(struct inode *inode, struct lustre_md *md) -{ - struct lmv_stripe_md *lsm = md->lmv; - struct lu_fid *fid; - int i; - - LASSERT(lsm); - /* - * XXX sigh, this lsm_root initialization should be in - * LMV layer, but it needs ll_iget right now, so we - * put this here right now. - */ - for (i = 0; i < lsm->lsm_md_stripe_count; i++) { - fid = &lsm->lsm_md_oinfo[i].lmo_fid; - LASSERT(!lsm->lsm_md_oinfo[i].lmo_root); - /* Unfortunately ll_iget will call ll_update_inode, - * where the initialization of slave inode is slightly - * different, so it reset lsm_md to NULL to avoid - * initializing lsm for slave inode. - */ - /* For migrating inode, master stripe and master object will - * be same, so we only need assign this inode - */ - if (lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION && !i) - lsm->lsm_md_oinfo[i].lmo_root = inode; - else - lsm->lsm_md_oinfo[i].lmo_root = - ll_iget_anon_dir(inode->i_sb, fid, md); - if (IS_ERR(lsm->lsm_md_oinfo[i].lmo_root)) { - int rc = PTR_ERR(lsm->lsm_md_oinfo[i].lmo_root); - - lsm->lsm_md_oinfo[i].lmo_root = NULL; - return rc; - } - } - - return 0; -} - -static inline int lli_lsm_md_eq(const struct lmv_stripe_md *lsm_md1, - const struct lmv_stripe_md *lsm_md2) -{ - return lsm_md1->lsm_md_magic == lsm_md2->lsm_md_magic && - lsm_md1->lsm_md_stripe_count == lsm_md2->lsm_md_stripe_count && - lsm_md1->lsm_md_master_mdt_index == - lsm_md2->lsm_md_master_mdt_index && - lsm_md1->lsm_md_hash_type == lsm_md2->lsm_md_hash_type && - lsm_md1->lsm_md_layout_version == - lsm_md2->lsm_md_layout_version && - !strcmp(lsm_md1->lsm_md_pool_name, - lsm_md2->lsm_md_pool_name); -} - -static int ll_update_lsm_md(struct inode *inode, struct lustre_md *md) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct lmv_stripe_md *lsm = md->lmv; - int rc; - - LASSERT(S_ISDIR(inode->i_mode)); - CDEBUG(D_INODE, "update lsm %p of " DFID "\n", lli->lli_lsm_md, - PFID(ll_inode2fid(inode))); - - /* no striped information from request. */ - if (!lsm) { - if (!lli->lli_lsm_md) { - return 0; - } else if (lli->lli_lsm_md->lsm_md_hash_type & - LMV_HASH_FLAG_MIGRATION) { - /* - * migration is done, the temporay MIGRATE layout has - * been removed - */ - CDEBUG(D_INODE, DFID " finish migration.\n", - PFID(ll_inode2fid(inode))); - lmv_free_memmd(lli->lli_lsm_md); - lli->lli_lsm_md = NULL; - return 0; - } - /* - * The lustre_md from req does not include stripeEA, - * see ll_md_setattr - */ - return 0; - } - - /* set the directory layout */ - if (!lli->lli_lsm_md) { - struct cl_attr *attr; - - rc = ll_init_lsm_md(inode, md); - if (rc) - return rc; - - /* - * set lsm_md to NULL, so the following free lustre_md - * will not free this lsm - */ - md->lmv = NULL; - lli->lli_lsm_md = lsm; - - attr = kzalloc(sizeof(*attr), GFP_NOFS); - if (!attr) - return -ENOMEM; - - /* validate the lsm */ - rc = md_merge_attr(ll_i2mdexp(inode), lsm, attr, - ll_md_blocking_ast); - if (rc) { - kfree(attr); - return rc; - } - - if (md->body->mbo_valid & OBD_MD_FLNLINK) - md->body->mbo_nlink = attr->cat_nlink; - if (md->body->mbo_valid & OBD_MD_FLSIZE) - md->body->mbo_size = attr->cat_size; - if (md->body->mbo_valid & OBD_MD_FLATIME) - md->body->mbo_atime = attr->cat_atime; - if (md->body->mbo_valid & OBD_MD_FLCTIME) - md->body->mbo_ctime = attr->cat_ctime; - if (md->body->mbo_valid & OBD_MD_FLMTIME) - md->body->mbo_mtime = attr->cat_mtime; - - kfree(attr); - - CDEBUG(D_INODE, "Set lsm %p magic %x to " DFID "\n", lsm, - lsm->lsm_md_magic, PFID(ll_inode2fid(inode))); - return 0; - } - - /* Compare the old and new stripe information */ - if (!lsm_md_eq(lli->lli_lsm_md, lsm)) { - struct lmv_stripe_md *old_lsm = lli->lli_lsm_md; - int idx; - - CERROR("%s: inode " DFID "(%p)'s lmv layout mismatch (%p)/(%p) magic:0x%x/0x%x stripe count: %d/%d master_mdt: %d/%d hash_type:0x%x/0x%x layout: 0x%x/0x%x pool:%s/%s\n", - ll_get_fsname(inode->i_sb, NULL, 0), PFID(&lli->lli_fid), - inode, lsm, old_lsm, - lsm->lsm_md_magic, old_lsm->lsm_md_magic, - lsm->lsm_md_stripe_count, - old_lsm->lsm_md_stripe_count, - lsm->lsm_md_master_mdt_index, - old_lsm->lsm_md_master_mdt_index, - lsm->lsm_md_hash_type, old_lsm->lsm_md_hash_type, - lsm->lsm_md_layout_version, - old_lsm->lsm_md_layout_version, - lsm->lsm_md_pool_name, - old_lsm->lsm_md_pool_name); - - for (idx = 0; idx < old_lsm->lsm_md_stripe_count; idx++) { - CERROR("%s: sub FIDs in old lsm idx %d, old: " DFID "\n", - ll_get_fsname(inode->i_sb, NULL, 0), idx, - PFID(&old_lsm->lsm_md_oinfo[idx].lmo_fid)); - } - - for (idx = 0; idx < lsm->lsm_md_stripe_count; idx++) { - CERROR("%s: sub FIDs in new lsm idx %d, new: " DFID "\n", - ll_get_fsname(inode->i_sb, NULL, 0), idx, - PFID(&lsm->lsm_md_oinfo[idx].lmo_fid)); - } - - return -EIO; - } - - return 0; -} - -void ll_clear_inode(struct inode *inode) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct ll_sb_info *sbi = ll_i2sbi(inode); - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p)\n", - PFID(ll_inode2fid(inode)), inode); - - if (S_ISDIR(inode->i_mode)) { - /* these should have been cleared in ll_file_release */ - LASSERT(!lli->lli_opendir_key); - LASSERT(!lli->lli_sai); - LASSERT(lli->lli_opendir_pid == 0); - } - - md_null_inode(sbi->ll_md_exp, ll_inode2fid(inode)); - - LASSERT(!lli->lli_open_fd_write_count); - LASSERT(!lli->lli_open_fd_read_count); - LASSERT(!lli->lli_open_fd_exec_count); - - if (lli->lli_mds_write_och) - ll_md_real_close(inode, FMODE_WRITE); - if (lli->lli_mds_exec_och) - ll_md_real_close(inode, FMODE_EXEC); - if (lli->lli_mds_read_och) - ll_md_real_close(inode, FMODE_READ); - - if (S_ISLNK(inode->i_mode)) { - kfree(lli->lli_symlink_name); - lli->lli_symlink_name = NULL; - } - - ll_xattr_cache_destroy(inode); - -#ifdef CONFIG_FS_POSIX_ACL - forget_all_cached_acls(inode); - if (lli->lli_posix_acl) { - posix_acl_release(lli->lli_posix_acl); - lli->lli_posix_acl = NULL; - } -#endif - lli->lli_inode_magic = LLI_INODE_DEAD; - - if (S_ISDIR(inode->i_mode)) - ll_dir_clear_lsm_md(inode); - if (S_ISREG(inode->i_mode) && !is_bad_inode(inode)) - LASSERT(list_empty(&lli->lli_agl_list)); - - /* - * XXX This has to be done before lsm is freed below, because - * cl_object still uses inode lsm. - */ - cl_inode_fini(inode); -} - -#define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET) - -static int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data) -{ - struct lustre_md md; - struct inode *inode = d_inode(dentry); - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct ptlrpc_request *request = NULL; - int rc, ia_valid; - - op_data = ll_prep_md_op_data(op_data, inode, NULL, NULL, 0, 0, - LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - rc = md_setattr(sbi->ll_md_exp, op_data, NULL, 0, &request); - if (rc) { - ptlrpc_req_finished(request); - if (rc == -ENOENT) { - clear_nlink(inode); - /* Unlinked special device node? Or just a race? - * Pretend we did everything. - */ - if (!S_ISREG(inode->i_mode) && - !S_ISDIR(inode->i_mode)) { - ia_valid = op_data->op_attr.ia_valid; - op_data->op_attr.ia_valid &= ~TIMES_SET_FLAGS; - rc = simple_setattr(dentry, &op_data->op_attr); - op_data->op_attr.ia_valid = ia_valid; - } - } else if (rc != -EPERM && rc != -EACCES && rc != -ETXTBSY) { - CERROR("md_setattr fails: rc = %d\n", rc); - } - return rc; - } - - rc = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp, - sbi->ll_md_exp, &md); - if (rc) { - ptlrpc_req_finished(request); - return rc; - } - - ia_valid = op_data->op_attr.ia_valid; - /* inode size will be in cl_setattr_ost, can't do it now since dirty - * cache is not cleared yet. - */ - op_data->op_attr.ia_valid &= ~(TIMES_SET_FLAGS | ATTR_SIZE); - if (S_ISREG(inode->i_mode)) - inode_lock(inode); - rc = simple_setattr(dentry, &op_data->op_attr); - if (S_ISREG(inode->i_mode)) - inode_unlock(inode); - op_data->op_attr.ia_valid = ia_valid; - - rc = ll_update_inode(inode, &md); - ptlrpc_req_finished(request); - - return rc; -} - -/* If this inode has objects allocated to it (lsm != NULL), then the OST - * object(s) determine the file size and mtime. Otherwise, the MDS will - * keep these values until such a time that objects are allocated for it. - * We do the MDS operations first, as it is checking permissions for us. - * We don't to the MDS RPC if there is nothing that we want to store there, - * otherwise there is no harm in updating mtime/atime on the MDS if we are - * going to do an RPC anyways. - * - * If we are doing a truncate, we will send the mtime and ctime updates - * to the OST with the punch RPC, otherwise we do an explicit setattr RPC. - * I don't believe it is possible to get e.g. ATTR_MTIME_SET and ATTR_SIZE - * at the same time. - * - * In case of HSMimport, we only set attr on MDS. - */ -int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import) -{ - struct inode *inode = d_inode(dentry); - struct ll_inode_info *lli = ll_i2info(inode); - struct md_op_data *op_data = NULL; - int rc = 0; - - CDEBUG(D_VFSTRACE, "%s: setattr inode " DFID "(%p) from %llu to %llu, valid %x, hsm_import %d\n", - ll_get_fsname(inode->i_sb, NULL, 0), PFID(&lli->lli_fid), inode, - i_size_read(inode), attr->ia_size, attr->ia_valid, hsm_import); - - if (attr->ia_valid & ATTR_SIZE) { - /* Check new size against VFS/VM file size limit and rlimit */ - rc = inode_newsize_ok(inode, attr->ia_size); - if (rc) - return rc; - - /* The maximum Lustre file size is variable, based on the - * OST maximum object size and number of stripes. This - * needs another check in addition to the VFS check above. - */ - if (attr->ia_size > ll_file_maxbytes(inode)) { - CDEBUG(D_INODE, "file " DFID " too large %llu > %llu\n", - PFID(&lli->lli_fid), attr->ia_size, - ll_file_maxbytes(inode)); - return -EFBIG; - } - - attr->ia_valid |= ATTR_MTIME | ATTR_CTIME; - } - - /* POSIX: check before ATTR_*TIME_SET set (from setattr_prepare) */ - if (attr->ia_valid & TIMES_SET_FLAGS) { - if ((!uid_eq(current_fsuid(), inode->i_uid)) && - !capable(CAP_FOWNER)) - return -EPERM; - } - - /* We mark all of the fields "set" so MDS/OST does not re-set them */ - if (attr->ia_valid & ATTR_CTIME) { - attr->ia_ctime = current_time(inode); - attr->ia_valid |= ATTR_CTIME_SET; - } - if (!(attr->ia_valid & ATTR_ATIME_SET) && - (attr->ia_valid & ATTR_ATIME)) { - attr->ia_atime = current_time(inode); - attr->ia_valid |= ATTR_ATIME_SET; - } - if (!(attr->ia_valid & ATTR_MTIME_SET) && - (attr->ia_valid & ATTR_MTIME)) { - attr->ia_mtime = current_time(inode); - attr->ia_valid |= ATTR_MTIME_SET; - } - - if (attr->ia_valid & (ATTR_MTIME | ATTR_CTIME)) - CDEBUG(D_INODE, "setting mtime %lu, ctime %lu, now = %llu\n", - LTIME_S(attr->ia_mtime), LTIME_S(attr->ia_ctime), - (s64)ktime_get_real_seconds()); - - if (S_ISREG(inode->i_mode)) - inode_unlock(inode); - - /* - * We always do an MDS RPC, even if we're only changing the size; - * only the MDS knows whether truncate() should fail with -ETXTBUSY - */ - op_data = kzalloc(sizeof(*op_data), GFP_NOFS); - if (!op_data) { - rc = -ENOMEM; - goto out; - } - - if (!hsm_import && attr->ia_valid & ATTR_SIZE) { - /* - * If we are changing file size, file content is - * modified, flag it. - */ - attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE; - op_data->op_bias |= MDS_DATA_MODIFIED; - clear_bit(LLIF_DATA_MODIFIED, &lli->lli_flags); - } - - op_data->op_attr = *attr; - - rc = ll_md_setattr(dentry, op_data); - if (rc) - goto out; - - if (!S_ISREG(inode->i_mode) || hsm_import) { - rc = 0; - goto out; - } - - if (attr->ia_valid & (ATTR_SIZE | - ATTR_ATIME | ATTR_ATIME_SET | - ATTR_MTIME | ATTR_MTIME_SET)) { - /* For truncate and utimes sending attributes to OSTs, setting - * mtime/atime to the past will be performed under PW [0:EOF] - * extent lock (new_size:EOF for truncate). It may seem - * excessive to send mtime/atime updates to OSTs when not - * setting times to past, but it is necessary due to possible - * time de-synchronization between MDT inode and OST objects - */ - rc = cl_setattr_ost(ll_i2info(inode)->lli_clob, attr, 0); - } - - /* - * If the file was restored, it needs to set dirty flag. - * - * We've already sent MDS_DATA_MODIFIED flag in - * ll_md_setattr() for truncate. However, the MDT refuses to - * set the HS_DIRTY flag on released files, so we have to set - * it again if the file has been restored. Please check how - * LLIF_DATA_MODIFIED is set in vvp_io_setattr_fini(). - * - * Please notice that if the file is not released, the previous - * MDS_DATA_MODIFIED has taken effect and usually - * LLIF_DATA_MODIFIED is not set(see vvp_io_setattr_fini()). - * This way we can save an RPC for common open + trunc - * operation. - */ - if (test_and_clear_bit(LLIF_DATA_MODIFIED, &lli->lli_flags)) { - struct hsm_state_set hss = { - .hss_valid = HSS_SETMASK, - .hss_setmask = HS_DIRTY, - }; - int rc2; - - rc2 = ll_hsm_state_set(inode, &hss); - /* - * truncate and write can happen at the same time, so that - * the file can be set modified even though the file is not - * restored from released state, and ll_hsm_state_set() is - * not applicable for the file, and rc2 < 0 is normal in this - * case. - */ - if (rc2 < 0) - CDEBUG(D_INFO, DFID "HSM set dirty failed: rc2 = %d\n", - PFID(ll_inode2fid(inode)), rc2); - } - -out: - if (op_data) - ll_finish_md_op_data(op_data); - - if (S_ISREG(inode->i_mode)) { - inode_lock(inode); - if ((attr->ia_valid & ATTR_SIZE) && !hsm_import) - inode_dio_wait(inode); - } - - ll_stats_ops_tally(ll_i2sbi(inode), (attr->ia_valid & ATTR_SIZE) ? - LPROC_LL_TRUNC : LPROC_LL_SETATTR, 1); - - return rc; -} - -int ll_setattr(struct dentry *de, struct iattr *attr) -{ - int mode = d_inode(de)->i_mode; - - if ((attr->ia_valid & (ATTR_CTIME | ATTR_SIZE | ATTR_MODE)) == - (ATTR_CTIME | ATTR_SIZE | ATTR_MODE)) - attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE; - - if (((attr->ia_valid & (ATTR_MODE | ATTR_FORCE | ATTR_SIZE)) == - (ATTR_SIZE | ATTR_MODE)) && - (((mode & S_ISUID) && !(attr->ia_mode & S_ISUID)) || - (((mode & (S_ISGID | 0010)) == (S_ISGID | 0010)) && - !(attr->ia_mode & S_ISGID)))) - attr->ia_valid |= ATTR_FORCE; - - if ((attr->ia_valid & ATTR_MODE) && - (mode & S_ISUID) && - !(attr->ia_mode & S_ISUID) && - !(attr->ia_valid & ATTR_KILL_SUID)) - attr->ia_valid |= ATTR_KILL_SUID; - - if ((attr->ia_valid & ATTR_MODE) && - ((mode & (S_ISGID | 0010)) == (S_ISGID | 0010)) && - !(attr->ia_mode & S_ISGID) && - !(attr->ia_valid & ATTR_KILL_SGID)) - attr->ia_valid |= ATTR_KILL_SGID; - - return ll_setattr_raw(de, attr, false); -} - -int ll_statfs_internal(struct super_block *sb, struct obd_statfs *osfs, - __u64 max_age, __u32 flags) -{ - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct obd_statfs obd_osfs; - int rc; - - rc = obd_statfs(NULL, sbi->ll_md_exp, osfs, max_age, flags); - if (rc) { - CERROR("md_statfs fails: rc = %d\n", rc); - return rc; - } - - osfs->os_type = sb->s_magic; - - CDEBUG(D_SUPER, "MDC blocks %llu/%llu objects %llu/%llu\n", - osfs->os_bavail, osfs->os_blocks, osfs->os_ffree, - osfs->os_files); - - if (sbi->ll_flags & LL_SBI_LAZYSTATFS) - flags |= OBD_STATFS_NODELAY; - - rc = obd_statfs_rqset(sbi->ll_dt_exp, &obd_osfs, max_age, flags); - if (rc) { - CERROR("obd_statfs fails: rc = %d\n", rc); - return rc; - } - - CDEBUG(D_SUPER, "OSC blocks %llu/%llu objects %llu/%llu\n", - obd_osfs.os_bavail, obd_osfs.os_blocks, obd_osfs.os_ffree, - obd_osfs.os_files); - - osfs->os_bsize = obd_osfs.os_bsize; - osfs->os_blocks = obd_osfs.os_blocks; - osfs->os_bfree = obd_osfs.os_bfree; - osfs->os_bavail = obd_osfs.os_bavail; - - /* If we don't have as many objects free on the OST as inodes - * on the MDS, we reduce the total number of inodes to - * compensate, so that the "inodes in use" number is correct. - */ - if (obd_osfs.os_ffree < osfs->os_ffree) { - osfs->os_files = (osfs->os_files - osfs->os_ffree) + - obd_osfs.os_ffree; - osfs->os_ffree = obd_osfs.os_ffree; - } - - return rc; -} - -int ll_statfs(struct dentry *de, struct kstatfs *sfs) -{ - struct super_block *sb = de->d_sb; - struct obd_statfs osfs; - int rc; - - CDEBUG(D_VFSTRACE, "VFS Op: at %llu jiffies\n", get_jiffies_64()); - ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_STAFS, 1); - - /* Some amount of caching on the client is allowed */ - rc = ll_statfs_internal(sb, &osfs, - get_jiffies_64() - OBD_STATFS_CACHE_SECONDS * HZ, - 0); - if (rc) - return rc; - - statfs_unpack(sfs, &osfs); - - /* We need to downshift for all 32-bit kernels, because we can't - * tell if the kernel is being called via sys_statfs64() or not. - * Stop before overflowing f_bsize - in which case it is better - * to just risk EOVERFLOW if caller is using old sys_statfs(). - */ - if (sizeof(long) < 8) { - while (osfs.os_blocks > ~0UL && sfs->f_bsize < 0x40000000) { - sfs->f_bsize <<= 1; - - osfs.os_blocks >>= 1; - osfs.os_bfree >>= 1; - osfs.os_bavail >>= 1; - } - } - - sfs->f_blocks = osfs.os_blocks; - sfs->f_bfree = osfs.os_bfree; - sfs->f_bavail = osfs.os_bavail; - sfs->f_fsid = ll_s2sbi(sb)->ll_fsid; - return 0; -} - -void ll_inode_size_lock(struct inode *inode) -{ - struct ll_inode_info *lli; - - LASSERT(!S_ISDIR(inode->i_mode)); - - lli = ll_i2info(inode); - mutex_lock(&lli->lli_size_mutex); -} - -void ll_inode_size_unlock(struct inode *inode) -{ - struct ll_inode_info *lli; - - lli = ll_i2info(inode); - mutex_unlock(&lli->lli_size_mutex); -} - -int ll_update_inode(struct inode *inode, struct lustre_md *md) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct mdt_body *body = md->body; - struct ll_sb_info *sbi = ll_i2sbi(inode); - - if (body->mbo_valid & OBD_MD_FLEASIZE) - cl_file_inode_init(inode, md); - - if (S_ISDIR(inode->i_mode)) { - int rc; - - rc = ll_update_lsm_md(inode, md); - if (rc) - return rc; - } - -#ifdef CONFIG_FS_POSIX_ACL - if (body->mbo_valid & OBD_MD_FLACL) { - spin_lock(&lli->lli_lock); - if (lli->lli_posix_acl) - posix_acl_release(lli->lli_posix_acl); - lli->lli_posix_acl = md->posix_acl; - spin_unlock(&lli->lli_lock); - } -#endif - inode->i_ino = cl_fid_build_ino(&body->mbo_fid1, - sbi->ll_flags & LL_SBI_32BIT_API); - inode->i_generation = cl_fid_build_gen(&body->mbo_fid1); - - if (body->mbo_valid & OBD_MD_FLATIME) { - if (body->mbo_atime > LTIME_S(inode->i_atime)) - LTIME_S(inode->i_atime) = body->mbo_atime; - lli->lli_atime = body->mbo_atime; - } - if (body->mbo_valid & OBD_MD_FLMTIME) { - if (body->mbo_mtime > LTIME_S(inode->i_mtime)) { - CDEBUG(D_INODE, - "setting ino %lu mtime from %lu to %llu\n", - inode->i_ino, LTIME_S(inode->i_mtime), - body->mbo_mtime); - LTIME_S(inode->i_mtime) = body->mbo_mtime; - } - lli->lli_mtime = body->mbo_mtime; - } - if (body->mbo_valid & OBD_MD_FLCTIME) { - if (body->mbo_ctime > LTIME_S(inode->i_ctime)) - LTIME_S(inode->i_ctime) = body->mbo_ctime; - lli->lli_ctime = body->mbo_ctime; - } - if (body->mbo_valid & OBD_MD_FLMODE) - inode->i_mode = (inode->i_mode & S_IFMT) | - (body->mbo_mode & ~S_IFMT); - if (body->mbo_valid & OBD_MD_FLTYPE) - inode->i_mode = (inode->i_mode & ~S_IFMT) | - (body->mbo_mode & S_IFMT); - LASSERT(inode->i_mode != 0); - if (S_ISREG(inode->i_mode)) - inode->i_blkbits = min(PTLRPC_MAX_BRW_BITS + 1, - LL_MAX_BLKSIZE_BITS); - else - inode->i_blkbits = inode->i_sb->s_blocksize_bits; - if (body->mbo_valid & OBD_MD_FLUID) - inode->i_uid = make_kuid(&init_user_ns, body->mbo_uid); - if (body->mbo_valid & OBD_MD_FLGID) - inode->i_gid = make_kgid(&init_user_ns, body->mbo_gid); - if (body->mbo_valid & OBD_MD_FLFLAGS) - inode->i_flags = ll_ext_to_inode_flags(body->mbo_flags); - if (body->mbo_valid & OBD_MD_FLNLINK) - set_nlink(inode, body->mbo_nlink); - if (body->mbo_valid & OBD_MD_FLRDEV) - inode->i_rdev = old_decode_dev(body->mbo_rdev); - - if (body->mbo_valid & OBD_MD_FLID) { - /* FID shouldn't be changed! */ - if (fid_is_sane(&lli->lli_fid)) { - LASSERTF(lu_fid_eq(&lli->lli_fid, &body->mbo_fid1), - "Trying to change FID " DFID " to the " DFID ", inode " DFID "(%p)\n", - PFID(&lli->lli_fid), PFID(&body->mbo_fid1), - PFID(ll_inode2fid(inode)), inode); - } else { - lli->lli_fid = body->mbo_fid1; - } - } - - LASSERT(fid_seq(&lli->lli_fid) != 0); - - if (body->mbo_valid & OBD_MD_FLSIZE) { - i_size_write(inode, body->mbo_size); - - CDEBUG(D_VFSTRACE, "inode=" DFID ", updating i_size %llu\n", - PFID(ll_inode2fid(inode)), - (unsigned long long)body->mbo_size); - - if (body->mbo_valid & OBD_MD_FLBLOCKS) - inode->i_blocks = body->mbo_blocks; - } - - if (body->mbo_valid & OBD_MD_TSTATE) { - if (body->mbo_t_state & MS_RESTORE) - set_bit(LLIF_FILE_RESTORING, &lli->lli_flags); - } - - return 0; -} - -int ll_read_inode2(struct inode *inode, void *opaque) -{ - struct lustre_md *md = opaque; - struct ll_inode_info *lli = ll_i2info(inode); - int rc; - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p)\n", - PFID(&lli->lli_fid), inode); - - /* Core attributes from the MDS first. This is a new inode, and - * the VFS doesn't zero times in the core inode so we have to do - * it ourselves. They will be overwritten by either MDS or OST - * attributes - we just need to make sure they aren't newer. - */ - LTIME_S(inode->i_mtime) = 0; - LTIME_S(inode->i_atime) = 0; - LTIME_S(inode->i_ctime) = 0; - inode->i_rdev = 0; - rc = ll_update_inode(inode, md); - if (rc) - return rc; - - /* OIDEBUG(inode); */ - - if (S_ISREG(inode->i_mode)) { - struct ll_sb_info *sbi = ll_i2sbi(inode); - - inode->i_op = &ll_file_inode_operations; - inode->i_fop = sbi->ll_fop; - inode->i_mapping->a_ops = (struct address_space_operations *)&ll_aops; - } else if (S_ISDIR(inode->i_mode)) { - inode->i_op = &ll_dir_inode_operations; - inode->i_fop = &ll_dir_operations; - } else if (S_ISLNK(inode->i_mode)) { - inode->i_op = &ll_fast_symlink_inode_operations; - } else { - inode->i_op = &ll_special_inode_operations; - - init_special_inode(inode, inode->i_mode, - inode->i_rdev); - } - - return 0; -} - -void ll_delete_inode(struct inode *inode) -{ - struct ll_inode_info *lli = ll_i2info(inode); - - if (S_ISREG(inode->i_mode) && lli->lli_clob) - /* discard all dirty pages before truncating them, required by - * osc_extent implementation at LU-1030. - */ - cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, - CL_FSYNC_LOCAL, 1); - - truncate_inode_pages_final(&inode->i_data); - - LASSERTF(!inode->i_data.nrpages, - "inode=" DFID "(%p) nrpages=%lu, see http://jira.whamcloud.com/browse/LU-118\n", - PFID(ll_inode2fid(inode)), inode, inode->i_data.nrpages); - - ll_clear_inode(inode); - clear_inode(inode); -} - -int ll_iocontrol(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg) -{ - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct ptlrpc_request *req = NULL; - int rc, flags = 0; - - switch (cmd) { - case FSFILT_IOC_GETFLAGS: { - struct mdt_body *body; - struct md_op_data *op_data; - - op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, - 0, 0, LUSTRE_OPC_ANY, - NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - op_data->op_valid = OBD_MD_FLFLAGS; - rc = md_getattr(sbi->ll_md_exp, op_data, &req); - ll_finish_md_op_data(op_data); - if (rc) { - CERROR("%s: failure inode " DFID ": rc = %d\n", - sbi->ll_md_exp->exp_obd->obd_name, - PFID(ll_inode2fid(inode)), rc); - return -abs(rc); - } - - body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - - flags = body->mbo_flags; - - ptlrpc_req_finished(req); - - return put_user(flags, (int __user *)arg); - } - case FSFILT_IOC_SETFLAGS: { - struct md_op_data *op_data; - struct cl_object *obj; - struct iattr *attr; - - if (get_user(flags, (int __user *)arg)) - return -EFAULT; - - op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, - LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - op_data->op_attr_flags = flags; - op_data->op_attr.ia_valid |= ATTR_ATTR_FLAG; - rc = md_setattr(sbi->ll_md_exp, op_data, NULL, 0, &req); - ll_finish_md_op_data(op_data); - ptlrpc_req_finished(req); - if (rc) - return rc; - - inode->i_flags = ll_ext_to_inode_flags(flags); - - obj = ll_i2info(inode)->lli_clob; - if (!obj) - return 0; - - attr = kzalloc(sizeof(*attr), GFP_NOFS); - if (!attr) - return -ENOMEM; - - attr->ia_valid = ATTR_ATTR_FLAG; - rc = cl_setattr_ost(obj, attr, flags); - kfree(attr); - return rc; - } - default: - return -ENOSYS; - } - - return 0; -} - -int ll_flush_ctx(struct inode *inode) -{ - struct ll_sb_info *sbi = ll_i2sbi(inode); - - CDEBUG(D_SEC, "flush context for user %d\n", - from_kuid(&init_user_ns, current_uid())); - - obd_set_info_async(NULL, sbi->ll_md_exp, - sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX, - 0, NULL, NULL); - obd_set_info_async(NULL, sbi->ll_dt_exp, - sizeof(KEY_FLUSH_CTX), KEY_FLUSH_CTX, - 0, NULL, NULL); - return 0; -} - -/* umount -f client means force down, don't save state */ -void ll_umount_begin(struct super_block *sb) -{ - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct obd_device *obd; - struct obd_ioctl_data *ioc_data; - int cnt = 0; - - CDEBUG(D_VFSTRACE, "VFS Op: superblock %p count %d active %d\n", sb, - sb->s_count, atomic_read(&sb->s_active)); - - obd = class_exp2obd(sbi->ll_md_exp); - if (!obd) { - CERROR("Invalid MDC connection handle %#llx\n", - sbi->ll_md_exp->exp_handle.h_cookie); - return; - } - obd->obd_force = 1; - - obd = class_exp2obd(sbi->ll_dt_exp); - if (!obd) { - CERROR("Invalid LOV connection handle %#llx\n", - sbi->ll_dt_exp->exp_handle.h_cookie); - return; - } - obd->obd_force = 1; - - ioc_data = kzalloc(sizeof(*ioc_data), GFP_NOFS); - if (ioc_data) { - obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_md_exp, - sizeof(*ioc_data), ioc_data, NULL); - - obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_dt_exp, - sizeof(*ioc_data), ioc_data, NULL); - - kfree(ioc_data); - } - - /* Really, we'd like to wait until there are no requests outstanding, - * and then continue. For now, we just periodically checking for vfs - * to decrement mnt_cnt and hope to finish it within 10sec. - */ - while (cnt < 10 && !may_umount(sbi->ll_mnt.mnt)) { - schedule_timeout_uninterruptible(HZ); - cnt++; - } - - schedule(); -} - -int ll_remount_fs(struct super_block *sb, int *flags, char *data) -{ - struct ll_sb_info *sbi = ll_s2sbi(sb); - char *profilenm = get_profile_name(sb); - int err; - __u32 read_only; - - if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) { - read_only = *flags & SB_RDONLY; - err = obd_set_info_async(NULL, sbi->ll_md_exp, - sizeof(KEY_READ_ONLY), - KEY_READ_ONLY, sizeof(read_only), - &read_only, NULL); - if (err) { - LCONSOLE_WARN("Failed to remount %s %s (%d)\n", - profilenm, read_only ? - "read-only" : "read-write", err); - return err; - } - - if (read_only) - sb->s_flags |= SB_RDONLY; - else - sb->s_flags &= ~SB_RDONLY; - - if (sbi->ll_flags & LL_SBI_VERBOSE) - LCONSOLE_WARN("Remounted %s %s\n", profilenm, - read_only ? "read-only" : "read-write"); - } - return 0; -} - -/** - * Cleanup the open handle that is cached on MDT-side. - * - * For open case, the client side open handling thread may hit error - * after the MDT grant the open. Under such case, the client should - * send close RPC to the MDT as cleanup; otherwise, the open handle - * on the MDT will be leaked there until the client umount or evicted. - * - * In further, if someone unlinked the file, because the open handle - * holds the reference on such file/object, then it will block the - * subsequent threads that want to locate such object via FID. - * - * \param[in] sb super block for this file-system - * \param[in] open_req pointer to the original open request - */ -void ll_open_cleanup(struct super_block *sb, struct ptlrpc_request *open_req) -{ - struct mdt_body *body; - struct md_op_data *op_data; - struct ptlrpc_request *close_req = NULL; - struct obd_export *exp = ll_s2sbi(sb)->ll_md_exp; - - body = req_capsule_server_get(&open_req->rq_pill, &RMF_MDT_BODY); - op_data = kzalloc(sizeof(*op_data), GFP_NOFS); - if (!op_data) - return; - - op_data->op_fid1 = body->mbo_fid1; - op_data->op_handle = body->mbo_handle; - op_data->op_mod_time = get_seconds(); - md_close(exp, op_data, NULL, &close_req); - ptlrpc_req_finished(close_req); - ll_finish_md_op_data(op_data); -} - -int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req, - struct super_block *sb, struct lookup_intent *it) -{ - struct ll_sb_info *sbi = NULL; - struct lustre_md md = { NULL }; - int rc; - - LASSERT(*inode || sb); - sbi = sb ? ll_s2sbi(sb) : ll_i2sbi(*inode); - rc = md_get_lustre_md(sbi->ll_md_exp, req, sbi->ll_dt_exp, - sbi->ll_md_exp, &md); - if (rc) - goto cleanup; - - if (*inode) { - rc = ll_update_inode(*inode, &md); - if (rc) - goto out; - } else { - LASSERT(sb); - - /* - * At this point server returns to client's same fid as client - * generated for creating. So using ->fid1 is okay here. - */ - if (!fid_is_sane(&md.body->mbo_fid1)) { - CERROR("%s: Fid is insane " DFID "\n", - ll_get_fsname(sb, NULL, 0), - PFID(&md.body->mbo_fid1)); - rc = -EINVAL; - goto out; - } - - *inode = ll_iget(sb, cl_fid_build_ino(&md.body->mbo_fid1, - sbi->ll_flags & LL_SBI_32BIT_API), - &md); - if (IS_ERR(*inode)) { -#ifdef CONFIG_FS_POSIX_ACL - if (md.posix_acl) { - posix_acl_release(md.posix_acl); - md.posix_acl = NULL; - } -#endif - rc = PTR_ERR(*inode); - CERROR("new_inode -fatal: rc %d\n", rc); - goto out; - } - } - - /* Handling piggyback layout lock. - * Layout lock can be piggybacked by getattr and open request. - * The lsm can be applied to inode only if it comes with a layout lock - * otherwise correct layout may be overwritten, for example: - * 1. proc1: mdt returns a lsm but not granting layout - * 2. layout was changed by another client - * 3. proc2: refresh layout and layout lock granted - * 4. proc1: to apply a stale layout - */ - if (it && it->it_lock_mode != 0) { - struct lustre_handle lockh; - struct ldlm_lock *lock; - - lockh.cookie = it->it_lock_handle; - lock = ldlm_handle2lock(&lockh); - LASSERT(lock); - if (ldlm_has_layout(lock)) { - struct cl_object_conf conf; - - memset(&conf, 0, sizeof(conf)); - conf.coc_opc = OBJECT_CONF_SET; - conf.coc_inode = *inode; - conf.coc_lock = lock; - conf.u.coc_layout = md.layout; - (void)ll_layout_conf(*inode, &conf); - } - LDLM_LOCK_PUT(lock); - } - -out: - md_free_lustre_md(sbi->ll_md_exp, &md); -cleanup: - if (rc != 0 && it && it->it_op & IT_OPEN) - ll_open_cleanup(sb ? sb : (*inode)->i_sb, req); - - return rc; -} - -int ll_obd_statfs(struct inode *inode, void __user *arg) -{ - struct ll_sb_info *sbi = NULL; - struct obd_export *exp; - char *buf = NULL; - struct obd_ioctl_data *data = NULL; - __u32 type; - int len = 0, rc; - - if (!inode) { - rc = -EINVAL; - goto out_statfs; - } - - sbi = ll_i2sbi(inode); - if (!sbi) { - rc = -EINVAL; - goto out_statfs; - } - - rc = obd_ioctl_getdata(&buf, &len, arg); - if (rc) - goto out_statfs; - - data = (void *)buf; - if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2 || - !data->ioc_pbuf1 || !data->ioc_pbuf2) { - rc = -EINVAL; - goto out_statfs; - } - - if (data->ioc_inllen1 != sizeof(__u32) || - data->ioc_inllen2 != sizeof(__u32) || - data->ioc_plen1 != sizeof(struct obd_statfs) || - data->ioc_plen2 != sizeof(struct obd_uuid)) { - rc = -EINVAL; - goto out_statfs; - } - - memcpy(&type, data->ioc_inlbuf1, sizeof(__u32)); - if (type & LL_STATFS_LMV) { - exp = sbi->ll_md_exp; - } else if (type & LL_STATFS_LOV) { - exp = sbi->ll_dt_exp; - } else { - rc = -ENODEV; - goto out_statfs; - } - - rc = obd_iocontrol(IOC_OBD_STATFS, exp, len, buf, NULL); - if (rc) - goto out_statfs; -out_statfs: - kvfree(buf); - return rc; -} - -int ll_process_config(struct lustre_cfg *lcfg) -{ - char *ptr; - void *sb; - struct lprocfs_static_vars lvars; - unsigned long x; - int rc = 0; - - lprocfs_llite_init_vars(&lvars); - - /* The instance name contains the sb: lustre-client-aacfe000 */ - ptr = strrchr(lustre_cfg_string(lcfg, 0), '-'); - if (!ptr || !*(++ptr)) - return -EINVAL; - rc = kstrtoul(ptr, 16, &x); - if (rc != 0) - return -EINVAL; - sb = (void *)x; - /* This better be a real Lustre superblock! */ - LASSERT(s2lsi((struct super_block *)sb)->lsi_lmd->lmd_magic == - LMD_MAGIC); - - /* Note we have not called client_common_fill_super yet, so - * proc fns must be able to handle that! - */ - rc = class_process_proc_param(PARAM_LLITE, lvars.obd_vars, - lcfg, sb); - if (rc > 0) - rc = 0; - return rc; -} - -/* this function prepares md_op_data hint for passing ot down to MD stack. */ -struct md_op_data *ll_prep_md_op_data(struct md_op_data *op_data, - struct inode *i1, struct inode *i2, - const char *name, size_t namelen, - u32 mode, __u32 opc, void *data) -{ - if (!name) { - /* Do not reuse namelen for something else. */ - if (namelen) - return ERR_PTR(-EINVAL); - } else { - if (namelen > ll_i2sbi(i1)->ll_namelen) - return ERR_PTR(-ENAMETOOLONG); - - if (!lu_name_is_valid_2(name, namelen)) - return ERR_PTR(-EINVAL); - } - - if (!op_data) - op_data = kzalloc(sizeof(*op_data), GFP_NOFS); - - if (!op_data) - return ERR_PTR(-ENOMEM); - - ll_i2gids(op_data->op_suppgids, i1, i2); - op_data->op_fid1 = *ll_inode2fid(i1); - op_data->op_default_stripe_offset = -1; - if (S_ISDIR(i1->i_mode)) { - op_data->op_mea1 = ll_i2info(i1)->lli_lsm_md; - if (opc == LUSTRE_OPC_MKDIR) - op_data->op_default_stripe_offset = - ll_i2info(i1)->lli_def_stripe_offset; - } - - if (i2) { - op_data->op_fid2 = *ll_inode2fid(i2); - if (S_ISDIR(i2->i_mode)) - op_data->op_mea2 = ll_i2info(i2)->lli_lsm_md; - } else { - fid_zero(&op_data->op_fid2); - } - - if (ll_i2sbi(i1)->ll_flags & LL_SBI_64BIT_HASH) - op_data->op_cli_flags |= CLI_HASH64; - - if (ll_need_32bit_api(ll_i2sbi(i1))) - op_data->op_cli_flags |= CLI_API32; - - op_data->op_name = name; - op_data->op_namelen = namelen; - op_data->op_mode = mode; - op_data->op_mod_time = ktime_get_real_seconds(); - op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid()); - op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid()); - op_data->op_cap = current_cap(); - if ((opc == LUSTRE_OPC_CREATE) && name && - filename_is_volatile(name, namelen, &op_data->op_mds)) - op_data->op_bias |= MDS_CREATE_VOLATILE; - else - op_data->op_mds = 0; - op_data->op_data = data; - - return op_data; -} - -void ll_finish_md_op_data(struct md_op_data *op_data) -{ - kfree(op_data); -} - -int ll_show_options(struct seq_file *seq, struct dentry *dentry) -{ - struct ll_sb_info *sbi; - - LASSERT(seq && dentry); - sbi = ll_s2sbi(dentry->d_sb); - - if (sbi->ll_flags & LL_SBI_NOLCK) - seq_puts(seq, ",nolock"); - - if (sbi->ll_flags & LL_SBI_FLOCK) - seq_puts(seq, ",flock"); - - if (sbi->ll_flags & LL_SBI_LOCALFLOCK) - seq_puts(seq, ",localflock"); - - if (sbi->ll_flags & LL_SBI_USER_XATTR) - seq_puts(seq, ",user_xattr"); - - if (sbi->ll_flags & LL_SBI_LAZYSTATFS) - seq_puts(seq, ",lazystatfs"); - - if (sbi->ll_flags & LL_SBI_USER_FID2PATH) - seq_puts(seq, ",user_fid2path"); - - if (sbi->ll_flags & LL_SBI_ALWAYS_PING) - seq_puts(seq, ",always_ping"); - - return 0; -} - -/** - * Get obd name by cmd, and copy out to user space - */ -int ll_get_obd_name(struct inode *inode, unsigned int cmd, unsigned long arg) -{ - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct obd_device *obd; - - if (cmd == OBD_IOC_GETDTNAME) - obd = class_exp2obd(sbi->ll_dt_exp); - else if (cmd == OBD_IOC_GETMDNAME) - obd = class_exp2obd(sbi->ll_md_exp); - else - return -EINVAL; - - if (!obd) - return -ENOENT; - - if (copy_to_user((void __user *)arg, obd->obd_name, - strlen(obd->obd_name) + 1)) - return -EFAULT; - - return 0; -} - -/** - * Get lustre file system name by \a sbi. If \a buf is provided(non-NULL), the - * fsname will be returned in this buffer; otherwise, a static buffer will be - * used to store the fsname and returned to caller. - */ -char *ll_get_fsname(struct super_block *sb, char *buf, int buflen) -{ - static char fsname_static[MTI_NAME_MAXLEN]; - struct lustre_sb_info *lsi = s2lsi(sb); - char *ptr; - int len; - - if (!buf) { - /* this means the caller wants to use static buffer - * and it doesn't care about race. Usually this is - * in error reporting path - */ - buf = fsname_static; - buflen = sizeof(fsname_static); - } - - len = strlen(lsi->lsi_lmd->lmd_profile); - ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-'); - if (ptr && (strcmp(ptr, "-client") == 0)) - len -= 7; - - if (unlikely(len >= buflen)) - len = buflen - 1; - strncpy(buf, lsi->lsi_lmd->lmd_profile, len); - buf[len] = '\0'; - - return buf; -} - -void ll_dirty_page_discard_warn(struct page *page, int ioret) -{ - char *buf, *path = NULL; - struct dentry *dentry = NULL; - struct vvp_object *obj = cl_inode2vvp(page->mapping->host); - - /* this can be called inside spin lock so use GFP_ATOMIC. */ - buf = (char *)__get_free_page(GFP_ATOMIC); - if (buf) { - dentry = d_find_alias(page->mapping->host); - if (dentry) - path = dentry_path_raw(dentry, buf, PAGE_SIZE); - } - - CDEBUG(D_WARNING, - "%s: dirty page discard: %s/fid: " DFID "/%s may get corrupted (rc %d)\n", - ll_get_fsname(page->mapping->host->i_sb, NULL, 0), - s2lsi(page->mapping->host->i_sb)->lsi_lmd->lmd_dev, - PFID(&obj->vob_header.coh_lu.loh_fid), - (path && !IS_ERR(path)) ? path : "", ioret); - - if (dentry) - dput(dentry); - - if (buf) - free_page((unsigned long)buf); -} - -ssize_t ll_copy_user_md(const struct lov_user_md __user *md, - struct lov_user_md **kbuf) -{ - struct lov_user_md lum; - ssize_t lum_size; - - if (copy_from_user(&lum, md, sizeof(lum))) { - lum_size = -EFAULT; - goto no_kbuf; - } - - lum_size = ll_lov_user_md_size(&lum); - if (lum_size < 0) - goto no_kbuf; - - *kbuf = kzalloc(lum_size, GFP_NOFS); - if (!*kbuf) { - lum_size = -ENOMEM; - goto no_kbuf; - } - - if (copy_from_user(*kbuf, md, lum_size) != 0) { - kfree(*kbuf); - *kbuf = NULL; - lum_size = -EFAULT; - } -no_kbuf: - return lum_size; -} - -/* - * Compute llite root squash state after a change of root squash - * configuration setting or add/remove of a lnet nid - */ -void ll_compute_rootsquash_state(struct ll_sb_info *sbi) -{ - struct root_squash_info *squash = &sbi->ll_squash; - struct lnet_process_id id; - bool matched; - int i; - - /* Update norootsquash flag */ - down_write(&squash->rsi_sem); - if (list_empty(&squash->rsi_nosquash_nids)) { - sbi->ll_flags &= ~LL_SBI_NOROOTSQUASH; - } else { - /* - * Do not apply root squash as soon as one of our NIDs is - * in the nosquash_nids list - */ - matched = false; - i = 0; - - while (LNetGetId(i++, &id) != -ENOENT) { - if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND) - continue; - if (cfs_match_nid(id.nid, &squash->rsi_nosquash_nids)) { - matched = true; - break; - } - } - if (matched) - sbi->ll_flags |= LL_SBI_NOROOTSQUASH; - else - sbi->ll_flags &= ~LL_SBI_NOROOTSQUASH; - } - up_write(&squash->rsi_sem); -} - -/** - * Parse linkea content to extract information about a given hardlink - * - * \param[in] ldata - Initialized linkea data - * \param[in] linkno - Link identifier - * \param[out] parent_fid - The entry's parent FID - * \param[in] size - Entry name destination buffer - * - * \retval 0 on success - * \retval Appropriate negative error code on failure - */ -static int ll_linkea_decode(struct linkea_data *ldata, unsigned int linkno, - struct lu_fid *parent_fid, struct lu_name *ln) -{ - unsigned int idx; - int rc; - - rc = linkea_init_with_rec(ldata); - if (rc < 0) - return rc; - - if (linkno >= ldata->ld_leh->leh_reccount) - /* beyond last link */ - return -ENODATA; - - linkea_first_entry(ldata); - for (idx = 0; ldata->ld_lee; idx++) { - linkea_entry_unpack(ldata->ld_lee, &ldata->ld_reclen, ln, - parent_fid); - if (idx == linkno) - break; - - linkea_next_entry(ldata); - } - - if (idx < linkno) - return -ENODATA; - - return 0; -} - -/** - * Get parent FID and name of an identified link. Operation is performed for - * a given link number, letting the caller iterate over linkno to list one or - * all links of an entry. - * - * \param[in] file - File descriptor against which to perform the operation - * \param[in,out] arg - User-filled structure containing the linkno to operate - * on and the available size. It is eventually filled - * with the requested information or left untouched on - * error - * - * \retval - 0 on success - * \retval - Appropriate negative error code on failure - */ -int ll_getparent(struct file *file, struct getparent __user *arg) -{ - struct inode *inode = file_inode(file); - struct linkea_data *ldata; - struct lu_fid parent_fid; - struct lu_buf buf = { - .lb_buf = NULL, - .lb_len = 0 - }; - struct lu_name ln; - u32 name_size; - u32 linkno; - int rc; - - if (!capable(CAP_DAC_READ_SEARCH) && - !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH)) - return -EPERM; - - if (get_user(name_size, &arg->gp_name_size)) - return -EFAULT; - - if (get_user(linkno, &arg->gp_linkno)) - return -EFAULT; - - if (name_size > PATH_MAX) - return -EINVAL; - - ldata = kzalloc(sizeof(*ldata), GFP_NOFS); - if (!ldata) - return -ENOMEM; - - rc = linkea_data_new(ldata, &buf); - if (rc < 0) - goto ldata_free; - - rc = ll_xattr_list(inode, XATTR_NAME_LINK, XATTR_TRUSTED_T, buf.lb_buf, - buf.lb_len, OBD_MD_FLXATTR); - if (rc < 0) - goto lb_free; - - rc = ll_linkea_decode(ldata, linkno, &parent_fid, &ln); - if (rc < 0) - goto lb_free; - - if (ln.ln_namelen >= name_size) { - rc = -EOVERFLOW; - goto lb_free; - } - - if (copy_to_user(&arg->gp_fid, &parent_fid, sizeof(arg->gp_fid))) { - rc = -EFAULT; - goto lb_free; - } - - if (copy_to_user(&arg->gp_name, ln.ln_name, ln.ln_namelen)) { - rc = -EFAULT; - goto lb_free; - } - - if (put_user('\0', arg->gp_name + ln.ln_namelen)) { - rc = -EFAULT; - goto lb_free; - } - -lb_free: - kvfree(buf.lb_buf); -ldata_free: - kfree(ldata); - return rc; -} diff --git a/drivers/staging/lustre/lustre/llite/llite_mmap.c b/drivers/staging/lustre/lustre/llite/llite_mmap.c deleted file mode 100644 index d7fb5533f707288c17f17ba9a8814b76a3216348..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/llite_mmap.c +++ /dev/null @@ -1,480 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#define DEBUG_SUBSYSTEM S_LLITE - -#include "llite_internal.h" - -static const struct vm_operations_struct ll_file_vm_ops; - -void policy_from_vma(union ldlm_policy_data *policy, - struct vm_area_struct *vma, unsigned long addr, - size_t count) -{ - policy->l_extent.start = ((addr - vma->vm_start) & PAGE_MASK) + - (vma->vm_pgoff << PAGE_SHIFT); - policy->l_extent.end = (policy->l_extent.start + count - 1) | - ~PAGE_MASK; -} - -struct vm_area_struct *our_vma(struct mm_struct *mm, unsigned long addr, - size_t count) -{ - struct vm_area_struct *vma, *ret = NULL; - - /* mmap_sem must have been held by caller. */ - LASSERT(!down_write_trylock(&mm->mmap_sem)); - - for (vma = find_vma(mm, addr); - vma && vma->vm_start < (addr + count); vma = vma->vm_next) { - if (vma->vm_ops && vma->vm_ops == &ll_file_vm_ops && - vma->vm_flags & VM_SHARED) { - ret = vma; - break; - } - } - return ret; -} - -/** - * API independent part for page fault initialization. - * \param vma - virtual memory area addressed to page fault - * \param env - corespondent lu_env to processing - * \param index - page index corespondent to fault. - * \parm ra_flags - vma readahead flags. - * - * \return error codes from cl_io_init. - */ -static struct cl_io * -ll_fault_io_init(struct lu_env *env, struct vm_area_struct *vma, - pgoff_t index, unsigned long *ra_flags) -{ - struct file *file = vma->vm_file; - struct inode *inode = file_inode(file); - struct cl_io *io; - struct cl_fault_io *fio; - int rc; - - if (ll_file_nolock(file)) - return ERR_PTR(-EOPNOTSUPP); - -restart: - io = vvp_env_thread_io(env); - io->ci_obj = ll_i2info(inode)->lli_clob; - LASSERT(io->ci_obj); - - fio = &io->u.ci_fault; - fio->ft_index = index; - fio->ft_executable = vma->vm_flags & VM_EXEC; - - /* - * disable VM_SEQ_READ and use VM_RAND_READ to make sure that - * the kernel will not read other pages not covered by ldlm in - * filemap_nopage. we do our readahead in ll_readpage. - */ - if (ra_flags) - *ra_flags = vma->vm_flags & (VM_RAND_READ | VM_SEQ_READ); - vma->vm_flags &= ~VM_SEQ_READ; - vma->vm_flags |= VM_RAND_READ; - - CDEBUG(D_MMAP, "vm_flags: %lx (%lu %d)\n", vma->vm_flags, - fio->ft_index, fio->ft_executable); - - rc = cl_io_init(env, io, CIT_FAULT, io->ci_obj); - if (rc == 0) { - struct vvp_io *vio = vvp_env_io(env); - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - - LASSERT(vio->vui_cl.cis_io == io); - - /* mmap lock must be MANDATORY it has to cache pages. */ - io->ci_lockreq = CILR_MANDATORY; - vio->vui_fd = fd; - } else { - LASSERT(rc < 0); - cl_io_fini(env, io); - if (io->ci_need_restart) - goto restart; - - io = ERR_PTR(rc); - } - - return io; -} - -/* Sharing code of page_mkwrite method for rhel5 and rhel6 */ -static int ll_page_mkwrite0(struct vm_area_struct *vma, struct page *vmpage, - bool *retry) -{ - struct lu_env *env; - struct cl_io *io; - struct vvp_io *vio; - int result; - u16 refcheck; - sigset_t old, new; - struct inode *inode; - struct ll_inode_info *lli; - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - io = ll_fault_io_init(env, vma, vmpage->index, NULL); - if (IS_ERR(io)) { - result = PTR_ERR(io); - goto out; - } - - result = io->ci_result; - if (result < 0) - goto out_io; - - io->u.ci_fault.ft_mkwrite = 1; - io->u.ci_fault.ft_writable = 1; - - vio = vvp_env_io(env); - vio->u.fault.ft_vma = vma; - vio->u.fault.ft_vmpage = vmpage; - - siginitsetinv(&new, sigmask(SIGKILL) | sigmask(SIGTERM)); - sigprocmask(SIG_BLOCK, &new, &old); - - inode = vvp_object_inode(io->ci_obj); - lli = ll_i2info(inode); - - result = cl_io_loop(env, io); - - sigprocmask(SIG_SETMASK, &old, NULL); - - if (result == 0) { - struct inode *inode = file_inode(vma->vm_file); - struct ll_inode_info *lli = ll_i2info(inode); - - lock_page(vmpage); - if (!vmpage->mapping) { - unlock_page(vmpage); - - /* page was truncated and lock was cancelled, return - * ENODATA so that VM_FAULT_NOPAGE will be returned - * to handle_mm_fault(). - */ - if (result == 0) - result = -ENODATA; - } else if (!PageDirty(vmpage)) { - /* race, the page has been cleaned by ptlrpcd after - * it was unlocked, it has to be added into dirty - * cache again otherwise this soon-to-dirty page won't - * consume any grants, even worse if this page is being - * transferred because it will break RPC checksum. - */ - unlock_page(vmpage); - - CDEBUG(D_MMAP, - "Race on page_mkwrite %p/%lu, page has been written out, retry.\n", - vmpage, vmpage->index); - - *retry = true; - result = -EAGAIN; - } - - if (!result) - set_bit(LLIF_DATA_MODIFIED, &lli->lli_flags); - } - -out_io: - cl_io_fini(env, io); -out: - cl_env_put(env, &refcheck); - CDEBUG(D_MMAP, "%s mkwrite with %d\n", current->comm, result); - LASSERT(ergo(result == 0, PageLocked(vmpage))); - - return result; -} - -static inline int to_fault_error(int result) -{ - switch (result) { - case 0: - result = VM_FAULT_LOCKED; - break; - case -EFAULT: - result = VM_FAULT_NOPAGE; - break; - case -ENOMEM: - result = VM_FAULT_OOM; - break; - default: - result = VM_FAULT_SIGBUS; - break; - } - return result; -} - -/** - * Lustre implementation of a vm_operations_struct::fault() method, called by - * VM to server page fault (both in kernel and user space). - * - * \param vma - is virtual area struct related to page fault - * \param vmf - structure which describe type and address where hit fault - * - * \return allocated and filled _locked_ page for address - * \retval VM_FAULT_ERROR on general error - * \retval NOPAGE_OOM not have memory for allocate new page - */ -static int ll_fault0(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct lu_env *env; - struct cl_io *io; - struct vvp_io *vio = NULL; - struct page *vmpage; - unsigned long ra_flags; - int result = 0; - int fault_ret = 0; - u16 refcheck; - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - io = ll_fault_io_init(env, vma, vmf->pgoff, &ra_flags); - if (IS_ERR(io)) { - result = to_fault_error(PTR_ERR(io)); - goto out; - } - - result = io->ci_result; - if (result == 0) { - vio = vvp_env_io(env); - vio->u.fault.ft_vma = vma; - vio->u.fault.ft_vmpage = NULL; - vio->u.fault.ft_vmf = vmf; - vio->u.fault.ft_flags = 0; - vio->u.fault.ft_flags_valid = false; - - /* May call ll_readpage() */ - ll_cl_add(vma->vm_file, env, io); - - result = cl_io_loop(env, io); - - ll_cl_remove(vma->vm_file, env); - - /* ft_flags are only valid if we reached - * the call to filemap_fault - */ - if (vio->u.fault.ft_flags_valid) - fault_ret = vio->u.fault.ft_flags; - - vmpage = vio->u.fault.ft_vmpage; - if (result != 0 && vmpage) { - put_page(vmpage); - vmf->page = NULL; - } - } - cl_io_fini(env, io); - - vma->vm_flags |= ra_flags; - -out: - cl_env_put(env, &refcheck); - if (result != 0 && !(fault_ret & VM_FAULT_RETRY)) - fault_ret |= to_fault_error(result); - - CDEBUG(D_MMAP, "%s fault %d/%d\n", current->comm, fault_ret, result); - return fault_ret; -} - -static int ll_fault(struct vm_fault *vmf) -{ - int count = 0; - bool printed = false; - int result; - sigset_t old, new; - - /* Only SIGKILL and SIGTERM are allowed for fault/nopage/mkwrite - * so that it can be killed by admin but not cause segfault by - * other signals. - */ - siginitsetinv(&new, sigmask(SIGKILL) | sigmask(SIGTERM)); - sigprocmask(SIG_BLOCK, &new, &old); - -restart: - result = ll_fault0(vmf->vma, vmf); - LASSERT(!(result & VM_FAULT_LOCKED)); - if (result == 0) { - struct page *vmpage = vmf->page; - - /* check if this page has been truncated */ - lock_page(vmpage); - if (unlikely(!vmpage->mapping)) { /* unlucky */ - unlock_page(vmpage); - put_page(vmpage); - vmf->page = NULL; - - if (!printed && ++count > 16) { - CWARN("the page is under heavy contention, maybe your app(%s) needs revising :-)\n", - current->comm); - printed = true; - } - - goto restart; - } - - result = VM_FAULT_LOCKED; - } - sigprocmask(SIG_SETMASK, &old, NULL); - return result; -} - -static int ll_page_mkwrite(struct vm_fault *vmf) -{ - struct vm_area_struct *vma = vmf->vma; - int count = 0; - bool printed = false; - bool retry; - int result; - - file_update_time(vma->vm_file); - do { - retry = false; - result = ll_page_mkwrite0(vma, vmf->page, &retry); - - if (!printed && ++count > 16) { - const struct dentry *de = vma->vm_file->f_path.dentry; - - CWARN("app(%s): the page %lu of file " DFID " is under heavy contention\n", - current->comm, vmf->pgoff, - PFID(ll_inode2fid(de->d_inode))); - printed = true; - } - } while (retry); - - switch (result) { - case 0: - LASSERT(PageLocked(vmf->page)); - result = VM_FAULT_LOCKED; - break; - case -ENODATA: - case -EAGAIN: - case -EFAULT: - result = VM_FAULT_NOPAGE; - break; - case -ENOMEM: - result = VM_FAULT_OOM; - break; - default: - result = VM_FAULT_SIGBUS; - break; - } - - return result; -} - -/** - * To avoid cancel the locks covering mmapped region for lock cache pressure, - * we track the mapped vma count in vvp_object::vob_mmap_cnt. - */ -static void ll_vm_open(struct vm_area_struct *vma) -{ - struct inode *inode = file_inode(vma->vm_file); - struct vvp_object *vob = cl_inode2vvp(inode); - - LASSERT(atomic_read(&vob->vob_mmap_cnt) >= 0); - atomic_inc(&vob->vob_mmap_cnt); -} - -/** - * Dual to ll_vm_open(). - */ -static void ll_vm_close(struct vm_area_struct *vma) -{ - struct inode *inode = file_inode(vma->vm_file); - struct vvp_object *vob = cl_inode2vvp(inode); - - atomic_dec(&vob->vob_mmap_cnt); - LASSERT(atomic_read(&vob->vob_mmap_cnt) >= 0); -} - -/* XXX put nice comment here. talk about __free_pte -> dirty pages and - * nopage's reference passing to the pte - */ -int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last) -{ - int rc = -ENOENT; - - LASSERTF(last > first, "last %llu first %llu\n", last, first); - if (mapping_mapped(mapping)) { - rc = 0; - unmap_mapping_range(mapping, first + PAGE_SIZE - 1, - last - first + 1, 0); - } - - return rc; -} - -static const struct vm_operations_struct ll_file_vm_ops = { - .fault = ll_fault, - .page_mkwrite = ll_page_mkwrite, - .open = ll_vm_open, - .close = ll_vm_close, -}; - -int ll_file_mmap(struct file *file, struct vm_area_struct *vma) -{ - struct inode *inode = file_inode(file); - int rc; - - if (ll_file_nolock(file)) - return -EOPNOTSUPP; - - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_MAP, 1); - rc = generic_file_mmap(file, vma); - if (rc == 0) { - vma->vm_ops = &ll_file_vm_ops; - vma->vm_ops->open(vma); - /* update the inode's size and mtime */ - rc = ll_glimpse_size(inode); - } - - return rc; -} diff --git a/drivers/staging/lustre/lustre/llite/llite_nfs.c b/drivers/staging/lustre/lustre/llite/llite_nfs.c deleted file mode 100644 index 14172688d55f86792f9dfde82d3c8ae00121f40a..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/llite_nfs.c +++ /dev/null @@ -1,375 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/lustre/llite/llite_nfs.c - * - * NFS export of Lustre Light File System - * - * Author: Yury Umanets - * Author: Huang Hua - */ - -#define DEBUG_SUBSYSTEM S_LLITE -#include "llite_internal.h" -#include - -__u32 get_uuid2int(const char *name, int len) -{ - __u32 key0 = 0x12a3fe2d, key1 = 0x37abe8f9; - - while (len--) { - __u32 key = key1 + (key0 ^ (*name++ * 7152373)); - - if (key & 0x80000000) - key -= 0x7fffffff; - key1 = key0; - key0 = key; - } - return (key0 << 1); -} - -void get_uuid2fsid(const char *name, int len, __kernel_fsid_t *fsid) -{ - __u64 key = 0, key0 = 0x12a3fe2d, key1 = 0x37abe8f9; - - while (len--) { - key = key1 + (key0 ^ (*name++ * 7152373)); - if (key & 0x8000000000000000ULL) - key -= 0x7fffffffffffffffULL; - key1 = key0; - key0 = key; - } - - fsid->val[0] = key; - fsid->val[1] = key >> 32; -} - -struct inode *search_inode_for_lustre(struct super_block *sb, - const struct lu_fid *fid) -{ - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct ptlrpc_request *req = NULL; - struct inode *inode = NULL; - int eadatalen = 0; - unsigned long hash = cl_fid_build_ino(fid, - ll_need_32bit_api(sbi)); - struct md_op_data *op_data; - int rc; - - CDEBUG(D_INFO, "searching inode for:(%lu," DFID ")\n", hash, PFID(fid)); - - inode = ilookup5(sb, hash, ll_test_inode_by_fid, (void *)fid); - if (inode) - return inode; - - rc = ll_get_default_mdsize(sbi, &eadatalen); - if (rc) - return ERR_PTR(rc); - - /* Because inode is NULL, ll_prep_md_op_data can not - * be used here. So we allocate op_data ourselves - */ - op_data = kzalloc(sizeof(*op_data), GFP_NOFS); - if (!op_data) - return ERR_PTR(-ENOMEM); - - op_data->op_fid1 = *fid; - op_data->op_mode = eadatalen; - op_data->op_valid = OBD_MD_FLEASIZE; - - /* mds_fid2dentry ignores f_type */ - rc = md_getattr(sbi->ll_md_exp, op_data, &req); - kfree(op_data); - if (rc) { - CDEBUG(D_INFO, "can't get object attrs, fid " DFID ", rc %d\n", - PFID(fid), rc); - return ERR_PTR(rc); - } - rc = ll_prep_inode(&inode, req, sb, NULL); - ptlrpc_req_finished(req); - if (rc) - return ERR_PTR(rc); - - return inode; -} - -struct lustre_nfs_fid { - struct lu_fid lnf_child; - struct lu_fid lnf_parent; -}; - -static struct dentry * -ll_iget_for_nfs(struct super_block *sb, - struct lu_fid *fid, struct lu_fid *parent) -{ - struct inode *inode; - struct dentry *result; - - if (!fid_is_sane(fid)) - return ERR_PTR(-ESTALE); - - CDEBUG(D_INFO, "Get dentry for fid: " DFID "\n", PFID(fid)); - - inode = search_inode_for_lustre(sb, fid); - if (IS_ERR(inode)) - return ERR_CAST(inode); - - if (is_bad_inode(inode)) { - /* we didn't find the right inode.. */ - iput(inode); - return ERR_PTR(-ESTALE); - } - - result = d_obtain_alias(inode); - if (IS_ERR(result)) { - iput(inode); - return result; - } - - /** - * In case d_obtain_alias() found a disconnected dentry, always update - * lli_pfid to allow later operation (normally open) have parent fid, - * which may be used by MDS to create data. - */ - if (parent) { - struct ll_inode_info *lli = ll_i2info(inode); - - spin_lock(&lli->lli_lock); - lli->lli_pfid = *parent; - spin_unlock(&lli->lli_lock); - } - - /* N.B. d_obtain_alias() drops inode ref on error */ - result = d_obtain_alias(inode); - if (!IS_ERR(result)) { - /* - * Need to signal to the ll_intent_file_open that - * we came from NFS and so opencache needs to be - * enabled for this one - */ - ll_d2d(result)->lld_nfs_dentry = 1; - } - - return result; -} - -/** - * \a connectable - is nfsd will connect himself or this should be done - * at lustre - * - * The return value is file handle type: - * 1 -- contains child file handle; - * 2 -- contains child file handle and parent file handle; - * 255 -- error. - */ -static int ll_encode_fh(struct inode *inode, __u32 *fh, int *plen, - struct inode *parent) -{ - int fileid_len = sizeof(struct lustre_nfs_fid) / 4; - struct lustre_nfs_fid *nfs_fid = (void *)fh; - - CDEBUG(D_INFO, "%s: encoding for (" DFID ") maxlen=%d minlen=%d\n", - ll_get_fsname(inode->i_sb, NULL, 0), - PFID(ll_inode2fid(inode)), *plen, fileid_len); - - if (*plen < fileid_len) { - *plen = fileid_len; - return FILEID_INVALID; - } - - nfs_fid->lnf_child = *ll_inode2fid(inode); - if (parent) - nfs_fid->lnf_parent = *ll_inode2fid(parent); - else - fid_zero(&nfs_fid->lnf_parent); - *plen = fileid_len; - - return FILEID_LUSTRE; -} - -static int ll_nfs_get_name_filldir(struct dir_context *ctx, const char *name, - int namelen, loff_t hash, u64 ino, - unsigned int type) -{ - /* It is hack to access lde_fid for comparison with lgd_fid. - * So the input 'name' must be part of the 'lu_dirent'. - */ - struct lu_dirent *lde = container_of((void*)name, struct lu_dirent, lde_name); - struct ll_getname_data *lgd = - container_of(ctx, struct ll_getname_data, ctx); - struct lu_fid fid; - - fid_le_to_cpu(&fid, &lde->lde_fid); - if (lu_fid_eq(&fid, &lgd->lgd_fid)) { - memcpy(lgd->lgd_name, name, namelen); - lgd->lgd_name[namelen] = 0; - lgd->lgd_found = 1; - } - return lgd->lgd_found; -} - -static int ll_get_name(struct dentry *dentry, char *name, - struct dentry *child) -{ - struct inode *dir = d_inode(dentry); - int rc; - struct ll_getname_data lgd = { - .lgd_name = name, - .lgd_fid = ll_i2info(d_inode(child))->lli_fid, - .ctx.actor = ll_nfs_get_name_filldir, - }; - struct md_op_data *op_data; - __u64 pos = 0; - - if (!dir || !S_ISDIR(dir->i_mode)) { - rc = -ENOTDIR; - goto out; - } - - if (!dir->i_fop) { - rc = -EINVAL; - goto out; - } - - op_data = ll_prep_md_op_data(NULL, dir, dir, NULL, 0, 0, - LUSTRE_OPC_ANY, dir); - if (IS_ERR(op_data)) { - rc = PTR_ERR(op_data); - goto out; - } - - op_data->op_max_pages = ll_i2sbi(dir)->ll_md_brw_pages; - inode_lock(dir); - rc = ll_dir_read(dir, &pos, op_data, &lgd.ctx); - inode_unlock(dir); - ll_finish_md_op_data(op_data); - if (!rc && !lgd.lgd_found) - rc = -ENOENT; -out: - return rc; -} - -static struct dentry *ll_fh_to_dentry(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) -{ - struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid; - - if (fh_type != FILEID_LUSTRE) - return ERR_PTR(-EPROTO); - - return ll_iget_for_nfs(sb, &nfs_fid->lnf_child, &nfs_fid->lnf_parent); -} - -static struct dentry *ll_fh_to_parent(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) -{ - struct lustre_nfs_fid *nfs_fid = (struct lustre_nfs_fid *)fid; - - if (fh_type != FILEID_LUSTRE) - return ERR_PTR(-EPROTO); - - return ll_iget_for_nfs(sb, &nfs_fid->lnf_parent, NULL); -} - -int ll_dir_get_parent_fid(struct inode *dir, struct lu_fid *parent_fid) -{ - struct ptlrpc_request *req = NULL; - struct ll_sb_info *sbi; - struct mdt_body *body; - static const char dotdot[] = ".."; - struct md_op_data *op_data; - int rc; - int lmmsize; - - LASSERT(dir && S_ISDIR(dir->i_mode)); - - sbi = ll_s2sbi(dir->i_sb); - - CDEBUG(D_INFO, "%s: getting parent for (" DFID ")\n", - ll_get_fsname(dir->i_sb, NULL, 0), - PFID(ll_inode2fid(dir))); - - rc = ll_get_default_mdsize(sbi, &lmmsize); - if (rc != 0) - return rc; - - op_data = ll_prep_md_op_data(NULL, dir, NULL, dotdot, - strlen(dotdot), lmmsize, - LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - rc = md_getattr_name(sbi->ll_md_exp, op_data, &req); - ll_finish_md_op_data(op_data); - if (rc) { - CERROR("%s: failure inode " DFID " get parent: rc = %d\n", - ll_get_fsname(dir->i_sb, NULL, 0), - PFID(ll_inode2fid(dir)), rc); - return rc; - } - body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - /* - * LU-3952: MDT may lost the FID of its parent, we should not crash - * the NFS server, ll_iget_for_nfs() will handle the error. - */ - if (body->mbo_valid & OBD_MD_FLID) { - CDEBUG(D_INFO, "parent for " DFID " is " DFID "\n", - PFID(ll_inode2fid(dir)), PFID(&body->mbo_fid1)); - *parent_fid = body->mbo_fid1; - } - - ptlrpc_req_finished(req); - return 0; -} - -static struct dentry *ll_get_parent(struct dentry *dchild) -{ - struct lu_fid parent_fid = { 0 }; - struct dentry *dentry; - int rc; - - rc = ll_dir_get_parent_fid(dchild->d_inode, &parent_fid); - if (rc) - return ERR_PTR(rc); - - dentry = ll_iget_for_nfs(dchild->d_inode->i_sb, &parent_fid, NULL); - - return dentry; -} - -const struct export_operations lustre_export_operations = { - .get_parent = ll_get_parent, - .encode_fh = ll_encode_fh, - .get_name = ll_get_name, - .fh_to_dentry = ll_fh_to_dentry, - .fh_to_parent = ll_fh_to_parent, -}; diff --git a/drivers/staging/lustre/lustre/llite/lproc_llite.c b/drivers/staging/lustre/lustre/llite/lproc_llite.c deleted file mode 100644 index 49bf1b7ee3117009b6342ba1bf4692ea35dc7637..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/lproc_llite.c +++ /dev/null @@ -1,1659 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ -#define DEBUG_SUBSYSTEM S_LLITE - -#include -#include -#include - -#include "llite_internal.h" -#include "vvp_internal.h" - -/* debugfs llite mount point registration */ -static const struct file_operations ll_rw_extents_stats_fops; -static const struct file_operations ll_rw_extents_stats_pp_fops; -static const struct file_operations ll_rw_offset_stats_fops; - -static ssize_t blocksize_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - struct obd_statfs osfs; - int rc; - - rc = ll_statfs_internal(sbi->ll_sb, &osfs, - get_jiffies_64() - OBD_STATFS_CACHE_SECONDS * HZ, - OBD_STATFS_NODELAY); - if (!rc) - return sprintf(buf, "%u\n", osfs.os_bsize); - - return rc; -} -LUSTRE_RO_ATTR(blocksize); - -static ssize_t kbytestotal_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - struct obd_statfs osfs; - int rc; - - rc = ll_statfs_internal(sbi->ll_sb, &osfs, - get_jiffies_64() - OBD_STATFS_CACHE_SECONDS * HZ, - OBD_STATFS_NODELAY); - if (!rc) { - __u32 blk_size = osfs.os_bsize >> 10; - __u64 result = osfs.os_blocks; - - while (blk_size >>= 1) - result <<= 1; - - rc = sprintf(buf, "%llu\n", result); - } - - return rc; -} -LUSTRE_RO_ATTR(kbytestotal); - -static ssize_t kbytesfree_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - struct obd_statfs osfs; - int rc; - - rc = ll_statfs_internal(sbi->ll_sb, &osfs, - get_jiffies_64() - OBD_STATFS_CACHE_SECONDS * HZ, - OBD_STATFS_NODELAY); - if (!rc) { - __u32 blk_size = osfs.os_bsize >> 10; - __u64 result = osfs.os_bfree; - - while (blk_size >>= 1) - result <<= 1; - - rc = sprintf(buf, "%llu\n", result); - } - - return rc; -} -LUSTRE_RO_ATTR(kbytesfree); - -static ssize_t kbytesavail_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - struct obd_statfs osfs; - int rc; - - rc = ll_statfs_internal(sbi->ll_sb, &osfs, - get_jiffies_64() - OBD_STATFS_CACHE_SECONDS * HZ, - OBD_STATFS_NODELAY); - if (!rc) { - __u32 blk_size = osfs.os_bsize >> 10; - __u64 result = osfs.os_bavail; - - while (blk_size >>= 1) - result <<= 1; - - rc = sprintf(buf, "%llu\n", result); - } - - return rc; -} -LUSTRE_RO_ATTR(kbytesavail); - -static ssize_t filestotal_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - struct obd_statfs osfs; - int rc; - - rc = ll_statfs_internal(sbi->ll_sb, &osfs, - get_jiffies_64() - OBD_STATFS_CACHE_SECONDS * HZ, - OBD_STATFS_NODELAY); - if (!rc) - return sprintf(buf, "%llu\n", osfs.os_files); - - return rc; -} -LUSTRE_RO_ATTR(filestotal); - -static ssize_t filesfree_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - struct obd_statfs osfs; - int rc; - - rc = ll_statfs_internal(sbi->ll_sb, &osfs, - get_jiffies_64() - OBD_STATFS_CACHE_SECONDS * HZ, - OBD_STATFS_NODELAY); - if (!rc) - return sprintf(buf, "%llu\n", osfs.os_ffree); - - return rc; -} -LUSTRE_RO_ATTR(filesfree); - -static ssize_t client_type_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - return sprintf(buf, "local client\n"); -} -LUSTRE_RO_ATTR(client_type); - -static ssize_t fstype_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - - return sprintf(buf, "%s\n", sbi->ll_sb->s_type->name); -} -LUSTRE_RO_ATTR(fstype); - -static ssize_t uuid_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - - return sprintf(buf, "%s\n", sbi->ll_sb_uuid.uuid); -} -LUSTRE_RO_ATTR(uuid); - -static int ll_site_stats_seq_show(struct seq_file *m, void *v) -{ - struct super_block *sb = m->private; - - /* - * See description of statistical counters in struct cl_site, and - * struct lu_site. - */ - return cl_site_stats_print(lu2cl_site(ll_s2sbi(sb)->ll_site), m); -} - -LPROC_SEQ_FOPS_RO(ll_site_stats); - -static ssize_t max_read_ahead_mb_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - long pages_number; - int mult; - - spin_lock(&sbi->ll_lock); - pages_number = sbi->ll_ra_info.ra_max_pages; - spin_unlock(&sbi->ll_lock); - - mult = 1 << (20 - PAGE_SHIFT); - return lprocfs_read_frac_helper(buf, PAGE_SIZE, pages_number, mult); -} - -static ssize_t max_read_ahead_mb_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - int rc; - unsigned long pages_number; - - rc = kstrtoul(buffer, 10, &pages_number); - if (rc) - return rc; - - pages_number *= 1 << (20 - PAGE_SHIFT); /* MB -> pages */ - - if (pages_number > totalram_pages / 2) { - CERROR("can't set file readahead more than %lu MB\n", - totalram_pages >> (20 - PAGE_SHIFT + 1)); /*1/2 of RAM*/ - return -ERANGE; - } - - spin_lock(&sbi->ll_lock); - sbi->ll_ra_info.ra_max_pages = pages_number; - spin_unlock(&sbi->ll_lock); - - return count; -} -LUSTRE_RW_ATTR(max_read_ahead_mb); - -static ssize_t max_read_ahead_per_file_mb_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - long pages_number; - int mult; - - spin_lock(&sbi->ll_lock); - pages_number = sbi->ll_ra_info.ra_max_pages_per_file; - spin_unlock(&sbi->ll_lock); - - mult = 1 << (20 - PAGE_SHIFT); - return lprocfs_read_frac_helper(buf, PAGE_SIZE, pages_number, mult); -} - -static ssize_t max_read_ahead_per_file_mb_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - int rc; - unsigned long pages_number; - - rc = kstrtoul(buffer, 10, &pages_number); - if (rc) - return rc; - - if (pages_number > sbi->ll_ra_info.ra_max_pages) { - CERROR("can't set file readahead more than max_read_ahead_mb %lu MB\n", - sbi->ll_ra_info.ra_max_pages); - return -ERANGE; - } - - spin_lock(&sbi->ll_lock); - sbi->ll_ra_info.ra_max_pages_per_file = pages_number; - spin_unlock(&sbi->ll_lock); - - return count; -} -LUSTRE_RW_ATTR(max_read_ahead_per_file_mb); - -static ssize_t max_read_ahead_whole_mb_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - long pages_number; - int mult; - - spin_lock(&sbi->ll_lock); - pages_number = sbi->ll_ra_info.ra_max_read_ahead_whole_pages; - spin_unlock(&sbi->ll_lock); - - mult = 1 << (20 - PAGE_SHIFT); - return lprocfs_read_frac_helper(buf, PAGE_SIZE, pages_number, mult); -} - -static ssize_t max_read_ahead_whole_mb_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - int rc; - unsigned long pages_number; - - rc = kstrtoul(buffer, 10, &pages_number); - if (rc) - return rc; - - /* Cap this at the current max readahead window size, the readahead - * algorithm does this anyway so it's pointless to set it larger. - */ - if (pages_number > sbi->ll_ra_info.ra_max_pages_per_file) { - CERROR("can't set max_read_ahead_whole_mb more than max_read_ahead_per_file_mb: %lu\n", - sbi->ll_ra_info.ra_max_pages_per_file >> (20 - PAGE_SHIFT)); - return -ERANGE; - } - - spin_lock(&sbi->ll_lock); - sbi->ll_ra_info.ra_max_read_ahead_whole_pages = pages_number; - spin_unlock(&sbi->ll_lock); - - return count; -} -LUSTRE_RW_ATTR(max_read_ahead_whole_mb); - -static int ll_max_cached_mb_seq_show(struct seq_file *m, void *v) -{ - struct super_block *sb = m->private; - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct cl_client_cache *cache = sbi->ll_cache; - int shift = 20 - PAGE_SHIFT; - long max_cached_mb; - long unused_mb; - - max_cached_mb = cache->ccc_lru_max >> shift; - unused_mb = atomic_long_read(&cache->ccc_lru_left) >> shift; - seq_printf(m, - "users: %d\n" - "max_cached_mb: %ld\n" - "used_mb: %ld\n" - "unused_mb: %ld\n" - "reclaim_count: %u\n", - atomic_read(&cache->ccc_users), - max_cached_mb, - max_cached_mb - unused_mb, - unused_mb, - cache->ccc_lru_shrinkers); - return 0; -} - -static ssize_t ll_max_cached_mb_seq_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *off) -{ - struct super_block *sb = ((struct seq_file *)file->private_data)->private; - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct cl_client_cache *cache = sbi->ll_cache; - struct lu_env *env; - long diff = 0; - long nrpages = 0; - u16 refcheck; - long pages_number; - int mult; - long rc; - u64 val; - char kernbuf[128]; - - if (count >= sizeof(kernbuf)) - return -EINVAL; - - if (copy_from_user(kernbuf, buffer, count)) - return -EFAULT; - kernbuf[count] = 0; - - mult = 1 << (20 - PAGE_SHIFT); - buffer += lprocfs_find_named_value(kernbuf, "max_cached_mb:", &count) - - kernbuf; - rc = lprocfs_write_frac_u64_helper(buffer, count, &val, mult); - if (rc) - return rc; - - if (val > LONG_MAX) - return -ERANGE; - pages_number = (long)val; - - if (pages_number < 0 || pages_number > totalram_pages) { - CERROR("%s: can't set max cache more than %lu MB\n", - ll_get_fsname(sb, NULL, 0), - totalram_pages >> (20 - PAGE_SHIFT)); - return -ERANGE; - } - - spin_lock(&sbi->ll_lock); - diff = pages_number - cache->ccc_lru_max; - spin_unlock(&sbi->ll_lock); - - /* easy - add more LRU slots. */ - if (diff >= 0) { - atomic_long_add(diff, &cache->ccc_lru_left); - rc = 0; - goto out; - } - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return 0; - - diff = -diff; - while (diff > 0) { - long tmp; - - /* reduce LRU budget from free slots. */ - do { - long ov, nv; - - ov = atomic_long_read(&cache->ccc_lru_left); - if (ov == 0) - break; - - nv = ov > diff ? ov - diff : 0; - rc = atomic_long_cmpxchg(&cache->ccc_lru_left, ov, nv); - if (likely(ov == rc)) { - diff -= ov - nv; - nrpages += ov - nv; - break; - } - } while (1); - - if (diff <= 0) - break; - - if (!sbi->ll_dt_exp) { /* being initialized */ - rc = 0; - goto out; - } - - /* difficult - have to ask OSCs to drop LRU slots. */ - tmp = diff << 1; - rc = obd_set_info_async(env, sbi->ll_dt_exp, - sizeof(KEY_CACHE_LRU_SHRINK), - KEY_CACHE_LRU_SHRINK, - sizeof(tmp), &tmp, NULL); - if (rc < 0) - break; - } - cl_env_put(env, &refcheck); - -out: - if (rc >= 0) { - spin_lock(&sbi->ll_lock); - cache->ccc_lru_max = pages_number; - spin_unlock(&sbi->ll_lock); - rc = count; - } else { - atomic_long_add(nrpages, &cache->ccc_lru_left); - } - return rc; -} - -LPROC_SEQ_FOPS(ll_max_cached_mb); - -static ssize_t checksum_pages_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - - return sprintf(buf, "%u\n", (sbi->ll_flags & LL_SBI_CHECKSUM) ? 1 : 0); -} - -static ssize_t checksum_pages_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - int rc; - unsigned long val; - - if (!sbi->ll_dt_exp) - /* Not set up yet */ - return -EAGAIN; - - rc = kstrtoul(buffer, 10, &val); - if (rc) - return rc; - if (val) - sbi->ll_flags |= LL_SBI_CHECKSUM; - else - sbi->ll_flags &= ~LL_SBI_CHECKSUM; - - rc = obd_set_info_async(NULL, sbi->ll_dt_exp, sizeof(KEY_CHECKSUM), - KEY_CHECKSUM, sizeof(val), &val, NULL); - if (rc) - CWARN("Failed to set OSC checksum flags: %d\n", rc); - - return count; -} -LUSTRE_RW_ATTR(checksum_pages); - -static ssize_t ll_rd_track_id(struct kobject *kobj, char *buf, - enum stats_track_type type) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - - if (sbi->ll_stats_track_type == type) - return sprintf(buf, "%d\n", sbi->ll_stats_track_id); - else if (sbi->ll_stats_track_type == STATS_TRACK_ALL) - return sprintf(buf, "0 (all)\n"); - else - return sprintf(buf, "untracked\n"); -} - -static ssize_t ll_wr_track_id(struct kobject *kobj, const char *buffer, - size_t count, - enum stats_track_type type) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - int rc; - unsigned long pid; - - rc = kstrtoul(buffer, 10, &pid); - if (rc) - return rc; - sbi->ll_stats_track_id = pid; - if (pid == 0) - sbi->ll_stats_track_type = STATS_TRACK_ALL; - else - sbi->ll_stats_track_type = type; - lprocfs_clear_stats(sbi->ll_stats); - return count; -} - -static ssize_t stats_track_pid_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - return ll_rd_track_id(kobj, buf, STATS_TRACK_PID); -} - -static ssize_t stats_track_pid_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - return ll_wr_track_id(kobj, buffer, count, STATS_TRACK_PID); -} -LUSTRE_RW_ATTR(stats_track_pid); - -static ssize_t stats_track_ppid_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - return ll_rd_track_id(kobj, buf, STATS_TRACK_PPID); -} - -static ssize_t stats_track_ppid_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - return ll_wr_track_id(kobj, buffer, count, STATS_TRACK_PPID); -} -LUSTRE_RW_ATTR(stats_track_ppid); - -static ssize_t stats_track_gid_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - return ll_rd_track_id(kobj, buf, STATS_TRACK_GID); -} - -static ssize_t stats_track_gid_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - return ll_wr_track_id(kobj, buffer, count, STATS_TRACK_GID); -} -LUSTRE_RW_ATTR(stats_track_gid); - -static ssize_t statahead_max_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - - return sprintf(buf, "%u\n", sbi->ll_sa_max); -} - -static ssize_t statahead_max_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - int rc; - unsigned long val; - - rc = kstrtoul(buffer, 10, &val); - if (rc) - return rc; - - if (val <= LL_SA_RPC_MAX) - sbi->ll_sa_max = val; - else - CERROR("Bad statahead_max value %lu. Valid values are in the range [0, %d]\n", - val, LL_SA_RPC_MAX); - - return count; -} -LUSTRE_RW_ATTR(statahead_max); - -static ssize_t statahead_agl_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - - return sprintf(buf, "%u\n", sbi->ll_flags & LL_SBI_AGL_ENABLED ? 1 : 0); -} - -static ssize_t statahead_agl_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - int rc; - unsigned long val; - - rc = kstrtoul(buffer, 10, &val); - if (rc) - return rc; - - if (val) - sbi->ll_flags |= LL_SBI_AGL_ENABLED; - else - sbi->ll_flags &= ~LL_SBI_AGL_ENABLED; - - return count; -} -LUSTRE_RW_ATTR(statahead_agl); - -static int ll_statahead_stats_seq_show(struct seq_file *m, void *v) -{ - struct super_block *sb = m->private; - struct ll_sb_info *sbi = ll_s2sbi(sb); - - seq_printf(m, - "statahead total: %u\n" - "statahead wrong: %u\n" - "agl total: %u\n", - atomic_read(&sbi->ll_sa_total), - atomic_read(&sbi->ll_sa_wrong), - atomic_read(&sbi->ll_agl_total)); - return 0; -} - -LPROC_SEQ_FOPS_RO(ll_statahead_stats); - -static ssize_t lazystatfs_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - - return sprintf(buf, "%u\n", sbi->ll_flags & LL_SBI_LAZYSTATFS ? 1 : 0); -} - -static ssize_t lazystatfs_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - int rc; - unsigned long val; - - rc = kstrtoul(buffer, 10, &val); - if (rc) - return rc; - - if (val) - sbi->ll_flags |= LL_SBI_LAZYSTATFS; - else - sbi->ll_flags &= ~LL_SBI_LAZYSTATFS; - - return count; -} -LUSTRE_RW_ATTR(lazystatfs); - -static ssize_t max_easize_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - unsigned int ealen; - int rc; - - rc = ll_get_max_mdsize(sbi, &ealen); - if (rc) - return rc; - - return sprintf(buf, "%u\n", ealen); -} -LUSTRE_RO_ATTR(max_easize); - -/** - * Get default_easize. - * - * \see client_obd::cl_default_mds_easize - * - * \param[in] kobj kernel object for sysfs tree - * \param[in] attr attribute of this kernel object - * \param[in] buf buffer to write data into - * - * \retval positive \a count on success - * \retval negative negated errno on failure - */ -static ssize_t default_easize_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - unsigned int ealen; - int rc; - - rc = ll_get_default_mdsize(sbi, &ealen); - if (rc) - return rc; - - return sprintf(buf, "%u\n", ealen); -} - -/** - * Set default_easize. - * - * Range checking on the passed value is handled by - * ll_set_default_mdsize(). - * - * \see client_obd::cl_default_mds_easize - * - * \param[in] kobj kernel object for sysfs tree - * \param[in] attr attribute of this kernel object - * \param[in] buffer string passed from user space - * \param[in] count \a buffer length - * - * \retval positive \a count on success - * \retval negative negated errno on failure - */ -static ssize_t default_easize_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - unsigned long val; - int rc; - - rc = kstrtoul(buffer, 10, &val); - if (rc) - return rc; - - rc = ll_set_default_mdsize(sbi, val); - if (rc) - return rc; - - return count; -} -LUSTRE_RW_ATTR(default_easize); - -static int ll_sbi_flags_seq_show(struct seq_file *m, void *v) -{ - const char *str[] = LL_SBI_FLAGS; - struct super_block *sb = m->private; - int flags = ll_s2sbi(sb)->ll_flags; - int i = 0; - - while (flags != 0) { - if (ARRAY_SIZE(str) <= i) { - CERROR("%s: Revise array LL_SBI_FLAGS to match sbi flags please.\n", - ll_get_fsname(sb, NULL, 0)); - return -EINVAL; - } - - if (flags & 0x1) - seq_printf(m, "%s ", str[i]); - flags >>= 1; - ++i; - } - seq_puts(m, "\b\n"); - return 0; -} - -LPROC_SEQ_FOPS_RO(ll_sbi_flags); - -static ssize_t xattr_cache_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - - return sprintf(buf, "%u\n", sbi->ll_xattr_cache_enabled); -} - -static ssize_t xattr_cache_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - int rc; - unsigned long val; - - rc = kstrtoul(buffer, 10, &val); - if (rc) - return rc; - - if (val != 0 && val != 1) - return -ERANGE; - - if (val == 1 && !(sbi->ll_flags & LL_SBI_XATTR_CACHE)) - return -ENOTSUPP; - - sbi->ll_xattr_cache_enabled = val; - - return count; -} -LUSTRE_RW_ATTR(xattr_cache); - -static int ll_unstable_stats_seq_show(struct seq_file *m, void *v) -{ - struct super_block *sb = m->private; - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct cl_client_cache *cache = sbi->ll_cache; - long pages; - int mb; - - pages = atomic_long_read(&cache->ccc_unstable_nr); - mb = (pages * PAGE_SIZE) >> 20; - - seq_printf(m, - "unstable_check: %8d\n" - "unstable_pages: %12ld\n" - "unstable_mb: %8d\n", - cache->ccc_unstable_check, pages, mb); - - return 0; -} - -static ssize_t ll_unstable_stats_seq_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *off) -{ - struct super_block *sb = ((struct seq_file *)file->private_data)->private; - struct ll_sb_info *sbi = ll_s2sbi(sb); - char kernbuf[128]; - int val, rc; - - if (!count) - return 0; - if (count >= sizeof(kernbuf)) - return -EINVAL; - - if (copy_from_user(kernbuf, buffer, count)) - return -EFAULT; - kernbuf[count] = 0; - - buffer += lprocfs_find_named_value(kernbuf, "unstable_check:", &count) - - kernbuf; - rc = lprocfs_write_helper(buffer, count, &val); - if (rc < 0) - return rc; - - /* borrow lru lock to set the value */ - spin_lock(&sbi->ll_cache->ccc_lru_lock); - sbi->ll_cache->ccc_unstable_check = !!val; - spin_unlock(&sbi->ll_cache->ccc_lru_lock); - - return count; -} -LPROC_SEQ_FOPS(ll_unstable_stats); - -static int ll_root_squash_seq_show(struct seq_file *m, void *v) -{ - struct super_block *sb = m->private; - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct root_squash_info *squash = &sbi->ll_squash; - - seq_printf(m, "%u:%u\n", squash->rsi_uid, squash->rsi_gid); - return 0; -} - -static ssize_t ll_root_squash_seq_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *off) -{ - struct seq_file *m = file->private_data; - struct super_block *sb = m->private; - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct root_squash_info *squash = &sbi->ll_squash; - - return lprocfs_wr_root_squash(buffer, count, squash, - ll_get_fsname(sb, NULL, 0)); -} -LPROC_SEQ_FOPS(ll_root_squash); - -static int ll_nosquash_nids_seq_show(struct seq_file *m, void *v) -{ - struct super_block *sb = m->private; - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct root_squash_info *squash = &sbi->ll_squash; - int len; - - down_read(&squash->rsi_sem); - if (!list_empty(&squash->rsi_nosquash_nids)) { - len = cfs_print_nidlist(m->buf + m->count, m->size - m->count, - &squash->rsi_nosquash_nids); - m->count += len; - seq_puts(m, "\n"); - } else { - seq_puts(m, "NONE\n"); - } - up_read(&squash->rsi_sem); - - return 0; -} - -static ssize_t ll_nosquash_nids_seq_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *off) -{ - struct seq_file *m = file->private_data; - struct super_block *sb = m->private; - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct root_squash_info *squash = &sbi->ll_squash; - int rc; - - rc = lprocfs_wr_nosquash_nids(buffer, count, squash, - ll_get_fsname(sb, NULL, 0)); - if (rc < 0) - return rc; - - ll_compute_rootsquash_state(sbi); - - return rc; -} - -LPROC_SEQ_FOPS(ll_nosquash_nids); - -static struct lprocfs_vars lprocfs_llite_obd_vars[] = { - /* { "mntpt_path", ll_rd_path, 0, 0 }, */ - { "site", &ll_site_stats_fops, NULL, 0 }, - /* { "filegroups", lprocfs_rd_filegroups, 0, 0 }, */ - { "max_cached_mb", &ll_max_cached_mb_fops, NULL }, - { "statahead_stats", &ll_statahead_stats_fops, NULL, 0 }, - { "unstable_stats", &ll_unstable_stats_fops, NULL }, - { "sbi_flags", &ll_sbi_flags_fops, NULL, 0 }, - { .name = "root_squash", - .fops = &ll_root_squash_fops }, - { .name = "nosquash_nids", - .fops = &ll_nosquash_nids_fops }, - { NULL } -}; - -#define MAX_STRING_SIZE 128 - -static struct attribute *llite_attrs[] = { - &lustre_attr_blocksize.attr, - &lustre_attr_kbytestotal.attr, - &lustre_attr_kbytesfree.attr, - &lustre_attr_kbytesavail.attr, - &lustre_attr_filestotal.attr, - &lustre_attr_filesfree.attr, - &lustre_attr_client_type.attr, - &lustre_attr_fstype.attr, - &lustre_attr_uuid.attr, - &lustre_attr_max_read_ahead_mb.attr, - &lustre_attr_max_read_ahead_per_file_mb.attr, - &lustre_attr_max_read_ahead_whole_mb.attr, - &lustre_attr_checksum_pages.attr, - &lustre_attr_stats_track_pid.attr, - &lustre_attr_stats_track_ppid.attr, - &lustre_attr_stats_track_gid.attr, - &lustre_attr_statahead_max.attr, - &lustre_attr_statahead_agl.attr, - &lustre_attr_lazystatfs.attr, - &lustre_attr_max_easize.attr, - &lustre_attr_default_easize.attr, - &lustre_attr_xattr_cache.attr, - NULL, -}; - -static void llite_sb_release(struct kobject *kobj) -{ - struct ll_sb_info *sbi = container_of(kobj, struct ll_sb_info, - ll_kobj); - complete(&sbi->ll_kobj_unregister); -} - -static struct kobj_type llite_ktype = { - .default_attrs = llite_attrs, - .sysfs_ops = &lustre_sysfs_ops, - .release = llite_sb_release, -}; - -static const struct llite_file_opcode { - __u32 opcode; - __u32 type; - const char *opname; -} llite_opcode_table[LPROC_LL_FILE_OPCODES] = { - /* file operation */ - { LPROC_LL_DIRTY_HITS, LPROCFS_TYPE_REGS, "dirty_pages_hits" }, - { LPROC_LL_DIRTY_MISSES, LPROCFS_TYPE_REGS, "dirty_pages_misses" }, - { LPROC_LL_READ_BYTES, LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_BYTES, - "read_bytes" }, - { LPROC_LL_WRITE_BYTES, LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_BYTES, - "write_bytes" }, - { LPROC_LL_BRW_READ, LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_PAGES, - "brw_read" }, - { LPROC_LL_BRW_WRITE, LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_PAGES, - "brw_write" }, - { LPROC_LL_IOCTL, LPROCFS_TYPE_REGS, "ioctl" }, - { LPROC_LL_OPEN, LPROCFS_TYPE_REGS, "open" }, - { LPROC_LL_RELEASE, LPROCFS_TYPE_REGS, "close" }, - { LPROC_LL_MAP, LPROCFS_TYPE_REGS, "mmap" }, - { LPROC_LL_LLSEEK, LPROCFS_TYPE_REGS, "seek" }, - { LPROC_LL_FSYNC, LPROCFS_TYPE_REGS, "fsync" }, - { LPROC_LL_READDIR, LPROCFS_TYPE_REGS, "readdir" }, - /* inode operation */ - { LPROC_LL_SETATTR, LPROCFS_TYPE_REGS, "setattr" }, - { LPROC_LL_TRUNC, LPROCFS_TYPE_REGS, "truncate" }, - { LPROC_LL_FLOCK, LPROCFS_TYPE_REGS, "flock" }, - { LPROC_LL_GETATTR, LPROCFS_TYPE_REGS, "getattr" }, - /* dir inode operation */ - { LPROC_LL_CREATE, LPROCFS_TYPE_REGS, "create" }, - { LPROC_LL_LINK, LPROCFS_TYPE_REGS, "link" }, - { LPROC_LL_UNLINK, LPROCFS_TYPE_REGS, "unlink" }, - { LPROC_LL_SYMLINK, LPROCFS_TYPE_REGS, "symlink" }, - { LPROC_LL_MKDIR, LPROCFS_TYPE_REGS, "mkdir" }, - { LPROC_LL_RMDIR, LPROCFS_TYPE_REGS, "rmdir" }, - { LPROC_LL_MKNOD, LPROCFS_TYPE_REGS, "mknod" }, - { LPROC_LL_RENAME, LPROCFS_TYPE_REGS, "rename" }, - /* special inode operation */ - { LPROC_LL_STAFS, LPROCFS_TYPE_REGS, "statfs" }, - { LPROC_LL_ALLOC_INODE, LPROCFS_TYPE_REGS, "alloc_inode" }, - { LPROC_LL_SETXATTR, LPROCFS_TYPE_REGS, "setxattr" }, - { LPROC_LL_GETXATTR, LPROCFS_TYPE_REGS, "getxattr" }, - { LPROC_LL_GETXATTR_HITS, LPROCFS_TYPE_REGS, "getxattr_hits" }, - { LPROC_LL_LISTXATTR, LPROCFS_TYPE_REGS, "listxattr" }, - { LPROC_LL_REMOVEXATTR, LPROCFS_TYPE_REGS, "removexattr" }, - { LPROC_LL_INODE_PERM, LPROCFS_TYPE_REGS, "inode_permission" }, -}; - -void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count) -{ - if (!sbi->ll_stats) - return; - if (sbi->ll_stats_track_type == STATS_TRACK_ALL) - lprocfs_counter_add(sbi->ll_stats, op, count); - else if (sbi->ll_stats_track_type == STATS_TRACK_PID && - sbi->ll_stats_track_id == current->pid) - lprocfs_counter_add(sbi->ll_stats, op, count); - else if (sbi->ll_stats_track_type == STATS_TRACK_PPID && - sbi->ll_stats_track_id == current->real_parent->pid) - lprocfs_counter_add(sbi->ll_stats, op, count); - else if (sbi->ll_stats_track_type == STATS_TRACK_GID && - sbi->ll_stats_track_id == - from_kgid(&init_user_ns, current_gid())) - lprocfs_counter_add(sbi->ll_stats, op, count); -} -EXPORT_SYMBOL(ll_stats_ops_tally); - -static const char *ra_stat_string[] = { - [RA_STAT_HIT] = "hits", - [RA_STAT_MISS] = "misses", - [RA_STAT_DISTANT_READPAGE] = "readpage not consecutive", - [RA_STAT_MISS_IN_WINDOW] = "miss inside window", - [RA_STAT_FAILED_GRAB_PAGE] = "failed grab_cache_page", - [RA_STAT_FAILED_MATCH] = "failed lock match", - [RA_STAT_DISCARDED] = "read but discarded", - [RA_STAT_ZERO_LEN] = "zero length file", - [RA_STAT_ZERO_WINDOW] = "zero size window", - [RA_STAT_EOF] = "read-ahead to EOF", - [RA_STAT_MAX_IN_FLIGHT] = "hit max r-a issue", - [RA_STAT_WRONG_GRAB_PAGE] = "wrong page from grab_cache_page", - [RA_STAT_FAILED_REACH_END] = "failed to reach end" -}; - -int ldebugfs_register_mountpoint(struct dentry *parent, - struct super_block *sb, char *osc, char *mdc) -{ - struct lustre_sb_info *lsi = s2lsi(sb); - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct obd_device *obd; - struct dentry *dir; - char name[MAX_STRING_SIZE + 1], *ptr; - int err, id, len; - - name[MAX_STRING_SIZE] = '\0'; - - LASSERT(sbi); - LASSERT(mdc); - LASSERT(osc); - - /* Get fsname */ - len = strlen(lsi->lsi_lmd->lmd_profile); - ptr = strrchr(lsi->lsi_lmd->lmd_profile, '-'); - if (ptr && (strcmp(ptr, "-client") == 0)) - len -= 7; - - /* Mount info */ - snprintf(name, MAX_STRING_SIZE, "%.*s-%p", len, - lsi->lsi_lmd->lmd_profile, sb); - - dir = debugfs_create_dir(name, parent); - sbi->ll_debugfs_entry = dir; - - debugfs_create_file("dump_page_cache", 0444, dir, sbi, - &vvp_dump_pgcache_file_ops); - debugfs_create_file("extents_stats", 0644, dir, sbi, - &ll_rw_extents_stats_fops); - debugfs_create_file("extents_stats_per_process", 0644, - dir, sbi, &ll_rw_extents_stats_pp_fops); - debugfs_create_file("offset_stats", 0644, dir, sbi, - &ll_rw_offset_stats_fops); - - /* File operations stats */ - sbi->ll_stats = lprocfs_alloc_stats(LPROC_LL_FILE_OPCODES, - LPROCFS_STATS_FLAG_NONE); - if (!sbi->ll_stats) { - err = -ENOMEM; - goto out; - } - /* do counter init */ - for (id = 0; id < LPROC_LL_FILE_OPCODES; id++) { - __u32 type = llite_opcode_table[id].type; - void *ptr = NULL; - - if (type & LPROCFS_TYPE_REGS) - ptr = "regs"; - else if (type & LPROCFS_TYPE_BYTES) - ptr = "bytes"; - else if (type & LPROCFS_TYPE_PAGES) - ptr = "pages"; - lprocfs_counter_init(sbi->ll_stats, - llite_opcode_table[id].opcode, - (type & LPROCFS_CNTR_AVGMINMAX), - llite_opcode_table[id].opname, ptr); - } - - debugfs_create_file("stats", 0644, sbi->ll_debugfs_entry, sbi->ll_stats, - &lprocfs_stats_seq_fops); - - sbi->ll_ra_stats = lprocfs_alloc_stats(ARRAY_SIZE(ra_stat_string), - LPROCFS_STATS_FLAG_NONE); - if (!sbi->ll_ra_stats) { - err = -ENOMEM; - goto out; - } - - for (id = 0; id < ARRAY_SIZE(ra_stat_string); id++) - lprocfs_counter_init(sbi->ll_ra_stats, id, 0, - ra_stat_string[id], "pages"); - - debugfs_create_file("stats", 0644, sbi->ll_debugfs_entry, - sbi->ll_ra_stats, &lprocfs_stats_seq_fops); - - ldebugfs_add_vars(sbi->ll_debugfs_entry, lprocfs_llite_obd_vars, sb); - - sbi->ll_kobj.kset = llite_kset; - init_completion(&sbi->ll_kobj_unregister); - err = kobject_init_and_add(&sbi->ll_kobj, &llite_ktype, NULL, - "%s", name); - if (err) - goto out; - - /* MDC info */ - obd = class_name2obd(mdc); - - err = sysfs_create_link(&sbi->ll_kobj, &obd->obd_kobj, - obd->obd_type->typ_name); - if (err) - goto out; - - /* OSC */ - obd = class_name2obd(osc); - - err = sysfs_create_link(&sbi->ll_kobj, &obd->obd_kobj, - obd->obd_type->typ_name); -out: - if (err) { - debugfs_remove_recursive(sbi->ll_debugfs_entry); - lprocfs_free_stats(&sbi->ll_ra_stats); - lprocfs_free_stats(&sbi->ll_stats); - } - return err; -} - -void ldebugfs_unregister_mountpoint(struct ll_sb_info *sbi) -{ - debugfs_remove_recursive(sbi->ll_debugfs_entry); - kobject_put(&sbi->ll_kobj); - wait_for_completion(&sbi->ll_kobj_unregister); - lprocfs_free_stats(&sbi->ll_ra_stats); - lprocfs_free_stats(&sbi->ll_stats); -} - -#undef MAX_STRING_SIZE - -#define pct(a, b) (b ? a * 100 / b : 0) - -static void ll_display_extents_info(struct ll_rw_extents_info *io_extents, - struct seq_file *seq, int which) -{ - unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum; - unsigned long start, end, r, w; - char *unitp = "KMGTPEZY"; - int i, units = 10; - struct per_process_info *pp_info = &io_extents->pp_extents[which]; - - read_cum = 0; - write_cum = 0; - start = 0; - - for (i = 0; i < LL_HIST_MAX; i++) { - read_tot += pp_info->pp_r_hist.oh_buckets[i]; - write_tot += pp_info->pp_w_hist.oh_buckets[i]; - } - - for (i = 0; i < LL_HIST_MAX; i++) { - r = pp_info->pp_r_hist.oh_buckets[i]; - w = pp_info->pp_w_hist.oh_buckets[i]; - read_cum += r; - write_cum += w; - end = 1 << (i + LL_HIST_START - units); - seq_printf(seq, "%4lu%c - %4lu%c%c: %14lu %4lu %4lu | %14lu %4lu %4lu\n", - start, *unitp, end, *unitp, - (i == LL_HIST_MAX - 1) ? '+' : ' ', - r, pct(r, read_tot), pct(read_cum, read_tot), - w, pct(w, write_tot), pct(write_cum, write_tot)); - start = end; - if (start == 1024) { - start = 1; - units += 10; - unitp++; - } - if (read_cum == read_tot && write_cum == write_tot) - break; - } -} - -static int ll_rw_extents_stats_pp_seq_show(struct seq_file *seq, void *v) -{ - struct timespec64 now; - struct ll_sb_info *sbi = seq->private; - struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info; - int k; - - ktime_get_real_ts64(&now); - - if (!sbi->ll_rw_stats_on) { - seq_printf(seq, "disabled\n" - "write anything in this file to activate, then 0 or \"[D/d]isabled\" to deactivate\n"); - return 0; - } - seq_printf(seq, "snapshot_time: %llu.%09lu (secs.usecs)\n", - (s64)now.tv_sec, (unsigned long)now.tv_nsec); - seq_printf(seq, "%15s %19s | %20s\n", " ", "read", "write"); - seq_printf(seq, "%13s %14s %4s %4s | %14s %4s %4s\n", - "extents", "calls", "%", "cum%", - "calls", "%", "cum%"); - spin_lock(&sbi->ll_pp_extent_lock); - for (k = 0; k < LL_PROCESS_HIST_MAX; k++) { - if (io_extents->pp_extents[k].pid != 0) { - seq_printf(seq, "\nPID: %d\n", - io_extents->pp_extents[k].pid); - ll_display_extents_info(io_extents, seq, k); - } - } - spin_unlock(&sbi->ll_pp_extent_lock); - return 0; -} - -static ssize_t ll_rw_extents_stats_pp_seq_write(struct file *file, - const char __user *buf, - size_t len, - loff_t *off) -{ - struct seq_file *seq = file->private_data; - struct ll_sb_info *sbi = seq->private; - struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info; - int i; - int value = 1, rc = 0; - - if (len == 0) - return -EINVAL; - - rc = lprocfs_write_helper(buf, len, &value); - if (rc < 0 && len < 16) { - char kernbuf[16]; - - if (copy_from_user(kernbuf, buf, len)) - return -EFAULT; - kernbuf[len] = 0; - - if (kernbuf[len - 1] == '\n') - kernbuf[len - 1] = 0; - - if (strcmp(kernbuf, "disabled") == 0 || - strcmp(kernbuf, "Disabled") == 0) - value = 0; - } - - if (value == 0) - sbi->ll_rw_stats_on = 0; - else - sbi->ll_rw_stats_on = 1; - - spin_lock(&sbi->ll_pp_extent_lock); - for (i = 0; i < LL_PROCESS_HIST_MAX; i++) { - io_extents->pp_extents[i].pid = 0; - lprocfs_oh_clear(&io_extents->pp_extents[i].pp_r_hist); - lprocfs_oh_clear(&io_extents->pp_extents[i].pp_w_hist); - } - spin_unlock(&sbi->ll_pp_extent_lock); - return len; -} - -LPROC_SEQ_FOPS(ll_rw_extents_stats_pp); - -static int ll_rw_extents_stats_seq_show(struct seq_file *seq, void *v) -{ - struct timespec64 now; - struct ll_sb_info *sbi = seq->private; - struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info; - - ktime_get_real_ts64(&now); - - if (!sbi->ll_rw_stats_on) { - seq_printf(seq, "disabled\n" - "write anything in this file to activate, then 0 or \"[D/d]isabled\" to deactivate\n"); - return 0; - } - seq_printf(seq, "snapshot_time: %llu.%09lu (secs.usecs)\n", - (u64)now.tv_sec, (unsigned long)now.tv_nsec); - - seq_printf(seq, "%15s %19s | %20s\n", " ", "read", "write"); - seq_printf(seq, "%13s %14s %4s %4s | %14s %4s %4s\n", - "extents", "calls", "%", "cum%", - "calls", "%", "cum%"); - spin_lock(&sbi->ll_lock); - ll_display_extents_info(io_extents, seq, LL_PROCESS_HIST_MAX); - spin_unlock(&sbi->ll_lock); - - return 0; -} - -static ssize_t ll_rw_extents_stats_seq_write(struct file *file, - const char __user *buf, - size_t len, loff_t *off) -{ - struct seq_file *seq = file->private_data; - struct ll_sb_info *sbi = seq->private; - struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info; - int i; - int value = 1, rc = 0; - - if (len == 0) - return -EINVAL; - - rc = lprocfs_write_helper(buf, len, &value); - if (rc < 0 && len < 16) { - char kernbuf[16]; - - if (copy_from_user(kernbuf, buf, len)) - return -EFAULT; - kernbuf[len] = 0; - - if (kernbuf[len - 1] == '\n') - kernbuf[len - 1] = 0; - - if (strcmp(kernbuf, "disabled") == 0 || - strcmp(kernbuf, "Disabled") == 0) - value = 0; - } - - if (value == 0) - sbi->ll_rw_stats_on = 0; - else - sbi->ll_rw_stats_on = 1; - - spin_lock(&sbi->ll_pp_extent_lock); - for (i = 0; i <= LL_PROCESS_HIST_MAX; i++) { - io_extents->pp_extents[i].pid = 0; - lprocfs_oh_clear(&io_extents->pp_extents[i].pp_r_hist); - lprocfs_oh_clear(&io_extents->pp_extents[i].pp_w_hist); - } - spin_unlock(&sbi->ll_pp_extent_lock); - - return len; -} - -LPROC_SEQ_FOPS(ll_rw_extents_stats); - -void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, - struct ll_file_data *file, loff_t pos, - size_t count, int rw) -{ - int i, cur = -1; - struct ll_rw_process_info *process; - struct ll_rw_process_info *offset; - int *off_count = &sbi->ll_rw_offset_entry_count; - int *process_count = &sbi->ll_offset_process_count; - struct ll_rw_extents_info *io_extents = &sbi->ll_rw_extents_info; - - if (!sbi->ll_rw_stats_on) - return; - process = sbi->ll_rw_process_info; - offset = sbi->ll_rw_offset_info; - - spin_lock(&sbi->ll_pp_extent_lock); - /* Extent statistics */ - for (i = 0; i < LL_PROCESS_HIST_MAX; i++) { - if (io_extents->pp_extents[i].pid == pid) { - cur = i; - break; - } - } - - if (cur == -1) { - /* new process */ - sbi->ll_extent_process_count = - (sbi->ll_extent_process_count + 1) % LL_PROCESS_HIST_MAX; - cur = sbi->ll_extent_process_count; - io_extents->pp_extents[cur].pid = pid; - lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_r_hist); - lprocfs_oh_clear(&io_extents->pp_extents[cur].pp_w_hist); - } - - for (i = 0; (count >= (1 << LL_HIST_START << i)) && - (i < (LL_HIST_MAX - 1)); i++) - ; - if (rw == 0) { - io_extents->pp_extents[cur].pp_r_hist.oh_buckets[i]++; - io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_r_hist.oh_buckets[i]++; - } else { - io_extents->pp_extents[cur].pp_w_hist.oh_buckets[i]++; - io_extents->pp_extents[LL_PROCESS_HIST_MAX].pp_w_hist.oh_buckets[i]++; - } - spin_unlock(&sbi->ll_pp_extent_lock); - - spin_lock(&sbi->ll_process_lock); - /* Offset statistics */ - for (i = 0; i < LL_PROCESS_HIST_MAX; i++) { - if (process[i].rw_pid == pid) { - if (process[i].rw_last_file != file) { - process[i].rw_range_start = pos; - process[i].rw_last_file_pos = pos + count; - process[i].rw_smallest_extent = count; - process[i].rw_largest_extent = count; - process[i].rw_offset = 0; - process[i].rw_last_file = file; - spin_unlock(&sbi->ll_process_lock); - return; - } - if (process[i].rw_last_file_pos != pos) { - *off_count = - (*off_count + 1) % LL_OFFSET_HIST_MAX; - offset[*off_count].rw_op = process[i].rw_op; - offset[*off_count].rw_pid = pid; - offset[*off_count].rw_range_start = - process[i].rw_range_start; - offset[*off_count].rw_range_end = - process[i].rw_last_file_pos; - offset[*off_count].rw_smallest_extent = - process[i].rw_smallest_extent; - offset[*off_count].rw_largest_extent = - process[i].rw_largest_extent; - offset[*off_count].rw_offset = - process[i].rw_offset; - process[i].rw_op = rw; - process[i].rw_range_start = pos; - process[i].rw_smallest_extent = count; - process[i].rw_largest_extent = count; - process[i].rw_offset = pos - - process[i].rw_last_file_pos; - } - if (process[i].rw_smallest_extent > count) - process[i].rw_smallest_extent = count; - if (process[i].rw_largest_extent < count) - process[i].rw_largest_extent = count; - process[i].rw_last_file_pos = pos + count; - spin_unlock(&sbi->ll_process_lock); - return; - } - } - *process_count = (*process_count + 1) % LL_PROCESS_HIST_MAX; - process[*process_count].rw_pid = pid; - process[*process_count].rw_op = rw; - process[*process_count].rw_range_start = pos; - process[*process_count].rw_last_file_pos = pos + count; - process[*process_count].rw_smallest_extent = count; - process[*process_count].rw_largest_extent = count; - process[*process_count].rw_offset = 0; - process[*process_count].rw_last_file = file; - spin_unlock(&sbi->ll_process_lock); -} - -static int ll_rw_offset_stats_seq_show(struct seq_file *seq, void *v) -{ - struct timespec64 now; - struct ll_sb_info *sbi = seq->private; - struct ll_rw_process_info *offset = sbi->ll_rw_offset_info; - struct ll_rw_process_info *process = sbi->ll_rw_process_info; - int i; - - ktime_get_real_ts64(&now); - - if (!sbi->ll_rw_stats_on) { - seq_printf(seq, "disabled\n" - "write anything in this file to activate, then 0 or \"[D/d]isabled\" to deactivate\n"); - return 0; - } - spin_lock(&sbi->ll_process_lock); - - seq_printf(seq, "snapshot_time: %llu.%09lu (secs.usecs)\n", - (s64)now.tv_sec, (unsigned long)now.tv_nsec); - seq_printf(seq, "%3s %10s %14s %14s %17s %17s %14s\n", - "R/W", "PID", "RANGE START", "RANGE END", - "SMALLEST EXTENT", "LARGEST EXTENT", "OFFSET"); - /* We stored the discontiguous offsets here; print them first */ - for (i = 0; i < LL_OFFSET_HIST_MAX; i++) { - if (offset[i].rw_pid != 0) - seq_printf(seq, - "%3c %10d %14Lu %14Lu %17lu %17lu %14Lu", - offset[i].rw_op == READ ? 'R' : 'W', - offset[i].rw_pid, - offset[i].rw_range_start, - offset[i].rw_range_end, - (unsigned long)offset[i].rw_smallest_extent, - (unsigned long)offset[i].rw_largest_extent, - offset[i].rw_offset); - } - /* Then print the current offsets for each process */ - for (i = 0; i < LL_PROCESS_HIST_MAX; i++) { - if (process[i].rw_pid != 0) - seq_printf(seq, - "%3c %10d %14Lu %14Lu %17lu %17lu %14Lu", - process[i].rw_op == READ ? 'R' : 'W', - process[i].rw_pid, - process[i].rw_range_start, - process[i].rw_last_file_pos, - (unsigned long)process[i].rw_smallest_extent, - (unsigned long)process[i].rw_largest_extent, - process[i].rw_offset); - } - spin_unlock(&sbi->ll_process_lock); - - return 0; -} - -static ssize_t ll_rw_offset_stats_seq_write(struct file *file, - const char __user *buf, - size_t len, loff_t *off) -{ - struct seq_file *seq = file->private_data; - struct ll_sb_info *sbi = seq->private; - struct ll_rw_process_info *process_info = sbi->ll_rw_process_info; - struct ll_rw_process_info *offset_info = sbi->ll_rw_offset_info; - int value = 1, rc = 0; - - if (len == 0) - return -EINVAL; - - rc = lprocfs_write_helper(buf, len, &value); - - if (rc < 0 && len < 16) { - char kernbuf[16]; - - if (copy_from_user(kernbuf, buf, len)) - return -EFAULT; - kernbuf[len] = 0; - - if (kernbuf[len - 1] == '\n') - kernbuf[len - 1] = 0; - - if (strcmp(kernbuf, "disabled") == 0 || - strcmp(kernbuf, "Disabled") == 0) - value = 0; - } - - if (value == 0) - sbi->ll_rw_stats_on = 0; - else - sbi->ll_rw_stats_on = 1; - - spin_lock(&sbi->ll_process_lock); - sbi->ll_offset_process_count = 0; - sbi->ll_rw_offset_entry_count = 0; - memset(process_info, 0, sizeof(struct ll_rw_process_info) * - LL_PROCESS_HIST_MAX); - memset(offset_info, 0, sizeof(struct ll_rw_process_info) * - LL_OFFSET_HIST_MAX); - spin_unlock(&sbi->ll_process_lock); - - return len; -} - -LPROC_SEQ_FOPS(ll_rw_offset_stats); - -void lprocfs_llite_init_vars(struct lprocfs_static_vars *lvars) -{ - lvars->obd_vars = lprocfs_llite_obd_vars; -} diff --git a/drivers/staging/lustre/lustre/llite/namei.c b/drivers/staging/lustre/lustre/llite/namei.c deleted file mode 100644 index d5f6d20afe8c4adc5c8acd399002c3f57eae0bcd..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/namei.c +++ /dev/null @@ -1,1207 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#include -#include -#include -#include -#include -#include -#include - -#define DEBUG_SUBSYSTEM S_LLITE - -#include -#include -#include -#include "llite_internal.h" - -static int ll_create_it(struct inode *dir, struct dentry *dentry, - struct lookup_intent *it); - -/* called from iget5_locked->find_inode() under inode_hash_lock spinlock */ -static int ll_test_inode(struct inode *inode, void *opaque) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct lustre_md *md = opaque; - - if (unlikely(!(md->body->mbo_valid & OBD_MD_FLID))) { - CERROR("MDS body missing FID\n"); - return 0; - } - - if (!lu_fid_eq(&lli->lli_fid, &md->body->mbo_fid1)) - return 0; - - return 1; -} - -static int ll_set_inode(struct inode *inode, void *opaque) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct mdt_body *body = ((struct lustre_md *)opaque)->body; - - if (unlikely(!(body->mbo_valid & OBD_MD_FLID))) { - CERROR("MDS body missing FID\n"); - return -EINVAL; - } - - lli->lli_fid = body->mbo_fid1; - if (unlikely(!(body->mbo_valid & OBD_MD_FLTYPE))) { - CERROR("Can not initialize inode " DFID - " without object type: valid = %#llx\n", - PFID(&lli->lli_fid), body->mbo_valid); - return -EINVAL; - } - - inode->i_mode = (inode->i_mode & ~S_IFMT) | (body->mbo_mode & S_IFMT); - if (unlikely(inode->i_mode == 0)) { - CERROR("Invalid inode " DFID " type\n", PFID(&lli->lli_fid)); - return -EINVAL; - } - - ll_lli_init(lli); - - return 0; -} - -/** - * Get an inode by inode number(@hash), which is already instantiated by - * the intent lookup). - */ -struct inode *ll_iget(struct super_block *sb, ino_t hash, - struct lustre_md *md) -{ - struct inode *inode; - int rc = 0; - - LASSERT(hash != 0); - inode = iget5_locked(sb, hash, ll_test_inode, ll_set_inode, md); - if (!inode) - return ERR_PTR(-ENOMEM); - - if (inode->i_state & I_NEW) { - rc = ll_read_inode2(inode, md); - if (!rc && S_ISREG(inode->i_mode) && - !ll_i2info(inode)->lli_clob) - rc = cl_file_inode_init(inode, md); - - if (rc) { - /* - * Let's clear directory lsm here, otherwise - * make_bad_inode() will reset the inode mode - * to regular, then ll_clear_inode will not - * be able to clear lsm_md - */ - if (S_ISDIR(inode->i_mode)) - ll_dir_clear_lsm_md(inode); - make_bad_inode(inode); - unlock_new_inode(inode); - iput(inode); - inode = ERR_PTR(rc); - } else { - unlock_new_inode(inode); - } - } else if (!(inode->i_state & (I_FREEING | I_CLEAR))) { - rc = ll_update_inode(inode, md); - CDEBUG(D_VFSTRACE, "got inode: " DFID "(%p): rc = %d\n", - PFID(&md->body->mbo_fid1), inode, rc); - if (rc) { - if (S_ISDIR(inode->i_mode)) - ll_dir_clear_lsm_md(inode); - iput(inode); - inode = ERR_PTR(rc); - } - } - return inode; -} - -static void ll_invalidate_negative_children(struct inode *dir) -{ - struct dentry *dentry, *tmp_subdir; - - spin_lock(&dir->i_lock); - hlist_for_each_entry(dentry, &dir->i_dentry, d_u.d_alias) { - spin_lock(&dentry->d_lock); - if (!list_empty(&dentry->d_subdirs)) { - struct dentry *child; - - list_for_each_entry_safe(child, tmp_subdir, - &dentry->d_subdirs, - d_child) { - if (d_really_is_negative(child)) - d_lustre_invalidate(child, 1); - } - } - spin_unlock(&dentry->d_lock); - } - spin_unlock(&dir->i_lock); -} - -int ll_test_inode_by_fid(struct inode *inode, void *opaque) -{ - return lu_fid_eq(&ll_i2info(inode)->lli_fid, opaque); -} - -int ll_md_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, - void *data, int flag) -{ - struct lustre_handle lockh; - int rc; - - switch (flag) { - case LDLM_CB_BLOCKING: - ldlm_lock2handle(lock, &lockh); - rc = ldlm_cli_cancel(&lockh, LCF_ASYNC); - if (rc < 0) { - CDEBUG(D_INODE, "ldlm_cli_cancel: rc = %d\n", rc); - return rc; - } - break; - case LDLM_CB_CANCELING: { - struct inode *inode = ll_inode_from_resource_lock(lock); - __u64 bits = lock->l_policy_data.l_inodebits.bits; - - /* Inode is set to lock->l_resource->lr_lvb_inode - * for mdc - bug 24555 - */ - LASSERT(!lock->l_ast_data); - - if (!inode) - break; - - /* Invalidate all dentries associated with this inode */ - LASSERT(ldlm_is_canceling(lock)); - - if (!fid_res_name_eq(ll_inode2fid(inode), - &lock->l_resource->lr_name)) { - LDLM_ERROR(lock, - "data mismatch with object " DFID "(%p)", - PFID(ll_inode2fid(inode)), inode); - LBUG(); - } - - if (bits & MDS_INODELOCK_XATTR) { - if (S_ISDIR(inode->i_mode)) - ll_i2info(inode)->lli_def_stripe_offset = -1; - ll_xattr_cache_destroy(inode); - bits &= ~MDS_INODELOCK_XATTR; - } - - /* For OPEN locks we differentiate between lock modes - * LCK_CR, LCK_CW, LCK_PR - bug 22891 - */ - if (bits & MDS_INODELOCK_OPEN) - ll_have_md_lock(inode, &bits, lock->l_req_mode); - - if (bits & MDS_INODELOCK_OPEN) { - fmode_t fmode; - - switch (lock->l_req_mode) { - case LCK_CW: - fmode = FMODE_WRITE; - break; - case LCK_PR: - fmode = FMODE_EXEC; - break; - case LCK_CR: - fmode = FMODE_READ; - break; - default: - LDLM_ERROR(lock, "bad lock mode for OPEN lock"); - LBUG(); - } - - ll_md_real_close(inode, fmode); - } - - if (bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE | - MDS_INODELOCK_LAYOUT | MDS_INODELOCK_PERM)) - ll_have_md_lock(inode, &bits, LCK_MINMODE); - - if (bits & MDS_INODELOCK_LAYOUT) { - struct cl_object_conf conf = { - .coc_opc = OBJECT_CONF_INVALIDATE, - .coc_inode = inode, - }; - - rc = ll_layout_conf(inode, &conf); - if (rc < 0) - CDEBUG(D_INODE, "cannot invalidate layout of " - DFID ": rc = %d\n", - PFID(ll_inode2fid(inode)), rc); - } - - if (bits & MDS_INODELOCK_UPDATE) { - struct ll_inode_info *lli = ll_i2info(inode); - - spin_lock(&lli->lli_lock); - LTIME_S(inode->i_mtime) = 0; - LTIME_S(inode->i_atime) = 0; - LTIME_S(inode->i_ctime) = 0; - spin_unlock(&lli->lli_lock); - } - - if ((bits & MDS_INODELOCK_UPDATE) && S_ISDIR(inode->i_mode)) { - struct ll_inode_info *lli = ll_i2info(inode); - - CDEBUG(D_INODE, "invalidating inode " DFID " lli = %p, pfid = " DFID "\n", - PFID(ll_inode2fid(inode)), lli, - PFID(&lli->lli_pfid)); - - truncate_inode_pages(inode->i_mapping, 0); - - if (unlikely(!fid_is_zero(&lli->lli_pfid))) { - struct inode *master_inode = NULL; - unsigned long hash; - - /* - * This is slave inode, since all of the child - * dentry is connected on the master inode, so - * we have to invalidate the negative children - * on master inode - */ - CDEBUG(D_INODE, - "Invalidate s" DFID " m" DFID "\n", - PFID(ll_inode2fid(inode)), - PFID(&lli->lli_pfid)); - - hash = cl_fid_build_ino(&lli->lli_pfid, - ll_need_32bit_api(ll_i2sbi(inode))); - /* - * Do not lookup the inode with ilookup5, - * otherwise it will cause dead lock, - * - * 1. Client1 send chmod req to the MDT0, then - * on MDT0, it enqueues master and all of its - * slaves lock, (mdt_attr_set() -> - * mdt_lock_slaves()), after gets master and - * stripe0 lock, it will send the enqueue req - * (for stripe1) to MDT1, then MDT1 finds the - * lock has been granted to client2. Then MDT1 - * sends blocking ast to client2. - * - * 2. At the same time, client2 tries to unlink - * the striped dir (rm -rf striped_dir), and - * during lookup, it will hold the master inode - * of the striped directory, whose inode state - * is NEW, then tries to revalidate all of its - * slaves, (ll_prep_inode()->ll_iget()-> - * ll_read_inode2()-> ll_update_inode().). And - * it will be blocked on the server side because - * of 1. - * - * 3. Then the client get the blocking_ast req, - * cancel the lock, but being blocked if using - * ->ilookup5()), because master inode state is - * NEW. - */ - master_inode = ilookup5_nowait(inode->i_sb, - hash, - ll_test_inode_by_fid, - (void *)&lli->lli_pfid); - if (master_inode) { - ll_invalidate_negative_children(master_inode); - iput(master_inode); - } - } else { - ll_invalidate_negative_children(inode); - } - } - - if ((bits & (MDS_INODELOCK_LOOKUP | MDS_INODELOCK_PERM)) && - inode->i_sb->s_root && - !is_root_inode(inode)) - ll_invalidate_aliases(inode); - - iput(inode); - break; - } - default: - LBUG(); - } - - return 0; -} - -__u32 ll_i2suppgid(struct inode *i) -{ - if (in_group_p(i->i_gid)) - return (__u32)from_kgid(&init_user_ns, i->i_gid); - else - return (__u32)(-1); -} - -/* Pack the required supplementary groups into the supplied groups array. - * If we don't need to use the groups from the target inode(s) then we - * instead pack one or more groups from the user's supplementary group - * array in case it might be useful. Not needed if doing an MDS-side upcall. - */ -void ll_i2gids(__u32 *suppgids, struct inode *i1, struct inode *i2) -{ - LASSERT(i1); - - suppgids[0] = ll_i2suppgid(i1); - - if (i2) - suppgids[1] = ll_i2suppgid(i2); - else - suppgids[1] = -1; -} - -/* - * Try to reuse unhashed or invalidated dentries. - * This is very similar to d_exact_alias(), and any changes in one should be - * considered for inclusion in the other. The differences are that we don't - * need an unhashed alias, and we don't want d_compare to be used for - * comparison. - */ -static struct dentry *ll_find_alias(struct inode *inode, struct dentry *dentry) -{ - struct dentry *alias; - - if (hlist_empty(&inode->i_dentry)) - return NULL; - - spin_lock(&inode->i_lock); - hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { - LASSERT(alias != dentry); - /* - * Don't need alias->d_lock here, because aliases with - * d_parent == entry->d_parent are not subject to name or - * parent changes, because the parent inode i_mutex is held. - */ - - if (alias->d_parent != dentry->d_parent) - continue; - if (alias->d_name.hash != dentry->d_name.hash) - continue; - if (alias->d_name.len != dentry->d_name.len || - memcmp(alias->d_name.name, dentry->d_name.name, - dentry->d_name.len) != 0) - continue; - spin_lock(&alias->d_lock); - dget_dlock(alias); - spin_unlock(&alias->d_lock); - spin_unlock(&inode->i_lock); - return alias; - } - spin_unlock(&inode->i_lock); - - return NULL; -} - -/* - * Similar to d_splice_alias(), but lustre treats invalid alias - * similar to DCACHE_DISCONNECTED, and tries to use it anyway. - */ -struct dentry *ll_splice_alias(struct inode *inode, struct dentry *de) -{ - if (inode && !S_ISDIR(inode->i_mode)) { - struct dentry *new = ll_find_alias(inode, de); - - if (new) { - d_move(new, de); - iput(inode); - CDEBUG(D_DENTRY, - "Reuse dentry %p inode %p refc %d flags %#x\n", - new, d_inode(new), d_count(new), new->d_flags); - return new; - } - d_add(de, inode); - } else { - struct dentry *new = d_splice_alias(inode, de); - - if (IS_ERR(new)) - CDEBUG(D_DENTRY, - "splice inode %p as %pd gives error %lu\n", - inode, de, PTR_ERR(new)); - if (new) - de = new; - } - if (!IS_ERR(de)) - CDEBUG(D_DENTRY, "Add dentry %p inode %p refc %d flags %#x\n", - de, d_inode(de), d_count(de), de->d_flags); - return de; -} - -static int ll_lookup_it_finish(struct ptlrpc_request *request, - struct lookup_intent *it, - struct inode *parent, struct dentry **de) -{ - struct inode *inode = NULL; - __u64 bits = 0; - int rc = 0; - struct dentry *alias; - - /* NB 1 request reference will be taken away by ll_intent_lock() - * when I return - */ - CDEBUG(D_DENTRY, "it %p it_disposition %x\n", it, - it->it_disposition); - if (!it_disposition(it, DISP_LOOKUP_NEG)) { - rc = ll_prep_inode(&inode, request, (*de)->d_sb, it); - if (rc) - return rc; - - ll_set_lock_data(ll_i2sbi(parent)->ll_md_exp, inode, it, &bits); - - /* We used to query real size from OSTs here, but actually - * this is not needed. For stat() calls size would be updated - * from subsequent do_revalidate()->ll_inode_revalidate_it() in - * 2.4 and - * vfs_getattr_it->ll_getattr()->ll_inode_revalidate_it() in 2.6 - * Everybody else who needs correct file size would call - * ll_glimpse_size or some equivalent themselves anyway. - * Also see bug 7198. - */ - } - - alias = ll_splice_alias(inode, *de); - if (IS_ERR(alias)) { - rc = PTR_ERR(alias); - goto out; - } - *de = alias; - - if (!it_disposition(it, DISP_LOOKUP_NEG)) { - /* We have the "lookup" lock, so unhide dentry */ - if (bits & MDS_INODELOCK_LOOKUP) - d_lustre_revalidate(*de); - } else if (!it_disposition(it, DISP_OPEN_CREATE)) { - /* If file created on server, don't depend on parent UPDATE - * lock to unhide it. It is left hidden and next lookup can - * find it in ll_splice_alias. - */ - /* Check that parent has UPDATE lock. */ - struct lookup_intent parent_it = { - .it_op = IT_GETATTR, - .it_lock_handle = 0 }; - struct lu_fid fid = ll_i2info(parent)->lli_fid; - - /* If it is striped directory, get the real stripe parent */ - if (unlikely(ll_i2info(parent)->lli_lsm_md)) { - rc = md_get_fid_from_lsm(ll_i2mdexp(parent), - ll_i2info(parent)->lli_lsm_md, - (*de)->d_name.name, - (*de)->d_name.len, &fid); - if (rc) - return rc; - } - - if (md_revalidate_lock(ll_i2mdexp(parent), &parent_it, &fid, - NULL)) { - d_lustre_revalidate(*de); - ll_intent_release(&parent_it); - } - } - -out: - if (rc != 0 && it->it_op & IT_OPEN) - ll_open_cleanup((*de)->d_sb, request); - - return rc; -} - -static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry, - struct lookup_intent *it) -{ - struct lookup_intent lookup_it = { .it_op = IT_LOOKUP }; - struct dentry *save = dentry, *retval; - struct ptlrpc_request *req = NULL; - struct md_op_data *op_data = NULL; - struct inode *inode; - __u32 opc; - int rc; - - if (dentry->d_name.len > ll_i2sbi(parent)->ll_namelen) - return ERR_PTR(-ENAMETOOLONG); - - CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir=" DFID "(%p),intent=%s\n", - dentry, PFID(ll_inode2fid(parent)), parent, LL_IT2STR(it)); - - if (d_mountpoint(dentry)) - CERROR("Tell Peter, lookup on mtpt, it %s\n", LL_IT2STR(it)); - - if (!it || it->it_op == IT_GETXATTR) - it = &lookup_it; - - if (it->it_op == IT_GETATTR && dentry_may_statahead(parent, dentry)) { - rc = ll_statahead(parent, &dentry, 0); - if (rc == 1) { - if (dentry == save) - retval = NULL; - else - retval = dentry; - goto out; - } - } - - if (it->it_op & IT_OPEN && it->it_flags & FMODE_WRITE && sb_rdonly(dentry->d_sb)) - return ERR_PTR(-EROFS); - - if (it->it_op & IT_CREAT) - opc = LUSTRE_OPC_CREATE; - else - opc = LUSTRE_OPC_ANY; - - op_data = ll_prep_md_op_data(NULL, parent, NULL, dentry->d_name.name, - dentry->d_name.len, 0, opc, NULL); - if (IS_ERR(op_data)) - return (void *)op_data; - - /* enforce umask if acl disabled or MDS doesn't support umask */ - if (!IS_POSIXACL(parent) || !exp_connect_umask(ll_i2mdexp(parent))) - it->it_create_mode &= ~current_umask(); - - rc = md_intent_lock(ll_i2mdexp(parent), op_data, it, &req, - &ll_md_blocking_ast, 0); - /* - * If the MDS allows the client to chgrp (CFS_SETGRP_PERM), but the - * client does not know which suppgid should be sent to the MDS, or - * some other(s) changed the target file's GID after this RPC sent - * to the MDS with the suppgid as the original GID, then we should - * try again with right suppgid. - */ - if (rc == -EACCES && it->it_op & IT_OPEN && - it_disposition(it, DISP_OPEN_DENY)) { - struct mdt_body *body; - - LASSERT(req); - - body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - if (op_data->op_suppgids[0] == body->mbo_gid || - op_data->op_suppgids[1] == body->mbo_gid || - !in_group_p(make_kgid(&init_user_ns, body->mbo_gid))) { - retval = ERR_PTR(-EACCES); - goto out; - } - - fid_zero(&op_data->op_fid2); - op_data->op_suppgids[1] = body->mbo_gid; - ptlrpc_req_finished(req); - req = NULL; - ll_intent_release(it); - rc = md_intent_lock(ll_i2mdexp(parent), op_data, it, &req, - ll_md_blocking_ast, 0); - } - - if (rc < 0) { - retval = ERR_PTR(rc); - goto out; - } - - rc = ll_lookup_it_finish(req, it, parent, &dentry); - if (rc != 0) { - ll_intent_release(it); - retval = ERR_PTR(rc); - goto out; - } - - inode = d_inode(dentry); - if ((it->it_op & IT_OPEN) && inode && - !S_ISREG(inode->i_mode) && - !S_ISDIR(inode->i_mode)) { - ll_release_openhandle(inode, it); - } - ll_lookup_finish_locks(it, inode); - - if (dentry == save) - retval = NULL; - else - retval = dentry; -out: - if (op_data && !IS_ERR(op_data)) - ll_finish_md_op_data(op_data); - - ptlrpc_req_finished(req); - return retval; -} - -static struct dentry *ll_lookup_nd(struct inode *parent, struct dentry *dentry, - unsigned int flags) -{ - struct lookup_intent *itp, it = { .it_op = IT_GETATTR }; - struct dentry *de; - - CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir=" DFID "(%p),flags=%u\n", - dentry, PFID(ll_inode2fid(parent)), parent, flags); - - /* Optimize away (CREATE && !OPEN). Let .create handle the race. - * but only if we have write permissions there, otherwise we need - * to proceed with lookup. LU-4185 - */ - if ((flags & LOOKUP_CREATE) && !(flags & LOOKUP_OPEN) && - (inode_permission(parent, MAY_WRITE | MAY_EXEC) == 0)) - return NULL; - - if (flags & (LOOKUP_PARENT | LOOKUP_OPEN | LOOKUP_CREATE)) - itp = NULL; - else - itp = ⁢ - de = ll_lookup_it(parent, dentry, itp); - - if (itp) - ll_intent_release(itp); - - return de; -} - -/* - * For cached negative dentry and new dentry, handle lookup/create/open - * together. - */ -static int ll_atomic_open(struct inode *dir, struct dentry *dentry, - struct file *file, unsigned int open_flags, - umode_t mode, int *opened) -{ - struct lookup_intent *it; - struct dentry *de; - int rc = 0; - - CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir=" DFID "(%p),file %p,open_flags %x,mode %x opened %d\n", - dentry, PFID(ll_inode2fid(dir)), dir, file, open_flags, mode, - *opened); - - /* Only negative dentries enter here */ - LASSERT(!d_inode(dentry)); - - if (!d_in_lookup(dentry)) { - /* A valid negative dentry that just passed revalidation, - * there's little point to try and open it server-side, - * even though there's a minuscle chance it might succeed. - * Either way it's a valid race to just return -ENOENT here. - */ - if (!(open_flags & O_CREAT)) - return -ENOENT; - - /* Otherwise we just unhash it to be rehashed afresh via - * lookup if necessary - */ - d_drop(dentry); - } - - it = kzalloc(sizeof(*it), GFP_NOFS); - if (!it) - return -ENOMEM; - - it->it_op = IT_OPEN; - if (open_flags & O_CREAT) - it->it_op |= IT_CREAT; - it->it_create_mode = (mode & S_IALLUGO) | S_IFREG; - it->it_flags = (open_flags & ~O_ACCMODE) | OPEN_FMODE(open_flags); - it->it_flags &= ~MDS_OPEN_FL_INTERNAL; - - /* Dentry added to dcache tree in ll_lookup_it */ - de = ll_lookup_it(dir, dentry, it); - if (IS_ERR(de)) - rc = PTR_ERR(de); - else if (de) - dentry = de; - - if (!rc) { - if (it_disposition(it, DISP_OPEN_CREATE)) { - /* Dentry instantiated in ll_create_it. */ - rc = ll_create_it(dir, dentry, it); - if (rc) { - /* We dget in ll_splice_alias. */ - if (de) - dput(de); - goto out_release; - } - - *opened |= FILE_CREATED; - } - if (d_really_is_positive(dentry) && - it_disposition(it, DISP_OPEN_OPEN)) { - /* Open dentry. */ - if (S_ISFIFO(d_inode(dentry)->i_mode)) { - /* We cannot call open here as it might - * deadlock. This case is unreachable in - * practice because of OBD_CONNECT_NODEVOH. - */ - rc = finish_no_open(file, de); - } else { - file->private_data = it; - rc = finish_open(file, dentry, NULL, opened); - /* We dget in ll_splice_alias. finish_open takes - * care of dget for fd open. - */ - if (de) - dput(de); - } - } else { - rc = finish_no_open(file, de); - } - } - -out_release: - ll_intent_release(it); - kfree(it); - - return rc; -} - -/* We depend on "mode" being set with the proper file type/umask by now */ -static struct inode *ll_create_node(struct inode *dir, struct lookup_intent *it) -{ - struct inode *inode = NULL; - struct ptlrpc_request *request = NULL; - struct ll_sb_info *sbi = ll_i2sbi(dir); - int rc; - - LASSERT(it && it->it_disposition); - - LASSERT(it_disposition(it, DISP_ENQ_CREATE_REF)); - request = it->it_request; - it_clear_disposition(it, DISP_ENQ_CREATE_REF); - rc = ll_prep_inode(&inode, request, dir->i_sb, it); - if (rc) { - inode = ERR_PTR(rc); - goto out; - } - - LASSERT(hlist_empty(&inode->i_dentry)); - - /* We asked for a lock on the directory, but were granted a - * lock on the inode. Since we finally have an inode pointer, - * stuff it in the lock. - */ - CDEBUG(D_DLMTRACE, "setting l_ast_data to inode " DFID "(%p)\n", - PFID(ll_inode2fid(dir)), inode); - ll_set_lock_data(sbi->ll_md_exp, inode, it, NULL); - out: - ptlrpc_req_finished(request); - return inode; -} - -/* - * By the time this is called, we already have created the directory cache - * entry for the new file, but it is so far negative - it has no inode. - * - * We defer creating the OBD object(s) until open, to keep the intent and - * non-intent code paths similar, and also because we do not have the MDS - * inode number before calling ll_create_node() (which is needed for LOV), - * so we would need to do yet another RPC to the MDS to store the LOV EA - * data on the MDS. If needed, we would pass the PACKED lmm as data and - * lmm_size in datalen (the MDS still has code which will handle that). - * - * If the create succeeds, we fill in the inode information - * with d_instantiate(). - */ -static int ll_create_it(struct inode *dir, struct dentry *dentry, - struct lookup_intent *it) -{ - struct inode *inode; - int rc = 0; - - CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir=" DFID "(%p), intent=%s\n", - dentry, PFID(ll_inode2fid(dir)), dir, LL_IT2STR(it)); - - rc = it_open_error(DISP_OPEN_CREATE, it); - if (rc) - return rc; - - inode = ll_create_node(dir, it); - if (IS_ERR(inode)) - return PTR_ERR(inode); - - d_instantiate(dentry, inode); - - return ll_init_security(dentry, inode, dir); -} - -void ll_update_times(struct ptlrpc_request *request, struct inode *inode) -{ - struct mdt_body *body = req_capsule_server_get(&request->rq_pill, - &RMF_MDT_BODY); - - LASSERT(body); - if (body->mbo_valid & OBD_MD_FLMTIME && - body->mbo_mtime > LTIME_S(inode->i_mtime)) { - CDEBUG(D_INODE, "setting fid " DFID " mtime from %lu to %llu\n", - PFID(ll_inode2fid(inode)), LTIME_S(inode->i_mtime), - body->mbo_mtime); - LTIME_S(inode->i_mtime) = body->mbo_mtime; - } - if (body->mbo_valid & OBD_MD_FLCTIME && - body->mbo_ctime > LTIME_S(inode->i_ctime)) - LTIME_S(inode->i_ctime) = body->mbo_ctime; -} - -static int ll_new_node(struct inode *dir, struct dentry *dentry, - const char *tgt, umode_t mode, int rdev, - __u32 opc) -{ - struct ptlrpc_request *request = NULL; - struct md_op_data *op_data; - struct inode *inode = NULL; - struct ll_sb_info *sbi = ll_i2sbi(dir); - int tgt_len = 0; - int err; - - if (unlikely(tgt)) - tgt_len = strlen(tgt) + 1; -again: - op_data = ll_prep_md_op_data(NULL, dir, NULL, - dentry->d_name.name, - dentry->d_name.len, - 0, opc, NULL); - if (IS_ERR(op_data)) { - err = PTR_ERR(op_data); - goto err_exit; - } - - err = md_create(sbi->ll_md_exp, op_data, tgt, tgt_len, mode, - from_kuid(&init_user_ns, current_fsuid()), - from_kgid(&init_user_ns, current_fsgid()), - current_cap(), rdev, &request); - ll_finish_md_op_data(op_data); - if (err < 0 && err != -EREMOTE) - goto err_exit; - - /* - * If the client doesn't know where to create a subdirectory (or - * in case of a race that sends the RPC to the wrong MDS), the - * MDS will return -EREMOTE and the client will fetch the layout - * of the directory, then create the directory on the right MDT. - */ - if (unlikely(err == -EREMOTE)) { - struct ll_inode_info *lli = ll_i2info(dir); - struct lmv_user_md *lum; - int lumsize, err2; - - ptlrpc_req_finished(request); - request = NULL; - - err2 = ll_dir_getstripe(dir, (void **)&lum, &lumsize, &request, - OBD_MD_DEFAULT_MEA); - if (!err2) { - /* Update stripe_offset and retry */ - lli->lli_def_stripe_offset = lum->lum_stripe_offset; - } else if (err2 == -ENODATA && - lli->lli_def_stripe_offset != -1) { - /* - * If there are no default stripe EA on the MDT, but the - * client has default stripe, then it probably means - * default stripe EA has just been deleted. - */ - lli->lli_def_stripe_offset = -1; - } else { - goto err_exit; - } - - ptlrpc_req_finished(request); - request = NULL; - goto again; - } - - ll_update_times(request, dir); - - err = ll_prep_inode(&inode, request, dir->i_sb, NULL); - if (err) - goto err_exit; - - d_instantiate(dentry, inode); - - err = ll_init_security(dentry, inode, dir); -err_exit: - if (request) - ptlrpc_req_finished(request); - - return err; -} - -static int ll_mknod(struct inode *dir, struct dentry *dchild, - umode_t mode, dev_t rdev) -{ - int err; - - CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir=" DFID "(%p) mode %o dev %x\n", - dchild, PFID(ll_inode2fid(dir)), dir, mode, - old_encode_dev(rdev)); - - if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir))) - mode &= ~current_umask(); - - switch (mode & S_IFMT) { - case 0: - mode |= S_IFREG; - /* for mode = 0 case */ - /* fall through */ - case S_IFREG: - case S_IFCHR: - case S_IFBLK: - case S_IFIFO: - case S_IFSOCK: - err = ll_new_node(dir, dchild, NULL, mode, - old_encode_dev(rdev), - LUSTRE_OPC_MKNOD); - break; - case S_IFDIR: - err = -EPERM; - break; - default: - err = -EINVAL; - } - - if (!err) - ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKNOD, 1); - - return err; -} - -/* - * Plain create. Intent create is handled in atomic_open. - */ -static int ll_create_nd(struct inode *dir, struct dentry *dentry, - umode_t mode, bool want_excl) -{ - int rc; - - CDEBUG(D_VFSTRACE, - "VFS Op:name=%pd, dir=" DFID "(%p), flags=%u, excl=%d\n", - dentry, PFID(ll_inode2fid(dir)), dir, mode, want_excl); - - rc = ll_mknod(dir, dentry, mode, 0); - - ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_CREATE, 1); - - CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, unhashed %d\n", - dentry, d_unhashed(dentry)); - - return rc; -} - -static int ll_unlink(struct inode *dir, struct dentry *dchild) -{ - struct ptlrpc_request *request = NULL; - struct md_op_data *op_data; - int rc; - - CDEBUG(D_VFSTRACE, "VFS Op:name=%pd,dir=%lu/%u(%p)\n", - dchild, dir->i_ino, dir->i_generation, dir); - - op_data = ll_prep_md_op_data(NULL, dir, NULL, - dchild->d_name.name, - dchild->d_name.len, - 0, LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - if (dchild->d_inode) - op_data->op_fid3 = *ll_inode2fid(dchild->d_inode); - - op_data->op_fid2 = op_data->op_fid3; - rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request); - ll_finish_md_op_data(op_data); - if (rc) - goto out; - - ll_update_times(request, dir); - ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_UNLINK, 1); - - out: - ptlrpc_req_finished(request); - return rc; -} - -static int ll_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) -{ - int err; - - CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir" DFID "(%p)\n", - dentry, PFID(ll_inode2fid(dir)), dir); - - if (!IS_POSIXACL(dir) || !exp_connect_umask(ll_i2mdexp(dir))) - mode &= ~current_umask(); - mode = (mode & (0777 | S_ISVTX)) | S_IFDIR; - - err = ll_new_node(dir, dentry, NULL, mode, 0, LUSTRE_OPC_MKDIR); - if (!err) - ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_MKDIR, 1); - - return err; -} - -static int ll_rmdir(struct inode *dir, struct dentry *dchild) -{ - struct ptlrpc_request *request = NULL; - struct md_op_data *op_data; - int rc; - - CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir=" DFID "(%p)\n", - dchild, PFID(ll_inode2fid(dir)), dir); - - op_data = ll_prep_md_op_data(NULL, dir, NULL, - dchild->d_name.name, - dchild->d_name.len, - S_IFDIR, LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - if (dchild->d_inode) - op_data->op_fid3 = *ll_inode2fid(dchild->d_inode); - - op_data->op_fid2 = op_data->op_fid3; - rc = md_unlink(ll_i2sbi(dir)->ll_md_exp, op_data, &request); - ll_finish_md_op_data(op_data); - if (rc == 0) { - ll_update_times(request, dir); - ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_RMDIR, 1); - } - - ptlrpc_req_finished(request); - return rc; -} - -static int ll_symlink(struct inode *dir, struct dentry *dentry, - const char *oldname) -{ - int err; - - CDEBUG(D_VFSTRACE, "VFS Op:name=%pd, dir=" DFID "(%p),target=%.*s\n", - dentry, PFID(ll_inode2fid(dir)), dir, 3000, oldname); - - err = ll_new_node(dir, dentry, oldname, S_IFLNK | 0777, - 0, LUSTRE_OPC_SYMLINK); - - if (!err) - ll_stats_ops_tally(ll_i2sbi(dir), LPROC_LL_SYMLINK, 1); - - return err; -} - -static int ll_link(struct dentry *old_dentry, struct inode *dir, - struct dentry *new_dentry) -{ - struct inode *src = d_inode(old_dentry); - struct ll_sb_info *sbi = ll_i2sbi(dir); - struct ptlrpc_request *request = NULL; - struct md_op_data *op_data; - int err; - - CDEBUG(D_VFSTRACE, - "VFS Op: inode=" DFID "(%p), dir=" DFID "(%p), target=%pd\n", - PFID(ll_inode2fid(src)), src, PFID(ll_inode2fid(dir)), dir, - new_dentry); - - op_data = ll_prep_md_op_data(NULL, src, dir, new_dentry->d_name.name, - new_dentry->d_name.len, - 0, LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - err = md_link(sbi->ll_md_exp, op_data, &request); - ll_finish_md_op_data(op_data); - if (err) - goto out; - - ll_update_times(request, dir); - ll_stats_ops_tally(sbi, LPROC_LL_LINK, 1); -out: - ptlrpc_req_finished(request); - return err; -} - -static int ll_rename(struct inode *src, struct dentry *src_dchild, - struct inode *tgt, struct dentry *tgt_dchild, - unsigned int flags) -{ - struct ptlrpc_request *request = NULL; - struct ll_sb_info *sbi = ll_i2sbi(src); - struct md_op_data *op_data; - int err; - - if (flags) - return -EINVAL; - - CDEBUG(D_VFSTRACE, - "VFS Op:oldname=%pd, src_dir=" DFID "(%p), newname=%pd, tgt_dir=" DFID "(%p)\n", - src_dchild, PFID(ll_inode2fid(src)), src, - tgt_dchild, PFID(ll_inode2fid(tgt)), tgt); - - op_data = ll_prep_md_op_data(NULL, src, tgt, NULL, 0, 0, - LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - if (src_dchild->d_inode) - op_data->op_fid3 = *ll_inode2fid(src_dchild->d_inode); - if (tgt_dchild->d_inode) - op_data->op_fid4 = *ll_inode2fid(tgt_dchild->d_inode); - - err = md_rename(sbi->ll_md_exp, op_data, - src_dchild->d_name.name, - src_dchild->d_name.len, - tgt_dchild->d_name.name, - tgt_dchild->d_name.len, &request); - ll_finish_md_op_data(op_data); - if (!err) { - ll_update_times(request, src); - ll_update_times(request, tgt); - ll_stats_ops_tally(sbi, LPROC_LL_RENAME, 1); - } - - ptlrpc_req_finished(request); - if (!err) - d_move(src_dchild, tgt_dchild); - return err; -} - -const struct inode_operations ll_dir_inode_operations = { - .mknod = ll_mknod, - .atomic_open = ll_atomic_open, - .lookup = ll_lookup_nd, - .create = ll_create_nd, - /* We need all these non-raw things for NFSD, to not patch it. */ - .unlink = ll_unlink, - .mkdir = ll_mkdir, - .rmdir = ll_rmdir, - .symlink = ll_symlink, - .link = ll_link, - .rename = ll_rename, - .setattr = ll_setattr, - .getattr = ll_getattr, - .permission = ll_inode_permission, - .listxattr = ll_listxattr, - .get_acl = ll_get_acl, -}; - -const struct inode_operations ll_special_inode_operations = { - .setattr = ll_setattr, - .getattr = ll_getattr, - .permission = ll_inode_permission, - .listxattr = ll_listxattr, - .get_acl = ll_get_acl, -}; diff --git a/drivers/staging/lustre/lustre/llite/range_lock.c b/drivers/staging/lustre/lustre/llite/range_lock.c deleted file mode 100644 index 008a8874118d29e990e3e2fc521c1df647dcdb7b..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/range_lock.c +++ /dev/null @@ -1,241 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Range lock is used to allow multiple threads writing a single shared - * file given each thread is writing to a non-overlapping portion of the - * file. - * - * Refer to the possible upstream kernel version of range lock by - * Jan Kara : https://lkml.org/lkml/2013/1/31/480 - * - * This file could later replaced by the upstream kernel version. - */ -/* - * Author: Prakash Surya - * Author: Bobi Jam - */ -#include "range_lock.h" -#include -#include - -/** - * Initialize a range lock tree - * - * \param tree [in] an empty range lock tree - * - * Pre: Caller should have allocated the range lock tree. - * Post: The range lock tree is ready to function. - */ -void range_lock_tree_init(struct range_lock_tree *tree) -{ - tree->rlt_root = NULL; - tree->rlt_sequence = 0; - spin_lock_init(&tree->rlt_lock); -} - -/** - * Initialize a range lock node - * - * \param lock [in] an empty range lock node - * \param start [in] start of the covering region - * \param end [in] end of the covering region - * - * Pre: Caller should have allocated the range lock node. - * Post: The range lock node is meant to cover [start, end] region - */ -int range_lock_init(struct range_lock *lock, __u64 start, __u64 end) -{ - int rc; - - memset(&lock->rl_node, 0, sizeof(lock->rl_node)); - if (end != LUSTRE_EOF) - end >>= PAGE_SHIFT; - rc = interval_set(&lock->rl_node, start >> PAGE_SHIFT, end); - if (rc) - return rc; - - INIT_LIST_HEAD(&lock->rl_next_lock); - lock->rl_task = NULL; - lock->rl_lock_count = 0; - lock->rl_blocking_ranges = 0; - lock->rl_sequence = 0; - return rc; -} - -static inline struct range_lock *next_lock(struct range_lock *lock) -{ - return list_entry(lock->rl_next_lock.next, typeof(*lock), rl_next_lock); -} - -/** - * Helper function of range_unlock() - * - * \param node [in] a range lock found overlapped during interval node - * search - * \param arg [in] the range lock to be tested - * - * \retval INTERVAL_ITER_CONT indicate to continue the search for next - * overlapping range node - * \retval INTERVAL_ITER_STOP indicate to stop the search - */ -static enum interval_iter range_unlock_cb(struct interval_node *node, void *arg) -{ - struct range_lock *lock = arg; - struct range_lock *overlap = node2rangelock(node); - struct range_lock *iter; - - list_for_each_entry(iter, &overlap->rl_next_lock, rl_next_lock) { - if (iter->rl_sequence > lock->rl_sequence) { - --iter->rl_blocking_ranges; - LASSERT(iter->rl_blocking_ranges > 0); - } - } - if (overlap->rl_sequence > lock->rl_sequence) { - --overlap->rl_blocking_ranges; - if (overlap->rl_blocking_ranges == 0) - wake_up_process(overlap->rl_task); - } - return INTERVAL_ITER_CONT; -} - -/** - * Unlock a range lock, wake up locks blocked by this lock. - * - * \param tree [in] range lock tree - * \param lock [in] range lock to be deleted - * - * If this lock has been granted, relase it; if not, just delete it from - * the tree or the same region lock list. Wake up those locks only blocked - * by this lock through range_unlock_cb(). - */ -void range_unlock(struct range_lock_tree *tree, struct range_lock *lock) -{ - spin_lock(&tree->rlt_lock); - if (!list_empty(&lock->rl_next_lock)) { - struct range_lock *next; - - if (interval_is_intree(&lock->rl_node)) { /* first lock */ - /* Insert the next same range lock into the tree */ - next = next_lock(lock); - next->rl_lock_count = lock->rl_lock_count - 1; - interval_erase(&lock->rl_node, &tree->rlt_root); - interval_insert(&next->rl_node, &tree->rlt_root); - } else { - /* find the first lock in tree */ - list_for_each_entry(next, &lock->rl_next_lock, - rl_next_lock) { - if (!interval_is_intree(&next->rl_node)) - continue; - - LASSERT(next->rl_lock_count > 0); - next->rl_lock_count--; - break; - } - } - list_del_init(&lock->rl_next_lock); - } else { - LASSERT(interval_is_intree(&lock->rl_node)); - interval_erase(&lock->rl_node, &tree->rlt_root); - } - - interval_search(tree->rlt_root, &lock->rl_node.in_extent, - range_unlock_cb, lock); - spin_unlock(&tree->rlt_lock); -} - -/** - * Helper function of range_lock() - * - * \param node [in] a range lock found overlapped during interval node - * search - * \param arg [in] the range lock to be tested - * - * \retval INTERVAL_ITER_CONT indicate to continue the search for next - * overlapping range node - * \retval INTERVAL_ITER_STOP indicate to stop the search - */ -static enum interval_iter range_lock_cb(struct interval_node *node, void *arg) -{ - struct range_lock *lock = arg; - struct range_lock *overlap = node2rangelock(node); - - lock->rl_blocking_ranges += overlap->rl_lock_count + 1; - return INTERVAL_ITER_CONT; -} - -/** - * Lock a region - * - * \param tree [in] range lock tree - * \param lock [in] range lock node containing the region span - * - * \retval 0 get the range lock - * \retval <0 error code while not getting the range lock - * - * If there exists overlapping range lock, the new lock will wait and - * retry, if later it find that it is not the chosen one to wake up, - * it wait again. - */ -int range_lock(struct range_lock_tree *tree, struct range_lock *lock) -{ - struct interval_node *node; - int rc = 0; - - spin_lock(&tree->rlt_lock); - /* - * We need to check for all conflicting intervals - * already in the tree. - */ - interval_search(tree->rlt_root, &lock->rl_node.in_extent, - range_lock_cb, lock); - /* - * Insert to the tree if I am unique, otherwise I've been linked to - * the rl_next_lock of another lock which has the same range as mine - * in range_lock_cb(). - */ - node = interval_insert(&lock->rl_node, &tree->rlt_root); - if (node) { - struct range_lock *tmp = node2rangelock(node); - - list_add_tail(&lock->rl_next_lock, &tmp->rl_next_lock); - tmp->rl_lock_count++; - } - lock->rl_sequence = ++tree->rlt_sequence; - - while (lock->rl_blocking_ranges > 0) { - lock->rl_task = current; - __set_current_state(TASK_INTERRUPTIBLE); - spin_unlock(&tree->rlt_lock); - schedule(); - - if (signal_pending(current)) { - range_unlock(tree, lock); - rc = -EINTR; - goto out; - } - spin_lock(&tree->rlt_lock); - } - spin_unlock(&tree->rlt_lock); -out: - return rc; -} diff --git a/drivers/staging/lustre/lustre/llite/range_lock.h b/drivers/staging/lustre/lustre/llite/range_lock.h deleted file mode 100644 index 9ebac09160f2cc76317a72c7bf65f6c83d6cd2e4..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/range_lock.h +++ /dev/null @@ -1,83 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Range lock is used to allow multiple threads writing a single shared - * file given each thread is writing to a non-overlapping portion of the - * file. - * - * Refer to the possible upstream kernel version of range lock by - * Jan Kara : https://lkml.org/lkml/2013/1/31/480 - * - * This file could later replaced by the upstream kernel version. - */ -/* - * Author: Prakash Surya - * Author: Bobi Jam - */ -#ifndef _RANGE_LOCK_H -#define _RANGE_LOCK_H - -#include -#include - -struct range_lock { - struct interval_node rl_node; - /** - * Process to enqueue this lock. - */ - struct task_struct *rl_task; - /** - * List of locks with the same range. - */ - struct list_head rl_next_lock; - /** - * Number of locks in the list rl_next_lock - */ - unsigned int rl_lock_count; - /** - * Number of ranges which are blocking acquisition of the lock - */ - unsigned int rl_blocking_ranges; - /** - * Sequence number of range lock. This number is used to get to know - * the order the locks are queued; this is required for range_cancel(). - */ - __u64 rl_sequence; -}; - -static inline struct range_lock *node2rangelock(const struct interval_node *n) -{ - return container_of(n, struct range_lock, rl_node); -} - -struct range_lock_tree { - struct interval_node *rlt_root; - spinlock_t rlt_lock; /* protect range lock tree */ - __u64 rlt_sequence; -}; - -void range_lock_tree_init(struct range_lock_tree *tree); -int range_lock_init(struct range_lock *lock, __u64 start, __u64 end); -int range_lock(struct range_lock_tree *tree, struct range_lock *lock); -void range_unlock(struct range_lock_tree *tree, struct range_lock *lock); -#endif diff --git a/drivers/staging/lustre/lustre/llite/rw.c b/drivers/staging/lustre/lustre/llite/rw.c deleted file mode 100644 index 3e008ce7275dc83b9f7053c2c1c8629b006c3d6d..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/rw.c +++ /dev/null @@ -1,1214 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/llite/rw.c - * - * Lustre Lite I/O page cache routines shared by different kernel revs - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -/* current_is_kswapd() */ -#include -#include - -#define DEBUG_SUBSYSTEM S_LLITE - -#include -#include "llite_internal.h" - -static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which); - -/** - * Get readahead pages from the filesystem readahead pool of the client for a - * thread. - * - * /param sbi superblock for filesystem readahead state ll_ra_info - * /param ria per-thread readahead state - * /param pages number of pages requested for readahead for the thread. - * - * WARNING: This algorithm is used to reduce contention on sbi->ll_lock. - * It should work well if the ra_max_pages is much greater than the single - * file's read-ahead window, and not too many threads contending for - * these readahead pages. - * - * TODO: There may be a 'global sync problem' if many threads are trying - * to get an ra budget that is larger than the remaining readahead pages - * and reach here at exactly the same time. They will compute /a ret to - * consume the remaining pages, but will fail at atomic_add_return() and - * get a zero ra window, although there is still ra space remaining. - Jay - */ -static unsigned long ll_ra_count_get(struct ll_sb_info *sbi, - struct ra_io_arg *ria, - unsigned long pages, unsigned long min) -{ - struct ll_ra_info *ra = &sbi->ll_ra_info; - long ret; - - /* If read-ahead pages left are less than 1M, do not do read-ahead, - * otherwise it will form small read RPC(< 1M), which hurt server - * performance a lot. - */ - ret = min(ra->ra_max_pages - atomic_read(&ra->ra_cur_pages), pages); - if (ret < 0 || ret < min_t(long, PTLRPC_MAX_BRW_PAGES, pages)) { - ret = 0; - goto out; - } - - if (atomic_add_return(ret, &ra->ra_cur_pages) > ra->ra_max_pages) { - atomic_sub(ret, &ra->ra_cur_pages); - ret = 0; - } - -out: - if (ret < min) { - /* override ra limit for maximum performance */ - atomic_add(min - ret, &ra->ra_cur_pages); - ret = min; - } - return ret; -} - -void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len) -{ - struct ll_ra_info *ra = &sbi->ll_ra_info; - - atomic_sub(len, &ra->ra_cur_pages); -} - -static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which) -{ - LASSERTF(which < _NR_RA_STAT, "which: %u\n", which); - lprocfs_counter_incr(sbi->ll_ra_stats, which); -} - -void ll_ra_stats_inc(struct inode *inode, enum ra_stat which) -{ - struct ll_sb_info *sbi = ll_i2sbi(inode); - - ll_ra_stats_inc_sbi(sbi, which); -} - -#define RAS_CDEBUG(ras) \ - CDEBUG(D_READA, \ - "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu rpc %lu " \ - "r %lu ri %lu csr %lu sf %lu sp %lu sl %lu\n", \ - ras->ras_last_readpage, ras->ras_consecutive_requests, \ - ras->ras_consecutive_pages, ras->ras_window_start, \ - ras->ras_window_len, ras->ras_next_readahead, \ - ras->ras_rpc_size, \ - ras->ras_requests, ras->ras_request_index, \ - ras->ras_consecutive_stride_requests, ras->ras_stride_offset, \ - ras->ras_stride_pages, ras->ras_stride_length) - -static int index_in_window(unsigned long index, unsigned long point, - unsigned long before, unsigned long after) -{ - unsigned long start = point - before, end = point + after; - - if (start > point) - start = 0; - if (end < point) - end = ~0; - - return start <= index && index <= end; -} - -void ll_ras_enter(struct file *f) -{ - struct ll_file_data *fd = LUSTRE_FPRIVATE(f); - struct ll_readahead_state *ras = &fd->fd_ras; - - spin_lock(&ras->ras_lock); - ras->ras_requests++; - ras->ras_request_index = 0; - ras->ras_consecutive_requests++; - spin_unlock(&ras->ras_lock); -} - -/** - * Initiates read-ahead of a page with given index. - * - * \retval +ve: page was already uptodate so it will be skipped - * from being added; - * \retval -ve: page wasn't added to \a queue for error; - * \retval 0: page was added into \a queue for read ahead. - */ -static int ll_read_ahead_page(const struct lu_env *env, struct cl_io *io, - struct cl_page_list *queue, pgoff_t index) -{ - enum ra_stat which = _NR_RA_STAT; /* keep gcc happy */ - struct cl_object *clob = io->ci_obj; - struct inode *inode = vvp_object_inode(clob); - const char *msg = NULL; - struct cl_page *page; - struct vvp_page *vpg; - struct page *vmpage; - int rc = 0; - - vmpage = grab_cache_page_nowait(inode->i_mapping, index); - if (!vmpage) { - which = RA_STAT_FAILED_GRAB_PAGE; - msg = "g_c_p_n failed"; - rc = -EBUSY; - goto out; - } - - /* Check if vmpage was truncated or reclaimed */ - if (vmpage->mapping != inode->i_mapping) { - which = RA_STAT_WRONG_GRAB_PAGE; - msg = "g_c_p_n returned invalid page"; - rc = -EBUSY; - goto out; - } - - page = cl_page_find(env, clob, vmpage->index, vmpage, CPT_CACHEABLE); - if (IS_ERR(page)) { - which = RA_STAT_FAILED_GRAB_PAGE; - msg = "cl_page_find failed"; - rc = PTR_ERR(page); - goto out; - } - - lu_ref_add(&page->cp_reference, "ra", current); - cl_page_assume(env, io, page); - vpg = cl2vvp_page(cl_object_page_slice(clob, page)); - if (!vpg->vpg_defer_uptodate && !PageUptodate(vmpage)) { - vpg->vpg_defer_uptodate = 1; - vpg->vpg_ra_used = 0; - cl_page_list_add(queue, page); - } else { - /* skip completed pages */ - cl_page_unassume(env, io, page); - /* This page is already uptodate, returning a positive number - * to tell the callers about this - */ - rc = 1; - } - - lu_ref_del(&page->cp_reference, "ra", current); - cl_page_put(env, page); -out: - if (vmpage) { - if (rc) - unlock_page(vmpage); - put_page(vmpage); - } - if (msg) { - ll_ra_stats_inc(inode, which); - CDEBUG(D_READA, "%s\n", msg); - } - return rc; -} - -#define RIA_DEBUG(ria) \ - CDEBUG(D_READA, "rs %lu re %lu ro %lu rl %lu rp %lu\n", \ - ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\ - ria->ria_pages) - -static inline int stride_io_mode(struct ll_readahead_state *ras) -{ - return ras->ras_consecutive_stride_requests > 1; -} - -/* The function calculates how much pages will be read in - * [off, off + length], in such stride IO area, - * stride_offset = st_off, stride_length = st_len, - * stride_pages = st_pgs - * - * |------------------|*****|------------------|*****|------------|*****|.... - * st_off - * |--- st_pgs ---| - * |----- st_len -----| - * - * How many pages it should read in such pattern - * |-------------------------------------------------------------| - * off - * |<------ length ------->| - * - * = |<----->| + |-------------------------------------| + |---| - * start_left st_pgs * i end_left - */ -static unsigned long -stride_pg_count(pgoff_t st_off, unsigned long st_len, unsigned long st_pgs, - unsigned long off, unsigned long length) -{ - __u64 start = off > st_off ? off - st_off : 0; - __u64 end = off + length > st_off ? off + length - st_off : 0; - unsigned long start_left = 0; - unsigned long end_left = 0; - unsigned long pg_count; - - if (st_len == 0 || length == 0 || end == 0) - return length; - - start_left = do_div(start, st_len); - if (start_left < st_pgs) - start_left = st_pgs - start_left; - else - start_left = 0; - - end_left = do_div(end, st_len); - if (end_left > st_pgs) - end_left = st_pgs; - - CDEBUG(D_READA, "start %llu, end %llu start_left %lu end_left %lu\n", - start, end, start_left, end_left); - - if (start == end) - pg_count = end_left - (st_pgs - start_left); - else - pg_count = start_left + st_pgs * (end - start - 1) + end_left; - - CDEBUG(D_READA, - "st_off %lu, st_len %lu st_pgs %lu off %lu length %lu pgcount %lu\n", - st_off, st_len, st_pgs, off, length, pg_count); - - return pg_count; -} - -static int ria_page_count(struct ra_io_arg *ria) -{ - __u64 length = ria->ria_end >= ria->ria_start ? - ria->ria_end - ria->ria_start + 1 : 0; - - return stride_pg_count(ria->ria_stoff, ria->ria_length, - ria->ria_pages, ria->ria_start, - length); -} - -static unsigned long ras_align(struct ll_readahead_state *ras, - unsigned long index, - unsigned long *remainder) -{ - unsigned long rem = index % ras->ras_rpc_size; - - if (remainder) - *remainder = rem; - return index - rem; -} - -/*Check whether the index is in the defined ra-window */ -static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria) -{ - /* If ria_length == ria_pages, it means non-stride I/O mode, - * idx should always inside read-ahead window in this case - * For stride I/O mode, just check whether the idx is inside - * the ria_pages. - */ - return ria->ria_length == 0 || ria->ria_length == ria->ria_pages || - (idx >= ria->ria_stoff && (idx - ria->ria_stoff) % - ria->ria_length < ria->ria_pages); -} - -static unsigned long -ll_read_ahead_pages(const struct lu_env *env, struct cl_io *io, - struct cl_page_list *queue, struct ll_readahead_state *ras, - struct ra_io_arg *ria) -{ - struct cl_read_ahead ra = { 0 }; - unsigned long ra_end = 0; - bool stride_ria; - pgoff_t page_idx; - int rc; - - LASSERT(ria); - RIA_DEBUG(ria); - - stride_ria = ria->ria_length > ria->ria_pages && ria->ria_pages > 0; - for (page_idx = ria->ria_start; - page_idx <= ria->ria_end && ria->ria_reserved > 0; page_idx++) { - if (ras_inside_ra_window(page_idx, ria)) { - if (!ra.cra_end || ra.cra_end < page_idx) { - unsigned long end; - - cl_read_ahead_release(env, &ra); - - rc = cl_io_read_ahead(env, io, page_idx, &ra); - if (rc < 0) - break; - - CDEBUG(D_READA, "idx: %lu, ra: %lu, rpc: %lu\n", - page_idx, ra.cra_end, ra.cra_rpc_size); - LASSERTF(ra.cra_end >= page_idx, - "object: %p, indcies %lu / %lu\n", - io->ci_obj, ra.cra_end, page_idx); - /* - * update read ahead RPC size. - * NB: it's racy but doesn't matter - */ - if (ras->ras_rpc_size > ra.cra_rpc_size && - ra.cra_rpc_size > 0) - ras->ras_rpc_size = ra.cra_rpc_size; - /* trim it to align with optimal RPC size */ - end = ras_align(ras, ria->ria_end + 1, NULL); - if (end > 0 && !ria->ria_eof) - ria->ria_end = end - 1; - if (ria->ria_end < ria->ria_end_min) - ria->ria_end = ria->ria_end_min; - if (ria->ria_end > ra.cra_end) - ria->ria_end = ra.cra_end; - } - - /* If the page is inside the read-ahead window */ - rc = ll_read_ahead_page(env, io, queue, page_idx); - if (rc < 0) - break; - - ra_end = page_idx; - if (!rc) - ria->ria_reserved--; - } else if (stride_ria) { - /* If it is not in the read-ahead window, and it is - * read-ahead mode, then check whether it should skip - * the stride gap - */ - pgoff_t offset; - /* FIXME: This assertion only is valid when it is for - * forward read-ahead, it will be fixed when backward - * read-ahead is implemented - */ - LASSERTF(page_idx >= ria->ria_stoff, - "Invalid page_idx %lu rs %lu re %lu ro %lu rl %lu rp %lu\n", - page_idx, - ria->ria_start, ria->ria_end, ria->ria_stoff, - ria->ria_length, ria->ria_pages); - offset = page_idx - ria->ria_stoff; - offset = offset % (ria->ria_length); - if (offset > ria->ria_pages) { - page_idx += ria->ria_length - offset; - CDEBUG(D_READA, "i %lu skip %lu\n", page_idx, - ria->ria_length - offset); - continue; - } - } - } - cl_read_ahead_release(env, &ra); - - return ra_end; -} - -static int ll_readahead(const struct lu_env *env, struct cl_io *io, - struct cl_page_list *queue, - struct ll_readahead_state *ras, bool hit) -{ - struct vvp_io *vio = vvp_env_io(env); - struct ll_thread_info *lti = ll_env_info(env); - struct cl_attr *attr = vvp_env_thread_attr(env); - unsigned long len, mlen = 0; - pgoff_t ra_end, start = 0, end = 0; - struct inode *inode; - struct ra_io_arg *ria = <i->lti_ria; - struct cl_object *clob; - int ret = 0; - __u64 kms; - - clob = io->ci_obj; - inode = vvp_object_inode(clob); - - memset(ria, 0, sizeof(*ria)); - - cl_object_attr_lock(clob); - ret = cl_object_attr_get(env, clob, attr); - cl_object_attr_unlock(clob); - - if (ret != 0) - return ret; - kms = attr->cat_kms; - if (kms == 0) { - ll_ra_stats_inc(inode, RA_STAT_ZERO_LEN); - return 0; - } - - spin_lock(&ras->ras_lock); - - /** - * Note: other thread might rollback the ras_next_readahead, - * if it can not get the full size of prepared pages, see the - * end of this function. For stride read ahead, it needs to - * make sure the offset is no less than ras_stride_offset, - * so that stride read ahead can work correctly. - */ - if (stride_io_mode(ras)) - start = max(ras->ras_next_readahead, ras->ras_stride_offset); - else - start = ras->ras_next_readahead; - - if (ras->ras_window_len > 0) - end = ras->ras_window_start + ras->ras_window_len - 1; - - /* Enlarge the RA window to encompass the full read */ - if (vio->vui_ra_valid && - end < vio->vui_ra_start + vio->vui_ra_count - 1) - end = vio->vui_ra_start + vio->vui_ra_count - 1; - - if (end) { - unsigned long end_index; - - /* Truncate RA window to end of file */ - end_index = (unsigned long)((kms - 1) >> PAGE_SHIFT); - if (end_index <= end) { - end = end_index; - ria->ria_eof = true; - } - - ras->ras_next_readahead = max(end, end + 1); - RAS_CDEBUG(ras); - } - ria->ria_start = start; - ria->ria_end = end; - /* If stride I/O mode is detected, get stride window*/ - if (stride_io_mode(ras)) { - ria->ria_stoff = ras->ras_stride_offset; - ria->ria_length = ras->ras_stride_length; - ria->ria_pages = ras->ras_stride_pages; - } - spin_unlock(&ras->ras_lock); - - if (end == 0) { - ll_ra_stats_inc(inode, RA_STAT_ZERO_WINDOW); - return 0; - } - len = ria_page_count(ria); - if (len == 0) { - ll_ra_stats_inc(inode, RA_STAT_ZERO_WINDOW); - return 0; - } - - CDEBUG(D_READA, DFID ": ria: %lu/%lu, bead: %lu/%lu, hit: %d\n", - PFID(lu_object_fid(&clob->co_lu)), - ria->ria_start, ria->ria_end, - vio->vui_ra_valid ? vio->vui_ra_start : 0, - vio->vui_ra_valid ? vio->vui_ra_count : 0, - hit); - - /* at least to extend the readahead window to cover current read */ - if (!hit && vio->vui_ra_valid && - vio->vui_ra_start + vio->vui_ra_count > ria->ria_start) { - unsigned long remainder; - - /* to the end of current read window. */ - mlen = vio->vui_ra_start + vio->vui_ra_count - ria->ria_start; - /* trim to RPC boundary */ - ras_align(ras, ria->ria_start, &remainder); - mlen = min(mlen, ras->ras_rpc_size - remainder); - ria->ria_end_min = ria->ria_start + mlen; - } - - ria->ria_reserved = ll_ra_count_get(ll_i2sbi(inode), ria, len, mlen); - if (ria->ria_reserved < len) - ll_ra_stats_inc(inode, RA_STAT_MAX_IN_FLIGHT); - - CDEBUG(D_READA, "reserved pages %lu/%lu/%lu, ra_cur %d, ra_max %lu\n", - ria->ria_reserved, len, mlen, - atomic_read(&ll_i2sbi(inode)->ll_ra_info.ra_cur_pages), - ll_i2sbi(inode)->ll_ra_info.ra_max_pages); - - ra_end = ll_read_ahead_pages(env, io, queue, ras, ria); - - if (ria->ria_reserved) - ll_ra_count_put(ll_i2sbi(inode), ria->ria_reserved); - - if (ra_end == end && ra_end == (kms >> PAGE_SHIFT)) - ll_ra_stats_inc(inode, RA_STAT_EOF); - - /* if we didn't get to the end of the region we reserved from - * the ras we need to go back and update the ras so that the - * next read-ahead tries from where we left off. we only do so - * if the region we failed to issue read-ahead on is still ahead - * of the app and behind the next index to start read-ahead from - */ - CDEBUG(D_READA, "ra_end = %lu end = %lu stride end = %lu pages = %d\n", - ra_end, end, ria->ria_end, ret); - - if (ra_end > 0 && ra_end != end) { - ll_ra_stats_inc(inode, RA_STAT_FAILED_REACH_END); - spin_lock(&ras->ras_lock); - if (ra_end <= ras->ras_next_readahead && - index_in_window(ra_end, ras->ras_window_start, 0, - ras->ras_window_len)) { - ras->ras_next_readahead = ra_end + 1; - RAS_CDEBUG(ras); - } - spin_unlock(&ras->ras_lock); - } - - return ret; -} - -static void ras_set_start(struct inode *inode, struct ll_readahead_state *ras, - unsigned long index) -{ - ras->ras_window_start = ras_align(ras, index, NULL); -} - -/* called with the ras_lock held or from places where it doesn't matter */ -static void ras_reset(struct inode *inode, struct ll_readahead_state *ras, - unsigned long index) -{ - ras->ras_last_readpage = index; - ras->ras_consecutive_requests = 0; - ras->ras_consecutive_pages = 0; - ras->ras_window_len = 0; - ras_set_start(inode, ras, index); - ras->ras_next_readahead = max(ras->ras_window_start, index + 1); - - RAS_CDEBUG(ras); -} - -/* called with the ras_lock held or from places where it doesn't matter */ -static void ras_stride_reset(struct ll_readahead_state *ras) -{ - ras->ras_consecutive_stride_requests = 0; - ras->ras_stride_length = 0; - ras->ras_stride_pages = 0; - RAS_CDEBUG(ras); -} - -void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras) -{ - spin_lock_init(&ras->ras_lock); - ras->ras_rpc_size = PTLRPC_MAX_BRW_PAGES; - ras_reset(inode, ras, 0); - ras->ras_requests = 0; -} - -/* - * Check whether the read request is in the stride window. - * If it is in the stride window, return 1, otherwise return 0. - */ -static int index_in_stride_window(struct ll_readahead_state *ras, - unsigned long index) -{ - unsigned long stride_gap; - - if (ras->ras_stride_length == 0 || ras->ras_stride_pages == 0 || - ras->ras_stride_pages == ras->ras_stride_length) - return 0; - - stride_gap = index - ras->ras_last_readpage - 1; - - /* If it is contiguous read */ - if (stride_gap == 0) - return ras->ras_consecutive_pages + 1 <= ras->ras_stride_pages; - - /* Otherwise check the stride by itself */ - return (ras->ras_stride_length - ras->ras_stride_pages) == stride_gap && - ras->ras_consecutive_pages == ras->ras_stride_pages; -} - -static void ras_update_stride_detector(struct ll_readahead_state *ras, - unsigned long index) -{ - unsigned long stride_gap = index - ras->ras_last_readpage - 1; - - if ((stride_gap != 0 || ras->ras_consecutive_stride_requests == 0) && - !stride_io_mode(ras)) { - ras->ras_stride_pages = ras->ras_consecutive_pages; - ras->ras_stride_length = ras->ras_consecutive_pages + - stride_gap; - } - LASSERT(ras->ras_request_index == 0); - LASSERT(ras->ras_consecutive_stride_requests == 0); - - if (index <= ras->ras_last_readpage) { - /*Reset stride window for forward read*/ - ras_stride_reset(ras); - return; - } - - ras->ras_stride_pages = ras->ras_consecutive_pages; - ras->ras_stride_length = stride_gap + ras->ras_consecutive_pages; - - RAS_CDEBUG(ras); -} - -/* Stride Read-ahead window will be increased inc_len according to - * stride I/O pattern - */ -static void ras_stride_increase_window(struct ll_readahead_state *ras, - struct ll_ra_info *ra, - unsigned long inc_len) -{ - unsigned long left, step, window_len; - unsigned long stride_len; - - LASSERT(ras->ras_stride_length > 0); - LASSERTF(ras->ras_window_start + ras->ras_window_len >= - ras->ras_stride_offset, - "window_start %lu, window_len %lu stride_offset %lu\n", - ras->ras_window_start, - ras->ras_window_len, ras->ras_stride_offset); - - stride_len = ras->ras_window_start + ras->ras_window_len - - ras->ras_stride_offset; - - left = stride_len % ras->ras_stride_length; - window_len = ras->ras_window_len - left; - - if (left < ras->ras_stride_pages) - left += inc_len; - else - left = ras->ras_stride_pages + inc_len; - - LASSERT(ras->ras_stride_pages != 0); - - step = left / ras->ras_stride_pages; - left %= ras->ras_stride_pages; - - window_len += step * ras->ras_stride_length + left; - - if (stride_pg_count(ras->ras_stride_offset, ras->ras_stride_length, - ras->ras_stride_pages, ras->ras_stride_offset, - window_len) <= ra->ra_max_pages_per_file) - ras->ras_window_len = window_len; - - RAS_CDEBUG(ras); -} - -static void ras_increase_window(struct inode *inode, - struct ll_readahead_state *ras, - struct ll_ra_info *ra) -{ - /* The stretch of ra-window should be aligned with max rpc_size - * but current clio architecture does not support retrieve such - * information from lower layer. FIXME later - */ - if (stride_io_mode(ras)) { - ras_stride_increase_window(ras, ra, ras->ras_rpc_size); - } else { - unsigned long wlen; - - wlen = min(ras->ras_window_len + ras->ras_rpc_size, - ra->ra_max_pages_per_file); - ras->ras_window_len = ras_align(ras, wlen, NULL); - } -} - -static void ras_update(struct ll_sb_info *sbi, struct inode *inode, - struct ll_readahead_state *ras, unsigned long index, - enum ras_update_flags flags) -{ - struct ll_ra_info *ra = &sbi->ll_ra_info; - int zero = 0, stride_detect = 0, ra_miss = 0; - bool hit = flags & LL_RAS_HIT; - - spin_lock(&ras->ras_lock); - - if (!hit) - CDEBUG(D_READA, DFID " pages at %lu miss.\n", - PFID(ll_inode2fid(inode)), index); - - ll_ra_stats_inc_sbi(sbi, hit ? RA_STAT_HIT : RA_STAT_MISS); - - /* reset the read-ahead window in two cases. First when the app seeks - * or reads to some other part of the file. Secondly if we get a - * read-ahead miss that we think we've previously issued. This can - * be a symptom of there being so many read-ahead pages that the VM is - * reclaiming it before we get to it. - */ - if (!index_in_window(index, ras->ras_last_readpage, 8, 8)) { - zero = 1; - ll_ra_stats_inc_sbi(sbi, RA_STAT_DISTANT_READPAGE); - } else if (!hit && ras->ras_window_len && - index < ras->ras_next_readahead && - index_in_window(index, ras->ras_window_start, 0, - ras->ras_window_len)) { - ra_miss = 1; - ll_ra_stats_inc_sbi(sbi, RA_STAT_MISS_IN_WINDOW); - } - - /* On the second access to a file smaller than the tunable - * ra_max_read_ahead_whole_pages trigger RA on all pages in the - * file up to ra_max_pages_per_file. This is simply a best effort - * and only occurs once per open file. Normal RA behavior is reverted - * to for subsequent IO. The mmap case does not increment - * ras_requests and thus can never trigger this behavior. - */ - if (ras->ras_requests >= 2 && !ras->ras_request_index) { - __u64 kms_pages; - - kms_pages = (i_size_read(inode) + PAGE_SIZE - 1) >> - PAGE_SHIFT; - - CDEBUG(D_READA, "kmsp %llu mwp %lu mp %lu\n", kms_pages, - ra->ra_max_read_ahead_whole_pages, - ra->ra_max_pages_per_file); - - if (kms_pages && - kms_pages <= ra->ra_max_read_ahead_whole_pages) { - ras->ras_window_start = 0; - ras->ras_next_readahead = index + 1; - ras->ras_window_len = min(ra->ra_max_pages_per_file, - ra->ra_max_read_ahead_whole_pages); - goto out_unlock; - } - } - if (zero) { - /* check whether it is in stride I/O mode*/ - if (!index_in_stride_window(ras, index)) { - if (ras->ras_consecutive_stride_requests == 0 && - ras->ras_request_index == 0) { - ras_update_stride_detector(ras, index); - ras->ras_consecutive_stride_requests++; - } else { - ras_stride_reset(ras); - } - ras_reset(inode, ras, index); - ras->ras_consecutive_pages++; - goto out_unlock; - } else { - ras->ras_consecutive_pages = 0; - ras->ras_consecutive_requests = 0; - if (++ras->ras_consecutive_stride_requests > 1) - stride_detect = 1; - RAS_CDEBUG(ras); - } - } else { - if (ra_miss) { - if (index_in_stride_window(ras, index) && - stride_io_mode(ras)) { - if (index != ras->ras_last_readpage + 1) - ras->ras_consecutive_pages = 0; - ras_reset(inode, ras, index); - - /* If stride-RA hit cache miss, the stride - * detector will not be reset to avoid the - * overhead of redetecting read-ahead mode, - * but on the condition that the stride window - * is still intersect with normal sequential - * read-ahead window. - */ - if (ras->ras_window_start < - ras->ras_stride_offset) - ras_stride_reset(ras); - RAS_CDEBUG(ras); - } else { - /* Reset both stride window and normal RA - * window - */ - ras_reset(inode, ras, index); - ras->ras_consecutive_pages++; - ras_stride_reset(ras); - goto out_unlock; - } - } else if (stride_io_mode(ras)) { - /* If this is contiguous read but in stride I/O mode - * currently, check whether stride step still is valid, - * if invalid, it will reset the stride ra window - */ - if (!index_in_stride_window(ras, index)) { - /* Shrink stride read-ahead window to be zero */ - ras_stride_reset(ras); - ras->ras_window_len = 0; - ras->ras_next_readahead = index; - } - } - } - ras->ras_consecutive_pages++; - ras->ras_last_readpage = index; - ras_set_start(inode, ras, index); - - if (stride_io_mode(ras)) { - /* Since stride readahead is sensitive to the offset - * of read-ahead, so we use original offset here, - * instead of ras_window_start, which is RPC aligned - */ - ras->ras_next_readahead = max(index, ras->ras_next_readahead); - ras->ras_window_start = max(ras->ras_stride_offset, - ras->ras_window_start); - } else { - if (ras->ras_next_readahead < ras->ras_window_start) - ras->ras_next_readahead = ras->ras_window_start; - if (!hit) - ras->ras_next_readahead = index + 1; - } - RAS_CDEBUG(ras); - - /* Trigger RA in the mmap case where ras_consecutive_requests - * is not incremented and thus can't be used to trigger RA - */ - if (ras->ras_consecutive_pages >= 4 && flags & LL_RAS_MMAP) { - ras_increase_window(inode, ras, ra); - /* - * reset consecutive pages so that the readahead window can - * grow gradually. - */ - ras->ras_consecutive_pages = 0; - goto out_unlock; - } - - /* Initially reset the stride window offset to next_readahead*/ - if (ras->ras_consecutive_stride_requests == 2 && stride_detect) { - /** - * Once stride IO mode is detected, next_readahead should be - * reset to make sure next_readahead > stride offset - */ - ras->ras_next_readahead = max(index, ras->ras_next_readahead); - ras->ras_stride_offset = index; - ras->ras_window_start = max(index, ras->ras_window_start); - } - - /* The initial ras_window_len is set to the request size. To avoid - * uselessly reading and discarding pages for random IO the window is - * only increased once per consecutive request received. - */ - if ((ras->ras_consecutive_requests > 1 || stride_detect) && - !ras->ras_request_index) - ras_increase_window(inode, ras, ra); -out_unlock: - RAS_CDEBUG(ras); - ras->ras_request_index++; - spin_unlock(&ras->ras_lock); -} - -int ll_writepage(struct page *vmpage, struct writeback_control *wbc) -{ - struct inode *inode = vmpage->mapping->host; - struct ll_inode_info *lli = ll_i2info(inode); - struct lu_env *env; - struct cl_io *io; - struct cl_page *page; - struct cl_object *clob; - bool redirtied = false; - bool unlocked = false; - int result; - u16 refcheck; - - LASSERT(PageLocked(vmpage)); - LASSERT(!PageWriteback(vmpage)); - - LASSERT(ll_i2dtexp(inode)); - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) { - result = PTR_ERR(env); - goto out; - } - - clob = ll_i2info(inode)->lli_clob; - LASSERT(clob); - - io = vvp_env_thread_io(env); - io->ci_obj = clob; - io->ci_ignore_layout = 1; - result = cl_io_init(env, io, CIT_MISC, clob); - if (result == 0) { - page = cl_page_find(env, clob, vmpage->index, - vmpage, CPT_CACHEABLE); - if (!IS_ERR(page)) { - lu_ref_add(&page->cp_reference, "writepage", - current); - cl_page_assume(env, io, page); - result = cl_page_flush(env, io, page); - if (result != 0) { - /* - * Re-dirty page on error so it retries write, - * but not in case when IO has actually - * occurred and completed with an error. - */ - if (!PageError(vmpage)) { - redirty_page_for_writepage(wbc, vmpage); - result = 0; - redirtied = true; - } - } - cl_page_disown(env, io, page); - unlocked = true; - lu_ref_del(&page->cp_reference, - "writepage", current); - cl_page_put(env, page); - } else { - result = PTR_ERR(page); - } - } - cl_io_fini(env, io); - - if (redirtied && wbc->sync_mode == WB_SYNC_ALL) { - loff_t offset = cl_offset(clob, vmpage->index); - - /* Flush page failed because the extent is being written out. - * Wait for the write of extent to be finished to avoid - * breaking kernel which assumes ->writepage should mark - * PageWriteback or clean the page. - */ - result = cl_sync_file_range(inode, offset, - offset + PAGE_SIZE - 1, - CL_FSYNC_LOCAL, 1); - if (result > 0) { - /* actually we may have written more than one page. - * decreasing this page because the caller will count - * it. - */ - wbc->nr_to_write -= result - 1; - result = 0; - } - } - - cl_env_put(env, &refcheck); - goto out; - -out: - if (result < 0) { - if (!lli->lli_async_rc) - lli->lli_async_rc = result; - SetPageError(vmpage); - if (!unlocked) - unlock_page(vmpage); - } - return result; -} - -int ll_writepages(struct address_space *mapping, struct writeback_control *wbc) -{ - struct inode *inode = mapping->host; - struct ll_sb_info *sbi = ll_i2sbi(inode); - loff_t start; - loff_t end; - enum cl_fsync_mode mode; - int range_whole = 0; - int result; - int ignore_layout = 0; - - if (wbc->range_cyclic) { - start = mapping->writeback_index << PAGE_SHIFT; - end = OBD_OBJECT_EOF; - } else { - start = wbc->range_start; - end = wbc->range_end; - if (end == LLONG_MAX) { - end = OBD_OBJECT_EOF; - range_whole = start == 0; - } - } - - mode = CL_FSYNC_NONE; - if (wbc->sync_mode == WB_SYNC_ALL) - mode = CL_FSYNC_LOCAL; - - if (sbi->ll_umounting) - /* if the mountpoint is being umounted, all pages have to be - * evicted to avoid hitting LBUG when truncate_inode_pages() - * is called later on. - */ - ignore_layout = 1; - - if (!ll_i2info(inode)->lli_clob) - return 0; - - result = cl_sync_file_range(inode, start, end, mode, ignore_layout); - if (result > 0) { - wbc->nr_to_write -= result; - result = 0; - } - - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) { - if (end == OBD_OBJECT_EOF) - mapping->writeback_index = 0; - else - mapping->writeback_index = (end >> PAGE_SHIFT) + 1; - } - return result; -} - -struct ll_cl_context *ll_cl_find(struct file *file) -{ - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - struct ll_cl_context *lcc; - struct ll_cl_context *found = NULL; - - read_lock(&fd->fd_lock); - list_for_each_entry(lcc, &fd->fd_lccs, lcc_list) { - if (lcc->lcc_cookie == current) { - found = lcc; - break; - } - } - read_unlock(&fd->fd_lock); - - return found; -} - -void ll_cl_add(struct file *file, const struct lu_env *env, struct cl_io *io) -{ - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - struct ll_cl_context *lcc = &ll_env_info(env)->lti_io_ctx; - - memset(lcc, 0, sizeof(*lcc)); - INIT_LIST_HEAD(&lcc->lcc_list); - lcc->lcc_cookie = current; - lcc->lcc_env = env; - lcc->lcc_io = io; - - write_lock(&fd->fd_lock); - list_add(&lcc->lcc_list, &fd->fd_lccs); - write_unlock(&fd->fd_lock); -} - -void ll_cl_remove(struct file *file, const struct lu_env *env) -{ - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - struct ll_cl_context *lcc = &ll_env_info(env)->lti_io_ctx; - - write_lock(&fd->fd_lock); - list_del_init(&lcc->lcc_list); - write_unlock(&fd->fd_lock); -} - -static int ll_io_read_page(const struct lu_env *env, struct cl_io *io, - struct cl_page *page) -{ - struct inode *inode = vvp_object_inode(page->cp_obj); - struct ll_file_data *fd = vvp_env_io(env)->vui_fd; - struct ll_readahead_state *ras = &fd->fd_ras; - struct cl_2queue *queue = &io->ci_queue; - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct vvp_page *vpg; - bool uptodate; - int rc = 0; - - vpg = cl2vvp_page(cl_object_page_slice(page->cp_obj, page)); - uptodate = vpg->vpg_defer_uptodate; - - if (sbi->ll_ra_info.ra_max_pages_per_file > 0 && - sbi->ll_ra_info.ra_max_pages > 0) { - struct vvp_io *vio = vvp_env_io(env); - enum ras_update_flags flags = 0; - - if (uptodate) - flags |= LL_RAS_HIT; - if (!vio->vui_ra_valid) - flags |= LL_RAS_MMAP; - ras_update(sbi, inode, ras, vvp_index(vpg), flags); - } - - cl_2queue_init(queue); - if (uptodate) { - vpg->vpg_ra_used = 1; - cl_page_export(env, page, 1); - cl_page_disown(env, io, page); - } else { - cl_page_list_add(&queue->c2_qin, page); - } - - if (sbi->ll_ra_info.ra_max_pages_per_file > 0 && - sbi->ll_ra_info.ra_max_pages > 0) { - int rc2; - - rc2 = ll_readahead(env, io, &queue->c2_qin, ras, - uptodate); - CDEBUG(D_READA, DFID "%d pages read ahead at %lu\n", - PFID(ll_inode2fid(inode)), rc2, vvp_index(vpg)); - } - - if (queue->c2_qin.pl_nr > 0) - rc = cl_io_submit_rw(env, io, CRT_READ, queue); - - /* - * Unlock unsent pages in case of error. - */ - cl_page_list_disown(env, io, &queue->c2_qin); - cl_2queue_fini(env, queue); - - return rc; -} - -int ll_readpage(struct file *file, struct page *vmpage) -{ - struct cl_object *clob = ll_i2info(file_inode(file))->lli_clob; - struct ll_cl_context *lcc; - const struct lu_env *env; - struct cl_io *io; - struct cl_page *page; - int result; - - lcc = ll_cl_find(file); - if (!lcc) { - unlock_page(vmpage); - return -EIO; - } - - env = lcc->lcc_env; - io = lcc->lcc_io; - LASSERT(io->ci_state == CIS_IO_GOING); - page = cl_page_find(env, clob, vmpage->index, vmpage, CPT_CACHEABLE); - if (!IS_ERR(page)) { - LASSERT(page->cp_type == CPT_CACHEABLE); - if (likely(!PageUptodate(vmpage))) { - cl_page_assume(env, io, page); - result = ll_io_read_page(env, io, page); - } else { - /* Page from a non-object file. */ - unlock_page(vmpage); - result = 0; - } - cl_page_put(env, page); - } else { - unlock_page(vmpage); - result = PTR_ERR(page); - } - return result; -} - -int ll_page_sync_io(const struct lu_env *env, struct cl_io *io, - struct cl_page *page, enum cl_req_type crt) -{ - struct cl_2queue *queue; - int result; - - LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); - - queue = &io->ci_queue; - cl_2queue_init_page(queue, page); - - result = cl_io_submit_sync(env, io, crt, queue, 0); - LASSERT(cl_page_is_owned(page, io)); - - if (crt == CRT_READ) - /* - * in CRT_WRITE case page is left locked even in case of - * error. - */ - cl_page_list_disown(env, io, &queue->c2_qin); - cl_2queue_fini(env, queue); - - return result; -} diff --git a/drivers/staging/lustre/lustre/llite/rw26.c b/drivers/staging/lustre/lustre/llite/rw26.c deleted file mode 100644 index 722e5ea1af5f9beb32638b8da4c1ae49401b2c12..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/rw26.c +++ /dev/null @@ -1,641 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/lustre/llite/rw26.c - * - * Lustre Lite I/O page cache routines for the 2.5/2.6 kernel version - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#define DEBUG_SUBSYSTEM S_LLITE - -#include "llite_internal.h" - -/** - * Implements Linux VM address_space::invalidatepage() method. This method is - * called when the page is truncate from a file, either as a result of - * explicit truncate, or when inode is removed from memory (as a result of - * final iput(), umount, or memory pressure induced icache shrinking). - * - * [0, offset] bytes of the page remain valid (this is for a case of not-page - * aligned truncate). Lustre leaves partially truncated page in the cache, - * relying on struct inode::i_size to limit further accesses. - */ -static void ll_invalidatepage(struct page *vmpage, unsigned int offset, - unsigned int length) -{ - struct inode *inode; - struct lu_env *env; - struct cl_page *page; - struct cl_object *obj; - - LASSERT(PageLocked(vmpage)); - LASSERT(!PageWriteback(vmpage)); - - /* - * It is safe to not check anything in invalidatepage/releasepage - * below because they are run with page locked and all our io is - * happening with locked page too - */ - if (offset == 0 && length == PAGE_SIZE) { - /* See the comment in ll_releasepage() */ - env = cl_env_percpu_get(); - LASSERT(!IS_ERR(env)); - inode = vmpage->mapping->host; - obj = ll_i2info(inode)->lli_clob; - if (obj) { - page = cl_vmpage_page(vmpage, obj); - if (page) { - cl_page_delete(env, page); - cl_page_put(env, page); - } - } else { - LASSERT(vmpage->private == 0); - } - cl_env_percpu_put(env); - } -} - -static int ll_releasepage(struct page *vmpage, gfp_t gfp_mask) -{ - struct lu_env *env; - struct cl_object *obj; - struct cl_page *page; - struct address_space *mapping; - int result = 0; - - LASSERT(PageLocked(vmpage)); - if (PageWriteback(vmpage) || PageDirty(vmpage)) - return 0; - - mapping = vmpage->mapping; - if (!mapping) - return 1; - - obj = ll_i2info(mapping->host)->lli_clob; - if (!obj) - return 1; - - /* 1 for caller, 1 for cl_page and 1 for page cache */ - if (page_count(vmpage) > 3) - return 0; - - page = cl_vmpage_page(vmpage, obj); - if (!page) - return 1; - - env = cl_env_percpu_get(); - LASSERT(!IS_ERR(env)); - - if (!cl_page_in_use(page)) { - result = 1; - cl_page_delete(env, page); - } - - /* To use percpu env array, the call path can not be rescheduled; - * otherwise percpu array will be messed if ll_releaspage() called - * again on the same CPU. - * - * If this page holds the last refc of cl_object, the following - * call path may cause reschedule: - * cl_page_put -> cl_page_free -> cl_object_put -> - * lu_object_put -> lu_object_free -> lov_delete_raid0. - * - * However, the kernel can't get rid of this inode until all pages have - * been cleaned up. Now that we hold page lock here, it's pretty safe - * that we won't get into object delete path. - */ - LASSERT(cl_object_refc(obj) > 1); - cl_page_put(env, page); - - cl_env_percpu_put(env); - return result; -} - -#define MAX_DIRECTIO_SIZE (2 * 1024 * 1024 * 1024UL) - -/* ll_free_user_pages - tear down page struct array - * @pages: array of page struct pointers underlying target buffer - */ -static void ll_free_user_pages(struct page **pages, int npages, int do_dirty) -{ - int i; - - for (i = 0; i < npages; i++) { - if (do_dirty) - set_page_dirty_lock(pages[i]); - put_page(pages[i]); - } - kvfree(pages); -} - -ssize_t ll_direct_rw_pages(const struct lu_env *env, struct cl_io *io, - int rw, struct inode *inode, - struct ll_dio_pages *pv) -{ - struct cl_page *clp; - struct cl_2queue *queue; - struct cl_object *obj = io->ci_obj; - int i; - ssize_t rc = 0; - loff_t file_offset = pv->ldp_start_offset; - size_t size = pv->ldp_size; - int page_count = pv->ldp_nr; - struct page **pages = pv->ldp_pages; - size_t page_size = cl_page_size(obj); - bool do_io; - int io_pages = 0; - - queue = &io->ci_queue; - cl_2queue_init(queue); - for (i = 0; i < page_count; i++) { - if (pv->ldp_offsets) - file_offset = pv->ldp_offsets[i]; - - LASSERT(!(file_offset & (page_size - 1))); - clp = cl_page_find(env, obj, cl_index(obj, file_offset), - pv->ldp_pages[i], CPT_TRANSIENT); - if (IS_ERR(clp)) { - rc = PTR_ERR(clp); - break; - } - - rc = cl_page_own(env, io, clp); - if (rc) { - LASSERT(clp->cp_state == CPS_FREEING); - cl_page_put(env, clp); - break; - } - - do_io = true; - - /* check the page type: if the page is a host page, then do - * write directly - */ - if (clp->cp_type == CPT_CACHEABLE) { - struct page *vmpage = cl_page_vmpage(clp); - struct page *src_page; - struct page *dst_page; - void *src; - void *dst; - - src_page = (rw == WRITE) ? pages[i] : vmpage; - dst_page = (rw == WRITE) ? vmpage : pages[i]; - - src = kmap_atomic(src_page); - dst = kmap_atomic(dst_page); - memcpy(dst, src, min(page_size, size)); - kunmap_atomic(dst); - kunmap_atomic(src); - - /* make sure page will be added to the transfer by - * cl_io_submit()->...->vvp_page_prep_write(). - */ - if (rw == WRITE) - set_page_dirty(vmpage); - - if (rw == READ) { - /* do not issue the page for read, since it - * may reread a ra page which has NOT uptodate - * bit set. - */ - cl_page_disown(env, io, clp); - do_io = false; - } - } - - if (likely(do_io)) { - /* - * Add a page to the incoming page list of 2-queue. - */ - cl_page_list_add(&queue->c2_qin, clp); - - /* - * Set page clip to tell transfer formation engine - * that page has to be sent even if it is beyond KMS. - */ - cl_page_clip(env, clp, 0, min(size, page_size)); - - ++io_pages; - } - - /* drop the reference count for cl_page_find */ - cl_page_put(env, clp); - size -= page_size; - file_offset += page_size; - } - - if (rc == 0 && io_pages) { - rc = cl_io_submit_sync(env, io, - rw == READ ? CRT_READ : CRT_WRITE, - queue, 0); - } - if (rc == 0) - rc = pv->ldp_size; - - cl_2queue_discard(env, io, queue); - cl_2queue_disown(env, io, queue); - cl_2queue_fini(env, queue); - return rc; -} -EXPORT_SYMBOL(ll_direct_rw_pages); - -static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io, - int rw, struct inode *inode, - struct address_space *mapping, - size_t size, loff_t file_offset, - struct page **pages, int page_count) -{ - struct ll_dio_pages pvec = { - .ldp_pages = pages, - .ldp_nr = page_count, - .ldp_size = size, - .ldp_offsets = NULL, - .ldp_start_offset = file_offset - }; - - return ll_direct_rw_pages(env, io, rw, inode, &pvec); -} - -/* This is the maximum size of a single O_DIRECT request, based on the - * kmalloc limit. We need to fit all of the brw_page structs, each one - * representing PAGE_SIZE worth of user data, into a single buffer, and - * then truncate this to be a full-sized RPC. For 4kB PAGE_SIZE this is - * up to 22MB for 128kB kmalloc and up to 682MB for 4MB kmalloc. - */ -#define MAX_DIO_SIZE ((KMALLOC_MAX_SIZE / sizeof(struct brw_page) * \ - PAGE_SIZE) & ~(DT_MAX_BRW_SIZE - 1)) -static ssize_t ll_direct_IO_26(struct kiocb *iocb, struct iov_iter *iter) -{ - struct ll_cl_context *lcc; - const struct lu_env *env; - struct cl_io *io; - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - loff_t file_offset = iocb->ki_pos; - ssize_t count = iov_iter_count(iter); - ssize_t tot_bytes = 0, result = 0; - long size = MAX_DIO_SIZE; - - /* Check EOF by ourselves */ - if (iov_iter_rw(iter) == READ && file_offset >= i_size_read(inode)) - return 0; - - /* FIXME: io smaller than PAGE_SIZE is broken on ia64 ??? */ - if ((file_offset & ~PAGE_MASK) || (count & ~PAGE_MASK)) - return -EINVAL; - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p), size=%zd (max %lu), offset=%lld=%llx, pages %zd (max %lu)\n", - PFID(ll_inode2fid(inode)), inode, count, MAX_DIO_SIZE, - file_offset, file_offset, count >> PAGE_SHIFT, - MAX_DIO_SIZE >> PAGE_SHIFT); - - /* Check that all user buffers are aligned as well */ - if (iov_iter_alignment(iter) & ~PAGE_MASK) - return -EINVAL; - - lcc = ll_cl_find(file); - if (!lcc) - return -EIO; - - env = lcc->lcc_env; - LASSERT(!IS_ERR(env)); - io = lcc->lcc_io; - LASSERT(io); - - while (iov_iter_count(iter)) { - struct page **pages; - size_t offs; - - count = min_t(size_t, iov_iter_count(iter), size); - if (iov_iter_rw(iter) == READ) { - if (file_offset >= i_size_read(inode)) - break; - if (file_offset + count > i_size_read(inode)) - count = i_size_read(inode) - file_offset; - } - - result = iov_iter_get_pages_alloc(iter, &pages, count, &offs); - if (likely(result > 0)) { - int n = DIV_ROUND_UP(result + offs, PAGE_SIZE); - - result = ll_direct_IO_26_seg(env, io, iov_iter_rw(iter), - inode, file->f_mapping, - result, file_offset, pages, - n); - ll_free_user_pages(pages, n, iov_iter_rw(iter) == READ); - } - if (unlikely(result <= 0)) { - /* If we can't allocate a large enough buffer - * for the request, shrink it to a smaller - * PAGE_SIZE multiple and try again. - * We should always be able to kmalloc for a - * page worth of page pointers = 4MB on i386. - */ - if (result == -ENOMEM && - size > (PAGE_SIZE / sizeof(*pages)) * - PAGE_SIZE) { - size = ((((size / 2) - 1) | - ~PAGE_MASK) + 1) & - PAGE_MASK; - CDEBUG(D_VFSTRACE, "DIO size now %lu\n", - size); - continue; - } - - goto out; - } - iov_iter_advance(iter, result); - tot_bytes += result; - file_offset += result; - } -out: - if (tot_bytes > 0) { - struct vvp_io *vio = vvp_env_io(env); - - /* no commit async for direct IO */ - vio->u.write.vui_written += tot_bytes; - } - - return tot_bytes ? tot_bytes : result; -} - -/** - * Prepare partially written-to page for a write. - */ -static int ll_prepare_partial_page(const struct lu_env *env, struct cl_io *io, - struct cl_page *pg) -{ - struct cl_attr *attr = vvp_env_thread_attr(env); - struct cl_object *obj = io->ci_obj; - struct vvp_page *vpg = cl_object_page_slice(obj, pg); - loff_t offset = cl_offset(obj, vvp_index(vpg)); - int result; - - cl_object_attr_lock(obj); - result = cl_object_attr_get(env, obj, attr); - cl_object_attr_unlock(obj); - if (result == 0) { - /* - * If are writing to a new page, no need to read old data. - * The extent locking will have updated the KMS, and for our - * purposes here we can treat it like i_size. - */ - if (attr->cat_kms <= offset) { - char *kaddr = kmap_atomic(vpg->vpg_page); - - memset(kaddr, 0, cl_page_size(obj)); - kunmap_atomic(kaddr); - } else if (vpg->vpg_defer_uptodate) { - vpg->vpg_ra_used = 1; - } else { - result = ll_page_sync_io(env, io, pg, CRT_READ); - } - } - return result; -} - -static int ll_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned int len, unsigned int flags, - struct page **pagep, void **fsdata) -{ - struct ll_cl_context *lcc; - const struct lu_env *env = NULL; - struct cl_io *io; - struct cl_page *page = NULL; - struct cl_object *clob = ll_i2info(mapping->host)->lli_clob; - pgoff_t index = pos >> PAGE_SHIFT; - struct page *vmpage = NULL; - unsigned int from = pos & (PAGE_SIZE - 1); - unsigned int to = from + len; - int result = 0; - - CDEBUG(D_VFSTRACE, "Writing %lu of %d to %d bytes\n", index, from, len); - - lcc = ll_cl_find(file); - if (!lcc) { - io = NULL; - result = -EIO; - goto out; - } - - env = lcc->lcc_env; - io = lcc->lcc_io; - - /* To avoid deadlock, try to lock page first. */ - vmpage = grab_cache_page_nowait(mapping, index); - if (unlikely(!vmpage || PageDirty(vmpage) || PageWriteback(vmpage))) { - struct vvp_io *vio = vvp_env_io(env); - struct cl_page_list *plist = &vio->u.write.vui_queue; - - /* if the page is already in dirty cache, we have to commit - * the pages right now; otherwise, it may cause deadlock - * because it holds page lock of a dirty page and request for - * more grants. It's okay for the dirty page to be the first - * one in commit page list, though. - */ - if (vmpage && plist->pl_nr > 0) { - unlock_page(vmpage); - put_page(vmpage); - vmpage = NULL; - } - - /* commit pages and then wait for page lock */ - result = vvp_io_write_commit(env, io); - if (result < 0) - goto out; - - if (!vmpage) { - vmpage = grab_cache_page_write_begin(mapping, index, - flags); - if (!vmpage) { - result = -ENOMEM; - goto out; - } - } - } - - page = cl_page_find(env, clob, vmpage->index, vmpage, CPT_CACHEABLE); - if (IS_ERR(page)) { - result = PTR_ERR(page); - goto out; - } - - lcc->lcc_page = page; - lu_ref_add(&page->cp_reference, "cl_io", io); - - cl_page_assume(env, io, page); - if (!PageUptodate(vmpage)) { - /* - * We're completely overwriting an existing page, - * so _don't_ set it up to date until commit_write - */ - if (from == 0 && to == PAGE_SIZE) { - CL_PAGE_HEADER(D_PAGE, env, page, "full page write\n"); - POISON_PAGE(vmpage, 0x11); - } else { - /* TODO: can be optimized at OSC layer to check if it - * is a lockless IO. In that case, it's not necessary - * to read the data. - */ - result = ll_prepare_partial_page(env, io, page); - if (result == 0) - SetPageUptodate(vmpage); - } - } - if (result < 0) - cl_page_unassume(env, io, page); -out: - if (result < 0) { - if (vmpage) { - unlock_page(vmpage); - put_page(vmpage); - } - if (!IS_ERR_OR_NULL(page)) { - lu_ref_del(&page->cp_reference, "cl_io", io); - cl_page_put(env, page); - } - if (io) - io->ci_result = result; - } else { - *pagep = vmpage; - *fsdata = lcc; - } - return result; -} - -static int ll_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned int len, unsigned int copied, - struct page *vmpage, void *fsdata) -{ - struct ll_cl_context *lcc = fsdata; - const struct lu_env *env; - struct cl_io *io; - struct vvp_io *vio; - struct cl_page *page; - unsigned int from = pos & (PAGE_SIZE - 1); - bool unplug = false; - int result = 0; - - put_page(vmpage); - - env = lcc->lcc_env; - page = lcc->lcc_page; - io = lcc->lcc_io; - vio = vvp_env_io(env); - - LASSERT(cl_page_is_owned(page, io)); - if (copied > 0) { - struct cl_page_list *plist = &vio->u.write.vui_queue; - - lcc->lcc_page = NULL; /* page will be queued */ - - /* Add it into write queue */ - cl_page_list_add(plist, page); - if (plist->pl_nr == 1) /* first page */ - vio->u.write.vui_from = from; - else - LASSERT(from == 0); - vio->u.write.vui_to = from + copied; - - /* - * To address the deadlock in balance_dirty_pages() where - * this dirty page may be written back in the same thread. - */ - if (PageDirty(vmpage)) - unplug = true; - - /* We may have one full RPC, commit it soon */ - if (plist->pl_nr >= PTLRPC_MAX_BRW_PAGES) - unplug = true; - - CL_PAGE_DEBUG(D_VFSTRACE, env, page, - "queued page: %d.\n", plist->pl_nr); - } else { - cl_page_disown(env, io, page); - - lcc->lcc_page = NULL; - lu_ref_del(&page->cp_reference, "cl_io", io); - cl_page_put(env, page); - - /* page list is not contiguous now, commit it now */ - unplug = true; - } - - if (unplug || - file->f_flags & O_SYNC || IS_SYNC(file_inode(file))) - result = vvp_io_write_commit(env, io); - - if (result < 0) - io->ci_result = result; - return result >= 0 ? copied : result; -} - -#ifdef CONFIG_MIGRATION -static int ll_migratepage(struct address_space *mapping, - struct page *newpage, struct page *page, - enum migrate_mode mode - ) -{ - /* Always fail page migration until we have a proper implementation */ - return -EIO; -} -#endif - -const struct address_space_operations ll_aops = { - .readpage = ll_readpage, - .direct_IO = ll_direct_IO_26, - .writepage = ll_writepage, - .writepages = ll_writepages, - .set_page_dirty = __set_page_dirty_nobuffers, - .write_begin = ll_write_begin, - .write_end = ll_write_end, - .invalidatepage = ll_invalidatepage, - .releasepage = (void *)ll_releasepage, -#ifdef CONFIG_MIGRATION - .migratepage = ll_migratepage, -#endif -}; diff --git a/drivers/staging/lustre/lustre/llite/statahead.c b/drivers/staging/lustre/lustre/llite/statahead.c deleted file mode 100644 index d864f5f36d851b0d328fb116a271a95aa588ec0d..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/statahead.c +++ /dev/null @@ -1,1577 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#include -#include -#include -#include -#include - -#define DEBUG_SUBSYSTEM S_LLITE - -#include -#include -#include "llite_internal.h" - -#define SA_OMITTED_ENTRY_MAX 8ULL - -enum se_stat { - /** negative values are for error cases */ - SA_ENTRY_INIT = 0, /** init entry */ - SA_ENTRY_SUCC = 1, /** stat succeed */ - SA_ENTRY_INVA = 2, /** invalid entry */ -}; - -/* - * sa_entry is not refcounted: statahead thread allocates it and do async stat, - * and in async stat callback ll_statahead_interpret() will add it into - * sai_interim_entries, later statahead thread will call sa_handle_callback() to - * instantiate entry and move it into sai_entries, and then only scanner process - * can access and free it. - */ -struct sa_entry { - /* link into sai_interim_entries or sai_entries */ - struct list_head se_list; - /* link into sai hash table locally */ - struct list_head se_hash; - /* entry index in the sai */ - __u64 se_index; - /* low layer ldlm lock handle */ - __u64 se_handle; - /* entry status */ - enum se_stat se_state; - /* entry size, contains name */ - int se_size; - /* pointer to async getattr enqueue info */ - struct md_enqueue_info *se_minfo; - /* pointer to the async getattr request */ - struct ptlrpc_request *se_req; - /* pointer to the target inode */ - struct inode *se_inode; - /* entry name */ - struct qstr se_qstr; - /* entry fid */ - struct lu_fid se_fid; -}; - -static unsigned int sai_generation; -static DEFINE_SPINLOCK(sai_generation_lock); - -/* sa_entry is ready to use */ -static inline int sa_ready(struct sa_entry *entry) -{ - smp_rmb(); - return (entry->se_state != SA_ENTRY_INIT); -} - -/* hash value to put in sai_cache */ -static inline int sa_hash(int val) -{ - return val & LL_SA_CACHE_MASK; -} - -/* hash entry into sai_cache */ -static inline void -sa_rehash(struct ll_statahead_info *sai, struct sa_entry *entry) -{ - int i = sa_hash(entry->se_qstr.hash); - - spin_lock(&sai->sai_cache_lock[i]); - list_add_tail(&entry->se_hash, &sai->sai_cache[i]); - spin_unlock(&sai->sai_cache_lock[i]); -} - -/* - * Remove entry from SA table. - */ -static inline void -sa_unhash(struct ll_statahead_info *sai, struct sa_entry *entry) -{ - int i = sa_hash(entry->se_qstr.hash); - - spin_lock(&sai->sai_cache_lock[i]); - list_del_init(&entry->se_hash); - spin_unlock(&sai->sai_cache_lock[i]); -} - -static inline int agl_should_run(struct ll_statahead_info *sai, - struct inode *inode) -{ - return (inode && S_ISREG(inode->i_mode) && sai->sai_agl_valid); -} - -/* statahead window is full */ -static inline int sa_sent_full(struct ll_statahead_info *sai) -{ - return atomic_read(&sai->sai_cache_count) >= sai->sai_max; -} - -/* got async stat replies */ -static inline int sa_has_callback(struct ll_statahead_info *sai) -{ - return !list_empty(&sai->sai_interim_entries); -} - -static inline int agl_list_empty(struct ll_statahead_info *sai) -{ - return list_empty(&sai->sai_agls); -} - -/** - * (1) hit ratio less than 80% - * or - * (2) consecutive miss more than 8 - * then means low hit. - */ -static inline int sa_low_hit(struct ll_statahead_info *sai) -{ - return ((sai->sai_hit > 7 && sai->sai_hit < 4 * sai->sai_miss) || - (sai->sai_consecutive_miss > 8)); -} - -/* - * if the given index is behind of statahead window more than - * SA_OMITTED_ENTRY_MAX, then it is old. - */ -static inline int is_omitted_entry(struct ll_statahead_info *sai, __u64 index) -{ - return ((__u64)sai->sai_max + index + SA_OMITTED_ENTRY_MAX < - sai->sai_index); -} - -/* allocate sa_entry and hash it to allow scanner process to find it */ -static struct sa_entry * -sa_alloc(struct dentry *parent, struct ll_statahead_info *sai, __u64 index, - const char *name, int len, const struct lu_fid *fid) -{ - struct ll_inode_info *lli; - struct sa_entry *entry; - int entry_size; - char *dname; - - entry_size = sizeof(struct sa_entry) + (len & ~3) + 4; - entry = kzalloc(entry_size, GFP_NOFS); - if (unlikely(!entry)) - return ERR_PTR(-ENOMEM); - - CDEBUG(D_READA, "alloc sa entry %.*s(%p) index %llu\n", - len, name, entry, index); - - entry->se_index = index; - entry->se_state = SA_ENTRY_INIT; - entry->se_size = entry_size; - dname = (char *)entry + sizeof(struct sa_entry); - memcpy(dname, name, len); - dname[len] = 0; - - entry->se_qstr.hash = full_name_hash(parent, name, len); - entry->se_qstr.len = len; - entry->se_qstr.name = dname; - entry->se_fid = *fid; - - lli = ll_i2info(sai->sai_dentry->d_inode); - spin_lock(&lli->lli_sa_lock); - INIT_LIST_HEAD(&entry->se_list); - sa_rehash(sai, entry); - spin_unlock(&lli->lli_sa_lock); - - atomic_inc(&sai->sai_cache_count); - - return entry; -} - -/* free sa_entry, which should have been unhashed and not in any list */ -static void sa_free(struct ll_statahead_info *sai, struct sa_entry *entry) -{ - CDEBUG(D_READA, "free sa entry %.*s(%p) index %llu\n", - entry->se_qstr.len, entry->se_qstr.name, entry, - entry->se_index); - - LASSERT(list_empty(&entry->se_list)); - LASSERT(list_empty(&entry->se_hash)); - - kfree(entry); - atomic_dec(&sai->sai_cache_count); -} - -/* - * find sa_entry by name, used by directory scanner, lock is not needed because - * only scanner can remove the entry from cache. - */ -static struct sa_entry * -sa_get(struct ll_statahead_info *sai, const struct qstr *qstr) -{ - struct sa_entry *entry; - int i = sa_hash(qstr->hash); - - list_for_each_entry(entry, &sai->sai_cache[i], se_hash) { - if (entry->se_qstr.hash == qstr->hash && - entry->se_qstr.len == qstr->len && - memcmp(entry->se_qstr.name, qstr->name, qstr->len) == 0) - return entry; - } - return NULL; -} - -/* unhash and unlink sa_entry, and then free it */ -static inline void -sa_kill(struct ll_statahead_info *sai, struct sa_entry *entry) -{ - struct ll_inode_info *lli = ll_i2info(sai->sai_dentry->d_inode); - - LASSERT(!list_empty(&entry->se_hash)); - LASSERT(!list_empty(&entry->se_list)); - LASSERT(sa_ready(entry)); - - sa_unhash(sai, entry); - - spin_lock(&lli->lli_sa_lock); - list_del_init(&entry->se_list); - spin_unlock(&lli->lli_sa_lock); - - if (entry->se_inode) - iput(entry->se_inode); - - sa_free(sai, entry); -} - -/* called by scanner after use, sa_entry will be killed */ -static void -sa_put(struct ll_statahead_info *sai, struct sa_entry *entry, struct ll_inode_info *lli) -{ - struct sa_entry *tmp, *next; - - if (entry && entry->se_state == SA_ENTRY_SUCC) { - struct ll_sb_info *sbi = ll_i2sbi(sai->sai_dentry->d_inode); - - sai->sai_hit++; - sai->sai_consecutive_miss = 0; - sai->sai_max = min(2 * sai->sai_max, sbi->ll_sa_max); - } else { - sai->sai_miss++; - sai->sai_consecutive_miss++; - } - - if (entry) - sa_kill(sai, entry); - - /* - * kill old completed entries, only scanner process does this, no need - * to lock - */ - list_for_each_entry_safe(tmp, next, &sai->sai_entries, se_list) { - if (!is_omitted_entry(sai, tmp->se_index)) - break; - sa_kill(sai, tmp); - } - - spin_lock(&lli->lli_sa_lock); - if (sai->sai_task) - wake_up_process(sai->sai_task); - spin_unlock(&lli->lli_sa_lock); - -} - -/* - * update state and sort add entry to sai_entries by index, return true if - * scanner is waiting on this entry. - */ -static bool -__sa_make_ready(struct ll_statahead_info *sai, struct sa_entry *entry, int ret) -{ - struct list_head *pos = &sai->sai_entries; - __u64 index = entry->se_index; - struct sa_entry *se; - - LASSERT(!sa_ready(entry)); - LASSERT(list_empty(&entry->se_list)); - - list_for_each_entry_reverse(se, &sai->sai_entries, se_list) { - if (se->se_index < entry->se_index) { - pos = &se->se_list; - break; - } - } - list_add(&entry->se_list, pos); - entry->se_state = ret < 0 ? SA_ENTRY_INVA : SA_ENTRY_SUCC; - - return (index == sai->sai_index_wait); -} - -/* - * release resources used in async stat RPC, update entry state and wakeup if - * scanner process it waiting on this entry. - */ -static void -sa_make_ready(struct ll_statahead_info *sai, struct sa_entry *entry, int ret) -{ - struct ll_inode_info *lli = ll_i2info(sai->sai_dentry->d_inode); - struct md_enqueue_info *minfo = entry->se_minfo; - struct ptlrpc_request *req = entry->se_req; - bool wakeup; - - /* release resources used in RPC */ - if (minfo) { - entry->se_minfo = NULL; - ll_intent_release(&minfo->mi_it); - iput(minfo->mi_dir); - kfree(minfo); - } - - if (req) { - entry->se_req = NULL; - ptlrpc_req_finished(req); - } - - spin_lock(&lli->lli_sa_lock); - wakeup = __sa_make_ready(sai, entry, ret); - spin_unlock(&lli->lli_sa_lock); - - if (wakeup) - wake_up(&sai->sai_waitq); -} - -/* Insert inode into the list of sai_agls. */ -static void ll_agl_add(struct ll_statahead_info *sai, - struct inode *inode, int index) -{ - struct ll_inode_info *child = ll_i2info(inode); - struct ll_inode_info *parent = ll_i2info(sai->sai_dentry->d_inode); - int added = 0; - - spin_lock(&child->lli_agl_lock); - if (child->lli_agl_index == 0) { - child->lli_agl_index = index; - spin_unlock(&child->lli_agl_lock); - - LASSERT(list_empty(&child->lli_agl_list)); - - igrab(inode); - spin_lock(&parent->lli_agl_lock); - if (list_empty(&sai->sai_agls)) - added = 1; - list_add_tail(&child->lli_agl_list, &sai->sai_agls); - spin_unlock(&parent->lli_agl_lock); - } else { - spin_unlock(&child->lli_agl_lock); - } - - if (added > 0) - wake_up_process(sai->sai_agl_task); -} - -/* allocate sai */ -static struct ll_statahead_info *ll_sai_alloc(struct dentry *dentry) -{ - struct ll_inode_info *lli = ll_i2info(dentry->d_inode); - struct ll_statahead_info *sai; - int i; - - sai = kzalloc(sizeof(*sai), GFP_NOFS); - if (!sai) - return NULL; - - sai->sai_dentry = dget(dentry); - atomic_set(&sai->sai_refcount, 1); - - sai->sai_max = LL_SA_RPC_MIN; - sai->sai_index = 1; - init_waitqueue_head(&sai->sai_waitq); - - INIT_LIST_HEAD(&sai->sai_interim_entries); - INIT_LIST_HEAD(&sai->sai_entries); - INIT_LIST_HEAD(&sai->sai_agls); - - for (i = 0; i < LL_SA_CACHE_SIZE; i++) { - INIT_LIST_HEAD(&sai->sai_cache[i]); - spin_lock_init(&sai->sai_cache_lock[i]); - } - atomic_set(&sai->sai_cache_count, 0); - - spin_lock(&sai_generation_lock); - lli->lli_sa_generation = ++sai_generation; - if (unlikely(!sai_generation)) - lli->lli_sa_generation = ++sai_generation; - spin_unlock(&sai_generation_lock); - - return sai; -} - -/* free sai */ -static inline void ll_sai_free(struct ll_statahead_info *sai) -{ - LASSERT(sai->sai_dentry); - dput(sai->sai_dentry); - kfree(sai); -} - -/* - * take refcount of sai if sai for @dir exists, which means statahead is on for - * this directory. - */ -static inline struct ll_statahead_info *ll_sai_get(struct inode *dir) -{ - struct ll_inode_info *lli = ll_i2info(dir); - struct ll_statahead_info *sai = NULL; - - spin_lock(&lli->lli_sa_lock); - sai = lli->lli_sai; - if (sai) - atomic_inc(&sai->sai_refcount); - spin_unlock(&lli->lli_sa_lock); - - return sai; -} - -/* - * put sai refcount after use, if refcount reaches zero, free sai and sa_entries - * attached to it. - */ -static void ll_sai_put(struct ll_statahead_info *sai) -{ - struct ll_inode_info *lli = ll_i2info(sai->sai_dentry->d_inode); - - if (atomic_dec_and_lock(&sai->sai_refcount, &lli->lli_sa_lock)) { - struct ll_sb_info *sbi = ll_i2sbi(sai->sai_dentry->d_inode); - struct sa_entry *entry, *next; - - lli->lli_sai = NULL; - spin_unlock(&lli->lli_sa_lock); - - LASSERT(sai->sai_task == NULL); - LASSERT(sai->sai_agl_task == NULL); - LASSERT(sai->sai_sent == sai->sai_replied); - LASSERT(!sa_has_callback(sai)); - - list_for_each_entry_safe(entry, next, &sai->sai_entries, - se_list) - sa_kill(sai, entry); - - LASSERT(atomic_read(&sai->sai_cache_count) == 0); - LASSERT(list_empty(&sai->sai_agls)); - - ll_sai_free(sai); - atomic_dec(&sbi->ll_sa_running); - } -} - -/* Do NOT forget to drop inode refcount when into sai_agls. */ -static void ll_agl_trigger(struct inode *inode, struct ll_statahead_info *sai) -{ - struct ll_inode_info *lli = ll_i2info(inode); - __u64 index = lli->lli_agl_index; - int rc; - - LASSERT(list_empty(&lli->lli_agl_list)); - - /* AGL maybe fall behind statahead with one entry */ - if (is_omitted_entry(sai, index + 1)) { - lli->lli_agl_index = 0; - iput(inode); - return; - } - - /* Someone is in glimpse (sync or async), do nothing. */ - rc = down_write_trylock(&lli->lli_glimpse_sem); - if (rc == 0) { - lli->lli_agl_index = 0; - iput(inode); - return; - } - - /* - * Someone triggered glimpse within 1 sec before. - * 1) The former glimpse succeeded with glimpse lock granted by OST, and - * if the lock is still cached on client, AGL needs to do nothing. If - * it is cancelled by other client, AGL maybe cannot obtain new lock - * for no glimpse callback triggered by AGL. - * 2) The former glimpse succeeded, but OST did not grant glimpse lock. - * Under such case, it is quite possible that the OST will not grant - * glimpse lock for AGL also. - * 3) The former glimpse failed, compared with other two cases, it is - * relative rare. AGL can ignore such case, and it will not muchly - * affect the performance. - */ - if (lli->lli_glimpse_time != 0 && - time_before(jiffies - 1 * HZ, lli->lli_glimpse_time)) { - up_write(&lli->lli_glimpse_sem); - lli->lli_agl_index = 0; - iput(inode); - return; - } - - CDEBUG(D_READA, "Handling (init) async glimpse: inode = " - DFID ", idx = %llu\n", PFID(&lli->lli_fid), index); - - cl_agl(inode); - lli->lli_agl_index = 0; - lli->lli_glimpse_time = jiffies; - up_write(&lli->lli_glimpse_sem); - - CDEBUG(D_READA, "Handled (init) async glimpse: inode= " - DFID ", idx = %llu, rc = %d\n", - PFID(&lli->lli_fid), index, rc); - - iput(inode); -} - -/* - * prepare inode for sa entry, add it into agl list, now sa_entry is ready - * to be used by scanner process. - */ -static void sa_instantiate(struct ll_statahead_info *sai, - struct sa_entry *entry) -{ - struct inode *dir = sai->sai_dentry->d_inode; - struct inode *child; - struct md_enqueue_info *minfo; - struct lookup_intent *it; - struct ptlrpc_request *req; - struct mdt_body *body; - int rc = 0; - - LASSERT(entry->se_handle != 0); - - minfo = entry->se_minfo; - it = &minfo->mi_it; - req = entry->se_req; - body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - if (!body) { - rc = -EFAULT; - goto out; - } - - child = entry->se_inode; - if (child) { - /* revalidate; unlinked and re-created with the same name */ - if (unlikely(!lu_fid_eq(&minfo->mi_data.op_fid2, &body->mbo_fid1))) { - entry->se_inode = NULL; - iput(child); - child = NULL; - } - } - - it->it_lock_handle = entry->se_handle; - rc = md_revalidate_lock(ll_i2mdexp(dir), it, ll_inode2fid(dir), NULL); - if (rc != 1) { - rc = -EAGAIN; - goto out; - } - - rc = ll_prep_inode(&child, req, dir->i_sb, it); - if (rc) - goto out; - - CDEBUG(D_READA, "%s: setting %.*s" DFID " l_data to inode %p\n", - ll_get_fsname(child->i_sb, NULL, 0), - entry->se_qstr.len, entry->se_qstr.name, - PFID(ll_inode2fid(child)), child); - ll_set_lock_data(ll_i2sbi(dir)->ll_md_exp, child, it, NULL); - - entry->se_inode = child; - - if (agl_should_run(sai, child)) - ll_agl_add(sai, child, entry->se_index); - -out: - /* - * sa_make_ready() will drop ldlm ibits lock refcount by calling - * ll_intent_drop_lock() in spite of failures. Do not worry about - * calling ll_intent_drop_lock() more than once. - */ - sa_make_ready(sai, entry, rc); -} - -/* once there are async stat replies, instantiate sa_entry from replies */ -static void sa_handle_callback(struct ll_statahead_info *sai) -{ - struct ll_inode_info *lli; - - lli = ll_i2info(sai->sai_dentry->d_inode); - - while (sa_has_callback(sai)) { - struct sa_entry *entry; - - spin_lock(&lli->lli_sa_lock); - if (unlikely(!sa_has_callback(sai))) { - spin_unlock(&lli->lli_sa_lock); - break; - } - entry = list_entry(sai->sai_interim_entries.next, - struct sa_entry, se_list); - list_del_init(&entry->se_list); - spin_unlock(&lli->lli_sa_lock); - - sa_instantiate(sai, entry); - } -} - -/* - * callback for async stat, because this is called in ptlrpcd context, we only - * put sa_entry in sai_cb_entries list, and let sa_handle_callback() to really - * prepare inode and instantiate sa_entry later. - */ -static int ll_statahead_interpret(struct ptlrpc_request *req, - struct md_enqueue_info *minfo, int rc) -{ - struct lookup_intent *it = &minfo->mi_it; - struct inode *dir = minfo->mi_dir; - struct ll_inode_info *lli = ll_i2info(dir); - struct ll_statahead_info *sai = lli->lli_sai; - struct sa_entry *entry = (struct sa_entry *)minfo->mi_cbdata; - __u64 handle = 0; - - if (it_disposition(it, DISP_LOOKUP_NEG)) - rc = -ENOENT; - - /* - * because statahead thread will wait for all inflight RPC to finish, - * sai should be always valid, no need to refcount - */ - LASSERT(sai); - LASSERT(entry); - - CDEBUG(D_READA, "sa_entry %.*s rc %d\n", - entry->se_qstr.len, entry->se_qstr.name, rc); - - if (rc) { - ll_intent_release(it); - iput(dir); - kfree(minfo); - } else { - /* - * release ibits lock ASAP to avoid deadlock when statahead - * thread enqueues lock on parent in readdir and another - * process enqueues lock on child with parent lock held, eg. - * unlink. - */ - handle = it->it_lock_handle; - ll_intent_drop_lock(it); - } - - spin_lock(&lli->lli_sa_lock); - if (rc) { - if (__sa_make_ready(sai, entry, rc)) - wake_up(&sai->sai_waitq); - } else { - int first = 0; - entry->se_minfo = minfo; - entry->se_req = ptlrpc_request_addref(req); - /* - * Release the async ibits lock ASAP to avoid deadlock - * when statahead thread tries to enqueue lock on parent - * for readpage and other tries to enqueue lock on child - * with parent's lock held, for example: unlink. - */ - entry->se_handle = handle; - if (!sa_has_callback(sai)) - first = 1; - - list_add_tail(&entry->se_list, &sai->sai_interim_entries); - - if (first && sai->sai_task) - wake_up_process(sai->sai_task); - } - sai->sai_replied++; - - spin_unlock(&lli->lli_sa_lock); - - return rc; -} - -/* finish async stat RPC arguments */ -static void sa_fini_data(struct md_enqueue_info *minfo) -{ - iput(minfo->mi_dir); - kfree(minfo); -} - -/** - * prepare arguments for async stat RPC. - */ -static struct md_enqueue_info * -sa_prep_data(struct inode *dir, struct inode *child, struct sa_entry *entry) -{ - struct md_enqueue_info *minfo; - struct ldlm_enqueue_info *einfo; - struct md_op_data *op_data; - - minfo = kzalloc(sizeof(*minfo), GFP_NOFS); - if (!minfo) - return ERR_PTR(-ENOMEM); - - op_data = ll_prep_md_op_data(&minfo->mi_data, dir, child, NULL, 0, 0, - LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) { - kfree(minfo); - return (struct md_enqueue_info *)op_data; - } - - if (!child) - op_data->op_fid2 = entry->se_fid; - - minfo->mi_it.it_op = IT_GETATTR; - minfo->mi_dir = igrab(dir); - minfo->mi_cb = ll_statahead_interpret; - minfo->mi_cbdata = entry; - - einfo = &minfo->mi_einfo; - einfo->ei_type = LDLM_IBITS; - einfo->ei_mode = it_to_lock_mode(&minfo->mi_it); - einfo->ei_cb_bl = ll_md_blocking_ast; - einfo->ei_cb_cp = ldlm_completion_ast; - einfo->ei_cb_gl = NULL; - einfo->ei_cbdata = NULL; - - return minfo; -} - -/* async stat for file not found in dcache */ -static int sa_lookup(struct inode *dir, struct sa_entry *entry) -{ - struct md_enqueue_info *minfo; - int rc; - - minfo = sa_prep_data(dir, NULL, entry); - if (IS_ERR(minfo)) - return PTR_ERR(minfo); - - rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo); - if (rc) - sa_fini_data(minfo); - - return rc; -} - -/** - * async stat for file found in dcache, similar to .revalidate - * - * \retval 1 dentry valid, no RPC sent - * \retval 0 dentry invalid, will send async stat RPC - * \retval negative number upon error - */ -static int sa_revalidate(struct inode *dir, struct sa_entry *entry, - struct dentry *dentry) -{ - struct inode *inode = d_inode(dentry); - struct lookup_intent it = { .it_op = IT_GETATTR, - .it_lock_handle = 0 }; - struct md_enqueue_info *minfo; - int rc; - - if (unlikely(!inode)) - return 1; - - if (d_mountpoint(dentry)) - return 1; - - entry->se_inode = igrab(inode); - rc = md_revalidate_lock(ll_i2mdexp(dir), &it, ll_inode2fid(inode), - NULL); - if (rc == 1) { - entry->se_handle = it.it_lock_handle; - ll_intent_release(&it); - return 1; - } - - minfo = sa_prep_data(dir, inode, entry); - if (IS_ERR(minfo)) { - entry->se_inode = NULL; - iput(inode); - return PTR_ERR(minfo); - } - - rc = md_intent_getattr_async(ll_i2mdexp(dir), minfo); - if (rc) { - entry->se_inode = NULL; - iput(inode); - sa_fini_data(minfo); - } - - return rc; -} - -/* async stat for file with @name */ -static void sa_statahead(struct dentry *parent, const char *name, int len, - const struct lu_fid *fid) -{ - struct inode *dir = d_inode(parent); - struct ll_inode_info *lli = ll_i2info(dir); - struct ll_statahead_info *sai = lli->lli_sai; - struct dentry *dentry = NULL; - struct sa_entry *entry; - int rc; - - entry = sa_alloc(parent, sai, sai->sai_index, name, len, fid); - if (IS_ERR(entry)) - return; - - dentry = d_lookup(parent, &entry->se_qstr); - if (!dentry) { - rc = sa_lookup(dir, entry); - } else { - rc = sa_revalidate(dir, entry, dentry); - if (rc == 1 && agl_should_run(sai, d_inode(dentry))) - ll_agl_add(sai, d_inode(dentry), entry->se_index); - } - - if (dentry) - dput(dentry); - - if (rc) - sa_make_ready(sai, entry, rc); - else - sai->sai_sent++; - - sai->sai_index++; -} - -/* async glimpse (agl) thread main function */ -static int ll_agl_thread(void *arg) -{ - struct dentry *parent = arg; - struct inode *dir = d_inode(parent); - struct ll_inode_info *plli = ll_i2info(dir); - struct ll_inode_info *clli; - /* We already own this reference, so it is safe to take it without a lock. */ - struct ll_statahead_info *sai = plli->lli_sai; - - CDEBUG(D_READA, "agl thread started: sai %p, parent %pd\n", - sai, parent); - - while (!kthread_should_stop()) { - - spin_lock(&plli->lli_agl_lock); - /* The statahead thread maybe help to process AGL entries, - * so check whether list empty again. - */ - if (!list_empty(&sai->sai_agls)) { - clli = list_entry(sai->sai_agls.next, - struct ll_inode_info, lli_agl_list); - list_del_init(&clli->lli_agl_list); - spin_unlock(&plli->lli_agl_lock); - ll_agl_trigger(&clli->lli_vfs_inode, sai); - } else { - spin_unlock(&plli->lli_agl_lock); - } - - set_current_state(TASK_IDLE); - if (list_empty(&sai->sai_agls) && - !kthread_should_stop()) - schedule(); - __set_current_state(TASK_RUNNING); - } - - spin_lock(&plli->lli_agl_lock); - sai->sai_agl_valid = 0; - while (!list_empty(&sai->sai_agls)) { - clli = list_entry(sai->sai_agls.next, - struct ll_inode_info, lli_agl_list); - list_del_init(&clli->lli_agl_list); - spin_unlock(&plli->lli_agl_lock); - clli->lli_agl_index = 0; - iput(&clli->lli_vfs_inode); - spin_lock(&plli->lli_agl_lock); - } - spin_unlock(&plli->lli_agl_lock); - CDEBUG(D_READA, "agl thread stopped: sai %p, parent %pd\n", - sai, parent); - ll_sai_put(sai); - return 0; -} - -/* start agl thread */ -static void ll_start_agl(struct dentry *parent, struct ll_statahead_info *sai) -{ - struct ll_inode_info *plli; - struct task_struct *task; - - CDEBUG(D_READA, "start agl thread: sai %p, parent %pd\n", - sai, parent); - - plli = ll_i2info(d_inode(parent)); - task = kthread_create(ll_agl_thread, parent, "ll_agl_%u", - plli->lli_opendir_pid); - if (IS_ERR(task)) { - CERROR("can't start ll_agl thread, rc: %ld\n", PTR_ERR(task)); - return; - } - - sai->sai_agl_task = task; - atomic_inc(&ll_i2sbi(d_inode(parent))->ll_agl_total); - spin_lock(&plli->lli_agl_lock); - sai->sai_agl_valid = 1; - spin_unlock(&plli->lli_agl_lock); - /* Get an extra reference that the thread holds */ - ll_sai_get(d_inode(parent)); - - wake_up_process(task); -} - -/* statahead thread main function */ -static int ll_statahead_thread(void *arg) -{ - struct dentry *parent = arg; - struct inode *dir = d_inode(parent); - struct ll_inode_info *lli = ll_i2info(dir); - struct ll_sb_info *sbi = ll_i2sbi(dir); - struct ll_statahead_info *sai = lli->lli_sai; - struct page *page = NULL; - __u64 pos = 0; - int first = 0; - int rc = 0; - struct md_op_data *op_data; - - CDEBUG(D_READA, "statahead thread starting: sai %p, parent %pd\n", - sai, parent); - - op_data = ll_prep_md_op_data(NULL, dir, dir, NULL, 0, 0, - LUSTRE_OPC_ANY, dir); - if (IS_ERR(op_data)) { - rc = PTR_ERR(op_data); - goto out; - } - - op_data->op_max_pages = ll_i2sbi(dir)->ll_md_brw_pages; - - while (pos != MDS_DIR_END_OFF && sai->sai_task) { - struct lu_dirpage *dp; - struct lu_dirent *ent; - - sai->sai_in_readpage = 1; - page = ll_get_dir_page(dir, op_data, pos); - sai->sai_in_readpage = 0; - if (IS_ERR(page)) { - rc = PTR_ERR(page); - CDEBUG(D_READA, "error reading dir " DFID " at %llu/%llu: opendir_pid = %u: rc = %d\n", - PFID(ll_inode2fid(dir)), pos, sai->sai_index, - lli->lli_opendir_pid, rc); - break; - } - - dp = page_address(page); - for (ent = lu_dirent_start(dp); - ent && sai->sai_task && !sa_low_hit(sai); - ent = lu_dirent_next(ent)) { - struct lu_fid fid; - __u64 hash; - int namelen; - char *name; - - hash = le64_to_cpu(ent->lde_hash); - if (unlikely(hash < pos)) - /* - * Skip until we find target hash value. - */ - continue; - - namelen = le16_to_cpu(ent->lde_namelen); - if (unlikely(namelen == 0)) - /* - * Skip dummy record. - */ - continue; - - name = ent->lde_name; - if (name[0] == '.') { - if (namelen == 1) { - /* - * skip "." - */ - continue; - } else if (name[1] == '.' && namelen == 2) { - /* - * skip ".." - */ - continue; - } else if (!sai->sai_ls_all) { - /* - * skip hidden files. - */ - sai->sai_skip_hidden++; - continue; - } - } - - /* - * don't stat-ahead first entry. - */ - if (unlikely(++first == 1)) - continue; - - fid_le_to_cpu(&fid, &ent->lde_fid); - - do { - sa_handle_callback(sai); - - spin_lock(&lli->lli_agl_lock); - while (sa_sent_full(sai) && - !agl_list_empty(sai)) { - struct ll_inode_info *clli; - - clli = list_entry(sai->sai_agls.next, - struct ll_inode_info, - lli_agl_list); - list_del_init(&clli->lli_agl_list); - spin_unlock(&lli->lli_agl_lock); - - ll_agl_trigger(&clli->lli_vfs_inode, - sai); - - spin_lock(&lli->lli_agl_lock); - } - spin_unlock(&lli->lli_agl_lock); - - set_current_state(TASK_IDLE); - if (sa_sent_full(sai) && - !sa_has_callback(sai) && - agl_list_empty(sai) && - sai->sai_task) - /* wait for spare statahead window */ - schedule(); - __set_current_state(TASK_RUNNING); - } while (sa_sent_full(sai) && sai->sai_task); - - sa_statahead(parent, name, namelen, &fid); - } - - pos = le64_to_cpu(dp->ldp_hash_end); - ll_release_page(dir, page, - le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE); - - if (sa_low_hit(sai)) { - rc = -EFAULT; - atomic_inc(&sbi->ll_sa_wrong); - CDEBUG(D_READA, "Statahead for dir " DFID " hit ratio too low: hit/miss %llu/%llu, sent/replied %llu/%llu, stopping statahead thread: pid %d\n", - PFID(&lli->lli_fid), sai->sai_hit, - sai->sai_miss, sai->sai_sent, - sai->sai_replied, current->pid); - break; - } - } - ll_finish_md_op_data(op_data); - - if (rc < 0) { - spin_lock(&lli->lli_sa_lock); - sai->sai_task = NULL; - lli->lli_sa_enabled = 0; - spin_unlock(&lli->lli_sa_lock); - } - - /* - * statahead is finished, but statahead entries need to be cached, wait - * for file release to stop me. - */ - while (sai->sai_task) { - sa_handle_callback(sai); - - set_current_state(TASK_IDLE); - if (!sa_has_callback(sai) && - sai->sai_task) - schedule(); - __set_current_state(TASK_RUNNING); - } -out: - if (sai->sai_agl_task) { - kthread_stop(sai->sai_agl_task); - - CDEBUG(D_READA, "stop agl thread: sai %p pid %u\n", - sai, (unsigned int)sai->sai_agl_task->pid); - sai->sai_agl_task = NULL; - } - /* - * wait for inflight statahead RPCs to finish, and then we can free sai - * safely because statahead RPC will access sai data - */ - while (sai->sai_sent != sai->sai_replied) { - /* in case we're not woken up, timeout wait */ - schedule_timeout_idle(HZ>>3); - } - - /* release resources held by statahead RPCs */ - sa_handle_callback(sai); - - CDEBUG(D_READA, "statahead thread stopped: sai %p, parent %pd\n", - sai, parent); - - spin_lock(&lli->lli_sa_lock); - sai->sai_task = NULL; - spin_unlock(&lli->lli_sa_lock); - - wake_up(&sai->sai_waitq); - ll_sai_put(sai); - - do_exit(rc); -} - -/* authorize opened dir handle @key to statahead */ -void ll_authorize_statahead(struct inode *dir, void *key) -{ - struct ll_inode_info *lli = ll_i2info(dir); - - spin_lock(&lli->lli_sa_lock); - if (!lli->lli_opendir_key && !lli->lli_sai) { - /* - * if lli_sai is not NULL, it means previous statahead is not - * finished yet, we'd better not start a new statahead for now. - */ - LASSERT(!lli->lli_opendir_pid); - lli->lli_opendir_key = key; - lli->lli_opendir_pid = current->pid; - lli->lli_sa_enabled = 1; - } - spin_unlock(&lli->lli_sa_lock); -} - -/* - * deauthorize opened dir handle @key to statahead, but statahead thread may - * still be running, notify it to quit. - */ -void ll_deauthorize_statahead(struct inode *dir, void *key) -{ - struct ll_inode_info *lli = ll_i2info(dir); - struct ll_statahead_info *sai; - - LASSERT(lli->lli_opendir_key == key); - LASSERT(lli->lli_opendir_pid); - - CDEBUG(D_READA, "deauthorize statahead for " DFID "\n", - PFID(&lli->lli_fid)); - - spin_lock(&lli->lli_sa_lock); - lli->lli_opendir_key = NULL; - lli->lli_opendir_pid = 0; - lli->lli_sa_enabled = 0; - sai = lli->lli_sai; - if (sai && sai->sai_task) { - /* - * statahead thread may not quit yet because it needs to cache - * entries, now it's time to tell it to quit. - */ - wake_up_process(sai->sai_task); - sai->sai_task = NULL; - } - spin_unlock(&lli->lli_sa_lock); -} - -enum { - /** - * not first dirent, or is "." - */ - LS_NOT_FIRST_DE = 0, - /** - * the first non-hidden dirent - */ - LS_FIRST_DE, - /** - * the first hidden dirent, that is "." - */ - LS_FIRST_DOT_DE -}; - -/* file is first dirent under @dir */ -static int is_first_dirent(struct inode *dir, struct dentry *dentry) -{ - const struct qstr *target = &dentry->d_name; - struct md_op_data *op_data; - struct page *page; - __u64 pos = 0; - int dot_de; - int rc = LS_NOT_FIRST_DE; - - op_data = ll_prep_md_op_data(NULL, dir, dir, NULL, 0, 0, - LUSTRE_OPC_ANY, dir); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - /** - * FIXME choose the start offset of the readdir - */ - op_data->op_max_pages = ll_i2sbi(dir)->ll_md_brw_pages; - - page = ll_get_dir_page(dir, op_data, pos); - - while (1) { - struct lu_dirpage *dp; - struct lu_dirent *ent; - - if (IS_ERR(page)) { - struct ll_inode_info *lli = ll_i2info(dir); - - rc = PTR_ERR(page); - CERROR("%s: error reading dir " DFID " at %llu: opendir_pid = %u : rc = %d\n", - ll_get_fsname(dir->i_sb, NULL, 0), - PFID(ll_inode2fid(dir)), pos, - lli->lli_opendir_pid, rc); - break; - } - - dp = page_address(page); - for (ent = lu_dirent_start(dp); ent; - ent = lu_dirent_next(ent)) { - __u64 hash; - int namelen; - char *name; - - hash = le64_to_cpu(ent->lde_hash); - /* The ll_get_dir_page() can return any page containing - * the given hash which may be not the start hash. - */ - if (unlikely(hash < pos)) - continue; - - namelen = le16_to_cpu(ent->lde_namelen); - if (unlikely(namelen == 0)) - /* - * skip dummy record. - */ - continue; - - name = ent->lde_name; - if (name[0] == '.') { - if (namelen == 1) - /* - * skip "." - */ - continue; - else if (name[1] == '.' && namelen == 2) - /* - * skip ".." - */ - continue; - else - dot_de = 1; - } else { - dot_de = 0; - } - - if (dot_de && target->name[0] != '.') { - CDEBUG(D_READA, "%.*s skip hidden file %.*s\n", - target->len, target->name, - namelen, name); - continue; - } - - if (target->len != namelen || - memcmp(target->name, name, namelen) != 0) - rc = LS_NOT_FIRST_DE; - else if (!dot_de) - rc = LS_FIRST_DE; - else - rc = LS_FIRST_DOT_DE; - - ll_release_page(dir, page, false); - goto out; - } - pos = le64_to_cpu(dp->ldp_hash_end); - if (pos == MDS_DIR_END_OFF) { - /* - * End of directory reached. - */ - ll_release_page(dir, page, false); - goto out; - } else { - /* - * chain is exhausted - * Normal case: continue to the next page. - */ - ll_release_page(dir, page, - le32_to_cpu(dp->ldp_flags) & - LDF_COLLIDE); - page = ll_get_dir_page(dir, op_data, pos); - } - } -out: - ll_finish_md_op_data(op_data); - return rc; -} - -/** - * revalidate @dentryp from statahead cache - * - * \param[in] dir parent directory - * \param[in] sai sai structure - * \param[out] dentryp pointer to dentry which will be revalidated - * \param[in] unplug unplug statahead window only (normally for negative - * dentry) - * \retval 1 on success, dentry is saved in @dentryp - * \retval 0 if revalidation failed (no proper lock on client) - * \retval negative number upon error - */ -static int revalidate_statahead_dentry(struct inode *dir, - struct ll_statahead_info *sai, - struct dentry **dentryp, - bool unplug) -{ - struct ll_inode_info *lli = ll_i2info(dir); - struct sa_entry *entry = NULL; - struct ll_dentry_data *ldd; - int rc = 0; - - if ((*dentryp)->d_name.name[0] == '.') { - if (sai->sai_ls_all || - sai->sai_miss_hidden >= sai->sai_skip_hidden) { - /* - * Hidden dentry is the first one, or statahead - * thread does not skip so many hidden dentries - * before "sai_ls_all" enabled as below. - */ - } else { - if (!sai->sai_ls_all) - /* - * It maybe because hidden dentry is not - * the first one, "sai_ls_all" was not - * set, then "ls -al" missed. Enable - * "sai_ls_all" for such case. - */ - sai->sai_ls_all = 1; - - /* - * Such "getattr" has been skipped before - * "sai_ls_all" enabled as above. - */ - sai->sai_miss_hidden++; - return -EAGAIN; - } - } - - if (unplug) { - rc = 1; - goto out_unplug; - } - - entry = sa_get(sai, &(*dentryp)->d_name); - if (!entry) { - rc = -EAGAIN; - goto out_unplug; - } - - /* if statahead is busy in readdir, help it do post-work */ - if (!sa_ready(entry) && sai->sai_in_readpage) - sa_handle_callback(sai); - - if (!sa_ready(entry)) { - spin_lock(&lli->lli_sa_lock); - sai->sai_index_wait = entry->se_index; - spin_unlock(&lli->lli_sa_lock); - if (0 == wait_event_idle_timeout(sai->sai_waitq, - sa_ready(entry), 30 * HZ)) { - /* - * entry may not be ready, so it may be used by inflight - * statahead RPC, don't free it. - */ - entry = NULL; - rc = -EAGAIN; - goto out_unplug; - } - } - - if (entry->se_state == SA_ENTRY_SUCC && entry->se_inode) { - struct inode *inode = entry->se_inode; - struct lookup_intent it = { .it_op = IT_GETATTR, - .it_lock_handle = entry->se_handle }; - __u64 bits; - - rc = md_revalidate_lock(ll_i2mdexp(dir), &it, - ll_inode2fid(inode), &bits); - if (rc == 1) { - if (!(*dentryp)->d_inode) { - struct dentry *alias; - - alias = ll_splice_alias(inode, *dentryp); - if (IS_ERR(alias)) { - ll_intent_release(&it); - rc = PTR_ERR(alias); - goto out_unplug; - } - *dentryp = alias; - /** - * statahead prepared this inode, transfer inode - * refcount from sa_entry to dentry - */ - entry->se_inode = NULL; - } else if ((*dentryp)->d_inode != inode) { - /* revalidate, but inode is recreated */ - CDEBUG(D_READA, - "%s: stale dentry %pd inode " DFID ", statahead inode " DFID "\n", - ll_get_fsname((*dentryp)->d_inode->i_sb, - NULL, 0), - *dentryp, - PFID(ll_inode2fid((*dentryp)->d_inode)), - PFID(ll_inode2fid(inode))); - ll_intent_release(&it); - rc = -ESTALE; - goto out_unplug; - } - - if ((bits & MDS_INODELOCK_LOOKUP) && - d_lustre_invalid(*dentryp)) - d_lustre_revalidate(*dentryp); - ll_intent_release(&it); - } - } -out_unplug: - /* - * statahead cached sa_entry can be used only once, and will be killed - * right after use, so if lookup/revalidate accessed statahead cache, - * set dentry ldd_sa_generation to parent lli_sa_generation, later if we - * stat this file again, we know we've done statahead before, see - * dentry_may_statahead(). - */ - ldd = ll_d2d(*dentryp); - ldd->lld_sa_generation = lli->lli_sa_generation; - sa_put(sai, entry, lli); - return rc; -} - -/** - * start statahead thread - * - * \param[in] dir parent directory - * \param[in] dentry dentry that triggers statahead, normally the first - * dirent under @dir - * \retval -EAGAIN on success, because when this function is - * called, it's already in lookup call, so client should - * do it itself instead of waiting for statahead thread - * to do it asynchronously. - * \retval negative number upon error - */ -static int start_statahead_thread(struct inode *dir, struct dentry *dentry) -{ - struct ll_inode_info *lli = ll_i2info(dir); - struct ll_statahead_info *sai = NULL; - struct task_struct *task; - struct dentry *parent = dentry->d_parent; - int rc; - - /* I am the "lli_opendir_pid" owner, only me can set "lli_sai". */ - rc = is_first_dirent(dir, dentry); - if (rc == LS_NOT_FIRST_DE) { - /* It is not "ls -{a}l" operation, no need statahead for it. */ - rc = -EFAULT; - goto out; - } - - sai = ll_sai_alloc(parent); - if (!sai) { - rc = -ENOMEM; - goto out; - } - - sai->sai_ls_all = (rc == LS_FIRST_DOT_DE); - /* - * if current lli_opendir_key was deauthorized, or dir re-opened by - * another process, don't start statahead, otherwise the newly spawned - * statahead thread won't be notified to quit. - */ - spin_lock(&lli->lli_sa_lock); - if (unlikely(lli->lli_sai || lli->lli_opendir_key || - lli->lli_opendir_pid != current->pid)) { - spin_unlock(&lli->lli_sa_lock); - rc = -EPERM; - goto out; - } - lli->lli_sai = sai; - spin_unlock(&lli->lli_sa_lock); - - atomic_inc(&ll_i2sbi(parent->d_inode)->ll_sa_running); - - CDEBUG(D_READA, "start statahead thread: [pid %d] [parent %pd]\n", - current->pid, parent); - - task = kthread_create(ll_statahead_thread, parent, "ll_sa_%u", - lli->lli_opendir_pid); - if (IS_ERR(task)) { - rc = PTR_ERR(task); - CERROR("can't start ll_sa thread, rc : %d\n", rc); - goto out; - } - - if (ll_i2sbi(parent->d_inode)->ll_flags & LL_SBI_AGL_ENABLED) - ll_start_agl(parent, sai); - - atomic_inc(&ll_i2sbi(parent->d_inode)->ll_sa_total); - sai->sai_task = task; - - wake_up_process(task); - - /* - * We don't stat-ahead for the first dirent since we are already in - * lookup. - */ - return -EAGAIN; - -out: - /* - * once we start statahead thread failed, disable statahead so - * that subsequent stat won't waste time to try it. - */ - spin_lock(&lli->lli_sa_lock); - lli->lli_sa_enabled = 0; - lli->lli_sai = NULL; - spin_unlock(&lli->lli_sa_lock); - if (sai) - ll_sai_free(sai); - return rc; -} - -/** - * statahead entry function, this is called when client getattr on a file, it - * will start statahead thread if this is the first dir entry, else revalidate - * dentry from statahead cache. - * - * \param[in] dir parent directory - * \param[out] dentryp dentry to getattr - * \param[in] unplug unplug statahead window only (normally for negative - * dentry) - * \retval 1 on success - * \retval 0 revalidation from statahead cache failed, caller needs - * to getattr from server directly - * \retval negative number on error, caller often ignores this and - * then getattr from server - */ -int ll_statahead(struct inode *dir, struct dentry **dentryp, bool unplug) -{ - struct ll_statahead_info *sai; - - sai = ll_sai_get(dir); - if (sai) { - int rc; - - rc = revalidate_statahead_dentry(dir, sai, dentryp, unplug); - CDEBUG(D_READA, "revalidate statahead %pd: %d.\n", - *dentryp, rc); - ll_sai_put(sai); - return rc; - } - return start_statahead_thread(dir, *dentryp); -} diff --git a/drivers/staging/lustre/lustre/llite/super25.c b/drivers/staging/lustre/lustre/llite/super25.c deleted file mode 100644 index d335f29556c23f546fe1797d528df97f6f8eddc4..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/super25.c +++ /dev/null @@ -1,189 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_LLITE - -#include -#include -#include -#include -#include -#include -#include -#include "llite_internal.h" - -static struct kmem_cache *ll_inode_cachep; - -static struct inode *ll_alloc_inode(struct super_block *sb) -{ - struct ll_inode_info *lli; - - ll_stats_ops_tally(ll_s2sbi(sb), LPROC_LL_ALLOC_INODE, 1); - lli = kmem_cache_zalloc(ll_inode_cachep, GFP_NOFS); - if (!lli) - return NULL; - - inode_init_once(&lli->lli_vfs_inode); - return &lli->lli_vfs_inode; -} - -static void ll_inode_destroy_callback(struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - struct ll_inode_info *ptr = ll_i2info(inode); - - kmem_cache_free(ll_inode_cachep, ptr); -} - -static void ll_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, ll_inode_destroy_callback); -} - -/* exported operations */ -struct super_operations lustre_super_operations = { - .alloc_inode = ll_alloc_inode, - .destroy_inode = ll_destroy_inode, - .evict_inode = ll_delete_inode, - .put_super = ll_put_super, - .statfs = ll_statfs, - .umount_begin = ll_umount_begin, - .remount_fs = ll_remount_fs, - .show_options = ll_show_options, -}; -MODULE_ALIAS_FS("lustre"); - -static int __init lustre_init(void) -{ - int rc; - - BUILD_BUG_ON(sizeof(LUSTRE_VOLATILE_HDR) != - LUSTRE_VOLATILE_HDR_LEN + 1); - - rc = libcfs_setup(); - if (rc) - return rc; - - /* print an address of _any_ initialized kernel symbol from this - * module, to allow debugging with gdb that doesn't support data - * symbols from modules. - */ - CDEBUG(D_INFO, "Lustre client module (%p).\n", - &lustre_super_operations); - - rc = -ENOMEM; - ll_inode_cachep = kmem_cache_create("lustre_inode_cache", - sizeof(struct ll_inode_info), 0, - SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, - NULL); - if (!ll_inode_cachep) - goto out_cache; - - ll_file_data_slab = kmem_cache_create("ll_file_data", - sizeof(struct ll_file_data), 0, - SLAB_HWCACHE_ALIGN, NULL); - if (!ll_file_data_slab) - goto out_cache; - - llite_root = debugfs_create_dir("llite", debugfs_lustre_root); - if (IS_ERR_OR_NULL(llite_root)) { - rc = llite_root ? PTR_ERR(llite_root) : -ENOMEM; - llite_root = NULL; - goto out_cache; - } - - llite_kset = kset_create_and_add("llite", NULL, lustre_kobj); - if (!llite_kset) { - rc = -ENOMEM; - goto out_debugfs; - } - - rc = vvp_global_init(); - if (rc != 0) - goto out_sysfs; - - cl_inode_fini_env = cl_env_alloc(&cl_inode_fini_refcheck, - LCT_REMEMBER | LCT_NOREF); - if (IS_ERR(cl_inode_fini_env)) { - rc = PTR_ERR(cl_inode_fini_env); - goto out_vvp; - } - - cl_inode_fini_env->le_ctx.lc_cookie = 0x4; - - rc = ll_xattr_init(); - if (rc != 0) - goto out_inode_fini_env; - - lustre_register_super_ops(THIS_MODULE, ll_fill_super, ll_kill_super); - lustre_register_client_process_config(ll_process_config); - - return 0; - -out_inode_fini_env: - cl_env_put(cl_inode_fini_env, &cl_inode_fini_refcheck); -out_vvp: - vvp_global_fini(); -out_sysfs: - kset_unregister(llite_kset); -out_debugfs: - debugfs_remove(llite_root); -out_cache: - kmem_cache_destroy(ll_inode_cachep); - kmem_cache_destroy(ll_file_data_slab); - return rc; -} - -static void __exit lustre_exit(void) -{ - lustre_register_super_ops(NULL, NULL, NULL); - lustre_register_client_process_config(NULL); - - debugfs_remove(llite_root); - kset_unregister(llite_kset); - - ll_xattr_fini(); - cl_env_put(cl_inode_fini_env, &cl_inode_fini_refcheck); - vvp_global_fini(); - - kmem_cache_destroy(ll_inode_cachep); - kmem_cache_destroy(ll_file_data_slab); -} - -MODULE_AUTHOR("OpenSFS, Inc. "); -MODULE_DESCRIPTION("Lustre Client File System"); -MODULE_VERSION(LUSTRE_VERSION_STRING); -MODULE_LICENSE("GPL"); - -module_init(lustre_init); -module_exit(lustre_exit); diff --git a/drivers/staging/lustre/lustre/llite/symlink.c b/drivers/staging/lustre/lustre/llite/symlink.c deleted file mode 100644 index 0690fdbf49f514065b36b5e2c285e25556e53704..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/symlink.c +++ /dev/null @@ -1,159 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#include -#include -#include -#define DEBUG_SUBSYSTEM S_LLITE - -#include "llite_internal.h" - -static int ll_readlink_internal(struct inode *inode, - struct ptlrpc_request **request, char **symname) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct ll_sb_info *sbi = ll_i2sbi(inode); - int rc, symlen = i_size_read(inode) + 1; - struct mdt_body *body; - struct md_op_data *op_data; - - *request = NULL; - - if (lli->lli_symlink_name) { - int print_limit = min_t(int, PAGE_SIZE - 128, symlen); - - *symname = lli->lli_symlink_name; - /* If the total CDEBUG() size is larger than a page, it - * will print a warning to the console, avoid this by - * printing just the last part of the symlink. - */ - CDEBUG(D_INODE, "using cached symlink %s%.*s, len = %d\n", - print_limit < symlen ? "..." : "", print_limit, - (*symname) + symlen - print_limit, symlen); - return 0; - } - - op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, symlen, - LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) - return PTR_ERR(op_data); - - op_data->op_valid = OBD_MD_LINKNAME; - rc = md_getattr(sbi->ll_md_exp, op_data, request); - ll_finish_md_op_data(op_data); - if (rc) { - if (rc != -ENOENT) - CERROR("%s: inode " DFID ": rc = %d\n", - ll_get_fsname(inode->i_sb, NULL, 0), - PFID(ll_inode2fid(inode)), rc); - goto failed; - } - - body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY); - if ((body->mbo_valid & OBD_MD_LINKNAME) == 0) { - CERROR("OBD_MD_LINKNAME not set on reply\n"); - rc = -EPROTO; - goto failed; - } - - LASSERT(symlen != 0); - if (body->mbo_eadatasize != symlen) { - CERROR("%s: inode " DFID ": symlink length %d not expected %d\n", - ll_get_fsname(inode->i_sb, NULL, 0), - PFID(ll_inode2fid(inode)), body->mbo_eadatasize - 1, - symlen - 1); - rc = -EPROTO; - goto failed; - } - - *symname = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_MD); - if (!*symname || - strnlen(*symname, symlen) != symlen - 1) { - /* not full/NULL terminated */ - CERROR("inode %lu: symlink not NULL terminated string of length %d\n", - inode->i_ino, symlen - 1); - rc = -EPROTO; - goto failed; - } - - lli->lli_symlink_name = kzalloc(symlen, GFP_NOFS); - /* do not return an error if we cannot cache the symlink locally */ - if (lli->lli_symlink_name) { - memcpy(lli->lli_symlink_name, *symname, symlen); - *symname = lli->lli_symlink_name; - } - return 0; - -failed: - return rc; -} - -static void ll_put_link(void *p) -{ - ptlrpc_req_finished(p); -} - -static const char *ll_get_link(struct dentry *dentry, - struct inode *inode, - struct delayed_call *done) -{ - struct ptlrpc_request *request = NULL; - int rc; - char *symname = NULL; - - if (!dentry) - return ERR_PTR(-ECHILD); - - CDEBUG(D_VFSTRACE, "VFS Op\n"); - ll_inode_size_lock(inode); - rc = ll_readlink_internal(inode, &request, &symname); - ll_inode_size_unlock(inode); - if (rc) { - ptlrpc_req_finished(request); - return ERR_PTR(rc); - } - - /* symname may contain a pointer to the request message buffer, - * we delay request releasing then. - */ - set_delayed_call(done, ll_put_link, request); - return symname; -} - -const struct inode_operations ll_fast_symlink_inode_operations = { - .setattr = ll_setattr, - .get_link = ll_get_link, - .getattr = ll_getattr, - .permission = ll_inode_permission, - .listxattr = ll_listxattr, -}; diff --git a/drivers/staging/lustre/lustre/llite/vvp_dev.c b/drivers/staging/lustre/lustre/llite/vvp_dev.c deleted file mode 100644 index 31dc3c0ade01172fae2ce6c24a937e586aed900d..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/vvp_dev.c +++ /dev/null @@ -1,640 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * cl_device and cl_device_type implementation for VVP layer. - * - * Author: Nikita Danilov - * Author: Jinshan Xiong - */ - -#define DEBUG_SUBSYSTEM S_LLITE - -#include -#include -#include "llite_internal.h" -#include "vvp_internal.h" - -/***************************************************************************** - * - * Vvp device and device type functions. - * - */ - -/* - * vvp_ prefix stands for "Vfs Vm Posix". It corresponds to historical - * "llite_" (var. "ll_") prefix. - */ - -static struct kmem_cache *ll_thread_kmem; -struct kmem_cache *vvp_lock_kmem; -struct kmem_cache *vvp_object_kmem; -static struct kmem_cache *vvp_session_kmem; -static struct kmem_cache *vvp_thread_kmem; - -static struct lu_kmem_descr vvp_caches[] = { - { - .ckd_cache = &ll_thread_kmem, - .ckd_name = "ll_thread_kmem", - .ckd_size = sizeof(struct ll_thread_info), - }, - { - .ckd_cache = &vvp_lock_kmem, - .ckd_name = "vvp_lock_kmem", - .ckd_size = sizeof(struct vvp_lock), - }, - { - .ckd_cache = &vvp_object_kmem, - .ckd_name = "vvp_object_kmem", - .ckd_size = sizeof(struct vvp_object), - }, - { - .ckd_cache = &vvp_session_kmem, - .ckd_name = "vvp_session_kmem", - .ckd_size = sizeof(struct vvp_session) - }, - { - .ckd_cache = &vvp_thread_kmem, - .ckd_name = "vvp_thread_kmem", - .ckd_size = sizeof(struct vvp_thread_info), - }, - { - .ckd_cache = NULL - } -}; - -static void *ll_thread_key_init(const struct lu_context *ctx, - struct lu_context_key *key) -{ - struct vvp_thread_info *info; - - info = kmem_cache_zalloc(ll_thread_kmem, GFP_NOFS); - if (!info) - info = ERR_PTR(-ENOMEM); - return info; -} - -static void ll_thread_key_fini(const struct lu_context *ctx, - struct lu_context_key *key, void *data) -{ - struct vvp_thread_info *info = data; - - kmem_cache_free(ll_thread_kmem, info); -} - -struct lu_context_key ll_thread_key = { - .lct_tags = LCT_CL_THREAD, - .lct_init = ll_thread_key_init, - .lct_fini = ll_thread_key_fini -}; - -static void *vvp_session_key_init(const struct lu_context *ctx, - struct lu_context_key *key) -{ - struct vvp_session *session; - - session = kmem_cache_zalloc(vvp_session_kmem, GFP_NOFS); - if (!session) - session = ERR_PTR(-ENOMEM); - return session; -} - -static void vvp_session_key_fini(const struct lu_context *ctx, - struct lu_context_key *key, void *data) -{ - struct vvp_session *session = data; - - kmem_cache_free(vvp_session_kmem, session); -} - -struct lu_context_key vvp_session_key = { - .lct_tags = LCT_SESSION, - .lct_init = vvp_session_key_init, - .lct_fini = vvp_session_key_fini -}; - -static void *vvp_thread_key_init(const struct lu_context *ctx, - struct lu_context_key *key) -{ - struct vvp_thread_info *vti; - - vti = kmem_cache_zalloc(vvp_thread_kmem, GFP_NOFS); - if (!vti) - vti = ERR_PTR(-ENOMEM); - return vti; -} - -static void vvp_thread_key_fini(const struct lu_context *ctx, - struct lu_context_key *key, void *data) -{ - struct vvp_thread_info *vti = data; - - kmem_cache_free(vvp_thread_kmem, vti); -} - -struct lu_context_key vvp_thread_key = { - .lct_tags = LCT_CL_THREAD, - .lct_init = vvp_thread_key_init, - .lct_fini = vvp_thread_key_fini -}; - -/* type constructor/destructor: vvp_type_{init,fini,start,stop}(). */ -LU_TYPE_INIT_FINI(vvp, &vvp_thread_key, &ll_thread_key, &vvp_session_key); - -static const struct lu_device_operations vvp_lu_ops = { - .ldo_object_alloc = vvp_object_alloc -}; - -static struct lu_device *vvp_device_free(const struct lu_env *env, - struct lu_device *d) -{ - struct vvp_device *vdv = lu2vvp_dev(d); - struct cl_site *site = lu2cl_site(d->ld_site); - struct lu_device *next = cl2lu_dev(vdv->vdv_next); - - if (d->ld_site) { - cl_site_fini(site); - kfree(site); - } - cl_device_fini(lu2cl_dev(d)); - kfree(vdv); - return next; -} - -static struct lu_device *vvp_device_alloc(const struct lu_env *env, - struct lu_device_type *t, - struct lustre_cfg *cfg) -{ - struct vvp_device *vdv; - struct lu_device *lud; - struct cl_site *site; - int rc; - - vdv = kzalloc(sizeof(*vdv), GFP_NOFS); - if (!vdv) - return ERR_PTR(-ENOMEM); - - lud = &vdv->vdv_cl.cd_lu_dev; - cl_device_init(&vdv->vdv_cl, t); - vvp2lu_dev(vdv)->ld_ops = &vvp_lu_ops; - - site = kzalloc(sizeof(*site), GFP_NOFS); - if (site) { - rc = cl_site_init(site, &vdv->vdv_cl); - if (rc == 0) { - rc = lu_site_init_finish(&site->cs_lu); - } else { - LASSERT(!lud->ld_site); - CERROR("Cannot init lu_site, rc %d.\n", rc); - kfree(site); - } - } else { - rc = -ENOMEM; - } - if (rc != 0) { - vvp_device_free(env, lud); - lud = ERR_PTR(rc); - } - return lud; -} - -static int vvp_device_init(const struct lu_env *env, struct lu_device *d, - const char *name, struct lu_device *next) -{ - struct vvp_device *vdv; - int rc; - - vdv = lu2vvp_dev(d); - vdv->vdv_next = lu2cl_dev(next); - - LASSERT(d->ld_site && next->ld_type); - next->ld_site = d->ld_site; - rc = next->ld_type->ldt_ops->ldto_device_init(env, next, - next->ld_type->ldt_name, - NULL); - if (rc == 0) { - lu_device_get(next); - lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init); - } - return rc; -} - -static struct lu_device *vvp_device_fini(const struct lu_env *env, - struct lu_device *d) -{ - return cl2lu_dev(lu2vvp_dev(d)->vdv_next); -} - -static const struct lu_device_type_operations vvp_device_type_ops = { - .ldto_init = vvp_type_init, - .ldto_fini = vvp_type_fini, - - .ldto_start = vvp_type_start, - .ldto_stop = vvp_type_stop, - - .ldto_device_alloc = vvp_device_alloc, - .ldto_device_free = vvp_device_free, - .ldto_device_init = vvp_device_init, - .ldto_device_fini = vvp_device_fini, -}; - -struct lu_device_type vvp_device_type = { - .ldt_tags = LU_DEVICE_CL, - .ldt_name = LUSTRE_VVP_NAME, - .ldt_ops = &vvp_device_type_ops, - .ldt_ctx_tags = LCT_CL_THREAD -}; - -/** - * A mutex serializing calls to vvp_inode_fini() under extreme memory - * pressure, when environments cannot be allocated. - */ -int vvp_global_init(void) -{ - int rc; - - rc = lu_kmem_init(vvp_caches); - if (rc != 0) - return rc; - - rc = lu_device_type_init(&vvp_device_type); - if (rc != 0) - goto out_kmem; - - return 0; - -out_kmem: - lu_kmem_fini(vvp_caches); - - return rc; -} - -void vvp_global_fini(void) -{ - lu_device_type_fini(&vvp_device_type); - lu_kmem_fini(vvp_caches); -} - -/***************************************************************************** - * - * mirror obd-devices into cl devices. - * - */ - -int cl_sb_init(struct super_block *sb) -{ - struct ll_sb_info *sbi; - struct cl_device *cl; - struct lu_env *env; - int rc = 0; - u16 refcheck; - - sbi = ll_s2sbi(sb); - env = cl_env_get(&refcheck); - if (!IS_ERR(env)) { - cl = cl_type_setup(env, NULL, &vvp_device_type, - sbi->ll_dt_exp->exp_obd->obd_lu_dev); - if (!IS_ERR(cl)) { - sbi->ll_cl = cl; - sbi->ll_site = cl2lu_dev(cl)->ld_site; - } - cl_env_put(env, &refcheck); - } else { - rc = PTR_ERR(env); - } - return rc; -} - -int cl_sb_fini(struct super_block *sb) -{ - struct ll_sb_info *sbi; - struct lu_env *env; - struct cl_device *cld; - u16 refcheck; - int result; - - sbi = ll_s2sbi(sb); - env = cl_env_get(&refcheck); - if (!IS_ERR(env)) { - cld = sbi->ll_cl; - - if (cld) { - cl_stack_fini(env, cld); - sbi->ll_cl = NULL; - sbi->ll_site = NULL; - } - cl_env_put(env, &refcheck); - result = 0; - } else { - CERROR("Cannot cleanup cl-stack due to memory shortage.\n"); - result = PTR_ERR(env); - } - return result; -} - -/**************************************************************************** - * - * debugfs/lustre/llite/$MNT/dump_page_cache - * - ****************************************************************************/ - -/* - * To represent contents of a page cache as a byte stream, following - * information if encoded in 64bit offset: - * - * - file hash bucket in lu_site::ls_hash[] 28bits - * - * - how far file is from bucket head 4bits - * - * - page index 32bits - * - * First two data identify a file in the cache uniquely. - */ - -#define PGC_OBJ_SHIFT (32 + 4) -#define PGC_DEPTH_SHIFT (32) - -struct vvp_pgcache_id { - unsigned int vpi_bucket; - unsigned int vpi_depth; - u32 vpi_index; - - unsigned int vpi_curdep; - struct lu_object_header *vpi_obj; -}; - -struct seq_private { - struct ll_sb_info *sbi; - struct lu_env *env; - u16 refcheck; - struct cl_object *clob; -}; - -static void vvp_pgcache_id_unpack(loff_t pos, struct vvp_pgcache_id *id) -{ - BUILD_BUG_ON(sizeof(pos) != sizeof(__u64)); - - id->vpi_index = pos & 0xffffffff; - id->vpi_depth = (pos >> PGC_DEPTH_SHIFT) & 0xf; - id->vpi_bucket = (unsigned long long)pos >> PGC_OBJ_SHIFT; -} - -static loff_t vvp_pgcache_id_pack(struct vvp_pgcache_id *id) -{ - return - ((__u64)id->vpi_index) | - ((__u64)id->vpi_depth << PGC_DEPTH_SHIFT) | - ((__u64)id->vpi_bucket << PGC_OBJ_SHIFT); -} - -static int vvp_pgcache_obj_get(struct cfs_hash *hs, struct cfs_hash_bd *bd, - struct hlist_node *hnode, void *data) -{ - struct vvp_pgcache_id *id = data; - struct lu_object_header *hdr = cfs_hash_object(hs, hnode); - - if (id->vpi_curdep-- > 0) - return 0; /* continue */ - - if (lu_object_is_dying(hdr)) - return 1; - - cfs_hash_get(hs, hnode); - id->vpi_obj = hdr; - return 1; -} - -static struct cl_object *vvp_pgcache_obj(const struct lu_env *env, - struct lu_device *dev, - struct vvp_pgcache_id *id) -{ - LASSERT(lu_device_is_cl(dev)); - - id->vpi_depth &= 0xf; - id->vpi_obj = NULL; - id->vpi_curdep = id->vpi_depth; - - cfs_hash_hlist_for_each(dev->ld_site->ls_obj_hash, id->vpi_bucket, - vvp_pgcache_obj_get, id); - if (id->vpi_obj) { - struct lu_object *lu_obj; - - lu_obj = lu_object_locate(id->vpi_obj, dev->ld_type); - if (lu_obj) { - lu_object_ref_add(lu_obj, "dump", current); - return lu2cl(lu_obj); - } - lu_object_put(env, lu_object_top(id->vpi_obj)); - - } else if (id->vpi_curdep > 0) { - id->vpi_depth = 0xf; - } - return NULL; -} - -static struct page *vvp_pgcache_find(const struct lu_env *env, - struct lu_device *dev, - struct cl_object **clobp, loff_t *pos) -{ - struct cl_object *clob; - struct lu_site *site; - struct vvp_pgcache_id id; - - site = dev->ld_site; - vvp_pgcache_id_unpack(*pos, &id); - - while (1) { - if (id.vpi_bucket >= CFS_HASH_NHLIST(site->ls_obj_hash)) - return NULL; - clob = vvp_pgcache_obj(env, dev, &id); - if (clob) { - struct inode *inode = vvp_object_inode(clob); - struct page *vmpage; - int nr; - - nr = find_get_pages_contig(inode->i_mapping, - id.vpi_index, 1, &vmpage); - if (nr > 0) { - id.vpi_index = vmpage->index; - /* Cant support over 16T file */ - if (vmpage->index <= 0xffffffff) { - *clobp = clob; - *pos = vvp_pgcache_id_pack(&id); - return vmpage; - } - put_page(vmpage); - } - - lu_object_ref_del(&clob->co_lu, "dump", current); - cl_object_put(env, clob); - } - /* to the next object. */ - ++id.vpi_depth; - id.vpi_depth &= 0xf; - if (id.vpi_depth == 0 && ++id.vpi_bucket == 0) - return NULL; - id.vpi_index = 0; - } -} - -#define seq_page_flag(seq, page, flag, has_flags) do { \ - if (test_bit(PG_##flag, &(page)->flags)) { \ - seq_printf(seq, "%s"#flag, has_flags ? "|" : ""); \ - has_flags = 1; \ - } \ -} while (0) - -static void vvp_pgcache_page_show(const struct lu_env *env, - struct seq_file *seq, struct cl_page *page) -{ - struct vvp_page *vpg; - struct page *vmpage; - int has_flags; - - vpg = cl2vvp_page(cl_page_at(page, &vvp_device_type)); - vmpage = vpg->vpg_page; - seq_printf(seq, " %5i | %p %p %s %s %s | %p " DFID "(%p) %lu %u [", - 0 /* gen */, - vpg, page, - "none", - vpg->vpg_defer_uptodate ? "du" : "- ", - PageWriteback(vmpage) ? "wb" : "-", - vmpage, PFID(ll_inode2fid(vmpage->mapping->host)), - vmpage->mapping->host, vmpage->index, - page_count(vmpage)); - has_flags = 0; - seq_page_flag(seq, vmpage, locked, has_flags); - seq_page_flag(seq, vmpage, error, has_flags); - seq_page_flag(seq, vmpage, referenced, has_flags); - seq_page_flag(seq, vmpage, uptodate, has_flags); - seq_page_flag(seq, vmpage, dirty, has_flags); - seq_page_flag(seq, vmpage, writeback, has_flags); - seq_printf(seq, "%s]\n", has_flags ? "" : "-"); -} - -static int vvp_pgcache_show(struct seq_file *f, void *v) -{ - struct seq_private *priv = f->private; - struct page *vmpage = v; - struct cl_page *page; - - seq_printf(f, "%8lx@" DFID ": ", vmpage->index, - PFID(lu_object_fid(&priv->clob->co_lu))); - lock_page(vmpage); - page = cl_vmpage_page(vmpage, priv->clob); - unlock_page(vmpage); - put_page(vmpage); - - if (page) { - vvp_pgcache_page_show(priv->env, f, page); - cl_page_put(priv->env, page); - } else { - seq_puts(f, "missing\n"); - } - lu_object_ref_del(&priv->clob->co_lu, "dump", current); - cl_object_put(priv->env, priv->clob); - - return 0; -} - -static void *vvp_pgcache_start(struct seq_file *f, loff_t *pos) -{ - struct seq_private *priv = f->private; - struct page *ret; - - if (priv->sbi->ll_site->ls_obj_hash->hs_cur_bits > - 64 - PGC_OBJ_SHIFT) - ret = ERR_PTR(-EFBIG); - else - ret = vvp_pgcache_find(priv->env, &priv->sbi->ll_cl->cd_lu_dev, - &priv->clob, pos); - - return ret; -} - -static void *vvp_pgcache_next(struct seq_file *f, void *v, loff_t *pos) -{ - struct seq_private *priv = f->private; - struct page *ret; - - *pos += 1; - ret = vvp_pgcache_find(priv->env, &priv->sbi->ll_cl->cd_lu_dev, - &priv->clob, pos); - return ret; -} - -static void vvp_pgcache_stop(struct seq_file *f, void *v) -{ - /* Nothing to do */ -} - -static const struct seq_operations vvp_pgcache_ops = { - .start = vvp_pgcache_start, - .next = vvp_pgcache_next, - .stop = vvp_pgcache_stop, - .show = vvp_pgcache_show -}; - -static int vvp_dump_pgcache_seq_open(struct inode *inode, struct file *filp) -{ - struct seq_private *priv; - - priv = __seq_open_private(filp, &vvp_pgcache_ops, sizeof(*priv)); - if (!priv) - return -ENOMEM; - - priv->sbi = inode->i_private; - priv->env = cl_env_get(&priv->refcheck); - if (IS_ERR(priv->env)) { - int err = PTR_ERR(priv->env); - - seq_release_private(inode, filp); - return err; - } - return 0; -} - -static int vvp_dump_pgcache_seq_release(struct inode *inode, struct file *file) -{ - struct seq_file *seq = file->private_data; - struct seq_private *priv = seq->private; - - cl_env_put(priv->env, &priv->refcheck); - return seq_release_private(inode, file); -} - -const struct file_operations vvp_dump_pgcache_file_ops = { - .owner = THIS_MODULE, - .open = vvp_dump_pgcache_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = vvp_dump_pgcache_seq_release, -}; diff --git a/drivers/staging/lustre/lustre/llite/vvp_internal.h b/drivers/staging/lustre/lustre/llite/vvp_internal.h deleted file mode 100644 index 7d3abb43584a0b536087819ed910cba96a9c4ae1..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/vvp_internal.h +++ /dev/null @@ -1,321 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2013, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Internal definitions for VVP layer. - * - * Author: Nikita Danilov - */ - -#ifndef VVP_INTERNAL_H -#define VVP_INTERNAL_H - -#include -#include - -enum obd_notify_event; -struct inode; -struct lustre_md; -struct obd_device; -struct obd_export; -struct page; - -/** - * IO state private to IO state private to VVP layer. - */ -struct vvp_io { - /** super class */ - struct cl_io_slice vui_cl; - struct cl_io_lock_link vui_link; - /** - * I/O vector information to or from which read/write is going. - */ - struct iov_iter *vui_iter; - /** - * Total size for the left IO. - */ - size_t vui_tot_count; - - union { - struct vvp_fault_io { - /** - * Inode modification time that is checked across DLM - * lock request. - */ - time64_t ft_mtime; - struct vm_area_struct *ft_vma; - /** - * locked page returned from vvp_io - */ - struct page *ft_vmpage; - /** - * kernel fault info - */ - struct vm_fault *ft_vmf; - /** - * fault API used bitflags for return code. - */ - unsigned int ft_flags; - /** - * check that flags are from filemap_fault - */ - bool ft_flags_valid; - } fault; - struct { - struct cl_page_list vui_queue; - unsigned long vui_written; - int vui_from; - int vui_to; - } write; - } u; - - /** - * Layout version when this IO is initialized - */ - __u32 vui_layout_gen; - /** - * File descriptor against which IO is done. - */ - struct ll_file_data *vui_fd; - struct kiocb *vui_iocb; - - /* Readahead state. */ - pgoff_t vui_ra_start; - pgoff_t vui_ra_count; - /* Set when vui_ra_{start,count} have been initialized. */ - bool vui_ra_valid; -}; - -extern struct lu_device_type vvp_device_type; - -extern struct lu_context_key vvp_session_key; -extern struct lu_context_key vvp_thread_key; - -extern struct kmem_cache *vvp_lock_kmem; -extern struct kmem_cache *vvp_object_kmem; - -struct vvp_thread_info { - struct cl_lock vti_lock; - struct cl_lock_descr vti_descr; - struct cl_io vti_io; - struct cl_attr vti_attr; -}; - -static inline struct vvp_thread_info *vvp_env_info(const struct lu_env *env) -{ - struct vvp_thread_info *vti; - - vti = lu_context_key_get(&env->le_ctx, &vvp_thread_key); - LASSERT(vti); - - return vti; -} - -static inline struct cl_lock *vvp_env_lock(const struct lu_env *env) -{ - struct cl_lock *lock = &vvp_env_info(env)->vti_lock; - - memset(lock, 0, sizeof(*lock)); - return lock; -} - -static inline struct cl_attr *vvp_env_thread_attr(const struct lu_env *env) -{ - struct cl_attr *attr = &vvp_env_info(env)->vti_attr; - - memset(attr, 0, sizeof(*attr)); - - return attr; -} - -static inline struct cl_io *vvp_env_thread_io(const struct lu_env *env) -{ - struct cl_io *io = &vvp_env_info(env)->vti_io; - - memset(io, 0, sizeof(*io)); - - return io; -} - -struct vvp_session { - struct vvp_io cs_ios; -}; - -static inline struct vvp_session *vvp_env_session(const struct lu_env *env) -{ - struct vvp_session *ses; - - ses = lu_context_key_get(env->le_ses, &vvp_session_key); - LASSERT(ses); - - return ses; -} - -static inline struct vvp_io *vvp_env_io(const struct lu_env *env) -{ - return &vvp_env_session(env)->cs_ios; -} - -/** - * ccc-private object state. - */ -struct vvp_object { - struct cl_object_header vob_header; - struct cl_object vob_cl; - struct inode *vob_inode; - - /** - * Number of transient pages. This is no longer protected by i_sem, - * and needs to be atomic. This is not actually used for anything, - * and can probably be removed. - */ - atomic_t vob_transient_pages; - - /** - * Number of outstanding mmaps on this file. - * - * \see ll_vm_open(), ll_vm_close(). - */ - atomic_t vob_mmap_cnt; - - /** - * various flags - * vob_discard_page_warned - * if pages belonging to this object are discarded when a client - * is evicted, some debug info will be printed, this flag will be set - * during processing the first discarded page, then avoid flooding - * debug message for lots of discarded pages. - * - * \see ll_dirty_page_discard_warn. - */ - unsigned int vob_discard_page_warned:1; -}; - -/** - * VVP-private page state. - */ -struct vvp_page { - struct cl_page_slice vpg_cl; - unsigned int vpg_defer_uptodate:1, - vpg_ra_used:1; - /** VM page */ - struct page *vpg_page; -}; - -static inline struct vvp_page *cl2vvp_page(const struct cl_page_slice *slice) -{ - return container_of(slice, struct vvp_page, vpg_cl); -} - -static inline pgoff_t vvp_index(struct vvp_page *vvp) -{ - return vvp->vpg_cl.cpl_index; -} - -struct vvp_device { - struct cl_device vdv_cl; - struct cl_device *vdv_next; -}; - -struct vvp_lock { - struct cl_lock_slice vlk_cl; -}; - -void *ccc_key_init(const struct lu_context *ctx, - struct lu_context_key *key); -void ccc_key_fini(const struct lu_context *ctx, - struct lu_context_key *key, void *data); - -void ccc_umount(const struct lu_env *env, struct cl_device *dev); - -static inline struct lu_device *vvp2lu_dev(struct vvp_device *vdv) -{ - return &vdv->vdv_cl.cd_lu_dev; -} - -static inline struct vvp_device *lu2vvp_dev(const struct lu_device *d) -{ - return container_of_safe(d, struct vvp_device, vdv_cl.cd_lu_dev); -} - -static inline struct vvp_device *cl2vvp_dev(const struct cl_device *d) -{ - return container_of_safe(d, struct vvp_device, vdv_cl); -} - -static inline struct vvp_object *cl2vvp(const struct cl_object *obj) -{ - return container_of_safe(obj, struct vvp_object, vob_cl); -} - -static inline struct vvp_object *lu2vvp(const struct lu_object *obj) -{ - return container_of_safe(obj, struct vvp_object, vob_cl.co_lu); -} - -static inline struct inode *vvp_object_inode(const struct cl_object *obj) -{ - return cl2vvp(obj)->vob_inode; -} - -int vvp_object_invariant(const struct cl_object *obj); -struct vvp_object *cl_inode2vvp(struct inode *inode); - -static inline struct page *cl2vm_page(const struct cl_page_slice *slice) -{ - return cl2vvp_page(slice)->vpg_page; -} - -static inline struct vvp_lock *cl2vvp_lock(const struct cl_lock_slice *slice) -{ - return container_of(slice, struct vvp_lock, vlk_cl); -} - -# define CLOBINVRNT(env, clob, expr) \ - ((void)sizeof(env), (void)sizeof(clob), (void)sizeof(!!(expr))) - -int vvp_io_init(const struct lu_env *env, struct cl_object *obj, - struct cl_io *io); -int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io); -int vvp_lock_init(const struct lu_env *env, struct cl_object *obj, - struct cl_lock *lock, const struct cl_io *io); -int vvp_page_init(const struct lu_env *env, struct cl_object *obj, - struct cl_page *page, pgoff_t index); -struct lu_object *vvp_object_alloc(const struct lu_env *env, - const struct lu_object_header *hdr, - struct lu_device *dev); - -int vvp_global_init(void); -void vvp_global_fini(void); - -extern const struct file_operations vvp_dump_pgcache_file_ops; - -#endif /* VVP_INTERNAL_H */ diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c deleted file mode 100644 index e7a4778e02e4f53559083233680e6bd6fb6c664e..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/vvp_io.c +++ /dev/null @@ -1,1374 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Implementation of cl_io for VVP layer. - * - * Author: Nikita Danilov - * Author: Jinshan Xiong - */ - -#define DEBUG_SUBSYSTEM S_LLITE - -#include - -#include "llite_internal.h" -#include "vvp_internal.h" - -static struct vvp_io *cl2vvp_io(const struct lu_env *env, - const struct cl_io_slice *slice) -{ - struct vvp_io *vio; - - vio = container_of(slice, struct vvp_io, vui_cl); - LASSERT(vio == vvp_env_io(env)); - - return vio; -} - -/** - * For swapping layout. The file's layout may have changed. - * To avoid populating pages to a wrong stripe, we have to verify the - * correctness of layout. It works because swapping layout processes - * have to acquire group lock. - */ -static bool can_populate_pages(const struct lu_env *env, struct cl_io *io, - struct inode *inode) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct vvp_io *vio = vvp_env_io(env); - bool rc = true; - - switch (io->ci_type) { - case CIT_READ: - case CIT_WRITE: - /* don't need lock here to check lli_layout_gen as we have held - * extent lock and GROUP lock has to hold to swap layout - */ - if (ll_layout_version_get(lli) != vio->vui_layout_gen || - OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_LOST_LAYOUT, 0)) { - io->ci_need_restart = 1; - /* this will cause a short read/write */ - io->ci_continue = 0; - rc = false; - } - case CIT_FAULT: - /* fault is okay because we've already had a page. */ - default: - break; - } - - return rc; -} - -static void vvp_object_size_lock(struct cl_object *obj) -{ - struct inode *inode = vvp_object_inode(obj); - - ll_inode_size_lock(inode); - cl_object_attr_lock(obj); -} - -static void vvp_object_size_unlock(struct cl_object *obj) -{ - struct inode *inode = vvp_object_inode(obj); - - cl_object_attr_unlock(obj); - ll_inode_size_unlock(inode); -} - -/** - * Helper function that if necessary adjusts file size (inode->i_size), when - * position at the offset \a pos is accessed. File size can be arbitrary stale - * on a Lustre client, but client at least knows KMS. If accessed area is - * inside [0, KMS], set file size to KMS, otherwise glimpse file size. - * - * Locking: cl_isize_lock is used to serialize changes to inode size and to - * protect consistency between inode size and cl_object - * attributes. cl_object_size_lock() protects consistency between cl_attr's of - * top-object and sub-objects. - */ -static int vvp_prep_size(const struct lu_env *env, struct cl_object *obj, - struct cl_io *io, loff_t start, size_t count, - int *exceed) -{ - struct cl_attr *attr = vvp_env_thread_attr(env); - struct inode *inode = vvp_object_inode(obj); - loff_t pos = start + count - 1; - loff_t kms; - int result; - - /* - * Consistency guarantees: following possibilities exist for the - * relation between region being accessed and real file size at this - * moment: - * - * (A): the region is completely inside of the file; - * - * (B-x): x bytes of region are inside of the file, the rest is - * outside; - * - * (C): the region is completely outside of the file. - * - * This classification is stable under DLM lock already acquired by - * the caller, because to change the class, other client has to take - * DLM lock conflicting with our lock. Also, any updates to ->i_size - * by other threads on this client are serialized by - * ll_inode_size_lock(). This guarantees that short reads are handled - * correctly in the face of concurrent writes and truncates. - */ - vvp_object_size_lock(obj); - result = cl_object_attr_get(env, obj, attr); - if (result == 0) { - kms = attr->cat_kms; - if (pos > kms) { - /* - * A glimpse is necessary to determine whether we - * return a short read (B) or some zeroes at the end - * of the buffer (C) - */ - vvp_object_size_unlock(obj); - result = cl_glimpse_lock(env, io, inode, obj, 0); - if (result == 0 && exceed) { - /* If objective page index exceed end-of-file - * page index, return directly. Do not expect - * kernel will check such case correctly. - * linux-2.6.18-128.1.1 miss to do that. - * --bug 17336 - */ - loff_t size = i_size_read(inode); - loff_t cur_index = start >> PAGE_SHIFT; - loff_t size_index = (size - 1) >> PAGE_SHIFT; - - if ((size == 0 && cur_index != 0) || - size_index < cur_index) - *exceed = 1; - } - return result; - } - /* - * region is within kms and, hence, within real file - * size (A). We need to increase i_size to cover the - * read region so that generic_file_read() will do its - * job, but that doesn't mean the kms size is - * _correct_, it is only the _minimum_ size. If - * someone does a stat they will get the correct size - * which will always be >= the kms value here. - * b=11081 - */ - if (i_size_read(inode) < kms) { - i_size_write(inode, kms); - CDEBUG(D_VFSTRACE, DFID " updating i_size %llu\n", - PFID(lu_object_fid(&obj->co_lu)), - (__u64)i_size_read(inode)); - } - } - - vvp_object_size_unlock(obj); - - return result; -} - -/***************************************************************************** - * - * io operations. - * - */ - -static int vvp_io_one_lock_index(const struct lu_env *env, struct cl_io *io, - __u32 enqflags, enum cl_lock_mode mode, - pgoff_t start, pgoff_t end) -{ - struct vvp_io *vio = vvp_env_io(env); - struct cl_lock_descr *descr = &vio->vui_link.cill_descr; - struct cl_object *obj = io->ci_obj; - - CLOBINVRNT(env, obj, vvp_object_invariant(obj)); - - CDEBUG(D_VFSTRACE, "lock: %d [%lu, %lu]\n", mode, start, end); - - memset(&vio->vui_link, 0, sizeof(vio->vui_link)); - - if (vio->vui_fd && (vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) { - descr->cld_mode = CLM_GROUP; - descr->cld_gid = vio->vui_fd->fd_grouplock.lg_gid; - enqflags |= CEF_LOCK_MATCH; - } else { - descr->cld_mode = mode; - } - descr->cld_obj = obj; - descr->cld_start = start; - descr->cld_end = end; - descr->cld_enq_flags = enqflags; - - cl_io_lock_add(env, io, &vio->vui_link); - return 0; -} - -static int vvp_io_one_lock(const struct lu_env *env, struct cl_io *io, - __u32 enqflags, enum cl_lock_mode mode, - loff_t start, loff_t end) -{ - struct cl_object *obj = io->ci_obj; - - return vvp_io_one_lock_index(env, io, enqflags, mode, - cl_index(obj, start), cl_index(obj, end)); -} - -static int vvp_io_write_iter_init(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct vvp_io *vio = cl2vvp_io(env, ios); - - cl_page_list_init(&vio->u.write.vui_queue); - vio->u.write.vui_written = 0; - vio->u.write.vui_from = 0; - vio->u.write.vui_to = PAGE_SIZE; - - return 0; -} - -static void vvp_io_write_iter_fini(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct vvp_io *vio = cl2vvp_io(env, ios); - - LASSERT(vio->u.write.vui_queue.pl_nr == 0); -} - -static int vvp_io_fault_iter_init(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct vvp_io *vio = cl2vvp_io(env, ios); - struct inode *inode = vvp_object_inode(ios->cis_obj); - - LASSERT(inode == file_inode(vio->vui_fd->fd_file)); - vio->u.fault.ft_mtime = inode->i_mtime.tv_sec; - return 0; -} - -static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios) -{ - struct cl_io *io = ios->cis_io; - struct cl_object *obj = io->ci_obj; - struct vvp_io *vio = cl2vvp_io(env, ios); - struct inode *inode = vvp_object_inode(obj); - - CLOBINVRNT(env, obj, vvp_object_invariant(obj)); - - CDEBUG(D_VFSTRACE, DFID - " ignore/verify layout %d/%d, layout version %d restore needed %d\n", - PFID(lu_object_fid(&obj->co_lu)), - io->ci_ignore_layout, io->ci_verify_layout, - vio->vui_layout_gen, io->ci_restore_needed); - - if (io->ci_restore_needed) { - int rc; - - /* file was detected release, we need to restore it - * before finishing the io - */ - rc = ll_layout_restore(inode, 0, OBD_OBJECT_EOF); - /* if restore registration failed, no restart, - * we will return -ENODATA - */ - /* The layout will change after restore, so we need to - * block on layout lock hold by the MDT - * as MDT will not send new layout in lvb (see LU-3124) - * we have to explicitly fetch it, all this will be done - * by ll_layout_refresh() - */ - if (rc == 0) { - io->ci_restore_needed = 0; - io->ci_need_restart = 1; - io->ci_verify_layout = 1; - } else { - io->ci_restore_needed = 1; - io->ci_need_restart = 0; - io->ci_verify_layout = 0; - io->ci_result = rc; - } - } - - if (!io->ci_ignore_layout && io->ci_verify_layout) { - __u32 gen = 0; - - /* check layout version */ - ll_layout_refresh(inode, &gen); - io->ci_need_restart = vio->vui_layout_gen != gen; - if (io->ci_need_restart) { - CDEBUG(D_VFSTRACE, - DFID " layout changed from %d to %d.\n", - PFID(lu_object_fid(&obj->co_lu)), - vio->vui_layout_gen, gen); - /* today successful restore is the only possible case */ - /* restore was done, clear restoring state */ - clear_bit(LLIF_FILE_RESTORING, - &ll_i2info(inode)->lli_flags); - } - } -} - -static void vvp_io_fault_fini(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct cl_io *io = ios->cis_io; - struct cl_page *page = io->u.ci_fault.ft_page; - - CLOBINVRNT(env, io->ci_obj, vvp_object_invariant(io->ci_obj)); - - if (page) { - lu_ref_del(&page->cp_reference, "fault", io); - cl_page_put(env, page); - io->u.ci_fault.ft_page = NULL; - } - vvp_io_fini(env, ios); -} - -static enum cl_lock_mode vvp_mode_from_vma(struct vm_area_struct *vma) -{ - /* - * we only want to hold PW locks if the mmap() can generate - * writes back to the file and that only happens in shared - * writable vmas - */ - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) - return CLM_WRITE; - return CLM_READ; -} - -static int vvp_mmap_locks(const struct lu_env *env, - struct vvp_io *vio, struct cl_io *io) -{ - struct vvp_thread_info *cti = vvp_env_info(env); - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - struct cl_lock_descr *descr = &cti->vti_descr; - union ldlm_policy_data policy; - unsigned long addr; - ssize_t count; - int result = 0; - struct iov_iter i; - struct iovec iov; - - LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); - - if (!vio->vui_iter) /* nfs or loop back device write */ - return 0; - - /* No MM (e.g. NFS)? No vmas too. */ - if (!mm) - return 0; - - iov_for_each(iov, i, *vio->vui_iter) { - addr = (unsigned long)iov.iov_base; - count = iov.iov_len; - if (count == 0) - continue; - - count += addr & (~PAGE_MASK); - addr &= PAGE_MASK; - - down_read(&mm->mmap_sem); - while ((vma = our_vma(mm, addr, count)) != NULL) { - struct inode *inode = file_inode(vma->vm_file); - int flags = CEF_MUST; - - if (ll_file_nolock(vma->vm_file)) { - /* - * For no lock case is not allowed for mmap - */ - result = -EINVAL; - break; - } - - /* - * XXX: Required lock mode can be weakened: CIT_WRITE - * io only ever reads user level buffer, and CIT_READ - * only writes on it. - */ - policy_from_vma(&policy, vma, addr, count); - descr->cld_mode = vvp_mode_from_vma(vma); - descr->cld_obj = ll_i2info(inode)->lli_clob; - descr->cld_start = cl_index(descr->cld_obj, - policy.l_extent.start); - descr->cld_end = cl_index(descr->cld_obj, - policy.l_extent.end); - descr->cld_enq_flags = flags; - result = cl_io_lock_alloc_add(env, io, descr); - - CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n", - descr->cld_mode, descr->cld_start, - descr->cld_end); - - if (result < 0) - break; - - if (vma->vm_end - addr >= count) - break; - - count -= vma->vm_end - addr; - addr = vma->vm_end; - } - up_read(&mm->mmap_sem); - if (result < 0) - break; - } - return result; -} - -static void vvp_io_advance(const struct lu_env *env, - const struct cl_io_slice *ios, - size_t nob) -{ - struct cl_object *obj = ios->cis_io->ci_obj; - struct vvp_io *vio = cl2vvp_io(env, ios); - - CLOBINVRNT(env, obj, vvp_object_invariant(obj)); - - vio->vui_tot_count -= nob; - iov_iter_reexpand(vio->vui_iter, vio->vui_tot_count); -} - -static void vvp_io_update_iov(const struct lu_env *env, - struct vvp_io *vio, struct cl_io *io) -{ - size_t size = io->u.ci_rw.crw_count; - - if (!vio->vui_iter) - return; - - iov_iter_truncate(vio->vui_iter, size); -} - -static int vvp_io_rw_lock(const struct lu_env *env, struct cl_io *io, - enum cl_lock_mode mode, loff_t start, loff_t end) -{ - struct vvp_io *vio = vvp_env_io(env); - int result; - int ast_flags = 0; - - LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); - - vvp_io_update_iov(env, vio, io); - - if (io->u.ci_rw.crw_nonblock) - ast_flags |= CEF_NONBLOCK; - result = vvp_mmap_locks(env, vio, io); - if (result == 0) - result = vvp_io_one_lock(env, io, ast_flags, mode, start, end); - return result; -} - -static int vvp_io_read_lock(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct cl_io *io = ios->cis_io; - struct cl_io_rw_common *rd = &io->u.ci_rd.rd; - int result; - - result = vvp_io_rw_lock(env, io, CLM_READ, rd->crw_pos, - rd->crw_pos + rd->crw_count - 1); - - return result; -} - -static int vvp_io_fault_lock(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct cl_io *io = ios->cis_io; - struct vvp_io *vio = cl2vvp_io(env, ios); - /* - * XXX LDLM_FL_CBPENDING - */ - return vvp_io_one_lock_index(env, - io, 0, - vvp_mode_from_vma(vio->u.fault.ft_vma), - io->u.ci_fault.ft_index, - io->u.ci_fault.ft_index); -} - -static int vvp_io_write_lock(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct cl_io *io = ios->cis_io; - loff_t start; - loff_t end; - - if (io->u.ci_wr.wr_append) { - start = 0; - end = OBD_OBJECT_EOF; - } else { - start = io->u.ci_wr.wr.crw_pos; - end = start + io->u.ci_wr.wr.crw_count - 1; - } - return vvp_io_rw_lock(env, io, CLM_WRITE, start, end); -} - -static int vvp_io_setattr_iter_init(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - return 0; -} - -/** - * Implementation of cl_io_operations::vio_lock() method for CIT_SETATTR io. - * - * Handles "lockless io" mode when extent locking is done by server. - */ -static int vvp_io_setattr_lock(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct cl_io *io = ios->cis_io; - __u64 new_size; - __u32 enqflags = 0; - - if (cl_io_is_trunc(io)) { - new_size = io->u.ci_setattr.sa_attr.lvb_size; - if (new_size == 0) - enqflags = CEF_DISCARD_DATA; - } else { - unsigned int valid = io->u.ci_setattr.sa_valid; - - if (!(valid & TIMES_SET_FLAGS)) - return 0; - - if ((!(valid & ATTR_MTIME) || - io->u.ci_setattr.sa_attr.lvb_mtime >= - io->u.ci_setattr.sa_attr.lvb_ctime) && - (!(valid & ATTR_ATIME) || - io->u.ci_setattr.sa_attr.lvb_atime >= - io->u.ci_setattr.sa_attr.lvb_ctime)) - return 0; - new_size = 0; - } - - return vvp_io_one_lock(env, io, enqflags, CLM_WRITE, - new_size, OBD_OBJECT_EOF); -} - -static int vvp_do_vmtruncate(struct inode *inode, size_t size) -{ - int result; - /* - * Only ll_inode_size_lock is taken at this level. - */ - ll_inode_size_lock(inode); - result = inode_newsize_ok(inode, size); - if (result < 0) { - ll_inode_size_unlock(inode); - return result; - } - truncate_setsize(inode, size); - ll_inode_size_unlock(inode); - return result; -} - -static int vvp_io_setattr_time(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct cl_io *io = ios->cis_io; - struct cl_object *obj = io->ci_obj; - struct cl_attr *attr = vvp_env_thread_attr(env); - int result; - unsigned valid = CAT_CTIME; - - cl_object_attr_lock(obj); - attr->cat_ctime = io->u.ci_setattr.sa_attr.lvb_ctime; - if (io->u.ci_setattr.sa_valid & ATTR_ATIME_SET) { - attr->cat_atime = io->u.ci_setattr.sa_attr.lvb_atime; - valid |= CAT_ATIME; - } - if (io->u.ci_setattr.sa_valid & ATTR_MTIME_SET) { - attr->cat_mtime = io->u.ci_setattr.sa_attr.lvb_mtime; - valid |= CAT_MTIME; - } - result = cl_object_attr_update(env, obj, attr, valid); - cl_object_attr_unlock(obj); - - return result; -} - -static int vvp_io_setattr_start(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct cl_io *io = ios->cis_io; - struct inode *inode = vvp_object_inode(io->ci_obj); - struct ll_inode_info *lli = ll_i2info(inode); - - if (cl_io_is_trunc(io)) { - down_write(&lli->lli_trunc_sem); - inode_lock(inode); - inode_dio_wait(inode); - } else { - inode_lock(inode); - } - - if (io->u.ci_setattr.sa_valid & TIMES_SET_FLAGS) - return vvp_io_setattr_time(env, ios); - - return 0; -} - -static void vvp_io_setattr_end(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct cl_io *io = ios->cis_io; - struct inode *inode = vvp_object_inode(io->ci_obj); - struct ll_inode_info *lli = ll_i2info(inode); - - if (cl_io_is_trunc(io)) { - /* Truncate in memory pages - they must be clean pages - * because osc has already notified to destroy osc_extents. - */ - vvp_do_vmtruncate(inode, io->u.ci_setattr.sa_attr.lvb_size); - inode_unlock(inode); - up_write(&lli->lli_trunc_sem); - } else { - inode_unlock(inode); - } -} - -static void vvp_io_setattr_fini(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - bool restore_needed = ios->cis_io->ci_restore_needed; - struct inode *inode = vvp_object_inode(ios->cis_obj); - - vvp_io_fini(env, ios); - - if (restore_needed && !ios->cis_io->ci_restore_needed) { - /* restore finished, set data modified flag for HSM */ - set_bit(LLIF_DATA_MODIFIED, &(ll_i2info(inode))->lli_flags); - } -} - -static int vvp_io_read_start(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct vvp_io *vio = cl2vvp_io(env, ios); - struct cl_io *io = ios->cis_io; - struct cl_object *obj = io->ci_obj; - struct inode *inode = vvp_object_inode(obj); - struct ll_inode_info *lli = ll_i2info(inode); - struct file *file = vio->vui_fd->fd_file; - - int result; - loff_t pos = io->u.ci_rd.rd.crw_pos; - long cnt = io->u.ci_rd.rd.crw_count; - long tot = vio->vui_tot_count; - int exceed = 0; - - CLOBINVRNT(env, obj, vvp_object_invariant(obj)); - - CDEBUG(D_VFSTRACE, "read: -> [%lli, %lli)\n", pos, pos + cnt); - - down_read(&lli->lli_trunc_sem); - - if (!can_populate_pages(env, io, inode)) - return 0; - - result = vvp_prep_size(env, obj, io, pos, tot, &exceed); - if (result != 0) - return result; - if (exceed != 0) - goto out; - - LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu, - "Read ino %lu, %lu bytes, offset %lld, size %llu\n", - inode->i_ino, cnt, pos, i_size_read(inode)); - - /* turn off the kernel's read-ahead */ - vio->vui_fd->fd_file->f_ra.ra_pages = 0; - - /* initialize read-ahead window once per syscall */ - if (!vio->vui_ra_valid) { - vio->vui_ra_valid = true; - vio->vui_ra_start = cl_index(obj, pos); - vio->vui_ra_count = cl_index(obj, tot + PAGE_SIZE - 1); - ll_ras_enter(file); - } - - /* BUG: 5972 */ - file_accessed(file); - LASSERT(vio->vui_iocb->ki_pos == pos); - result = generic_file_read_iter(vio->vui_iocb, vio->vui_iter); - -out: - if (result >= 0) { - if (result < cnt) - io->ci_continue = 0; - io->ci_nob += result; - ll_rw_stats_tally(ll_i2sbi(inode), current->pid, - vio->vui_fd, pos, result, READ); - result = 0; - } - return result; -} - -static int vvp_io_commit_sync(const struct lu_env *env, struct cl_io *io, - struct cl_page_list *plist, int from, int to) -{ - struct cl_2queue *queue = &io->ci_queue; - struct cl_page *page; - unsigned int bytes = 0; - int rc = 0; - - if (plist->pl_nr == 0) - return 0; - - if (from > 0 || to != PAGE_SIZE) { - page = cl_page_list_first(plist); - if (plist->pl_nr == 1) { - cl_page_clip(env, page, from, to); - } else { - if (from > 0) - cl_page_clip(env, page, from, PAGE_SIZE); - if (to != PAGE_SIZE) { - page = cl_page_list_last(plist); - cl_page_clip(env, page, 0, to); - } - } - } - - cl_2queue_init(queue); - cl_page_list_splice(plist, &queue->c2_qin); - rc = cl_io_submit_sync(env, io, CRT_WRITE, queue, 0); - - /* plist is not sorted any more */ - cl_page_list_splice(&queue->c2_qin, plist); - cl_page_list_splice(&queue->c2_qout, plist); - cl_2queue_fini(env, queue); - - if (rc == 0) { - /* calculate bytes */ - bytes = plist->pl_nr << PAGE_SHIFT; - bytes -= from + PAGE_SIZE - to; - - while (plist->pl_nr > 0) { - page = cl_page_list_first(plist); - cl_page_list_del(env, plist, page); - - cl_page_clip(env, page, 0, PAGE_SIZE); - - SetPageUptodate(cl_page_vmpage(page)); - cl_page_disown(env, io, page); - - /* held in ll_cl_init() */ - lu_ref_del(&page->cp_reference, "cl_io", io); - cl_page_put(env, page); - } - } - - return bytes > 0 ? bytes : rc; -} - -static void write_commit_callback(const struct lu_env *env, struct cl_io *io, - struct cl_page *page) -{ - struct page *vmpage = page->cp_vmpage; - - SetPageUptodate(vmpage); - set_page_dirty(vmpage); - - cl_page_disown(env, io, page); - - /* held in ll_cl_init() */ - lu_ref_del(&page->cp_reference, "cl_io", cl_io_top(io)); - cl_page_put(env, page); -} - -/* make sure the page list is contiguous */ -static bool page_list_sanity_check(struct cl_object *obj, - struct cl_page_list *plist) -{ - struct cl_page *page; - pgoff_t index = CL_PAGE_EOF; - - cl_page_list_for_each(page, plist) { - struct vvp_page *vpg = cl_object_page_slice(obj, page); - - if (index == CL_PAGE_EOF) { - index = vvp_index(vpg); - continue; - } - - ++index; - if (index == vvp_index(vpg)) - continue; - - return false; - } - return true; -} - -/* Return how many bytes have queued or written */ -int vvp_io_write_commit(const struct lu_env *env, struct cl_io *io) -{ - struct cl_object *obj = io->ci_obj; - struct inode *inode = vvp_object_inode(obj); - struct vvp_io *vio = vvp_env_io(env); - struct cl_page_list *queue = &vio->u.write.vui_queue; - struct cl_page *page; - int rc = 0; - int bytes = 0; - unsigned int npages = vio->u.write.vui_queue.pl_nr; - - if (npages == 0) - return 0; - - CDEBUG(D_VFSTRACE, "commit async pages: %d, from %d, to %d\n", - npages, vio->u.write.vui_from, vio->u.write.vui_to); - - LASSERT(page_list_sanity_check(obj, queue)); - - /* submit IO with async write */ - rc = cl_io_commit_async(env, io, queue, - vio->u.write.vui_from, vio->u.write.vui_to, - write_commit_callback); - npages -= queue->pl_nr; /* already committed pages */ - if (npages > 0) { - /* calculate how many bytes were written */ - bytes = npages << PAGE_SHIFT; - - /* first page */ - bytes -= vio->u.write.vui_from; - if (queue->pl_nr == 0) /* last page */ - bytes -= PAGE_SIZE - vio->u.write.vui_to; - LASSERTF(bytes > 0, "bytes = %d, pages = %d\n", bytes, npages); - - vio->u.write.vui_written += bytes; - - CDEBUG(D_VFSTRACE, "Committed %d pages %d bytes, tot: %ld\n", - npages, bytes, vio->u.write.vui_written); - - /* the first page must have been written. */ - vio->u.write.vui_from = 0; - } - LASSERT(page_list_sanity_check(obj, queue)); - LASSERT(ergo(rc == 0, queue->pl_nr == 0)); - - /* out of quota, try sync write */ - if (rc == -EDQUOT && !cl_io_is_mkwrite(io)) { - rc = vvp_io_commit_sync(env, io, queue, - vio->u.write.vui_from, - vio->u.write.vui_to); - if (rc > 0) { - vio->u.write.vui_written += rc; - rc = 0; - } - } - - /* update inode size */ - ll_merge_attr(env, inode); - - /* Now the pages in queue were failed to commit, discard them - * unless they were dirtied before. - */ - while (queue->pl_nr > 0) { - page = cl_page_list_first(queue); - cl_page_list_del(env, queue, page); - - if (!PageDirty(cl_page_vmpage(page))) - cl_page_discard(env, io, page); - - cl_page_disown(env, io, page); - - /* held in ll_cl_init() */ - lu_ref_del(&page->cp_reference, "cl_io", io); - cl_page_put(env, page); - } - cl_page_list_fini(env, queue); - - return rc; -} - -static int vvp_io_write_start(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct vvp_io *vio = cl2vvp_io(env, ios); - struct cl_io *io = ios->cis_io; - struct cl_object *obj = io->ci_obj; - struct inode *inode = vvp_object_inode(obj); - struct ll_inode_info *lli = ll_i2info(inode); - ssize_t result = 0; - loff_t pos = io->u.ci_wr.wr.crw_pos; - size_t cnt = io->u.ci_wr.wr.crw_count; - - down_read(&lli->lli_trunc_sem); - - if (!can_populate_pages(env, io, inode)) - return 0; - - if (cl_io_is_append(io)) { - /* - * PARALLEL IO This has to be changed for parallel IO doing - * out-of-order writes. - */ - ll_merge_attr(env, inode); - pos = i_size_read(inode); - io->u.ci_wr.wr.crw_pos = pos; - vio->vui_iocb->ki_pos = pos; - } else { - LASSERT(vio->vui_iocb->ki_pos == pos); - } - - CDEBUG(D_VFSTRACE, "write: [%lli, %lli)\n", pos, pos + (long long)cnt); - - /* - * The maximum Lustre file size is variable, based on the OST maximum - * object size and number of stripes. This needs another check in - * addition to the VFS checks earlier. - */ - if (pos + cnt > ll_file_maxbytes(inode)) { - CDEBUG(D_INODE, - "%s: file " DFID " offset %llu > maxbytes %llu\n", - ll_get_fsname(inode->i_sb, NULL, 0), - PFID(ll_inode2fid(inode)), pos + cnt, - ll_file_maxbytes(inode)); - return -EFBIG; - } - - if (!vio->vui_iter) { - /* from a temp io in ll_cl_init(). */ - result = 0; - } else { - /* - * When using the locked AIO function (generic_file_aio_write()) - * testing has shown the inode mutex to be a limiting factor - * with multi-threaded single shared file performance. To get - * around this, we now use the lockless version. To maintain - * consistency, proper locking to protect against writes, - * trucates, etc. is handled in the higher layers of lustre. - */ - bool lock_node = !IS_NOSEC(inode); - - if (lock_node) - inode_lock(inode); - result = __generic_file_write_iter(vio->vui_iocb, - vio->vui_iter); - if (lock_node) - inode_unlock(inode); - - if (result > 0 || result == -EIOCBQUEUED) - result = generic_write_sync(vio->vui_iocb, result); - } - - if (result > 0) { - result = vvp_io_write_commit(env, io); - if (vio->u.write.vui_written > 0) { - result = vio->u.write.vui_written; - io->ci_nob += result; - - CDEBUG(D_VFSTRACE, "write: nob %zd, result: %zd\n", - io->ci_nob, result); - } - } - if (result > 0) { - set_bit(LLIF_DATA_MODIFIED, &(ll_i2info(inode))->lli_flags); - - if (result < cnt) - io->ci_continue = 0; - ll_rw_stats_tally(ll_i2sbi(inode), current->pid, - vio->vui_fd, pos, result, WRITE); - result = 0; - } - return result; -} - -static void vvp_io_rw_end(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct inode *inode = vvp_object_inode(ios->cis_obj); - struct ll_inode_info *lli = ll_i2info(inode); - - up_read(&lli->lli_trunc_sem); -} - -static int vvp_io_kernel_fault(struct vvp_fault_io *cfio) -{ - struct vm_fault *vmf = cfio->ft_vmf; - - cfio->ft_flags = filemap_fault(vmf); - cfio->ft_flags_valid = 1; - - if (vmf->page) { - CDEBUG(D_PAGE, - "page %p map %p index %lu flags %lx count %u priv %0lx: got addr %p type NOPAGE\n", - vmf->page, vmf->page->mapping, vmf->page->index, - (long)vmf->page->flags, page_count(vmf->page), - page_private(vmf->page), (void *)vmf->address); - if (unlikely(!(cfio->ft_flags & VM_FAULT_LOCKED))) { - lock_page(vmf->page); - cfio->ft_flags |= VM_FAULT_LOCKED; - } - - cfio->ft_vmpage = vmf->page; - return 0; - } - - if (cfio->ft_flags & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) { - CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", (void *)vmf->address); - return -EFAULT; - } - - if (cfio->ft_flags & VM_FAULT_OOM) { - CDEBUG(D_PAGE, "got addr %p - OOM\n", (void *)vmf->address); - return -ENOMEM; - } - - if (cfio->ft_flags & VM_FAULT_RETRY) - return -EAGAIN; - - CERROR("Unknown error in page fault %d!\n", cfio->ft_flags); - return -EINVAL; -} - -static void mkwrite_commit_callback(const struct lu_env *env, struct cl_io *io, - struct cl_page *page) -{ - set_page_dirty(page->cp_vmpage); -} - -static int vvp_io_fault_start(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct vvp_io *vio = cl2vvp_io(env, ios); - struct cl_io *io = ios->cis_io; - struct cl_object *obj = io->ci_obj; - struct inode *inode = vvp_object_inode(obj); - struct ll_inode_info *lli = ll_i2info(inode); - struct cl_fault_io *fio = &io->u.ci_fault; - struct vvp_fault_io *cfio = &vio->u.fault; - loff_t offset; - int result = 0; - struct page *vmpage = NULL; - struct cl_page *page; - loff_t size; - pgoff_t last_index; - - down_read(&lli->lli_trunc_sem); - - /* offset of the last byte on the page */ - offset = cl_offset(obj, fio->ft_index + 1) - 1; - LASSERT(cl_index(obj, offset) == fio->ft_index); - result = vvp_prep_size(env, obj, io, 0, offset + 1, NULL); - if (result != 0) - return result; - - /* must return locked page */ - if (fio->ft_mkwrite) { - LASSERT(cfio->ft_vmpage); - lock_page(cfio->ft_vmpage); - } else { - result = vvp_io_kernel_fault(cfio); - if (result != 0) - return result; - } - - vmpage = cfio->ft_vmpage; - LASSERT(PageLocked(vmpage)); - - if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_FAULT_TRUNC_RACE)) - ll_invalidate_page(vmpage); - - size = i_size_read(inode); - /* Though we have already held a cl_lock upon this page, but - * it still can be truncated locally. - */ - if (unlikely((vmpage->mapping != inode->i_mapping) || - (page_offset(vmpage) > size))) { - CDEBUG(D_PAGE, "llite: fault and truncate race happened!\n"); - - /* return +1 to stop cl_io_loop() and ll_fault() will catch - * and retry. - */ - result = 1; - goto out; - } - - last_index = cl_index(obj, size - 1); - - if (fio->ft_mkwrite) { - /* - * Capture the size while holding the lli_trunc_sem from above - * we want to make sure that we complete the mkwrite action - * while holding this lock. We need to make sure that we are - * not past the end of the file. - */ - if (last_index < fio->ft_index) { - CDEBUG(D_PAGE, - "llite: mkwrite and truncate race happened: %p: 0x%lx 0x%lx\n", - vmpage->mapping, fio->ft_index, last_index); - /* - * We need to return if we are - * passed the end of the file. This will propagate - * up the call stack to ll_page_mkwrite where - * we will return VM_FAULT_NOPAGE. Any non-negative - * value returned here will be silently - * converted to 0. If the vmpage->mapping is null - * the error code would be converted back to ENODATA - * in ll_page_mkwrite0. Thus we return -ENODATA - * to handle both cases - */ - result = -ENODATA; - goto out; - } - } - - page = cl_page_find(env, obj, fio->ft_index, vmpage, CPT_CACHEABLE); - if (IS_ERR(page)) { - result = PTR_ERR(page); - goto out; - } - - /* if page is going to be written, we should add this page into cache - * earlier. - */ - if (fio->ft_mkwrite) { - wait_on_page_writeback(vmpage); - if (!PageDirty(vmpage)) { - struct cl_page_list *plist = &io->ci_queue.c2_qin; - struct vvp_page *vpg = cl_object_page_slice(obj, page); - int to = PAGE_SIZE; - - /* vvp_page_assume() calls wait_on_page_writeback(). */ - cl_page_assume(env, io, page); - - cl_page_list_init(plist); - cl_page_list_add(plist, page); - - /* size fixup */ - if (last_index == vvp_index(vpg)) - to = size & ~PAGE_MASK; - - /* Do not set Dirty bit here so that in case IO is - * started before the page is really made dirty, we - * still have chance to detect it. - */ - result = cl_io_commit_async(env, io, plist, 0, to, - mkwrite_commit_callback); - LASSERT(cl_page_is_owned(page, io)); - cl_page_list_fini(env, plist); - - vmpage = NULL; - if (result < 0) { - cl_page_discard(env, io, page); - cl_page_disown(env, io, page); - - cl_page_put(env, page); - - /* we're in big trouble, what can we do now? */ - if (result == -EDQUOT) - result = -ENOSPC; - goto out; - } else { - cl_page_disown(env, io, page); - } - } - } - - /* - * The ft_index is only used in the case of - * a mkwrite action. We need to check - * our assertions are correct, since - * we should have caught this above - */ - LASSERT(!fio->ft_mkwrite || fio->ft_index <= last_index); - if (fio->ft_index == last_index) - /* - * Last page is mapped partially. - */ - fio->ft_nob = size - cl_offset(obj, fio->ft_index); - else - fio->ft_nob = cl_page_size(obj); - - lu_ref_add(&page->cp_reference, "fault", io); - fio->ft_page = page; - -out: - /* return unlocked vmpage to avoid deadlocking */ - if (vmpage) - unlock_page(vmpage); - - cfio->ft_flags &= ~VM_FAULT_LOCKED; - - return result; -} - -static void vvp_io_fault_end(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct inode *inode = vvp_object_inode(ios->cis_obj); - struct ll_inode_info *lli = ll_i2info(inode); - - CLOBINVRNT(env, ios->cis_io->ci_obj, - vvp_object_invariant(ios->cis_io->ci_obj)); - up_read(&lli->lli_trunc_sem); -} - -static int vvp_io_fsync_start(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - /* we should mark TOWRITE bit to each dirty page in radix tree to - * verify pages have been written, but this is difficult because of - * race. - */ - return 0; -} - -static int vvp_io_read_ahead(const struct lu_env *env, - const struct cl_io_slice *ios, - pgoff_t start, struct cl_read_ahead *ra) -{ - int result = 0; - - if (ios->cis_io->ci_type == CIT_READ || - ios->cis_io->ci_type == CIT_FAULT) { - struct vvp_io *vio = cl2vvp_io(env, ios); - - if (unlikely(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) { - ra->cra_end = CL_PAGE_EOF; - result = 1; /* no need to call down */ - } - } - - return result; -} - -static const struct cl_io_operations vvp_io_ops = { - .op = { - [CIT_READ] = { - .cio_fini = vvp_io_fini, - .cio_lock = vvp_io_read_lock, - .cio_start = vvp_io_read_start, - .cio_end = vvp_io_rw_end, - .cio_advance = vvp_io_advance, - }, - [CIT_WRITE] = { - .cio_fini = vvp_io_fini, - .cio_iter_init = vvp_io_write_iter_init, - .cio_iter_fini = vvp_io_write_iter_fini, - .cio_lock = vvp_io_write_lock, - .cio_start = vvp_io_write_start, - .cio_end = vvp_io_rw_end, - .cio_advance = vvp_io_advance, - }, - [CIT_SETATTR] = { - .cio_fini = vvp_io_setattr_fini, - .cio_iter_init = vvp_io_setattr_iter_init, - .cio_lock = vvp_io_setattr_lock, - .cio_start = vvp_io_setattr_start, - .cio_end = vvp_io_setattr_end - }, - [CIT_FAULT] = { - .cio_fini = vvp_io_fault_fini, - .cio_iter_init = vvp_io_fault_iter_init, - .cio_lock = vvp_io_fault_lock, - .cio_start = vvp_io_fault_start, - .cio_end = vvp_io_fault_end, - }, - [CIT_FSYNC] = { - .cio_start = vvp_io_fsync_start, - .cio_fini = vvp_io_fini - }, - [CIT_MISC] = { - .cio_fini = vvp_io_fini - } - }, - .cio_read_ahead = vvp_io_read_ahead, -}; - -int vvp_io_init(const struct lu_env *env, struct cl_object *obj, - struct cl_io *io) -{ - struct vvp_io *vio = vvp_env_io(env); - struct inode *inode = vvp_object_inode(obj); - int result; - - CLOBINVRNT(env, obj, vvp_object_invariant(obj)); - - CDEBUG(D_VFSTRACE, DFID - " ignore/verify layout %d/%d, layout version %d restore needed %d\n", - PFID(lu_object_fid(&obj->co_lu)), - io->ci_ignore_layout, io->ci_verify_layout, - vio->vui_layout_gen, io->ci_restore_needed); - - CL_IO_SLICE_CLEAN(vio, vui_cl); - cl_io_slice_add(io, &vio->vui_cl, obj, &vvp_io_ops); - vio->vui_ra_valid = false; - result = 0; - if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE) { - size_t count; - struct ll_inode_info *lli = ll_i2info(inode); - - count = io->u.ci_rw.crw_count; - /* "If nbyte is 0, read() will return 0 and have no other - * results." -- Single Unix Spec - */ - if (count == 0) - result = 1; - else - vio->vui_tot_count = count; - - /* for read/write, we store the jobid in the inode, and - * it'll be fetched by osc when building RPC. - * - * it's not accurate if the file is shared by different - * jobs. - */ - lustre_get_jobid(lli->lli_jobid); - } else if (io->ci_type == CIT_SETATTR) { - if (!cl_io_is_trunc(io)) - io->ci_lockreq = CILR_MANDATORY; - } - - /* Enqueue layout lock and get layout version. We need to do this - * even for operations requiring to open file, such as read and write, - * because it might not grant layout lock in IT_OPEN. - */ - if (result == 0 && !io->ci_ignore_layout) { - result = ll_layout_refresh(inode, &vio->vui_layout_gen); - if (result == -ENOENT) - /* If the inode on MDS has been removed, but the objects - * on OSTs haven't been destroyed (async unlink), layout - * fetch will return -ENOENT, we'd ignore this error - * and continue with dirty flush. LU-3230. - */ - result = 0; - if (result < 0) - CERROR("%s: refresh file layout " DFID " error %d.\n", - ll_get_fsname(inode->i_sb, NULL, 0), - PFID(lu_object_fid(&obj->co_lu)), result); - } - - return result; -} diff --git a/drivers/staging/lustre/lustre/llite/vvp_lock.c b/drivers/staging/lustre/lustre/llite/vvp_lock.c deleted file mode 100644 index 4b6c7143bd2cdeb483026f6b84aa1d13cbbc7751..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/vvp_lock.c +++ /dev/null @@ -1,87 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2014, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Implementation of cl_lock for VVP layer. - * - * Author: Nikita Danilov - */ - -#define DEBUG_SUBSYSTEM S_LLITE - -#include - -#include "vvp_internal.h" - -/***************************************************************************** - * - * Vvp lock functions. - * - */ - -static void vvp_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice) -{ - struct vvp_lock *vlk = cl2vvp_lock(slice); - - kmem_cache_free(vvp_lock_kmem, vlk); -} - -static int vvp_lock_enqueue(const struct lu_env *env, - const struct cl_lock_slice *slice, - struct cl_io *unused, struct cl_sync_io *anchor) -{ - CLOBINVRNT(env, slice->cls_obj, vvp_object_invariant(slice->cls_obj)); - - return 0; -} - -static const struct cl_lock_operations vvp_lock_ops = { - .clo_fini = vvp_lock_fini, - .clo_enqueue = vvp_lock_enqueue, -}; - -int vvp_lock_init(const struct lu_env *env, struct cl_object *obj, - struct cl_lock *lock, const struct cl_io *unused) -{ - struct vvp_lock *vlk; - int result; - - CLOBINVRNT(env, obj, vvp_object_invariant(obj)); - - vlk = kmem_cache_zalloc(vvp_lock_kmem, GFP_NOFS); - if (vlk) { - cl_lock_slice_add(lock, &vlk->vlk_cl, obj, &vvp_lock_ops); - result = 0; - } else { - result = -ENOMEM; - } - return result; -} diff --git a/drivers/staging/lustre/lustre/llite/vvp_object.c b/drivers/staging/lustre/lustre/llite/vvp_object.c deleted file mode 100644 index b2cb51c8f7f4739c2cea1099787ac48f557e30fe..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/vvp_object.c +++ /dev/null @@ -1,303 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * cl_object implementation for VVP layer. - * - * Author: Nikita Danilov - */ - -#define DEBUG_SUBSYSTEM S_LLITE - -#include - -#include "llite_internal.h" -#include "vvp_internal.h" - -/***************************************************************************** - * - * Object operations. - * - */ - -int vvp_object_invariant(const struct cl_object *obj) -{ - struct inode *inode = vvp_object_inode(obj); - struct ll_inode_info *lli = ll_i2info(inode); - - return (S_ISREG(inode->i_mode) || inode->i_mode == 0) && - lli->lli_clob == obj; -} - -static int vvp_object_print(const struct lu_env *env, void *cookie, - lu_printer_t p, const struct lu_object *o) -{ - struct vvp_object *obj = lu2vvp(o); - struct inode *inode = obj->vob_inode; - struct ll_inode_info *lli; - - (*p)(env, cookie, "(%d %d) inode: %p ", - atomic_read(&obj->vob_transient_pages), - atomic_read(&obj->vob_mmap_cnt), inode); - if (inode) { - lli = ll_i2info(inode); - (*p)(env, cookie, "%lu/%u %o %u %d %p " DFID, - inode->i_ino, inode->i_generation, inode->i_mode, - inode->i_nlink, atomic_read(&inode->i_count), - lli->lli_clob, PFID(&lli->lli_fid)); - } - return 0; -} - -static int vvp_attr_get(const struct lu_env *env, struct cl_object *obj, - struct cl_attr *attr) -{ - struct inode *inode = vvp_object_inode(obj); - - /* - * lov overwrites most of these fields in - * lov_attr_get()->...lov_merge_lvb_kms(), except when inode - * attributes are newer. - */ - - attr->cat_size = i_size_read(inode); - attr->cat_mtime = inode->i_mtime.tv_sec; - attr->cat_atime = inode->i_atime.tv_sec; - attr->cat_ctime = inode->i_ctime.tv_sec; - attr->cat_blocks = inode->i_blocks; - attr->cat_uid = from_kuid(&init_user_ns, inode->i_uid); - attr->cat_gid = from_kgid(&init_user_ns, inode->i_gid); - /* KMS is not known by this layer */ - return 0; /* layers below have to fill in the rest */ -} - -static int vvp_attr_update(const struct lu_env *env, struct cl_object *obj, - const struct cl_attr *attr, unsigned int valid) -{ - struct inode *inode = vvp_object_inode(obj); - - if (valid & CAT_UID) - inode->i_uid = make_kuid(&init_user_ns, attr->cat_uid); - if (valid & CAT_GID) - inode->i_gid = make_kgid(&init_user_ns, attr->cat_gid); - if (valid & CAT_ATIME) - inode->i_atime.tv_sec = attr->cat_atime; - if (valid & CAT_MTIME) - inode->i_mtime.tv_sec = attr->cat_mtime; - if (valid & CAT_CTIME) - inode->i_ctime.tv_sec = attr->cat_ctime; - if (0 && valid & CAT_SIZE) - i_size_write(inode, attr->cat_size); - /* not currently necessary */ - if (0 && valid & (CAT_UID | CAT_GID | CAT_SIZE)) - mark_inode_dirty(inode); - return 0; -} - -static int vvp_conf_set(const struct lu_env *env, struct cl_object *obj, - const struct cl_object_conf *conf) -{ - struct ll_inode_info *lli = ll_i2info(conf->coc_inode); - - if (conf->coc_opc == OBJECT_CONF_INVALIDATE) { - CDEBUG(D_VFSTRACE, DFID ": losing layout lock\n", - PFID(&lli->lli_fid)); - - ll_layout_version_set(lli, CL_LAYOUT_GEN_NONE); - - /* Clean up page mmap for this inode. - * The reason for us to do this is that if the page has - * already been installed into memory space, the process - * can access it without interacting with lustre, so this - * page may be stale due to layout change, and the process - * will never be notified. - * This operation is expensive but mmap processes have to pay - * a price themselves. - */ - unmap_mapping_range(conf->coc_inode->i_mapping, - 0, OBD_OBJECT_EOF, 0); - } - - return 0; -} - -static int vvp_prune(const struct lu_env *env, struct cl_object *obj) -{ - struct inode *inode = vvp_object_inode(obj); - int rc; - - rc = cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, CL_FSYNC_LOCAL, 1); - if (rc < 0) { - CDEBUG(D_VFSTRACE, DFID ": writeback failed: %d\n", - PFID(lu_object_fid(&obj->co_lu)), rc); - return rc; - } - - truncate_inode_pages(inode->i_mapping, 0); - return 0; -} - -static int vvp_object_glimpse(const struct lu_env *env, - const struct cl_object *obj, struct ost_lvb *lvb) -{ - struct inode *inode = vvp_object_inode(obj); - - lvb->lvb_mtime = LTIME_S(inode->i_mtime); - lvb->lvb_atime = LTIME_S(inode->i_atime); - lvb->lvb_ctime = LTIME_S(inode->i_ctime); - /* - * LU-417: Add dirty pages block count lest i_blocks reports 0, some - * "cp" or "tar" on remote node may think it's a completely sparse file - * and skip it. - */ - if (lvb->lvb_size > 0 && lvb->lvb_blocks == 0) - lvb->lvb_blocks = dirty_cnt(inode); - return 0; -} - -static void vvp_req_attr_set(const struct lu_env *env, struct cl_object *obj, - struct cl_req_attr *attr) -{ - u64 valid_flags = OBD_MD_FLTYPE; - struct inode *inode; - struct obdo *oa; - - oa = attr->cra_oa; - inode = vvp_object_inode(obj); - - if (attr->cra_type == CRT_WRITE) - valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME | - OBD_MD_FLUID | OBD_MD_FLGID; - obdo_from_inode(oa, inode, valid_flags & attr->cra_flags); - obdo_set_parent_fid(oa, &ll_i2info(inode)->lli_fid); - if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_INVALID_PFID)) - oa->o_parent_oid++; - memcpy(attr->cra_jobid, ll_i2info(inode)->lli_jobid, LUSTRE_JOBID_SIZE); -} - -static const struct cl_object_operations vvp_ops = { - .coo_page_init = vvp_page_init, - .coo_lock_init = vvp_lock_init, - .coo_io_init = vvp_io_init, - .coo_attr_get = vvp_attr_get, - .coo_attr_update = vvp_attr_update, - .coo_conf_set = vvp_conf_set, - .coo_prune = vvp_prune, - .coo_glimpse = vvp_object_glimpse, - .coo_req_attr_set = vvp_req_attr_set -}; - -static int vvp_object_init0(const struct lu_env *env, - struct vvp_object *vob, - const struct cl_object_conf *conf) -{ - vob->vob_inode = conf->coc_inode; - atomic_set(&vob->vob_transient_pages, 0); - cl_object_page_init(&vob->vob_cl, sizeof(struct vvp_page)); - return 0; -} - -static int vvp_object_init(const struct lu_env *env, struct lu_object *obj, - const struct lu_object_conf *conf) -{ - struct vvp_device *dev = lu2vvp_dev(obj->lo_dev); - struct vvp_object *vob = lu2vvp(obj); - struct lu_object *below; - struct lu_device *under; - int result; - - under = &dev->vdv_next->cd_lu_dev; - below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under); - if (below) { - const struct cl_object_conf *cconf; - - cconf = lu2cl_conf(conf); - lu_object_add(obj, below); - result = vvp_object_init0(env, vob, cconf); - } else { - result = -ENOMEM; - } - - return result; -} - -static void vvp_object_free(const struct lu_env *env, struct lu_object *obj) -{ - struct vvp_object *vob = lu2vvp(obj); - - lu_object_fini(obj); - lu_object_header_fini(obj->lo_header); - kmem_cache_free(vvp_object_kmem, vob); -} - -static const struct lu_object_operations vvp_lu_obj_ops = { - .loo_object_init = vvp_object_init, - .loo_object_free = vvp_object_free, - .loo_object_print = vvp_object_print, -}; - -struct vvp_object *cl_inode2vvp(struct inode *inode) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct cl_object *obj = lli->lli_clob; - struct lu_object *lu; - - lu = lu_object_locate(obj->co_lu.lo_header, &vvp_device_type); - LASSERT(lu); - return lu2vvp(lu); -} - -struct lu_object *vvp_object_alloc(const struct lu_env *env, - const struct lu_object_header *unused, - struct lu_device *dev) -{ - struct vvp_object *vob; - struct lu_object *obj; - - vob = kmem_cache_zalloc(vvp_object_kmem, GFP_NOFS); - if (vob) { - struct cl_object_header *hdr; - - obj = &vob->vob_cl.co_lu; - hdr = &vob->vob_header; - cl_object_header_init(hdr); - hdr->coh_page_bufsize = cfs_size_round(sizeof(struct cl_page)); - - lu_object_init(obj, &hdr->coh_lu, dev); - lu_object_add_top(&hdr->coh_lu, obj); - - vob->vob_cl.co_ops = &vvp_ops; - obj->lo_ops = &vvp_lu_obj_ops; - } else { - obj = NULL; - } - return obj; -} diff --git a/drivers/staging/lustre/lustre/llite/vvp_page.c b/drivers/staging/lustre/lustre/llite/vvp_page.c deleted file mode 100644 index 6eb0565ddc225a76f3ede27448aaf68d31292056..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/vvp_page.c +++ /dev/null @@ -1,523 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Implementation of cl_page for VVP layer. - * - * Author: Nikita Danilov - * Author: Jinshan Xiong - */ - -#define DEBUG_SUBSYSTEM S_LLITE - -#include -#include -#include -#include -#include -#include - -#include "llite_internal.h" -#include "vvp_internal.h" - -/***************************************************************************** - * - * Page operations. - * - */ - -static void vvp_page_fini_common(struct vvp_page *vpg) -{ - struct page *vmpage = vpg->vpg_page; - - LASSERT(vmpage); - put_page(vmpage); -} - -static void vvp_page_fini(const struct lu_env *env, - struct cl_page_slice *slice) -{ - struct vvp_page *vpg = cl2vvp_page(slice); - struct page *vmpage = vpg->vpg_page; - - /* - * vmpage->private was already cleared when page was moved into - * VPG_FREEING state. - */ - LASSERT((struct cl_page *)vmpage->private != slice->cpl_page); - vvp_page_fini_common(vpg); -} - -static int vvp_page_own(const struct lu_env *env, - const struct cl_page_slice *slice, struct cl_io *io, - int nonblock) -{ - struct vvp_page *vpg = cl2vvp_page(slice); - struct page *vmpage = vpg->vpg_page; - - LASSERT(vmpage); - if (nonblock) { - if (!trylock_page(vmpage)) - return -EAGAIN; - - if (unlikely(PageWriteback(vmpage))) { - unlock_page(vmpage); - return -EAGAIN; - } - - return 0; - } - - lock_page(vmpage); - wait_on_page_writeback(vmpage); - - return 0; -} - -static void vvp_page_assume(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *unused) -{ - struct page *vmpage = cl2vm_page(slice); - - LASSERT(vmpage); - LASSERT(PageLocked(vmpage)); - wait_on_page_writeback(vmpage); -} - -static void vvp_page_unassume(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *unused) -{ - struct page *vmpage = cl2vm_page(slice); - - LASSERT(vmpage); - LASSERT(PageLocked(vmpage)); -} - -static void vvp_page_disown(const struct lu_env *env, - const struct cl_page_slice *slice, struct cl_io *io) -{ - struct page *vmpage = cl2vm_page(slice); - - LASSERT(vmpage); - LASSERT(PageLocked(vmpage)); - - unlock_page(cl2vm_page(slice)); -} - -static void vvp_page_discard(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *unused) -{ - struct page *vmpage = cl2vm_page(slice); - struct vvp_page *vpg = cl2vvp_page(slice); - - LASSERT(vmpage); - LASSERT(PageLocked(vmpage)); - - if (vpg->vpg_defer_uptodate && !vpg->vpg_ra_used) - ll_ra_stats_inc(vmpage->mapping->host, RA_STAT_DISCARDED); - - ll_invalidate_page(vmpage); -} - -static void vvp_page_delete(const struct lu_env *env, - const struct cl_page_slice *slice) -{ - struct page *vmpage = cl2vm_page(slice); - struct inode *inode = vmpage->mapping->host; - struct cl_object *obj = slice->cpl_obj; - struct cl_page *page = slice->cpl_page; - int refc; - - LASSERT(PageLocked(vmpage)); - LASSERT((struct cl_page *)vmpage->private == page); - LASSERT(inode == vvp_object_inode(obj)); - - /* Drop the reference count held in vvp_page_init */ - refc = atomic_dec_return(&page->cp_ref); - LASSERTF(refc >= 1, "page = %p, refc = %d\n", page, refc); - - ClearPagePrivate(vmpage); - vmpage->private = 0; - /* - * Reference from vmpage to cl_page is removed, but the reference back - * is still here. It is removed later in vvp_page_fini(). - */ -} - -static void vvp_page_export(const struct lu_env *env, - const struct cl_page_slice *slice, - int uptodate) -{ - struct page *vmpage = cl2vm_page(slice); - - LASSERT(vmpage); - LASSERT(PageLocked(vmpage)); - if (uptodate) - SetPageUptodate(vmpage); - else - ClearPageUptodate(vmpage); -} - -static int vvp_page_is_vmlocked(const struct lu_env *env, - const struct cl_page_slice *slice) -{ - return PageLocked(cl2vm_page(slice)) ? -EBUSY : -ENODATA; -} - -static int vvp_page_prep_read(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *unused) -{ - /* Skip the page already marked as PG_uptodate. */ - return PageUptodate(cl2vm_page(slice)) ? -EALREADY : 0; -} - -static int vvp_page_prep_write(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *unused) -{ - struct page *vmpage = cl2vm_page(slice); - struct cl_page *pg = slice->cpl_page; - - LASSERT(PageLocked(vmpage)); - LASSERT(!PageDirty(vmpage)); - - /* ll_writepage path is not a sync write, so need to set page writeback - * flag - */ - if (!pg->cp_sync_io) - set_page_writeback(vmpage); - - return 0; -} - -/** - * Handles page transfer errors at VM level. - * - * This takes inode as a separate argument, because inode on which error is to - * be set can be different from \a vmpage inode in case of direct-io. - */ -static void vvp_vmpage_error(struct inode *inode, struct page *vmpage, - int ioret) -{ - struct vvp_object *obj = cl_inode2vvp(inode); - - if (ioret == 0) { - ClearPageError(vmpage); - obj->vob_discard_page_warned = 0; - } else { - SetPageError(vmpage); - mapping_set_error(inode->i_mapping, ioret); - - if ((ioret == -ESHUTDOWN || ioret == -EINTR) && - obj->vob_discard_page_warned == 0) { - obj->vob_discard_page_warned = 1; - ll_dirty_page_discard_warn(vmpage, ioret); - } - } -} - -static void vvp_page_completion_read(const struct lu_env *env, - const struct cl_page_slice *slice, - int ioret) -{ - struct vvp_page *vpg = cl2vvp_page(slice); - struct page *vmpage = vpg->vpg_page; - struct cl_page *page = slice->cpl_page; - struct inode *inode = vvp_object_inode(page->cp_obj); - - LASSERT(PageLocked(vmpage)); - CL_PAGE_HEADER(D_PAGE, env, page, "completing READ with %d\n", ioret); - - if (vpg->vpg_defer_uptodate) - ll_ra_count_put(ll_i2sbi(inode), 1); - - if (ioret == 0) { - if (!vpg->vpg_defer_uptodate) - cl_page_export(env, page, 1); - } else { - vpg->vpg_defer_uptodate = 0; - } - - if (!page->cp_sync_io) - unlock_page(vmpage); -} - -static void vvp_page_completion_write(const struct lu_env *env, - const struct cl_page_slice *slice, - int ioret) -{ - struct vvp_page *vpg = cl2vvp_page(slice); - struct cl_page *pg = slice->cpl_page; - struct page *vmpage = vpg->vpg_page; - - CL_PAGE_HEADER(D_PAGE, env, pg, "completing WRITE with %d\n", ioret); - - if (pg->cp_sync_io) { - LASSERT(PageLocked(vmpage)); - LASSERT(!PageWriteback(vmpage)); - } else { - LASSERT(PageWriteback(vmpage)); - /* - * Only mark the page error only when it's an async write - * because applications won't wait for IO to finish. - */ - vvp_vmpage_error(vvp_object_inode(pg->cp_obj), vmpage, ioret); - - end_page_writeback(vmpage); - } -} - -/** - * Implements cl_page_operations::cpo_make_ready() method. - * - * This is called to yank a page from the transfer cache and to send it out as - * a part of transfer. This function try-locks the page. If try-lock failed, - * page is owned by some concurrent IO, and should be skipped (this is bad, - * but hopefully rare situation, as it usually results in transfer being - * shorter than possible). - * - * \retval 0 success, page can be placed into transfer - * - * \retval -EAGAIN page is either used by concurrent IO has been - * truncated. Skip it. - */ -static int vvp_page_make_ready(const struct lu_env *env, - const struct cl_page_slice *slice) -{ - struct page *vmpage = cl2vm_page(slice); - struct cl_page *pg = slice->cpl_page; - int result = 0; - - lock_page(vmpage); - if (clear_page_dirty_for_io(vmpage)) { - LASSERT(pg->cp_state == CPS_CACHED); - /* This actually clears the dirty bit in the radix tree. */ - set_page_writeback(vmpage); - CL_PAGE_HEADER(D_PAGE, env, pg, "readied\n"); - } else if (pg->cp_state == CPS_PAGEOUT) { - /* is it possible for osc_flush_async_page() to already - * make it ready? - */ - result = -EALREADY; - } else { - CL_PAGE_DEBUG(D_ERROR, env, pg, "Unexpecting page state %d.\n", - pg->cp_state); - LBUG(); - } - unlock_page(vmpage); - return result; -} - -static int vvp_page_print(const struct lu_env *env, - const struct cl_page_slice *slice, - void *cookie, lu_printer_t printer) -{ - struct vvp_page *vpg = cl2vvp_page(slice); - struct page *vmpage = vpg->vpg_page; - - (*printer)(env, cookie, LUSTRE_VVP_NAME "-page@%p(%d:%d) vm@%p ", - vpg, vpg->vpg_defer_uptodate, vpg->vpg_ra_used, vmpage); - if (vmpage) { - (*printer)(env, cookie, "%lx %d:%d %lx %lu %slru", - (long)vmpage->flags, page_count(vmpage), - page_mapcount(vmpage), vmpage->private, - vmpage->index, - list_empty(&vmpage->lru) ? "not-" : ""); - } - - (*printer)(env, cookie, "\n"); - - return 0; -} - -static int vvp_page_fail(const struct lu_env *env, - const struct cl_page_slice *slice) -{ - /* - * Cached read? - */ - LBUG(); - - return 0; -} - -static const struct cl_page_operations vvp_page_ops = { - .cpo_own = vvp_page_own, - .cpo_assume = vvp_page_assume, - .cpo_unassume = vvp_page_unassume, - .cpo_disown = vvp_page_disown, - .cpo_discard = vvp_page_discard, - .cpo_delete = vvp_page_delete, - .cpo_export = vvp_page_export, - .cpo_is_vmlocked = vvp_page_is_vmlocked, - .cpo_fini = vvp_page_fini, - .cpo_print = vvp_page_print, - .io = { - [CRT_READ] = { - .cpo_prep = vvp_page_prep_read, - .cpo_completion = vvp_page_completion_read, - .cpo_make_ready = vvp_page_fail, - }, - [CRT_WRITE] = { - .cpo_prep = vvp_page_prep_write, - .cpo_completion = vvp_page_completion_write, - .cpo_make_ready = vvp_page_make_ready, - }, - }, -}; - -static int vvp_transient_page_prep(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *unused) -{ - /* transient page should always be sent. */ - return 0; -} - -static int vvp_transient_page_own(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *unused, int nonblock) -{ - return 0; -} - -static void vvp_transient_page_assume(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *unused) -{ -} - -static void vvp_transient_page_unassume(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *unused) -{ -} - -static void vvp_transient_page_disown(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *unused) -{ -} - -static void vvp_transient_page_discard(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *unused) -{ - struct cl_page *page = slice->cpl_page; - - /* - * For transient pages, remove it from the radix tree. - */ - cl_page_delete(env, page); -} - -static int vvp_transient_page_is_vmlocked(const struct lu_env *env, - const struct cl_page_slice *slice) -{ - struct inode *inode = vvp_object_inode(slice->cpl_obj); - int locked; - - locked = !inode_trylock(inode); - if (!locked) - inode_unlock(inode); - return locked ? -EBUSY : -ENODATA; -} - -static void -vvp_transient_page_completion(const struct lu_env *env, - const struct cl_page_slice *slice, - int ioret) -{ -} - -static void vvp_transient_page_fini(const struct lu_env *env, - struct cl_page_slice *slice) -{ - struct vvp_page *vpg = cl2vvp_page(slice); - struct cl_page *clp = slice->cpl_page; - struct vvp_object *clobj = cl2vvp(clp->cp_obj); - - vvp_page_fini_common(vpg); - atomic_dec(&clobj->vob_transient_pages); -} - -static const struct cl_page_operations vvp_transient_page_ops = { - .cpo_own = vvp_transient_page_own, - .cpo_assume = vvp_transient_page_assume, - .cpo_unassume = vvp_transient_page_unassume, - .cpo_disown = vvp_transient_page_disown, - .cpo_discard = vvp_transient_page_discard, - .cpo_fini = vvp_transient_page_fini, - .cpo_is_vmlocked = vvp_transient_page_is_vmlocked, - .cpo_print = vvp_page_print, - .io = { - [CRT_READ] = { - .cpo_prep = vvp_transient_page_prep, - .cpo_completion = vvp_transient_page_completion, - }, - [CRT_WRITE] = { - .cpo_prep = vvp_transient_page_prep, - .cpo_completion = vvp_transient_page_completion, - } - } -}; - -int vvp_page_init(const struct lu_env *env, struct cl_object *obj, - struct cl_page *page, pgoff_t index) -{ - struct vvp_page *vpg = cl_object_page_slice(obj, page); - struct page *vmpage = page->cp_vmpage; - - CLOBINVRNT(env, obj, vvp_object_invariant(obj)); - - vpg->vpg_page = vmpage; - get_page(vmpage); - - if (page->cp_type == CPT_CACHEABLE) { - /* in cache, decref in vvp_page_delete */ - atomic_inc(&page->cp_ref); - SetPagePrivate(vmpage); - vmpage->private = (unsigned long)page; - cl_page_slice_add(page, &vpg->vpg_cl, obj, index, - &vvp_page_ops); - } else { - struct vvp_object *clobj = cl2vvp(obj); - - cl_page_slice_add(page, &vpg->vpg_cl, obj, index, - &vvp_transient_page_ops); - atomic_inc(&clobj->vob_transient_pages); - } - return 0; -} diff --git a/drivers/staging/lustre/lustre/llite/xattr.c b/drivers/staging/lustre/lustre/llite/xattr.c deleted file mode 100644 index 7fa0a419c094dfae8bdbf1d1dbeb43aff68f2f19..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/xattr.c +++ /dev/null @@ -1,665 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#include -#include -#include -#include -#include - -#define DEBUG_SUBSYSTEM S_LLITE - -#include -#include - -#include "llite_internal.h" - -const struct xattr_handler *get_xattr_type(const char *name) -{ - int i; - - for (i = 0; ll_xattr_handlers[i]; i++) { - const char *prefix = xattr_prefix(ll_xattr_handlers[i]); - size_t prefix_len = strlen(prefix); - - if (!strncmp(prefix, name, prefix_len)) - return ll_xattr_handlers[i]; - } - - return NULL; -} - -static int xattr_type_filter(struct ll_sb_info *sbi, - const struct xattr_handler *handler) -{ - /* No handler means XATTR_OTHER_T */ - if (!handler) - return -EOPNOTSUPP; - - if ((handler->flags == XATTR_ACL_ACCESS_T || - handler->flags == XATTR_ACL_DEFAULT_T) && - !(sbi->ll_flags & LL_SBI_ACL)) - return -EOPNOTSUPP; - - if (handler->flags == XATTR_USER_T && - !(sbi->ll_flags & LL_SBI_USER_XATTR)) - return -EOPNOTSUPP; - - if (handler->flags == XATTR_TRUSTED_T && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - - return 0; -} - -static int ll_xattr_set_common(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *inode, - const char *name, const void *value, size_t size, - int flags) -{ - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct ptlrpc_request *req = NULL; - const char *pv = value; - char *fullname; - u64 valid; - int rc; - - /* When setxattr() is called with a size of 0 the value is - * unconditionally replaced by "". When removexattr() is - * called we get a NULL value and XATTR_REPLACE for flags. - */ - if (!value && flags == XATTR_REPLACE) { - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_REMOVEXATTR, 1); - valid = OBD_MD_FLXATTRRM; - } else { - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_SETXATTR, 1); - valid = OBD_MD_FLXATTR; - } - - rc = xattr_type_filter(sbi, handler); - if (rc) - return rc; - - if ((handler->flags == XATTR_ACL_ACCESS_T || - handler->flags == XATTR_ACL_DEFAULT_T) && - !inode_owner_or_capable(inode)) - return -EPERM; - - /* b10667: ignore lustre special xattr for now */ - if (!strcmp(name, "hsm") || - ((handler->flags == XATTR_TRUSTED_T && !strcmp(name, "lov")) || - (handler->flags == XATTR_LUSTRE_T && !strcmp(name, "lov")))) - return 0; - - /* LU-549: Disable security.selinux when selinux is disabled */ - if (handler->flags == XATTR_SECURITY_T && !selinux_is_enabled() && - strcmp(name, "selinux") == 0) - return -EOPNOTSUPP; - - /*FIXME: enable IMA when the conditions are ready */ - if (handler->flags == XATTR_SECURITY_T && - (!strcmp(name, "ima") || !strcmp(name, "evm"))) - return -EOPNOTSUPP; - - /* - * In user.* namespace, only regular files and directories can have - * extended attributes. - */ - if (handler->flags == XATTR_USER_T) { - if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) - return -EPERM; - } - - fullname = kasprintf(GFP_KERNEL, "%s%s", handler->prefix, name); - if (!fullname) - return -ENOMEM; - - rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode), valid, fullname, - pv, size, flags, ll_i2suppgid(inode), &req); - kfree(fullname); - if (rc) { - if (rc == -EOPNOTSUPP && handler->flags == XATTR_USER_T) { - LCONSOLE_INFO("Disabling user_xattr feature because it is not supported on the server\n"); - sbi->ll_flags &= ~LL_SBI_USER_XATTR; - } - return rc; - } - - ptlrpc_req_finished(req); - return 0; -} - -static int get_hsm_state(struct inode *inode, u32 *hus_states) -{ - struct md_op_data *op_data; - struct hsm_user_state *hus; - int rc; - - hus = kzalloc(sizeof(*hus), GFP_NOFS); - if (!hus) - return -ENOMEM; - - op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, - LUSTRE_OPC_ANY, hus); - if (!IS_ERR(op_data)) { - rc = obd_iocontrol(LL_IOC_HSM_STATE_GET, ll_i2mdexp(inode), - sizeof(*op_data), op_data, NULL); - if (!rc) - *hus_states = hus->hus_states; - else - CDEBUG(D_VFSTRACE, "obd_iocontrol failed. rc = %d\n", - rc); - - ll_finish_md_op_data(op_data); - } else { - rc = PTR_ERR(op_data); - CDEBUG(D_VFSTRACE, "Could not prepare the opdata. rc = %d\n", - rc); - } - kfree(hus); - return rc; -} - -static int ll_adjust_lum(struct inode *inode, struct lov_user_md *lump) -{ - int rc = 0; - - if (!lump) - return 0; - - /* Attributes that are saved via getxattr will always have - * the stripe_offset as 0. Instead, the MDS should be - * allowed to pick the starting OST index. b=17846 - */ - if (lump->lmm_stripe_offset == 0) - lump->lmm_stripe_offset = -1; - - /* Avoid anyone directly setting the RELEASED flag. */ - if (lump->lmm_pattern & LOV_PATTERN_F_RELEASED) { - /* Only if we have a released flag check if the file - * was indeed archived. - */ - u32 state = HS_NONE; - - rc = get_hsm_state(inode, &state); - if (rc) - return rc; - - if (!(state & HS_ARCHIVED)) { - CDEBUG(D_VFSTRACE, - "hus_states state = %x, pattern = %x\n", - state, lump->lmm_pattern); - /* - * Here the state is: real file is not - * archived but user is requesting to set - * the RELEASED flag so we mask off the - * released flag from the request - */ - lump->lmm_pattern ^= LOV_PATTERN_F_RELEASED; - } - } - - return rc; -} - -static int ll_setstripe_ea(struct dentry *dentry, struct lov_user_md *lump, - size_t size) -{ - struct inode *inode = d_inode(dentry); - int rc = 0; - - /* - * It is possible to set an xattr to a "" value of zero size. - * For this case we are going to treat it as a removal. - */ - if (!size && lump) - lump = NULL; - - rc = ll_adjust_lum(inode, lump); - if (rc) - return rc; - - if (lump && S_ISREG(inode->i_mode)) { - u64 it_flags = FMODE_WRITE; - ssize_t lum_size; - - lum_size = ll_lov_user_md_size(lump); - if (lum_size < 0 || size < lum_size) - return -ERANGE; - - rc = ll_lov_setstripe_ea_info(inode, dentry, it_flags, lump, - lum_size); - /** - * b=10667: ignore -EEXIST. - * Silently eat error on setting trusted.lov/lustre.lov - * attribute for platforms that added the default option - * to copy all attributes in 'cp' command. Both rsync and - * tar --xattrs also will try to set LOVEA for existing - * files. - */ - if (rc == -EEXIST) - rc = 0; - } else if (S_ISDIR(inode->i_mode)) { - if (size != 0 && size < sizeof(struct lov_user_md)) - return -EINVAL; - - rc = ll_dir_setstripe(inode, lump, 0); - } - - return rc; -} - -static int ll_xattr_set(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *inode, - const char *name, const void *value, size_t size, - int flags) -{ - LASSERT(inode); - LASSERT(name); - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p), xattr %s\n", - PFID(ll_inode2fid(inode)), inode, name); - - /* lustre/trusted.lov.xxx would be passed through xattr API */ - if (!strcmp(name, "lov")) { - int op_type = flags == XATTR_REPLACE ? LPROC_LL_REMOVEXATTR : - LPROC_LL_SETXATTR; - - ll_stats_ops_tally(ll_i2sbi(inode), op_type, 1); - - return ll_setstripe_ea(dentry, (struct lov_user_md *)value, - size); - } else if (!strcmp(name, "lma") || !strcmp(name, "link")) { - int op_type = flags == XATTR_REPLACE ? LPROC_LL_REMOVEXATTR : - LPROC_LL_SETXATTR; - - ll_stats_ops_tally(ll_i2sbi(inode), op_type, 1); - return 0; - } - - return ll_xattr_set_common(handler, dentry, inode, name, value, size, - flags); -} - -int ll_xattr_list(struct inode *inode, const char *name, int type, void *buffer, - size_t size, u64 valid) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct ptlrpc_request *req = NULL; - struct mdt_body *body; - void *xdata; - int rc; - - if (sbi->ll_xattr_cache_enabled && type != XATTR_ACL_ACCESS_T && - (type != XATTR_SECURITY_T || strcmp(name, "security.selinux"))) { - rc = ll_xattr_cache_get(inode, name, buffer, size, valid); - if (rc == -EAGAIN) - goto getxattr_nocache; - if (rc < 0) - goto out_xattr; - - /* Add "system.posix_acl_access" to the list */ - if (lli->lli_posix_acl && valid & OBD_MD_FLXATTRLS) { - if (size == 0) { - rc += sizeof(XATTR_NAME_ACL_ACCESS); - } else if (size - rc >= sizeof(XATTR_NAME_ACL_ACCESS)) { - memcpy(buffer + rc, XATTR_NAME_ACL_ACCESS, - sizeof(XATTR_NAME_ACL_ACCESS)); - rc += sizeof(XATTR_NAME_ACL_ACCESS); - } else { - rc = -ERANGE; - goto out_xattr; - } - } - } else { -getxattr_nocache: - rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), valid, - name, size, &req); - if (rc < 0) - goto out_xattr; - - body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - LASSERT(body); - - /* only detect the xattr size */ - if (size == 0) { - rc = body->mbo_eadatasize; - goto out; - } - - if (size < body->mbo_eadatasize) { - CERROR("server bug: replied size %u > %u\n", - body->mbo_eadatasize, (int)size); - rc = -ERANGE; - goto out; - } - - if (body->mbo_eadatasize == 0) { - rc = -ENODATA; - goto out; - } - - /* do not need swab xattr data */ - xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, - body->mbo_eadatasize); - if (!xdata) { - rc = -EFAULT; - goto out; - } - - memcpy(buffer, xdata, body->mbo_eadatasize); - rc = body->mbo_eadatasize; - } - -out_xattr: - if (rc == -EOPNOTSUPP && type == XATTR_USER_T) { - LCONSOLE_INFO( - "%s: disabling user_xattr feature because it is not supported on the server: rc = %d\n", - ll_get_fsname(inode->i_sb, NULL, 0), rc); - sbi->ll_flags &= ~LL_SBI_USER_XATTR; - } -out: - ptlrpc_req_finished(req); - return rc; -} - -static int ll_xattr_get_common(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *inode, - const char *name, void *buffer, size_t size) -{ - struct ll_sb_info *sbi = ll_i2sbi(inode); - char *fullname; - int rc; - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p)\n", - PFID(ll_inode2fid(inode)), inode); - - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR, 1); - - rc = xattr_type_filter(sbi, handler); - if (rc) - return rc; - - /* LU-549: Disable security.selinux when selinux is disabled */ - if (handler->flags == XATTR_SECURITY_T && !selinux_is_enabled() && - !strcmp(name, "selinux")) - return -EOPNOTSUPP; - -#ifdef CONFIG_FS_POSIX_ACL - /* posix acl is under protection of LOOKUP lock. when calling to this, - * we just have path resolution to the target inode, so we have great - * chance that cached ACL is uptodate. - */ - if (handler->flags == XATTR_ACL_ACCESS_T) { - struct ll_inode_info *lli = ll_i2info(inode); - struct posix_acl *acl; - - spin_lock(&lli->lli_lock); - acl = posix_acl_dup(lli->lli_posix_acl); - spin_unlock(&lli->lli_lock); - - if (!acl) - return -ENODATA; - - rc = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); - posix_acl_release(acl); - return rc; - } - if (handler->flags == XATTR_ACL_DEFAULT_T && !S_ISDIR(inode->i_mode)) - return -ENODATA; -#endif - fullname = kasprintf(GFP_KERNEL, "%s%s", handler->prefix, name); - if (!fullname) - return -ENOMEM; - - rc = ll_xattr_list(inode, fullname, handler->flags, buffer, size, - OBD_MD_FLXATTR); - kfree(fullname); - return rc; -} - -static ssize_t ll_getxattr_lov(struct inode *inode, void *buf, size_t buf_size) -{ - ssize_t rc; - - if (S_ISREG(inode->i_mode)) { - struct cl_object *obj = ll_i2info(inode)->lli_clob; - struct cl_layout cl = { - .cl_buf.lb_buf = buf, - .cl_buf.lb_len = buf_size, - }; - struct lu_env *env; - u16 refcheck; - - if (!obj) - return -ENODATA; - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - rc = cl_object_layout_get(env, obj, &cl); - if (rc < 0) - goto out_env; - - if (!cl.cl_size) { - rc = -ENODATA; - goto out_env; - } - - rc = cl.cl_size; - - if (!buf_size) - goto out_env; - - LASSERT(buf && rc <= buf_size); - - /* - * Do not return layout gen for getxattr() since - * otherwise it would confuse tar --xattr by - * recognizing layout gen as stripe offset when the - * file is restored. See LU-2809. - */ - ((struct lov_mds_md *)buf)->lmm_layout_gen = 0; -out_env: - cl_env_put(env, &refcheck); - - return rc; - } else if (S_ISDIR(inode->i_mode)) { - struct ptlrpc_request *req = NULL; - struct lov_mds_md *lmm = NULL; - int lmm_size = 0; - - rc = ll_dir_getstripe(inode, (void **)&lmm, &lmm_size, - &req, 0); - if (rc < 0) - goto out_req; - - if (!buf_size) { - rc = lmm_size; - goto out_req; - } - - if (buf_size < lmm_size) { - rc = -ERANGE; - goto out_req; - } - - memcpy(buf, lmm, lmm_size); - rc = lmm_size; -out_req: - if (req) - ptlrpc_req_finished(req); - - return rc; - } else { - return -ENODATA; - } -} - -static int ll_xattr_get(const struct xattr_handler *handler, - struct dentry *dentry, struct inode *inode, - const char *name, void *buffer, size_t size) -{ - LASSERT(inode); - LASSERT(name); - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p), xattr %s\n", - PFID(ll_inode2fid(inode)), inode, name); - - if (!strcmp(name, "lov")) { - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR, 1); - - return ll_getxattr_lov(inode, buffer, size); - } - - return ll_xattr_get_common(handler, dentry, inode, name, buffer, size); -} - -ssize_t ll_listxattr(struct dentry *dentry, char *buffer, size_t size) -{ - struct inode *inode = d_inode(dentry); - struct ll_sb_info *sbi = ll_i2sbi(inode); - char *xattr_name; - ssize_t rc, rc2; - size_t len, rem; - - LASSERT(inode); - - CDEBUG(D_VFSTRACE, "VFS Op:inode=" DFID "(%p)\n", - PFID(ll_inode2fid(inode)), inode); - - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LISTXATTR, 1); - - rc = ll_xattr_list(inode, NULL, XATTR_OTHER_T, buffer, size, - OBD_MD_FLXATTRLS); - if (rc < 0) - return rc; - - /* - * If we're being called to get the size of the xattr list - * (size == 0) then just assume that a lustre.lov xattr - * exists. - */ - if (!size) - return rc + sizeof(XATTR_LUSTRE_LOV); - - xattr_name = buffer; - rem = rc; - - while (rem > 0) { - len = strnlen(xattr_name, rem - 1) + 1; - rem -= len; - if (!xattr_type_filter(sbi, get_xattr_type(xattr_name))) { - /* Skip OK xattr type, leave it in buffer. */ - xattr_name += len; - continue; - } - - /* - * Move up remaining xattrs in buffer - * removing the xattr that is not OK. - */ - memmove(xattr_name, xattr_name + len, rem); - rc -= len; - } - - rc2 = ll_getxattr_lov(inode, NULL, 0); - if (rc2 == -ENODATA) - return rc; - - if (rc2 < 0) - return rc2; - - if (size < rc + sizeof(XATTR_LUSTRE_LOV)) - return -ERANGE; - - memcpy(buffer + rc, XATTR_LUSTRE_LOV, sizeof(XATTR_LUSTRE_LOV)); - - return rc + sizeof(XATTR_LUSTRE_LOV); -} - -static const struct xattr_handler ll_user_xattr_handler = { - .prefix = XATTR_USER_PREFIX, - .flags = XATTR_USER_T, - .get = ll_xattr_get_common, - .set = ll_xattr_set_common, -}; - -static const struct xattr_handler ll_trusted_xattr_handler = { - .prefix = XATTR_TRUSTED_PREFIX, - .flags = XATTR_TRUSTED_T, - .get = ll_xattr_get, - .set = ll_xattr_set, -}; - -static const struct xattr_handler ll_security_xattr_handler = { - .prefix = XATTR_SECURITY_PREFIX, - .flags = XATTR_SECURITY_T, - .get = ll_xattr_get_common, - .set = ll_xattr_set_common, -}; - -static const struct xattr_handler ll_acl_access_xattr_handler = { - .name = XATTR_NAME_POSIX_ACL_ACCESS, - .flags = XATTR_ACL_ACCESS_T, - .get = ll_xattr_get_common, - .set = ll_xattr_set_common, -}; - -static const struct xattr_handler ll_acl_default_xattr_handler = { - .name = XATTR_NAME_POSIX_ACL_DEFAULT, - .flags = XATTR_ACL_DEFAULT_T, - .get = ll_xattr_get_common, - .set = ll_xattr_set_common, -}; - -static const struct xattr_handler ll_lustre_xattr_handler = { - .prefix = XATTR_LUSTRE_PREFIX, - .flags = XATTR_LUSTRE_T, - .get = ll_xattr_get, - .set = ll_xattr_set, -}; - -const struct xattr_handler *ll_xattr_handlers[] = { - &ll_user_xattr_handler, - &ll_trusted_xattr_handler, - &ll_security_xattr_handler, -#ifdef CONFIG_FS_POSIX_ACL - &ll_acl_access_xattr_handler, - &ll_acl_default_xattr_handler, -#endif - &ll_lustre_xattr_handler, - NULL, -}; diff --git a/drivers/staging/lustre/lustre/llite/xattr_cache.c b/drivers/staging/lustre/lustre/llite/xattr_cache.c deleted file mode 100644 index 5da69ba088c4051fbc307749505868e1296f5387..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/xattr_cache.c +++ /dev/null @@ -1,504 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright 2012 Xyratex Technology Limited - * - * Copyright (c) 2013, 2015, Intel Corporation. - * - * Author: Andrew Perepechko - * - */ - -#define DEBUG_SUBSYSTEM S_LLITE - -#include -#include -#include -#include -#include -#include "llite_internal.h" - -/* If we ever have hundreds of extended attributes, we might want to consider - * using a hash or a tree structure instead of list for faster lookups. - */ -struct ll_xattr_entry { - struct list_head xe_list; /* protected with - * lli_xattrs_list_rwsem - */ - char *xe_name; /* xattr name, \0-terminated */ - char *xe_value; /* xattr value */ - unsigned int xe_namelen; /* strlen(xe_name) + 1 */ - unsigned int xe_vallen; /* xattr value length */ -}; - -static struct kmem_cache *xattr_kmem; -static struct lu_kmem_descr xattr_caches[] = { - { - .ckd_cache = &xattr_kmem, - .ckd_name = "xattr_kmem", - .ckd_size = sizeof(struct ll_xattr_entry) - }, - { - .ckd_cache = NULL - } -}; - -int ll_xattr_init(void) -{ - return lu_kmem_init(xattr_caches); -} - -void ll_xattr_fini(void) -{ - lu_kmem_fini(xattr_caches); -} - -/** - * Initializes xattr cache for an inode. - * - * This initializes the xattr list and marks cache presence. - */ -static void ll_xattr_cache_init(struct ll_inode_info *lli) -{ - INIT_LIST_HEAD(&lli->lli_xattrs); - set_bit(LLIF_XATTR_CACHE, &lli->lli_flags); -} - -/** - * This looks for a specific extended attribute. - * - * Find in @cache and return @xattr_name attribute in @xattr, - * for the NULL @xattr_name return the first cached @xattr. - * - * \retval 0 success - * \retval -ENODATA if not found - */ -static int ll_xattr_cache_find(struct list_head *cache, - const char *xattr_name, - struct ll_xattr_entry **xattr) -{ - struct ll_xattr_entry *entry; - - list_for_each_entry(entry, cache, xe_list) { - /* xattr_name == NULL means look for any entry */ - if (!xattr_name || strcmp(xattr_name, entry->xe_name) == 0) { - *xattr = entry; - CDEBUG(D_CACHE, "find: [%s]=%.*s\n", - entry->xe_name, entry->xe_vallen, - entry->xe_value); - return 0; - } - } - - return -ENODATA; -} - -/** - * This adds an xattr. - * - * Add @xattr_name attr with @xattr_val value and @xattr_val_len length, - * - * \retval 0 success - * \retval -ENOMEM if no memory could be allocated for the cached attr - * \retval -EPROTO if duplicate xattr is being added - */ -static int ll_xattr_cache_add(struct list_head *cache, - const char *xattr_name, - const char *xattr_val, - unsigned int xattr_val_len) -{ - struct ll_xattr_entry *xattr; - - if (ll_xattr_cache_find(cache, xattr_name, &xattr) == 0) { - CDEBUG(D_CACHE, "duplicate xattr: [%s]\n", xattr_name); - return -EPROTO; - } - - xattr = kmem_cache_zalloc(xattr_kmem, GFP_NOFS); - if (!xattr) { - CDEBUG(D_CACHE, "failed to allocate xattr\n"); - return -ENOMEM; - } - - xattr->xe_name = kstrdup(xattr_name, GFP_NOFS); - if (!xattr->xe_name) { - CDEBUG(D_CACHE, "failed to alloc xattr name %s\n", - xattr_name); - goto err_name; - } - xattr->xe_namelen = strlen(xattr_name) + 1; - - xattr->xe_value = kmemdup(xattr_val, xattr_val_len, GFP_NOFS); - if (!xattr->xe_value) - goto err_value; - - xattr->xe_vallen = xattr_val_len; - list_add(&xattr->xe_list, cache); - - CDEBUG(D_CACHE, "set: [%s]=%.*s\n", xattr_name, xattr_val_len, - xattr_val); - - return 0; -err_value: - kfree(xattr->xe_name); -err_name: - kmem_cache_free(xattr_kmem, xattr); - - return -ENOMEM; -} - -/** - * This removes an extended attribute from cache. - * - * Remove @xattr_name attribute from @cache. - * - * \retval 0 success - * \retval -ENODATA if @xattr_name is not cached - */ -static int ll_xattr_cache_del(struct list_head *cache, - const char *xattr_name) -{ - struct ll_xattr_entry *xattr; - - CDEBUG(D_CACHE, "del xattr: %s\n", xattr_name); - - if (ll_xattr_cache_find(cache, xattr_name, &xattr) == 0) { - list_del(&xattr->xe_list); - kfree(xattr->xe_name); - kfree(xattr->xe_value); - kmem_cache_free(xattr_kmem, xattr); - - return 0; - } - - return -ENODATA; -} - -/** - * This iterates cached extended attributes. - * - * Walk over cached attributes in @cache and - * fill in @xld_buffer or only calculate buffer - * size if @xld_buffer is NULL. - * - * \retval >= 0 buffer list size - * \retval -ENODATA if the list cannot fit @xld_size buffer - */ -static int ll_xattr_cache_list(struct list_head *cache, - char *xld_buffer, - int xld_size) -{ - struct ll_xattr_entry *xattr, *tmp; - int xld_tail = 0; - - list_for_each_entry_safe(xattr, tmp, cache, xe_list) { - CDEBUG(D_CACHE, "list: buffer=%p[%d] name=%s\n", - xld_buffer, xld_tail, xattr->xe_name); - - if (xld_buffer) { - xld_size -= xattr->xe_namelen; - if (xld_size < 0) - break; - memcpy(&xld_buffer[xld_tail], - xattr->xe_name, xattr->xe_namelen); - } - xld_tail += xattr->xe_namelen; - } - - if (xld_size < 0) - return -ERANGE; - - return xld_tail; -} - -/** - * Check if the xattr cache is initialized (filled). - * - * \retval 0 @cache is not initialized - * \retval 1 @cache is initialized - */ -static int ll_xattr_cache_valid(struct ll_inode_info *lli) -{ - return test_bit(LLIF_XATTR_CACHE, &lli->lli_flags); -} - -/** - * This finalizes the xattr cache. - * - * Free all xattr memory. @lli is the inode info pointer. - * - * \retval 0 no error occurred - */ -static int ll_xattr_cache_destroy_locked(struct ll_inode_info *lli) -{ - if (!ll_xattr_cache_valid(lli)) - return 0; - - while (ll_xattr_cache_del(&lli->lli_xattrs, NULL) == 0) - ; /* empty loop */ - - clear_bit(LLIF_XATTR_CACHE, &lli->lli_flags); - - return 0; -} - -int ll_xattr_cache_destroy(struct inode *inode) -{ - struct ll_inode_info *lli = ll_i2info(inode); - int rc; - - down_write(&lli->lli_xattrs_list_rwsem); - rc = ll_xattr_cache_destroy_locked(lli); - up_write(&lli->lli_xattrs_list_rwsem); - - return rc; -} - -/** - * Match or enqueue a PR lock. - * - * Find or request an LDLM lock with xattr data. - * Since LDLM does not provide API for atomic match_or_enqueue, - * the function handles it with a separate enq lock. - * If successful, the function exits with the list lock held. - * - * \retval 0 no error occurred - * \retval -ENOMEM not enough memory - */ -static int ll_xattr_find_get_lock(struct inode *inode, - struct lookup_intent *oit, - struct ptlrpc_request **req) -{ - enum ldlm_mode mode; - struct lustre_handle lockh = { 0 }; - struct md_op_data *op_data; - struct ll_inode_info *lli = ll_i2info(inode); - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct obd_export *exp = sbi->ll_md_exp; - int rc; - - mutex_lock(&lli->lli_xattrs_enq_lock); - /* inode may have been shrunk and recreated, so data is gone, match lock - * only when data exists. - */ - if (ll_xattr_cache_valid(lli)) { - /* Try matching first. */ - mode = ll_take_md_lock(inode, MDS_INODELOCK_XATTR, &lockh, 0, - LCK_PR); - if (mode != 0) { - /* fake oit in mdc_revalidate_lock() manner */ - oit->it_lock_handle = lockh.cookie; - oit->it_lock_mode = mode; - goto out; - } - } - - /* Enqueue if the lock isn't cached locally. */ - op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0, - LUSTRE_OPC_ANY, NULL); - if (IS_ERR(op_data)) { - mutex_unlock(&lli->lli_xattrs_enq_lock); - return PTR_ERR(op_data); - } - - op_data->op_valid = OBD_MD_FLXATTR | OBD_MD_FLXATTRLS; - - rc = md_intent_lock(exp, op_data, oit, req, &ll_md_blocking_ast, 0); - ll_finish_md_op_data(op_data); - *req = oit->it_request; - - if (rc < 0) { - CDEBUG(D_CACHE, - "md_intent_lock failed with %d for fid " DFID "\n", - rc, PFID(ll_inode2fid(inode))); - mutex_unlock(&lli->lli_xattrs_enq_lock); - return rc; - } - -out: - down_write(&lli->lli_xattrs_list_rwsem); - mutex_unlock(&lli->lli_xattrs_enq_lock); - - return 0; -} - -/** - * Refill the xattr cache. - * - * Fetch and cache the whole of xattrs for @inode, acquiring a read lock. - * - * \retval 0 no error occurred - * \retval -EPROTO network protocol error - * \retval -ENOMEM not enough memory for the cache - */ -static int ll_xattr_cache_refill(struct inode *inode) -{ - struct lookup_intent oit = { .it_op = IT_GETXATTR }; - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct ptlrpc_request *req = NULL; - const char *xdata, *xval, *xtail, *xvtail; - struct ll_inode_info *lli = ll_i2info(inode); - struct mdt_body *body; - __u32 *xsizes; - int rc, i; - - rc = ll_xattr_find_get_lock(inode, &oit, &req); - if (rc) - goto err_req; - - /* Do we have the data at this point? */ - if (ll_xattr_cache_valid(lli)) { - ll_stats_ops_tally(sbi, LPROC_LL_GETXATTR_HITS, 1); - ll_intent_drop_lock(&oit); - rc = 0; - goto err_req; - } - - /* Matched but no cache? Cancelled on error by a parallel refill. */ - if (unlikely(!req)) { - CDEBUG(D_CACHE, "cancelled by a parallel getxattr\n"); - ll_intent_drop_lock(&oit); - rc = -EAGAIN; - goto err_unlock; - } - - body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - if (!body) { - CERROR("no MDT BODY in the refill xattr reply\n"); - rc = -EPROTO; - goto err_cancel; - } - /* do not need swab xattr data */ - xdata = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, - body->mbo_eadatasize); - xval = req_capsule_server_sized_get(&req->rq_pill, &RMF_EAVALS, - body->mbo_aclsize); - xsizes = req_capsule_server_sized_get(&req->rq_pill, &RMF_EAVALS_LENS, - body->mbo_max_mdsize * sizeof(__u32)); - if (!xdata || !xval || !xsizes) { - CERROR("wrong setxattr reply\n"); - rc = -EPROTO; - goto err_cancel; - } - - xtail = xdata + body->mbo_eadatasize; - xvtail = xval + body->mbo_aclsize; - - CDEBUG(D_CACHE, "caching: xdata=%p xtail=%p\n", xdata, xtail); - - ll_xattr_cache_init(lli); - - for (i = 0; i < body->mbo_max_mdsize; i++) { - CDEBUG(D_CACHE, "caching [%s]=%.*s\n", xdata, *xsizes, xval); - /* Perform consistency checks: attr names and vals in pill */ - if (!memchr(xdata, 0, xtail - xdata)) { - CERROR("xattr protocol violation (names are broken)\n"); - rc = -EPROTO; - } else if (xval + *xsizes > xvtail) { - CERROR("xattr protocol violation (vals are broken)\n"); - rc = -EPROTO; - } else if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_XATTR_ENOMEM)) { - rc = -ENOMEM; - } else if (!strcmp(xdata, XATTR_NAME_ACL_ACCESS)) { - /* Filter out ACL ACCESS since it's cached separately */ - CDEBUG(D_CACHE, "not caching %s\n", - XATTR_NAME_ACL_ACCESS); - rc = 0; - } else if (!strcmp(xdata, "security.selinux")) { - /* Filter out security.selinux, it is cached in slab */ - CDEBUG(D_CACHE, "not caching security.selinux\n"); - rc = 0; - } else { - rc = ll_xattr_cache_add(&lli->lli_xattrs, xdata, xval, - *xsizes); - } - if (rc < 0) { - ll_xattr_cache_destroy_locked(lli); - goto err_cancel; - } - xdata += strlen(xdata) + 1; - xval += *xsizes; - xsizes++; - } - - if (xdata != xtail || xval != xvtail) - CERROR("a hole in xattr data\n"); - - ll_set_lock_data(sbi->ll_md_exp, inode, &oit, NULL); - ll_intent_drop_lock(&oit); - - ptlrpc_req_finished(req); - return rc; - -err_cancel: - ldlm_lock_decref_and_cancel((struct lustre_handle *) - &oit.it_lock_handle, - oit.it_lock_mode); -err_unlock: - up_write(&lli->lli_xattrs_list_rwsem); -err_req: - if (rc == -ERANGE) - rc = -EAGAIN; - - ptlrpc_req_finished(req); - return rc; -} - -/** - * Get an xattr value or list xattrs using the write-through cache. - * - * Get the xattr value (@valid has OBD_MD_FLXATTR set) of @name or - * list xattr names (@valid has OBD_MD_FLXATTRLS set) for @inode. - * The resulting value/list is stored in @buffer if the former - * is not larger than @size. - * - * \retval 0 no error occurred - * \retval -EPROTO network protocol error - * \retval -ENOMEM not enough memory for the cache - * \retval -ERANGE the buffer is not large enough - * \retval -ENODATA no such attr or the list is empty - */ -int ll_xattr_cache_get(struct inode *inode, const char *name, char *buffer, - size_t size, __u64 valid) -{ - struct ll_inode_info *lli = ll_i2info(inode); - int rc = 0; - - LASSERT(!!(valid & OBD_MD_FLXATTR) ^ !!(valid & OBD_MD_FLXATTRLS)); - - down_read(&lli->lli_xattrs_list_rwsem); - if (!ll_xattr_cache_valid(lli)) { - up_read(&lli->lli_xattrs_list_rwsem); - rc = ll_xattr_cache_refill(inode); - if (rc) - return rc; - downgrade_write(&lli->lli_xattrs_list_rwsem); - } else { - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_GETXATTR_HITS, 1); - } - - if (valid & OBD_MD_FLXATTR) { - struct ll_xattr_entry *xattr; - - rc = ll_xattr_cache_find(&lli->lli_xattrs, name, &xattr); - if (rc == 0) { - rc = xattr->xe_vallen; - /* zero size means we are only requested size in rc */ - if (size != 0) { - if (size >= xattr->xe_vallen) - memcpy(buffer, xattr->xe_value, - xattr->xe_vallen); - else - rc = -ERANGE; - } - } - } else if (valid & OBD_MD_FLXATTRLS) { - rc = ll_xattr_cache_list(&lli->lli_xattrs, - size ? buffer : NULL, size); - } - - goto out; -out: - up_read(&lli->lli_xattrs_list_rwsem); - - return rc; -} diff --git a/drivers/staging/lustre/lustre/llite/xattr_security.c b/drivers/staging/lustre/lustre/llite/xattr_security.c deleted file mode 100644 index 93ec07531ac7afa1c94beab87f2c26eebc82d1b9..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/llite/xattr_security.c +++ /dev/null @@ -1,96 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see http://www.gnu.org/licenses - * - * GPL HEADER END - */ - -/* - * Copyright (c) 2014 Bull SAS - * Author: Sebastien Buisson sebastien.buisson@bull.net - */ - -/* - * lustre/llite/xattr_security.c - * Handler for storing security labels as extended attributes. - */ - -#include -#include -#include -#include -#include "llite_internal.h" - -/** - * A helper function for ll_security_inode_init_security() - * that takes care of setting xattrs - * - * Get security context of @inode from @xattr_array, - * and put it in 'security.xxx' xattr of dentry - * stored in @fs_info. - * - * \retval 0 success - * \retval -ENOMEM if no memory could be allocated for xattr name - * \retval < 0 failure to set xattr - */ -static int -ll_initxattrs(struct inode *inode, const struct xattr *xattr_array, - void *fs_info) -{ - struct dentry *dentry = fs_info; - const struct xattr *xattr; - int err = 0; - - for (xattr = xattr_array; xattr->name; xattr++) { - char *full_name; - - full_name = kasprintf(GFP_KERNEL, "%s%s", - XATTR_SECURITY_PREFIX, xattr->name); - if (!full_name) { - err = -ENOMEM; - break; - } - - err = __vfs_setxattr(dentry, inode, full_name, xattr->value, - xattr->value_len, XATTR_CREATE); - kfree(full_name); - if (err < 0) - break; - } - return err; -} - -/** - * Initializes security context - * - * Get security context of @inode in @dir, - * and put it in 'security.xxx' xattr of @dentry. - * - * \retval 0 success, or SELinux is disabled - * \retval -ENOMEM if no memory could be allocated for xattr name - * \retval < 0 failure to get security context or set xattr - */ -int -ll_init_security(struct dentry *dentry, struct inode *inode, struct inode *dir) -{ - if (!selinux_is_enabled()) - return 0; - - return security_inode_init_security(inode, dir, NULL, - &ll_initxattrs, dentry); -} diff --git a/drivers/staging/lustre/lustre/lmv/Makefile b/drivers/staging/lustre/lustre/lmv/Makefile deleted file mode 100644 index 91c99114aa13f1eabac7c8ef86779b5d39a84870..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/lmv/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/include -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/lustre/include - -obj-$(CONFIG_LUSTRE_FS) += lmv.o -lmv-y := lmv_obd.o lmv_intent.o lmv_fld.o lproc_lmv.o diff --git a/drivers/staging/lustre/lustre/lmv/lmv_fld.c b/drivers/staging/lustre/lustre/lmv/lmv_fld.c deleted file mode 100644 index 00dc858c10c973509ba826c134e974ed0dc98fe8..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/lmv/lmv_fld.c +++ /dev/null @@ -1,82 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2013, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_LMV -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include "lmv_internal.h" - -int lmv_fld_lookup(struct lmv_obd *lmv, const struct lu_fid *fid, u32 *mds) -{ - struct obd_device *obd = lmv2obd_dev(lmv); - int rc; - - /* - * FIXME: Currently ZFS still use local seq for ROOT unfortunately, and - * this fid_is_local check should be removed once LU-2240 is fixed - */ - if (!fid_is_sane(fid) || !(fid_seq_in_fldb(fid_seq(fid)) || - fid_seq_is_local_file(fid_seq(fid)))) { - CERROR("%s: invalid FID " DFID "\n", obd->obd_name, PFID(fid)); - return -EINVAL; - } - - rc = fld_client_lookup(&lmv->lmv_fld, fid_seq(fid), mds, - LU_SEQ_RANGE_MDT, NULL); - if (rc) { - CERROR("Error while looking for mds number. Seq %#llx, err = %d\n", - fid_seq(fid), rc); - return rc; - } - - CDEBUG(D_INODE, "FLD lookup got mds #%x for fid=" DFID "\n", - *mds, PFID(fid)); - - if (*mds >= lmv->desc.ld_tgt_count) { - CERROR("FLD lookup got invalid mds #%x (max: %x) for fid=" DFID "\n", *mds, lmv->desc.ld_tgt_count, - PFID(fid)); - rc = -EINVAL; - } - return rc; -} diff --git a/drivers/staging/lustre/lustre/lmv/lmv_intent.c b/drivers/staging/lustre/lustre/lmv/lmv_intent.c deleted file mode 100644 index 1e850fdbc623e7556eac8bd694ba2fdd3d8bd936..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/lmv/lmv_intent.c +++ /dev/null @@ -1,521 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_LMV -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "lmv_internal.h" - -static int lmv_intent_remote(struct obd_export *exp, struct lookup_intent *it, - const struct lu_fid *parent_fid, - struct ptlrpc_request **reqp, - ldlm_blocking_callback cb_blocking, - __u64 extra_lock_flags) -{ - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - struct ptlrpc_request *req = NULL; - struct lustre_handle plock; - struct md_op_data *op_data; - struct lmv_tgt_desc *tgt; - struct mdt_body *body; - int pmode; - int rc = 0; - - body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY); - if (!body) - return -EPROTO; - - LASSERT((body->mbo_valid & OBD_MD_MDS)); - - /* - * Unfortunately, we have to lie to MDC/MDS to retrieve - * attributes llite needs and provideproper locking. - */ - if (it->it_op & IT_LOOKUP) - it->it_op = IT_GETATTR; - - /* - * We got LOOKUP lock, but we really need attrs. - */ - pmode = it->it_lock_mode; - if (pmode) { - plock.cookie = it->it_lock_handle; - it->it_lock_mode = 0; - it->it_request = NULL; - } - - LASSERT(fid_is_sane(&body->mbo_fid1)); - - tgt = lmv_find_target(lmv, &body->mbo_fid1); - if (IS_ERR(tgt)) { - rc = PTR_ERR(tgt); - goto out; - } - - op_data = kzalloc(sizeof(*op_data), GFP_NOFS); - if (!op_data) { - rc = -ENOMEM; - goto out; - } - - op_data->op_fid1 = body->mbo_fid1; - /* Sent the parent FID to the remote MDT */ - if (parent_fid) { - /* The parent fid is only for remote open to - * check whether the open is from OBF, - * see mdt_cross_open - */ - LASSERT(it->it_op & IT_OPEN); - op_data->op_fid2 = *parent_fid; - } - - op_data->op_bias = MDS_CROSS_REF; - CDEBUG(D_INODE, "REMOTE_INTENT with fid=" DFID " -> mds #%u\n", - PFID(&body->mbo_fid1), tgt->ltd_idx); - - rc = md_intent_lock(tgt->ltd_exp, op_data, it, &req, cb_blocking, - extra_lock_flags); - if (rc) - goto out_free_op_data; - - /* - * LLite needs LOOKUP lock to track dentry revocation in order to - * maintain dcache consistency. Thus drop UPDATE|PERM lock here - * and put LOOKUP in request. - */ - if (it->it_lock_mode != 0) { - it->it_remote_lock_handle = - it->it_lock_handle; - it->it_remote_lock_mode = it->it_lock_mode; - } - - if (pmode) { - it->it_lock_handle = plock.cookie; - it->it_lock_mode = pmode; - } - -out_free_op_data: - kfree(op_data); -out: - if (rc && pmode) - ldlm_lock_decref(&plock, pmode); - - ptlrpc_req_finished(*reqp); - *reqp = req; - return rc; -} - -int lmv_revalidate_slaves(struct obd_export *exp, - const struct lmv_stripe_md *lsm, - ldlm_blocking_callback cb_blocking, - int extra_lock_flags) -{ - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - struct ptlrpc_request *req = NULL; - struct mdt_body *body; - struct md_op_data *op_data; - int rc = 0, i; - - /** - * revalidate slaves has some problems, temporarily return, - * we may not need that - */ - op_data = kzalloc(sizeof(*op_data), GFP_NOFS); - if (!op_data) - return -ENOMEM; - - /** - * Loop over the stripe information, check validity and update them - * from MDS if needed. - */ - for (i = 0; i < lsm->lsm_md_stripe_count; i++) { - struct lookup_intent it = { .it_op = IT_GETATTR }; - struct lustre_handle *lockh = NULL; - struct lmv_tgt_desc *tgt = NULL; - struct inode *inode; - struct lu_fid fid; - - fid = lsm->lsm_md_oinfo[i].lmo_fid; - inode = lsm->lsm_md_oinfo[i].lmo_root; - - /* - * Prepare op_data for revalidating. Note that @fid2 shluld be - * defined otherwise it will go to server and take new lock - * which is not needed here. - */ - memset(op_data, 0, sizeof(*op_data)); - op_data->op_fid1 = fid; - op_data->op_fid2 = fid; - - tgt = lmv_locate_mds(lmv, op_data, &fid); - if (IS_ERR(tgt)) { - rc = PTR_ERR(tgt); - goto cleanup; - } - - CDEBUG(D_INODE, "Revalidate slave " DFID " -> mds #%u\n", - PFID(&fid), tgt->ltd_idx); - - if (req) { - ptlrpc_req_finished(req); - req = NULL; - } - - rc = md_intent_lock(tgt->ltd_exp, op_data, &it, &req, - cb_blocking, extra_lock_flags); - if (rc < 0) - goto cleanup; - - lockh = (struct lustre_handle *)&it.it_lock_handle; - if (rc > 0 && !req) { - /* slave inode is still valid */ - CDEBUG(D_INODE, "slave " DFID " is still valid.\n", - PFID(&fid)); - rc = 0; - } else { - /* refresh slave from server */ - body = req_capsule_server_get(&req->rq_pill, - &RMF_MDT_BODY); - if (!body) { - if (it.it_lock_mode && lockh) { - ldlm_lock_decref(lockh, it.it_lock_mode); - it.it_lock_mode = 0; - } - - rc = -ENOENT; - goto cleanup; - } - - i_size_write(inode, body->mbo_size); - inode->i_blocks = body->mbo_blocks; - set_nlink(inode, body->mbo_nlink); - LTIME_S(inode->i_atime) = body->mbo_atime; - LTIME_S(inode->i_ctime) = body->mbo_ctime; - LTIME_S(inode->i_mtime) = body->mbo_mtime; - } - - md_set_lock_data(tgt->ltd_exp, lockh, inode, NULL); - - if (it.it_lock_mode && lockh) { - ldlm_lock_decref(lockh, it.it_lock_mode); - it.it_lock_mode = 0; - } - } - -cleanup: - if (req) - ptlrpc_req_finished(req); - - kfree(op_data); - return rc; -} - -/* - * IT_OPEN is intended to open (and create, possible) an object. Parent (pid) - * may be split dir. - */ -static int lmv_intent_open(struct obd_export *exp, struct md_op_data *op_data, - struct lookup_intent *it, - struct ptlrpc_request **reqp, - ldlm_blocking_callback cb_blocking, - __u64 extra_lock_flags) -{ - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_tgt_desc *tgt; - struct mdt_body *body; - int rc; - - if (it->it_flags & MDS_OPEN_BY_FID) { - LASSERT(fid_is_sane(&op_data->op_fid2)); - - /* - * for striped directory, we can't know parent stripe fid - * without name, but we can set it to child fid, and MDT - * will obtain it from linkea in open in such case. - */ - if (op_data->op_mea1) - op_data->op_fid1 = op_data->op_fid2; - - tgt = lmv_find_target(lmv, &op_data->op_fid2); - if (IS_ERR(tgt)) - return PTR_ERR(tgt); - - op_data->op_mds = tgt->ltd_idx; - } else { - LASSERT(fid_is_sane(&op_data->op_fid1)); - LASSERT(fid_is_zero(&op_data->op_fid2)); - LASSERT(op_data->op_name); - - tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1); - if (IS_ERR(tgt)) - return PTR_ERR(tgt); - } - - /* If it is ready to open the file by FID, do not need - * allocate FID at all, otherwise it will confuse MDT - */ - if ((it->it_op & IT_CREAT) && !(it->it_flags & MDS_OPEN_BY_FID)) { - /* - * For lookup(IT_CREATE) cases allocate new fid and setup FLD - * for it. - */ - rc = lmv_fid_alloc(NULL, exp, &op_data->op_fid2, op_data); - if (rc != 0) - return rc; - } - - CDEBUG(D_INODE, "OPEN_INTENT with fid1=" DFID ", fid2=" DFID ", name='%s' -> mds #%u\n", - PFID(&op_data->op_fid1), - PFID(&op_data->op_fid2), op_data->op_name, tgt->ltd_idx); - - rc = md_intent_lock(tgt->ltd_exp, op_data, it, reqp, cb_blocking, - extra_lock_flags); - if (rc != 0) - return rc; - /* - * Nothing is found, do not access body->mbo_fid1 as it is zero and thus - * pointless. - */ - if ((it->it_disposition & DISP_LOOKUP_NEG) && - !(it->it_disposition & DISP_OPEN_CREATE) && - !(it->it_disposition & DISP_OPEN_OPEN)) - return rc; - - body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY); - if (!body) - return -EPROTO; - - /* Not cross-ref case, just get out of here. */ - if (unlikely((body->mbo_valid & OBD_MD_MDS))) { - rc = lmv_intent_remote(exp, it, &op_data->op_fid1, reqp, - cb_blocking, extra_lock_flags); - if (rc != 0) - return rc; - - body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY); - if (!body) - return -EPROTO; - } - - return rc; -} - -/* - * Handler for: getattr, lookup and revalidate cases. - */ -static int lmv_intent_lookup(struct obd_export *exp, - struct md_op_data *op_data, - struct lookup_intent *it, - struct ptlrpc_request **reqp, - ldlm_blocking_callback cb_blocking, - __u64 extra_lock_flags) -{ - struct lmv_stripe_md *lsm = op_data->op_mea1; - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_tgt_desc *tgt = NULL; - struct mdt_body *body; - int rc = 0; - - /* - * If it returns ERR_PTR(-EBADFD) then it is an unknown hash type - * it will try all stripes to locate the object - */ - tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1); - if (IS_ERR(tgt) && (PTR_ERR(tgt) != -EBADFD)) - return PTR_ERR(tgt); - - /* - * Both migrating dir and unknown hash dir need to try - * all of sub-stripes - */ - if (lsm && !lmv_is_known_hash_type(lsm->lsm_md_hash_type)) { - struct lmv_oinfo *oinfo = &lsm->lsm_md_oinfo[0]; - - op_data->op_fid1 = oinfo->lmo_fid; - op_data->op_mds = oinfo->lmo_mds; - tgt = lmv_get_target(lmv, oinfo->lmo_mds, NULL); - if (IS_ERR(tgt)) - return PTR_ERR(tgt); - } - - if (!fid_is_sane(&op_data->op_fid2)) - fid_zero(&op_data->op_fid2); - - CDEBUG(D_INODE, "LOOKUP_INTENT with fid1=" DFID ", fid2=" DFID ", name='%s' -> mds #%u lsm=%p lsm_magic=%x\n", - PFID(&op_data->op_fid1), PFID(&op_data->op_fid2), - op_data->op_name ? op_data->op_name : "", - tgt->ltd_idx, lsm, !lsm ? -1 : lsm->lsm_md_magic); - - op_data->op_bias &= ~MDS_CROSS_REF; - - rc = md_intent_lock(tgt->ltd_exp, op_data, it, reqp, cb_blocking, - extra_lock_flags); - if (rc < 0) - return rc; - - if (!*reqp) { - /* - * If RPC happens, lsm information will be revalidated - * during update_inode process (see ll_update_lsm_md) - */ - if (op_data->op_mea2) { - rc = lmv_revalidate_slaves(exp, op_data->op_mea2, - cb_blocking, - extra_lock_flags); - if (rc != 0) - return rc; - } - return rc; - } else if (it_disposition(it, DISP_LOOKUP_NEG) && lsm && - lmv_need_try_all_stripes(lsm)) { - /* - * For migrating and unknown hash type directory, it will - * try to target the entry on other stripes - */ - int stripe_index; - - for (stripe_index = 1; - stripe_index < lsm->lsm_md_stripe_count && - it_disposition(it, DISP_LOOKUP_NEG); stripe_index++) { - struct lmv_oinfo *oinfo; - - /* release the previous request */ - ptlrpc_req_finished(*reqp); - it->it_request = NULL; - *reqp = NULL; - - oinfo = &lsm->lsm_md_oinfo[stripe_index]; - tgt = lmv_find_target(lmv, &oinfo->lmo_fid); - if (IS_ERR(tgt)) - return PTR_ERR(tgt); - - CDEBUG(D_INODE, "Try other stripes " DFID "\n", - PFID(&oinfo->lmo_fid)); - - op_data->op_fid1 = oinfo->lmo_fid; - it->it_disposition &= ~DISP_ENQ_COMPLETE; - rc = md_intent_lock(tgt->ltd_exp, op_data, it, reqp, - cb_blocking, extra_lock_flags); - if (rc) - return rc; - } - } - - if (!it_has_reply_body(it)) - return 0; - - /* - * MDS has returned success. Probably name has been resolved in - * remote inode. Let's check this. - */ - body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY); - if (!body) - return -EPROTO; - - /* Not cross-ref case, just get out of here. */ - if (unlikely((body->mbo_valid & OBD_MD_MDS))) { - rc = lmv_intent_remote(exp, it, NULL, reqp, cb_blocking, - extra_lock_flags); - if (rc != 0) - return rc; - body = req_capsule_server_get(&(*reqp)->rq_pill, &RMF_MDT_BODY); - if (!body) - return -EPROTO; - } - - return rc; -} - -int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data, - struct lookup_intent *it, struct ptlrpc_request **reqp, - ldlm_blocking_callback cb_blocking, - __u64 extra_lock_flags) -{ - int rc; - - LASSERT(fid_is_sane(&op_data->op_fid1)); - - CDEBUG(D_INODE, "INTENT LOCK '%s' for " DFID " '%*s' on " DFID "\n", - LL_IT2STR(it), PFID(&op_data->op_fid2), - (int)op_data->op_namelen, op_data->op_name, - PFID(&op_data->op_fid1)); - - if (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_LAYOUT | IT_GETXATTR)) - rc = lmv_intent_lookup(exp, op_data, it, reqp, cb_blocking, - extra_lock_flags); - else if (it->it_op & IT_OPEN) - rc = lmv_intent_open(exp, op_data, it, reqp, cb_blocking, - extra_lock_flags); - else - LBUG(); - - if (rc < 0) { - struct lustre_handle lock_handle; - - if (it->it_lock_mode) { - lock_handle.cookie = it->it_lock_handle; - ldlm_lock_decref_and_cancel(&lock_handle, - it->it_lock_mode); - } - - it->it_lock_handle = 0; - it->it_lock_mode = 0; - - if (it->it_remote_lock_mode) { - lock_handle.cookie = it->it_remote_lock_handle; - ldlm_lock_decref_and_cancel(&lock_handle, - it->it_remote_lock_mode); - } - - it->it_remote_lock_handle = 0; - it->it_remote_lock_mode = 0; - } - - return rc; -} diff --git a/drivers/staging/lustre/lustre/lmv/lmv_internal.h b/drivers/staging/lustre/lustre/lmv/lmv_internal.h deleted file mode 100644 index 68a99170c42438f9a5527192c228584cc9c8082e..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/lmv/lmv_internal.h +++ /dev/null @@ -1,164 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#ifndef _LMV_INTERNAL_H_ -#define _LMV_INTERNAL_H_ - -#include -#include -#include - -#define LMV_MAX_TGT_COUNT 128 - -#define LL_IT2STR(it) \ - ((it) ? ldlm_it2str((it)->it_op) : "0") - -int lmv_intent_lock(struct obd_export *exp, struct md_op_data *op_data, - struct lookup_intent *it, struct ptlrpc_request **reqp, - ldlm_blocking_callback cb_blocking, - __u64 extra_lock_flags); - -int lmv_fld_lookup(struct lmv_obd *lmv, const struct lu_fid *fid, u32 *mds); -int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, u32 mds); -int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp, - struct lu_fid *fid, struct md_op_data *op_data); - -int lmv_revalidate_slaves(struct obd_export *exp, - const struct lmv_stripe_md *lsm, - ldlm_blocking_callback cb_blocking, - int extra_lock_flags); - -static inline struct obd_device *lmv2obd_dev(struct lmv_obd *lmv) -{ - return container_of_safe(lmv, struct obd_device, u.lmv); -} - -static inline struct lmv_tgt_desc * -lmv_get_target(struct lmv_obd *lmv, u32 mdt_idx, int *index) -{ - int i; - - for (i = 0; i < lmv->desc.ld_tgt_count; i++) { - if (!lmv->tgts[i]) - continue; - - if (lmv->tgts[i]->ltd_idx == mdt_idx) { - if (index) - *index = i; - return lmv->tgts[i]; - } - } - - return ERR_PTR(-ENODEV); -} - -static inline int -lmv_find_target_index(struct lmv_obd *lmv, const struct lu_fid *fid) -{ - struct lmv_tgt_desc *ltd; - u32 mdt_idx = 0; - int index = 0; - - if (lmv->desc.ld_tgt_count > 1) { - int rc; - - rc = lmv_fld_lookup(lmv, fid, &mdt_idx); - if (rc < 0) - return rc; - } - - ltd = lmv_get_target(lmv, mdt_idx, &index); - if (IS_ERR(ltd)) - return PTR_ERR(ltd); - - return index; -} - -static inline struct lmv_tgt_desc * -lmv_find_target(struct lmv_obd *lmv, const struct lu_fid *fid) -{ - int index; - - index = lmv_find_target_index(lmv, fid); - if (index < 0) - return ERR_PTR(index); - - return lmv->tgts[index]; -} - -static inline int lmv_stripe_md_size(int stripe_count) -{ - struct lmv_stripe_md *lsm; - - return sizeof(*lsm) + stripe_count * sizeof(lsm->lsm_md_oinfo[0]); -} - -int lmv_name_to_stripe_index(enum lmv_hash_type hashtype, - unsigned int max_mdt_index, - const char *name, int namelen); - -static inline const struct lmv_oinfo * -lsm_name_to_stripe_info(const struct lmv_stripe_md *lsm, const char *name, - int namelen) -{ - int stripe_index; - - stripe_index = lmv_name_to_stripe_index(lsm->lsm_md_hash_type, - lsm->lsm_md_stripe_count, - name, namelen); - if (stripe_index < 0) - return ERR_PTR(stripe_index); - - LASSERTF(stripe_index < lsm->lsm_md_stripe_count, - "stripe_index = %d, stripe_count = %d hash_type = %x name = %.*s\n", - stripe_index, lsm->lsm_md_stripe_count, - lsm->lsm_md_hash_type, namelen, name); - - return &lsm->lsm_md_oinfo[stripe_index]; -} - -static inline bool lmv_need_try_all_stripes(const struct lmv_stripe_md *lsm) -{ - return !lmv_is_known_hash_type(lsm->lsm_md_hash_type) || - lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION; -} - -struct lmv_tgt_desc -*lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data, - struct lu_fid *fid); -/* lproc_lmv.c */ -void lprocfs_lmv_init_vars(struct lprocfs_static_vars *lvars); - -extern const struct file_operations lmv_proc_target_fops; - -#endif diff --git a/drivers/staging/lustre/lustre/lmv/lmv_obd.c b/drivers/staging/lustre/lustre/lmv/lmv_obd.c deleted file mode 100644 index 65f94e6ecaadf0c7c16e015827cde64cc3e4f7c4..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/lmv/lmv_obd.c +++ /dev/null @@ -1,3131 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_LMV -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "lmv_internal.h" - -static int lmv_check_connect(struct obd_device *obd); - -static void lmv_activate_target(struct lmv_obd *lmv, - struct lmv_tgt_desc *tgt, - int activate) -{ - if (tgt->ltd_active == activate) - return; - - tgt->ltd_active = activate; - lmv->desc.ld_active_tgt_count += (activate ? 1 : -1); - tgt->ltd_exp->exp_obd->obd_inactive = !activate; -} - -/** - * Error codes: - * - * -EINVAL : UUID can't be found in the LMV's target list - * -ENOTCONN: The UUID is found, but the target connection is bad (!) - * -EBADF : The UUID is found, but the OBD of the wrong type (!) - */ -static int lmv_set_mdc_active(struct lmv_obd *lmv, const struct obd_uuid *uuid, - int activate) -{ - struct lmv_tgt_desc *tgt = NULL; - struct obd_device *obd; - u32 i; - int rc = 0; - - CDEBUG(D_INFO, "Searching in lmv %p for uuid %s (activate=%d)\n", - lmv, uuid->uuid, activate); - - spin_lock(&lmv->lmv_lock); - for (i = 0; i < lmv->desc.ld_tgt_count; i++) { - tgt = lmv->tgts[i]; - if (!tgt || !tgt->ltd_exp) - continue; - - CDEBUG(D_INFO, "Target idx %d is %s conn %#llx\n", i, - tgt->ltd_uuid.uuid, tgt->ltd_exp->exp_handle.h_cookie); - - if (obd_uuid_equals(uuid, &tgt->ltd_uuid)) - break; - } - - if (i == lmv->desc.ld_tgt_count) { - rc = -EINVAL; - goto out_lmv_lock; - } - - obd = class_exp2obd(tgt->ltd_exp); - if (!obd) { - rc = -ENOTCONN; - goto out_lmv_lock; - } - - CDEBUG(D_INFO, "Found OBD %s=%s device %d (%p) type %s at LMV idx %d\n", - obd->obd_name, obd->obd_uuid.uuid, obd->obd_minor, obd, - obd->obd_type->typ_name, i); - LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0); - - if (tgt->ltd_active == activate) { - CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd, - activate ? "" : "in"); - goto out_lmv_lock; - } - - CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd, - activate ? "" : "in"); - lmv_activate_target(lmv, tgt, activate); - - out_lmv_lock: - spin_unlock(&lmv->lmv_lock); - return rc; -} - -static struct obd_uuid *lmv_get_uuid(struct obd_export *exp) -{ - struct lmv_obd *lmv = &exp->exp_obd->u.lmv; - struct lmv_tgt_desc *tgt = lmv->tgts[0]; - - return tgt ? obd_get_uuid(tgt->ltd_exp) : NULL; -} - -static int lmv_notify(struct obd_device *obd, struct obd_device *watched, - enum obd_notify_event ev, void *data) -{ - struct obd_connect_data *conn_data; - struct lmv_obd *lmv = &obd->u.lmv; - struct obd_uuid *uuid; - int rc = 0; - - if (strcmp(watched->obd_type->typ_name, LUSTRE_MDC_NAME)) { - CERROR("unexpected notification of %s %s!\n", - watched->obd_type->typ_name, - watched->obd_name); - return -EINVAL; - } - - uuid = &watched->u.cli.cl_target_uuid; - if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE) { - /* - * Set MDC as active before notifying the observer, so the - * observer can use the MDC normally. - */ - rc = lmv_set_mdc_active(lmv, uuid, - ev == OBD_NOTIFY_ACTIVE); - if (rc) { - CERROR("%sactivation of %s failed: %d\n", - ev == OBD_NOTIFY_ACTIVE ? "" : "de", - uuid->uuid, rc); - return rc; - } - } else if (ev == OBD_NOTIFY_OCD) { - conn_data = &watched->u.cli.cl_import->imp_connect_data; - /* - * XXX: Make sure that ocd_connect_flags from all targets are - * the same. Otherwise one of MDTs runs wrong version or - * something like this. --umka - */ - obd->obd_self_export->exp_connect_data = *conn_data; - } - - /* - * Pass the notification up the chain. - */ - if (obd->obd_observer) - rc = obd_notify(obd->obd_observer, watched, ev, data); - - return rc; -} - -static int lmv_connect(const struct lu_env *env, - struct obd_export **pexp, struct obd_device *obd, - struct obd_uuid *cluuid, struct obd_connect_data *data, - void *localdata) -{ - struct lmv_obd *lmv = &obd->u.lmv; - struct lustre_handle conn = { 0 }; - struct obd_export *exp; - int rc = 0; - - rc = class_connect(&conn, obd, cluuid); - if (rc) { - CERROR("class_connection() returned %d\n", rc); - return rc; - } - - exp = class_conn2export(&conn); - - lmv->connected = 0; - lmv->cluuid = *cluuid; - lmv->conn_data = *data; - - lmv->lmv_tgts_kobj = kobject_create_and_add("target_obds", - &obd->obd_kobj); - rc = lmv_check_connect(obd); - if (rc) - goto out_sysfs; - - *pexp = exp; - - return rc; - -out_sysfs: - if (lmv->lmv_tgts_kobj) - kobject_put(lmv->lmv_tgts_kobj); - - class_disconnect(exp); - - return rc; -} - -static int lmv_init_ea_size(struct obd_export *exp, u32 easize, u32 def_easize) -{ - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - u32 i; - int rc = 0; - int change = 0; - - if (lmv->max_easize < easize) { - lmv->max_easize = easize; - change = 1; - } - if (lmv->max_def_easize < def_easize) { - lmv->max_def_easize = def_easize; - change = 1; - } - - if (change == 0) - return 0; - - if (lmv->connected == 0) - return 0; - - for (i = 0; i < lmv->desc.ld_tgt_count; i++) { - struct lmv_tgt_desc *tgt = lmv->tgts[i]; - - if (!tgt || !tgt->ltd_exp || !tgt->ltd_active) { - CWARN("%s: NULL export for %d\n", obd->obd_name, i); - continue; - } - - rc = md_init_ea_size(tgt->ltd_exp, easize, def_easize); - if (rc) { - CERROR("%s: obd_init_ea_size() failed on MDT target %d: rc = %d\n", - obd->obd_name, i, rc); - break; - } - } - return rc; -} - -#define MAX_STRING_SIZE 128 - -static int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt) -{ - struct lmv_obd *lmv = &obd->u.lmv; - struct obd_uuid *cluuid = &lmv->cluuid; - struct obd_uuid lmv_mdc_uuid = { "LMV_MDC_UUID" }; - struct obd_device *mdc_obd; - struct obd_export *mdc_exp; - struct lu_fld_target target; - int rc; - - mdc_obd = class_find_client_obd(&tgt->ltd_uuid, LUSTRE_MDC_NAME, - &obd->obd_uuid); - if (!mdc_obd) { - CERROR("target %s not attached\n", tgt->ltd_uuid.uuid); - return -EINVAL; - } - - CDEBUG(D_CONFIG, "connect to %s(%s) - %s, %s FOR %s\n", - mdc_obd->obd_name, mdc_obd->obd_uuid.uuid, - tgt->ltd_uuid.uuid, obd->obd_uuid.uuid, cluuid->uuid); - - if (!mdc_obd->obd_set_up) { - CERROR("target %s is not set up\n", tgt->ltd_uuid.uuid); - return -EINVAL; - } - - rc = obd_connect(NULL, &mdc_exp, mdc_obd, &lmv_mdc_uuid, - &lmv->conn_data, NULL); - if (rc) { - CERROR("target %s connect error %d\n", tgt->ltd_uuid.uuid, rc); - return rc; - } - - /* - * Init fid sequence client for this mdc and add new fld target. - */ - rc = obd_fid_init(mdc_obd, mdc_exp, LUSTRE_SEQ_METADATA); - if (rc) - return rc; - - target.ft_srv = NULL; - target.ft_exp = mdc_exp; - target.ft_idx = tgt->ltd_idx; - - fld_client_add_target(&lmv->lmv_fld, &target); - - rc = obd_register_observer(mdc_obd, obd); - if (rc) { - obd_disconnect(mdc_exp); - CERROR("target %s register_observer error %d\n", - tgt->ltd_uuid.uuid, rc); - return rc; - } - - if (obd->obd_observer) { - /* - * Tell the observer about the new target. - */ - rc = obd_notify(obd->obd_observer, mdc_exp->exp_obd, - OBD_NOTIFY_ACTIVE, - (void *)(tgt - lmv->tgts[0])); - if (rc) { - obd_disconnect(mdc_exp); - return rc; - } - } - - tgt->ltd_active = 1; - tgt->ltd_exp = mdc_exp; - lmv->desc.ld_active_tgt_count++; - - md_init_ea_size(tgt->ltd_exp, lmv->max_easize, lmv->max_def_easize); - - CDEBUG(D_CONFIG, "Connected to %s(%s) successfully (%d)\n", - mdc_obd->obd_name, mdc_obd->obd_uuid.uuid, - atomic_read(&obd->obd_refcount)); - - if (lmv->lmv_tgts_kobj) - /* Even if we failed to create the link, that's fine */ - rc = sysfs_create_link(lmv->lmv_tgts_kobj, &mdc_obd->obd_kobj, - mdc_obd->obd_name); - return 0; -} - -static void lmv_del_target(struct lmv_obd *lmv, int index) -{ - if (!lmv->tgts[index]) - return; - - kfree(lmv->tgts[index]); - lmv->tgts[index] = NULL; -} - -static int lmv_add_target(struct obd_device *obd, struct obd_uuid *uuidp, - __u32 index, int gen) -{ - struct lmv_obd *lmv = &obd->u.lmv; - struct obd_device *mdc_obd; - struct lmv_tgt_desc *tgt; - int orig_tgt_count = 0; - int rc = 0; - - CDEBUG(D_CONFIG, "Target uuid: %s. index %d\n", uuidp->uuid, index); - - mdc_obd = class_find_client_obd(uuidp, LUSTRE_MDC_NAME, - &obd->obd_uuid); - if (!mdc_obd) { - CERROR("%s: Target %s not attached: rc = %d\n", - obd->obd_name, uuidp->uuid, -EINVAL); - return -EINVAL; - } - - mutex_lock(&lmv->lmv_init_mutex); - - if ((index < lmv->tgts_size) && lmv->tgts[index]) { - tgt = lmv->tgts[index]; - CERROR("%s: UUID %s already assigned at LOV target index %d: rc = %d\n", - obd->obd_name, - obd_uuid2str(&tgt->ltd_uuid), index, -EEXIST); - mutex_unlock(&lmv->lmv_init_mutex); - return -EEXIST; - } - - if (index >= lmv->tgts_size) { - /* We need to reallocate the lmv target array. */ - struct lmv_tgt_desc **newtgts, **old = NULL; - __u32 newsize = 1; - __u32 oldsize = 0; - - while (newsize < index + 1) - newsize <<= 1; - newtgts = kcalloc(newsize, sizeof(*newtgts), GFP_NOFS); - if (!newtgts) { - mutex_unlock(&lmv->lmv_init_mutex); - return -ENOMEM; - } - - if (lmv->tgts_size) { - memcpy(newtgts, lmv->tgts, - sizeof(*newtgts) * lmv->tgts_size); - old = lmv->tgts; - oldsize = lmv->tgts_size; - } - - lmv->tgts = newtgts; - lmv->tgts_size = newsize; - smp_rmb(); - kfree(old); - - CDEBUG(D_CONFIG, "tgts: %p size: %d\n", lmv->tgts, - lmv->tgts_size); - } - - tgt = kzalloc(sizeof(*tgt), GFP_NOFS); - if (!tgt) { - mutex_unlock(&lmv->lmv_init_mutex); - return -ENOMEM; - } - - mutex_init(&tgt->ltd_fid_mutex); - tgt->ltd_idx = index; - tgt->ltd_uuid = *uuidp; - tgt->ltd_active = 0; - lmv->tgts[index] = tgt; - if (index >= lmv->desc.ld_tgt_count) { - orig_tgt_count = lmv->desc.ld_tgt_count; - lmv->desc.ld_tgt_count = index + 1; - } - - if (!lmv->connected) { - /* lmv_check_connect() will connect this target. */ - mutex_unlock(&lmv->lmv_init_mutex); - return rc; - } - - /* Otherwise let's connect it ourselves */ - mutex_unlock(&lmv->lmv_init_mutex); - rc = lmv_connect_mdc(obd, tgt); - if (rc) { - spin_lock(&lmv->lmv_lock); - if (lmv->desc.ld_tgt_count == index + 1) - lmv->desc.ld_tgt_count = orig_tgt_count; - memset(tgt, 0, sizeof(*tgt)); - spin_unlock(&lmv->lmv_lock); - } else { - int easize = sizeof(struct lmv_stripe_md) + - lmv->desc.ld_tgt_count * sizeof(struct lu_fid); - lmv_init_ea_size(obd->obd_self_export, easize, 0); - } - - return rc; -} - -static int lmv_check_connect(struct obd_device *obd) -{ - struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_tgt_desc *tgt; - u32 i; - int rc; - int easize; - - if (lmv->connected) - return 0; - - mutex_lock(&lmv->lmv_init_mutex); - if (lmv->connected) { - mutex_unlock(&lmv->lmv_init_mutex); - return 0; - } - - if (lmv->desc.ld_tgt_count == 0) { - mutex_unlock(&lmv->lmv_init_mutex); - CERROR("%s: no targets configured.\n", obd->obd_name); - return -EINVAL; - } - - LASSERT(lmv->tgts); - - if (!lmv->tgts[0]) { - mutex_unlock(&lmv->lmv_init_mutex); - CERROR("%s: no target configured for index 0.\n", - obd->obd_name); - return -EINVAL; - } - - CDEBUG(D_CONFIG, "Time to connect %s to %s\n", - lmv->cluuid.uuid, obd->obd_name); - - for (i = 0; i < lmv->desc.ld_tgt_count; i++) { - tgt = lmv->tgts[i]; - if (!tgt) - continue; - rc = lmv_connect_mdc(obd, tgt); - if (rc) - goto out_disc; - } - - lmv->connected = 1; - easize = lmv_mds_md_size(lmv->desc.ld_tgt_count, LMV_MAGIC); - lmv_init_ea_size(obd->obd_self_export, easize, 0); - mutex_unlock(&lmv->lmv_init_mutex); - return 0; - - out_disc: - while (i-- > 0) { - int rc2; - - tgt = lmv->tgts[i]; - if (!tgt) - continue; - tgt->ltd_active = 0; - if (tgt->ltd_exp) { - --lmv->desc.ld_active_tgt_count; - rc2 = obd_disconnect(tgt->ltd_exp); - if (rc2) { - CERROR("LMV target %s disconnect on MDC idx %d: error %d\n", - tgt->ltd_uuid.uuid, i, rc2); - } - } - } - - mutex_unlock(&lmv->lmv_init_mutex); - return rc; -} - -static int lmv_disconnect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt) -{ - struct lmv_obd *lmv = &obd->u.lmv; - struct obd_device *mdc_obd; - int rc; - - mdc_obd = class_exp2obd(tgt->ltd_exp); - - if (mdc_obd) { - mdc_obd->obd_force = obd->obd_force; - mdc_obd->obd_fail = obd->obd_fail; - mdc_obd->obd_no_recov = obd->obd_no_recov; - - if (lmv->lmv_tgts_kobj) - sysfs_remove_link(lmv->lmv_tgts_kobj, - mdc_obd->obd_name); - } - - rc = obd_fid_fini(tgt->ltd_exp->exp_obd); - if (rc) - CERROR("Can't finalize fids factory\n"); - - CDEBUG(D_INFO, "Disconnected from %s(%s) successfully\n", - tgt->ltd_exp->exp_obd->obd_name, - tgt->ltd_exp->exp_obd->obd_uuid.uuid); - - obd_register_observer(tgt->ltd_exp->exp_obd, NULL); - rc = obd_disconnect(tgt->ltd_exp); - if (rc) { - if (tgt->ltd_active) { - CERROR("Target %s disconnect error %d\n", - tgt->ltd_uuid.uuid, rc); - } - } - - lmv_activate_target(lmv, tgt, 0); - tgt->ltd_exp = NULL; - return 0; -} - -static int lmv_disconnect(struct obd_export *exp) -{ - struct obd_device *obd = class_exp2obd(exp); - struct lmv_obd *lmv = &obd->u.lmv; - int rc; - u32 i; - - if (!lmv->tgts) - goto out_local; - - for (i = 0; i < lmv->desc.ld_tgt_count; i++) { - if (!lmv->tgts[i] || !lmv->tgts[i]->ltd_exp) - continue; - - lmv_disconnect_mdc(obd, lmv->tgts[i]); - } - - if (lmv->lmv_tgts_kobj) - kobject_put(lmv->lmv_tgts_kobj); - -out_local: - /* - * This is the case when no real connection is established by - * lmv_check_connect(). - */ - if (!lmv->connected) - class_export_put(exp); - rc = class_disconnect(exp); - lmv->connected = 0; - return rc; -} - -static int lmv_fid2path(struct obd_export *exp, int len, void *karg, - void __user *uarg) -{ - struct obd_device *obddev = class_exp2obd(exp); - struct lmv_obd *lmv = &obddev->u.lmv; - struct getinfo_fid2path *gf; - struct lmv_tgt_desc *tgt; - struct getinfo_fid2path *remote_gf = NULL; - int remote_gf_size = 0; - int rc; - - gf = karg; - tgt = lmv_find_target(lmv, &gf->gf_fid); - if (IS_ERR(tgt)) - return PTR_ERR(tgt); - -repeat_fid2path: - rc = obd_iocontrol(OBD_IOC_FID2PATH, tgt->ltd_exp, len, gf, uarg); - if (rc != 0 && rc != -EREMOTE) - goto out_fid2path; - - /* If remote_gf != NULL, it means just building the - * path on the remote MDT, copy this path segment to gf - */ - if (remote_gf) { - struct getinfo_fid2path *ori_gf; - char *ptr; - - ori_gf = karg; - if (strlen(ori_gf->gf_path) + 1 + - strlen(gf->gf_path) + 1 > ori_gf->gf_pathlen) { - rc = -EOVERFLOW; - goto out_fid2path; - } - - ptr = ori_gf->gf_path; - - memmove(ptr + strlen(gf->gf_path) + 1, ptr, - strlen(ori_gf->gf_path)); - - strncpy(ptr, gf->gf_path, strlen(gf->gf_path)); - ptr += strlen(gf->gf_path); - *ptr = '/'; - } - - CDEBUG(D_INFO, "%s: get path %s " DFID " rec: %llu ln: %u\n", - tgt->ltd_exp->exp_obd->obd_name, - gf->gf_path, PFID(&gf->gf_fid), gf->gf_recno, - gf->gf_linkno); - - if (rc == 0) - goto out_fid2path; - - /* sigh, has to go to another MDT to do path building further */ - if (!remote_gf) { - remote_gf_size = sizeof(*remote_gf) + PATH_MAX; - remote_gf = kzalloc(remote_gf_size, GFP_NOFS); - if (!remote_gf) { - rc = -ENOMEM; - goto out_fid2path; - } - remote_gf->gf_pathlen = PATH_MAX; - } - - if (!fid_is_sane(&gf->gf_fid)) { - CERROR("%s: invalid FID " DFID ": rc = %d\n", - tgt->ltd_exp->exp_obd->obd_name, - PFID(&gf->gf_fid), -EINVAL); - rc = -EINVAL; - goto out_fid2path; - } - - tgt = lmv_find_target(lmv, &gf->gf_fid); - if (IS_ERR(tgt)) { - rc = -EINVAL; - goto out_fid2path; - } - - remote_gf->gf_fid = gf->gf_fid; - remote_gf->gf_recno = -1; - remote_gf->gf_linkno = -1; - memset(remote_gf->gf_path, 0, remote_gf->gf_pathlen); - gf = remote_gf; - goto repeat_fid2path; - -out_fid2path: - kfree(remote_gf); - return rc; -} - -static int lmv_hsm_req_count(struct lmv_obd *lmv, - const struct hsm_user_request *hur, - const struct lmv_tgt_desc *tgt_mds) -{ - u32 i, nr = 0; - struct lmv_tgt_desc *curr_tgt; - - /* count how many requests must be sent to the given target */ - for (i = 0; i < hur->hur_request.hr_itemcount; i++) { - curr_tgt = lmv_find_target(lmv, &hur->hur_user_item[i].hui_fid); - if (IS_ERR(curr_tgt)) - return PTR_ERR(curr_tgt); - if (obd_uuid_equals(&curr_tgt->ltd_uuid, &tgt_mds->ltd_uuid)) - nr++; - } - return nr; -} - -static int lmv_hsm_req_build(struct lmv_obd *lmv, - struct hsm_user_request *hur_in, - const struct lmv_tgt_desc *tgt_mds, - struct hsm_user_request *hur_out) -{ - int i, nr_out; - struct lmv_tgt_desc *curr_tgt; - - /* build the hsm_user_request for the given target */ - hur_out->hur_request = hur_in->hur_request; - nr_out = 0; - for (i = 0; i < hur_in->hur_request.hr_itemcount; i++) { - curr_tgt = lmv_find_target(lmv, - &hur_in->hur_user_item[i].hui_fid); - if (IS_ERR(curr_tgt)) - return PTR_ERR(curr_tgt); - if (obd_uuid_equals(&curr_tgt->ltd_uuid, &tgt_mds->ltd_uuid)) { - hur_out->hur_user_item[nr_out] = - hur_in->hur_user_item[i]; - nr_out++; - } - } - hur_out->hur_request.hr_itemcount = nr_out; - memcpy(hur_data(hur_out), hur_data(hur_in), - hur_in->hur_request.hr_data_len); - - return 0; -} - -static int lmv_hsm_ct_unregister(struct lmv_obd *lmv, unsigned int cmd, int len, - struct lustre_kernelcomm *lk, - void __user *uarg) -{ - __u32 i; - - /* unregister request (call from llapi_hsm_copytool_fini) */ - for (i = 0; i < lmv->desc.ld_tgt_count; i++) { - struct lmv_tgt_desc *tgt = lmv->tgts[i]; - - if (!tgt || !tgt->ltd_exp) - continue; - - /* best effort: try to clean as much as possible - * (continue on error) - */ - obd_iocontrol(cmd, lmv->tgts[i]->ltd_exp, len, lk, uarg); - } - - /* Whatever the result, remove copytool from kuc groups. - * Unreached coordinators will get EPIPE on next requests - * and will unregister automatically. - */ - return libcfs_kkuc_group_rem(lk->lk_uid, lk->lk_group); -} - -static int lmv_hsm_ct_register(struct lmv_obd *lmv, unsigned int cmd, int len, - struct lustre_kernelcomm *lk, void __user *uarg) -{ - struct file *filp; - __u32 i, j; - int err, rc = 0; - bool any_set = false; - struct kkuc_ct_data kcd = { 0 }; - - /* All or nothing: try to register to all MDS. - * In case of failure, unregister from previous MDS, - * except if it because of inactive target. - */ - for (i = 0; i < lmv->desc.ld_tgt_count; i++) { - struct lmv_tgt_desc *tgt = lmv->tgts[i]; - - if (!tgt || !tgt->ltd_exp) - continue; - - err = obd_iocontrol(cmd, tgt->ltd_exp, len, lk, uarg); - if (err) { - if (tgt->ltd_active) { - /* permanent error */ - CERROR("error: iocontrol MDC %s on MDTidx %d cmd %x: err = %d\n", - tgt->ltd_uuid.uuid, i, cmd, err); - rc = err; - lk->lk_flags |= LK_FLG_STOP; - /* unregister from previous MDS */ - for (j = 0; j < i; j++) { - tgt = lmv->tgts[j]; - - if (!tgt || !tgt->ltd_exp) - continue; - obd_iocontrol(cmd, tgt->ltd_exp, len, - lk, uarg); - } - return rc; - } - /* else: transient error. - * kuc will register to the missing MDT when it is back - */ - } else { - any_set = true; - } - } - - if (!any_set) - /* no registration done: return error */ - return -ENOTCONN; - - /* at least one registration done, with no failure */ - filp = fget(lk->lk_wfd); - if (!filp) - return -EBADF; - - kcd.kcd_magic = KKUC_CT_DATA_MAGIC; - kcd.kcd_uuid = lmv->cluuid; - kcd.kcd_archive = lk->lk_data; - - rc = libcfs_kkuc_group_add(filp, lk->lk_uid, lk->lk_group, - &kcd, sizeof(kcd)); - if (rc) - fput(filp); - - return rc; -} - -static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp, - int len, void *karg, void __user *uarg) -{ - struct obd_device *obddev = class_exp2obd(exp); - struct lmv_obd *lmv = &obddev->u.lmv; - struct lmv_tgt_desc *tgt = NULL; - u32 i = 0; - int rc = 0; - int set = 0; - u32 count = lmv->desc.ld_tgt_count; - - if (count == 0) - return -ENOTTY; - - switch (cmd) { - case IOC_OBD_STATFS: { - struct obd_ioctl_data *data = karg; - struct obd_device *mdc_obd; - struct obd_statfs stat_buf = {0}; - __u32 index; - - memcpy(&index, data->ioc_inlbuf2, sizeof(__u32)); - if (index >= count) - return -ENODEV; - - tgt = lmv->tgts[index]; - if (!tgt || !tgt->ltd_active) - return -ENODATA; - - mdc_obd = class_exp2obd(tgt->ltd_exp); - if (!mdc_obd) - return -EINVAL; - - /* copy UUID */ - if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(mdc_obd), - min((int)data->ioc_plen2, - (int)sizeof(struct obd_uuid)))) - return -EFAULT; - - rc = obd_statfs(NULL, tgt->ltd_exp, &stat_buf, - get_jiffies_64() - OBD_STATFS_CACHE_SECONDS * HZ, - 0); - if (rc) - return rc; - if (copy_to_user(data->ioc_pbuf1, &stat_buf, - min((int)data->ioc_plen1, - (int)sizeof(stat_buf)))) - return -EFAULT; - break; - } - case OBD_IOC_QUOTACTL: { - struct if_quotactl *qctl = karg; - struct obd_quotactl *oqctl; - - if (qctl->qc_valid == QC_MDTIDX) { - if (count <= qctl->qc_idx) - return -EINVAL; - - tgt = lmv->tgts[qctl->qc_idx]; - if (!tgt || !tgt->ltd_exp) - return -EINVAL; - } else if (qctl->qc_valid == QC_UUID) { - for (i = 0; i < count; i++) { - tgt = lmv->tgts[i]; - if (!tgt) - continue; - if (!obd_uuid_equals(&tgt->ltd_uuid, - &qctl->obd_uuid)) - continue; - - if (!tgt->ltd_exp) - return -EINVAL; - - break; - } - } else { - return -EINVAL; - } - - if (i >= count) - return -EAGAIN; - - LASSERT(tgt && tgt->ltd_exp); - oqctl = kzalloc(sizeof(*oqctl), GFP_NOFS); - if (!oqctl) - return -ENOMEM; - - QCTL_COPY(oqctl, qctl); - rc = obd_quotactl(tgt->ltd_exp, oqctl); - if (rc == 0) { - QCTL_COPY(qctl, oqctl); - qctl->qc_valid = QC_MDTIDX; - qctl->obd_uuid = tgt->ltd_uuid; - } - kfree(oqctl); - break; - } - case OBD_IOC_CHANGELOG_SEND: - case OBD_IOC_CHANGELOG_CLEAR: { - struct ioc_changelog *icc = karg; - - if (icc->icc_mdtindex >= count) - return -ENODEV; - - tgt = lmv->tgts[icc->icc_mdtindex]; - if (!tgt || !tgt->ltd_exp || !tgt->ltd_active) - return -ENODEV; - rc = obd_iocontrol(cmd, tgt->ltd_exp, sizeof(*icc), icc, NULL); - break; - } - case LL_IOC_GET_CONNECT_FLAGS: { - tgt = lmv->tgts[0]; - - if (!tgt || !tgt->ltd_exp) - return -ENODATA; - rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg); - break; - } - case LL_IOC_FID2MDTIDX: { - struct lu_fid *fid = karg; - int mdt_index; - - rc = lmv_fld_lookup(lmv, fid, &mdt_index); - if (rc) - return rc; - - /* - * Note: this is from llite(see ll_dir_ioctl()), @uarg does not - * point to user space memory for FID2MDTIDX. - */ - *(__u32 *)uarg = mdt_index; - break; - } - case OBD_IOC_FID2PATH: { - rc = lmv_fid2path(exp, len, karg, uarg); - break; - } - case LL_IOC_HSM_STATE_GET: - case LL_IOC_HSM_STATE_SET: - case LL_IOC_HSM_ACTION: { - struct md_op_data *op_data = karg; - - tgt = lmv_find_target(lmv, &op_data->op_fid1); - if (IS_ERR(tgt)) - return PTR_ERR(tgt); - - if (!tgt->ltd_exp) - return -EINVAL; - - rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg); - break; - } - case LL_IOC_HSM_PROGRESS: { - const struct hsm_progress_kernel *hpk = karg; - - tgt = lmv_find_target(lmv, &hpk->hpk_fid); - if (IS_ERR(tgt)) - return PTR_ERR(tgt); - rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg); - break; - } - case LL_IOC_HSM_REQUEST: { - struct hsm_user_request *hur = karg; - unsigned int reqcount = hur->hur_request.hr_itemcount; - - if (reqcount == 0) - return 0; - - /* if the request is about a single fid - * or if there is a single MDS, no need to split - * the request. - */ - if (reqcount == 1 || count == 1) { - tgt = lmv_find_target(lmv, - &hur->hur_user_item[0].hui_fid); - if (IS_ERR(tgt)) - return PTR_ERR(tgt); - rc = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg); - } else { - /* split fid list to their respective MDS */ - for (i = 0; i < count; i++) { - struct hsm_user_request *req; - size_t reqlen; - int nr, rc1; - - tgt = lmv->tgts[i]; - if (!tgt || !tgt->ltd_exp) - continue; - - nr = lmv_hsm_req_count(lmv, hur, tgt); - if (nr < 0) - return nr; - if (nr == 0) /* nothing for this MDS */ - continue; - - /* build a request with fids for this MDS */ - reqlen = offsetof(typeof(*hur), - hur_user_item[nr]) - + hur->hur_request.hr_data_len; - req = kvzalloc(reqlen, GFP_NOFS); - if (!req) - return -ENOMEM; - - rc1 = lmv_hsm_req_build(lmv, hur, tgt, req); - if (rc1 < 0) - goto hsm_req_err; - - rc1 = obd_iocontrol(cmd, tgt->ltd_exp, reqlen, - req, uarg); -hsm_req_err: - if (rc1 != 0 && rc == 0) - rc = rc1; - kvfree(req); - } - } - break; - } - case LL_IOC_LOV_SWAP_LAYOUTS: { - struct md_op_data *op_data = karg; - struct lmv_tgt_desc *tgt1, *tgt2; - - tgt1 = lmv_find_target(lmv, &op_data->op_fid1); - if (IS_ERR(tgt1)) - return PTR_ERR(tgt1); - - tgt2 = lmv_find_target(lmv, &op_data->op_fid2); - if (IS_ERR(tgt2)) - return PTR_ERR(tgt2); - - if (!tgt1->ltd_exp || !tgt2->ltd_exp) - return -EINVAL; - - /* only files on same MDT can have their layouts swapped */ - if (tgt1->ltd_idx != tgt2->ltd_idx) - return -EPERM; - - rc = obd_iocontrol(cmd, tgt1->ltd_exp, len, karg, uarg); - break; - } - case LL_IOC_HSM_CT_START: { - struct lustre_kernelcomm *lk = karg; - - if (lk->lk_flags & LK_FLG_STOP) - rc = lmv_hsm_ct_unregister(lmv, cmd, len, lk, uarg); - else - rc = lmv_hsm_ct_register(lmv, cmd, len, lk, uarg); - break; - } - default: - for (i = 0; i < count; i++) { - struct obd_device *mdc_obd; - int err; - - tgt = lmv->tgts[i]; - if (!tgt || !tgt->ltd_exp) - continue; - /* ll_umount_begin() sets force flag but for lmv, not - * mdc. Let's pass it through - */ - mdc_obd = class_exp2obd(tgt->ltd_exp); - mdc_obd->obd_force = obddev->obd_force; - err = obd_iocontrol(cmd, tgt->ltd_exp, len, karg, uarg); - if (err) { - if (tgt->ltd_active) { - CERROR("%s: error: iocontrol MDC %s on MDTidx %d cmd %x: err = %d\n", - lmv2obd_dev(lmv)->obd_name, - tgt->ltd_uuid.uuid, i, cmd, err); - if (!rc) - rc = err; - } - } else { - set = 1; - } - } - if (!set && !rc) - rc = -EIO; - } - return rc; -} - -/** - * This is _inode_ placement policy function (not name). - */ -static int lmv_placement_policy(struct obd_device *obd, - struct md_op_data *op_data, u32 *mds) -{ - struct lmv_obd *lmv = &obd->u.lmv; - - LASSERT(mds); - - if (lmv->desc.ld_tgt_count == 1) { - *mds = 0; - return 0; - } - - if (op_data->op_default_stripe_offset != -1) { - *mds = op_data->op_default_stripe_offset; - return 0; - } - - /** - * If stripe_offset is provided during setdirstripe - * (setdirstripe -i xx), xx MDS will be chosen. - */ - if (op_data->op_cli_flags & CLI_SET_MEA && op_data->op_data) { - struct lmv_user_md *lum; - - lum = op_data->op_data; - if (le32_to_cpu(lum->lum_stripe_offset) != (__u32)-1) { - *mds = le32_to_cpu(lum->lum_stripe_offset); - } else { - /* - * -1 means default, which will be in the same MDT with - * the stripe - */ - *mds = op_data->op_mds; - lum->lum_stripe_offset = cpu_to_le32(op_data->op_mds); - } - } else { - /* - * Allocate new fid on target according to operation type and - * parent home mds. - */ - *mds = op_data->op_mds; - } - - return 0; -} - -int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, u32 mds) -{ - struct lmv_tgt_desc *tgt; - int rc; - - tgt = lmv_get_target(lmv, mds, NULL); - if (IS_ERR(tgt)) - return PTR_ERR(tgt); - - /* - * New seq alloc and FLD setup should be atomic. Otherwise we may find - * on server that seq in new allocated fid is not yet known. - */ - mutex_lock(&tgt->ltd_fid_mutex); - - if (tgt->ltd_active == 0 || !tgt->ltd_exp) { - rc = -ENODEV; - goto out; - } - - /* - * Asking underlaying tgt layer to allocate new fid. - */ - rc = obd_fid_alloc(NULL, tgt->ltd_exp, fid, NULL); - if (rc > 0) { - LASSERT(fid_is_sane(fid)); - rc = 0; - } - -out: - mutex_unlock(&tgt->ltd_fid_mutex); - return rc; -} - -int lmv_fid_alloc(const struct lu_env *env, struct obd_export *exp, - struct lu_fid *fid, struct md_op_data *op_data) -{ - struct obd_device *obd = class_exp2obd(exp); - struct lmv_obd *lmv = &obd->u.lmv; - u32 mds = 0; - int rc; - - LASSERT(op_data); - LASSERT(fid); - - rc = lmv_placement_policy(obd, op_data, &mds); - if (rc) { - CERROR("Can't get target for allocating fid, rc %d\n", - rc); - return rc; - } - - rc = __lmv_fid_alloc(lmv, fid, mds); - if (rc) { - CERROR("Can't alloc new fid, rc %d\n", rc); - return rc; - } - - return rc; -} - -static int lmv_setup(struct obd_device *obd, struct lustre_cfg *lcfg) -{ - struct lmv_obd *lmv = &obd->u.lmv; - struct lprocfs_static_vars lvars = { NULL }; - struct lmv_desc *desc; - int rc; - - if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) { - CERROR("LMV setup requires a descriptor\n"); - return -EINVAL; - } - - desc = (struct lmv_desc *)lustre_cfg_buf(lcfg, 1); - if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) { - CERROR("Lmv descriptor size wrong: %d > %d\n", - (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1)); - return -EINVAL; - } - - lmv->tgts_size = 32U; - lmv->tgts = kcalloc(lmv->tgts_size, sizeof(*lmv->tgts), GFP_NOFS); - if (!lmv->tgts) - return -ENOMEM; - - obd_str2uuid(&lmv->desc.ld_uuid, desc->ld_uuid.uuid); - lmv->desc.ld_tgt_count = 0; - lmv->desc.ld_active_tgt_count = 0; - lmv->max_def_easize = 0; - lmv->max_easize = 0; - - spin_lock_init(&lmv->lmv_lock); - mutex_init(&lmv->lmv_init_mutex); - - lprocfs_lmv_init_vars(&lvars); - - lprocfs_obd_setup(obd, lvars.obd_vars, lvars.sysfs_vars); - debugfs_create_file("target_obd", 0444, obd->obd_debugfs_entry, obd, - &lmv_proc_target_fops); - rc = fld_client_init(&lmv->lmv_fld, obd->obd_name, - LUSTRE_CLI_FLD_HASH_DHT); - if (rc) { - CERROR("Can't init FLD, err %d\n", rc); - goto out; - } - - return 0; - -out: - return rc; -} - -static int lmv_cleanup(struct obd_device *obd) -{ - struct lmv_obd *lmv = &obd->u.lmv; - - fld_client_fini(&lmv->lmv_fld); - if (lmv->tgts) { - int i; - - for (i = 0; i < lmv->desc.ld_tgt_count; i++) { - if (!lmv->tgts[i]) - continue; - lmv_del_target(lmv, i); - } - kfree(lmv->tgts); - lmv->tgts_size = 0; - } - return 0; -} - -static int lmv_process_config(struct obd_device *obd, u32 len, void *buf) -{ - struct lustre_cfg *lcfg = buf; - struct obd_uuid obd_uuid; - int gen; - __u32 index; - int rc; - - switch (lcfg->lcfg_command) { - case LCFG_ADD_MDC: - /* modify_mdc_tgts add 0:lustre-clilmv 1:lustre-MDT0000_UUID - * 2:0 3:1 4:lustre-MDT0000-mdc_UUID - */ - if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid)) { - rc = -EINVAL; - goto out; - } - - obd_str2uuid(&obd_uuid, lustre_cfg_buf(lcfg, 1)); - - if (sscanf(lustre_cfg_buf(lcfg, 2), "%u", &index) != 1) { - rc = -EINVAL; - goto out; - } - if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", &gen) != 1) { - rc = -EINVAL; - goto out; - } - rc = lmv_add_target(obd, &obd_uuid, index, gen); - goto out; - default: - CERROR("Unknown command: %d\n", lcfg->lcfg_command); - rc = -EINVAL; - goto out; - } -out: - return rc; -} - -static int lmv_statfs(const struct lu_env *env, struct obd_export *exp, - struct obd_statfs *osfs, __u64 max_age, __u32 flags) -{ - struct obd_device *obd = class_exp2obd(exp); - struct lmv_obd *lmv = &obd->u.lmv; - struct obd_statfs *temp; - int rc = 0; - u32 i; - - temp = kzalloc(sizeof(*temp), GFP_NOFS); - if (!temp) - return -ENOMEM; - - for (i = 0; i < lmv->desc.ld_tgt_count; i++) { - if (!lmv->tgts[i] || !lmv->tgts[i]->ltd_exp) - continue; - - rc = obd_statfs(env, lmv->tgts[i]->ltd_exp, temp, - max_age, flags); - if (rc) { - CERROR("can't stat MDS #%d (%s), error %d\n", i, - lmv->tgts[i]->ltd_exp->exp_obd->obd_name, - rc); - goto out_free_temp; - } - - if (i == 0) { - *osfs = *temp; - /* If the statfs is from mount, it will needs - * retrieve necessary information from MDT0. - * i.e. mount does not need the merged osfs - * from all of MDT. - * And also clients can be mounted as long as - * MDT0 is in service - */ - if (flags & OBD_STATFS_FOR_MDT0) - goto out_free_temp; - } else { - osfs->os_bavail += temp->os_bavail; - osfs->os_blocks += temp->os_blocks; - osfs->os_ffree += temp->os_ffree; - osfs->os_files += temp->os_files; - } - } - -out_free_temp: - kfree(temp); - return rc; -} - -static int lmv_getstatus(struct obd_export *exp, - struct lu_fid *fid) -{ - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - - return md_getstatus(lmv->tgts[0]->ltd_exp, fid); -} - -static int lmv_getxattr(struct obd_export *exp, const struct lu_fid *fid, - u64 obd_md_valid, const char *name, size_t buf_size, - struct ptlrpc_request **req) -{ - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_tgt_desc *tgt; - - tgt = lmv_find_target(lmv, fid); - if (IS_ERR(tgt)) - return PTR_ERR(tgt); - - return md_getxattr(tgt->ltd_exp, fid, obd_md_valid, name, buf_size, - req); -} - -static int lmv_setxattr(struct obd_export *exp, const struct lu_fid *fid, - u64 obd_md_valid, const char *name, - const void *value, size_t value_size, - unsigned int xattr_flags, u32 suppgid, - struct ptlrpc_request **req) -{ - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_tgt_desc *tgt; - - tgt = lmv_find_target(lmv, fid); - if (IS_ERR(tgt)) - return PTR_ERR(tgt); - - return md_setxattr(tgt->ltd_exp, fid, obd_md_valid, name, - value, value_size, xattr_flags, suppgid, req); -} - -static int lmv_getattr(struct obd_export *exp, struct md_op_data *op_data, - struct ptlrpc_request **request) -{ - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_tgt_desc *tgt; - - tgt = lmv_find_target(lmv, &op_data->op_fid1); - if (IS_ERR(tgt)) - return PTR_ERR(tgt); - - if (op_data->op_flags & MF_GET_MDT_IDX) { - op_data->op_mds = tgt->ltd_idx; - return 0; - } - - return md_getattr(tgt->ltd_exp, op_data, request); -} - -static int lmv_null_inode(struct obd_export *exp, const struct lu_fid *fid) -{ - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - u32 i; - - CDEBUG(D_INODE, "CBDATA for " DFID "\n", PFID(fid)); - - /* - * With DNE every object can have two locks in different namespaces: - * lookup lock in space of MDT storing direntry and update/open lock in - * space of MDT storing inode. - */ - for (i = 0; i < lmv->desc.ld_tgt_count; i++) { - if (!lmv->tgts[i] || !lmv->tgts[i]->ltd_exp) - continue; - md_null_inode(lmv->tgts[i]->ltd_exp, fid); - } - - return 0; -} - -static int lmv_close(struct obd_export *exp, struct md_op_data *op_data, - struct md_open_data *mod, struct ptlrpc_request **request) -{ - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_tgt_desc *tgt; - - tgt = lmv_find_target(lmv, &op_data->op_fid1); - if (IS_ERR(tgt)) - return PTR_ERR(tgt); - - CDEBUG(D_INODE, "CLOSE " DFID "\n", PFID(&op_data->op_fid1)); - return md_close(tgt->ltd_exp, op_data, mod, request); -} - -/** - * Choosing the MDT by name or FID in @op_data. - * For non-striped directory, it will locate MDT by fid. - * For striped-directory, it will locate MDT by name. And also - * it will reset op_fid1 with the FID of the chosen stripe. - **/ -static struct lmv_tgt_desc * -lmv_locate_target_for_name(struct lmv_obd *lmv, struct lmv_stripe_md *lsm, - const char *name, int namelen, struct lu_fid *fid, - u32 *mds) -{ - const struct lmv_oinfo *oinfo; - struct lmv_tgt_desc *tgt; - - if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_NAME_HASH)) { - if (cfs_fail_val >= lsm->lsm_md_stripe_count) - return ERR_PTR(-EBADF); - oinfo = &lsm->lsm_md_oinfo[cfs_fail_val]; - } else { - oinfo = lsm_name_to_stripe_info(lsm, name, namelen); - if (IS_ERR(oinfo)) - return ERR_CAST(oinfo); - } - - if (fid) - *fid = oinfo->lmo_fid; - if (mds) - *mds = oinfo->lmo_mds; - - tgt = lmv_get_target(lmv, oinfo->lmo_mds, NULL); - - CDEBUG(D_INFO, "locate on mds %u " DFID "\n", oinfo->lmo_mds, - PFID(&oinfo->lmo_fid)); - return tgt; -} - -/** - * Locate mds by fid or name - * - * For striped directory (lsm != NULL), it will locate the stripe - * by name hash (see lsm_name_to_stripe_info()). Note: if the hash_type - * is unknown, it will return -EBADFD, and lmv_intent_lookup might need - * walk through all of stripes to locate the entry. - * - * For normal direcotry, it will locate MDS by FID directly. - * \param[in] lmv LMV device - * \param[in] op_data client MD stack parameters, name, namelen - * mds_num etc. - * \param[in] fid object FID used to locate MDS. - * - * retval pointer to the lmv_tgt_desc if succeed. - * ERR_PTR(errno) if failed. - */ -struct lmv_tgt_desc* -lmv_locate_mds(struct lmv_obd *lmv, struct md_op_data *op_data, - struct lu_fid *fid) -{ - struct lmv_stripe_md *lsm = op_data->op_mea1; - struct lmv_tgt_desc *tgt; - - /* - * During creating VOLATILE file, it should honor the mdt - * index if the file under striped dir is being restored, see - * ct_restore(). - */ - if (op_data->op_bias & MDS_CREATE_VOLATILE && - (int)op_data->op_mds != -1) { - int i; - - tgt = lmv_get_target(lmv, op_data->op_mds, NULL); - if (IS_ERR(tgt)) - return tgt; - - if (lsm) { - /* refill the right parent fid */ - for (i = 0; i < lsm->lsm_md_stripe_count; i++) { - struct lmv_oinfo *oinfo; - - oinfo = &lsm->lsm_md_oinfo[i]; - if (oinfo->lmo_mds == op_data->op_mds) { - *fid = oinfo->lmo_fid; - break; - } - } - - if (i == lsm->lsm_md_stripe_count) - *fid = lsm->lsm_md_oinfo[0].lmo_fid; - } - - return tgt; - } - - if (!lsm || !op_data->op_namelen) { - tgt = lmv_find_target(lmv, fid); - if (IS_ERR(tgt)) - return tgt; - - op_data->op_mds = tgt->ltd_idx; - - return tgt; - } - - return lmv_locate_target_for_name(lmv, lsm, op_data->op_name, - op_data->op_namelen, fid, - &op_data->op_mds); -} - -static int lmv_create(struct obd_export *exp, struct md_op_data *op_data, - const void *data, size_t datalen, umode_t mode, - uid_t uid, gid_t gid, kernel_cap_t cap_effective, - __u64 rdev, struct ptlrpc_request **request) -{ - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_tgt_desc *tgt; - int rc; - - if (!lmv->desc.ld_active_tgt_count) - return -EIO; - - tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1); - if (IS_ERR(tgt)) - return PTR_ERR(tgt); - - CDEBUG(D_INODE, "CREATE name '%.*s' on " DFID " -> mds #%x\n", - (int)op_data->op_namelen, op_data->op_name, - PFID(&op_data->op_fid1), op_data->op_mds); - - rc = lmv_fid_alloc(NULL, exp, &op_data->op_fid2, op_data); - if (rc) - return rc; - - if (exp_connect_flags(exp) & OBD_CONNECT_DIR_STRIPE) { - /* - * Send the create request to the MDT where the object - * will be located - */ - tgt = lmv_find_target(lmv, &op_data->op_fid2); - if (IS_ERR(tgt)) - return PTR_ERR(tgt); - - op_data->op_mds = tgt->ltd_idx; - } else { - CDEBUG(D_CONFIG, "Server doesn't support striped dirs\n"); - } - - CDEBUG(D_INODE, "CREATE obj " DFID " -> mds #%x\n", - PFID(&op_data->op_fid1), op_data->op_mds); - - op_data->op_flags |= MF_MDC_CANCEL_FID1; - rc = md_create(tgt->ltd_exp, op_data, data, datalen, mode, uid, gid, - cap_effective, rdev, request); - - if (rc == 0) { - if (!*request) - return rc; - CDEBUG(D_INODE, "Created - " DFID "\n", PFID(&op_data->op_fid2)); - } - return rc; -} - -static int -lmv_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo, - const union ldlm_policy_data *policy, struct md_op_data *op_data, - struct lustre_handle *lockh, __u64 extra_lock_flags) -{ - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_tgt_desc *tgt; - - CDEBUG(D_INODE, "ENQUEUE on " DFID "\n", PFID(&op_data->op_fid1)); - - tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1); - if (IS_ERR(tgt)) - return PTR_ERR(tgt); - - CDEBUG(D_INODE, "ENQUEUE on " DFID " -> mds #%u\n", - PFID(&op_data->op_fid1), tgt->ltd_idx); - - return md_enqueue(tgt->ltd_exp, einfo, policy, op_data, lockh, - extra_lock_flags); -} - -static int -lmv_getattr_name(struct obd_export *exp, struct md_op_data *op_data, - struct ptlrpc_request **preq) -{ - struct ptlrpc_request *req = NULL; - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_tgt_desc *tgt; - struct mdt_body *body; - int rc; - - tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1); - if (IS_ERR(tgt)) - return PTR_ERR(tgt); - - CDEBUG(D_INODE, "GETATTR_NAME for %*s on " DFID " -> mds #%u\n", - (int)op_data->op_namelen, op_data->op_name, - PFID(&op_data->op_fid1), tgt->ltd_idx); - - rc = md_getattr_name(tgt->ltd_exp, op_data, preq); - if (rc != 0) - return rc; - - body = req_capsule_server_get(&(*preq)->rq_pill, &RMF_MDT_BODY); - if (body->mbo_valid & OBD_MD_MDS) { - struct lu_fid rid = body->mbo_fid1; - - CDEBUG(D_INODE, "Request attrs for " DFID "\n", - PFID(&rid)); - - tgt = lmv_find_target(lmv, &rid); - if (IS_ERR(tgt)) { - ptlrpc_req_finished(*preq); - *preq = NULL; - return PTR_ERR(tgt); - } - - op_data->op_fid1 = rid; - op_data->op_valid |= OBD_MD_FLCROSSREF; - op_data->op_namelen = 0; - op_data->op_name = NULL; - rc = md_getattr_name(tgt->ltd_exp, op_data, &req); - ptlrpc_req_finished(*preq); - *preq = req; - } - - return rc; -} - -#define md_op_data_fid(op_data, fl) \ - (fl == MF_MDC_CANCEL_FID1 ? &op_data->op_fid1 : \ - fl == MF_MDC_CANCEL_FID2 ? &op_data->op_fid2 : \ - fl == MF_MDC_CANCEL_FID3 ? &op_data->op_fid3 : \ - fl == MF_MDC_CANCEL_FID4 ? &op_data->op_fid4 : \ - NULL) - -static int lmv_early_cancel(struct obd_export *exp, struct lmv_tgt_desc *tgt, - struct md_op_data *op_data, int op_tgt, - enum ldlm_mode mode, int bits, int flag) -{ - struct lu_fid *fid = md_op_data_fid(op_data, flag); - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - union ldlm_policy_data policy = { { 0 } }; - int rc = 0; - - if (!fid_is_sane(fid)) - return 0; - - if (!tgt) { - tgt = lmv_find_target(lmv, fid); - if (IS_ERR(tgt)) - return PTR_ERR(tgt); - } - - if (tgt->ltd_idx != op_tgt) { - CDEBUG(D_INODE, "EARLY_CANCEL on " DFID "\n", PFID(fid)); - policy.l_inodebits.bits = bits; - rc = md_cancel_unused(tgt->ltd_exp, fid, &policy, - mode, LCF_ASYNC, NULL); - } else { - CDEBUG(D_INODE, - "EARLY_CANCEL skip operation target %d on " DFID "\n", - op_tgt, PFID(fid)); - op_data->op_flags |= flag; - rc = 0; - } - - return rc; -} - -/* - * llite passes fid of an target inode in op_data->op_fid1 and id of directory in - * op_data->op_fid2 - */ -static int lmv_link(struct obd_export *exp, struct md_op_data *op_data, - struct ptlrpc_request **request) -{ - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_tgt_desc *tgt; - int rc; - - LASSERT(op_data->op_namelen != 0); - - CDEBUG(D_INODE, "LINK " DFID ":%*s to " DFID "\n", - PFID(&op_data->op_fid2), (int)op_data->op_namelen, - op_data->op_name, PFID(&op_data->op_fid1)); - - op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid()); - op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid()); - op_data->op_cap = current_cap(); - if (op_data->op_mea2) { - struct lmv_stripe_md *lsm = op_data->op_mea2; - const struct lmv_oinfo *oinfo; - - oinfo = lsm_name_to_stripe_info(lsm, op_data->op_name, - op_data->op_namelen); - if (IS_ERR(oinfo)) - return PTR_ERR(oinfo); - - op_data->op_fid2 = oinfo->lmo_fid; - } - - tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2); - if (IS_ERR(tgt)) - return PTR_ERR(tgt); - - /* - * Cancel UPDATE lock on child (fid1). - */ - op_data->op_flags |= MF_MDC_CANCEL_FID2; - rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_idx, LCK_EX, - MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID1); - if (rc != 0) - return rc; - - return md_link(tgt->ltd_exp, op_data, request); -} - -static int lmv_rename(struct obd_export *exp, struct md_op_data *op_data, - const char *old, size_t oldlen, - const char *new, size_t newlen, - struct ptlrpc_request **request) -{ - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - struct obd_export *target_exp; - struct lmv_tgt_desc *src_tgt; - struct lmv_tgt_desc *tgt_tgt; - struct mdt_body *body; - int rc; - - LASSERT(oldlen != 0); - - CDEBUG(D_INODE, "RENAME %.*s in " DFID ":%d to %.*s in " DFID ":%d\n", - (int)oldlen, old, PFID(&op_data->op_fid1), - op_data->op_mea1 ? op_data->op_mea1->lsm_md_stripe_count : 0, - (int)newlen, new, PFID(&op_data->op_fid2), - op_data->op_mea2 ? op_data->op_mea2->lsm_md_stripe_count : 0); - - op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid()); - op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid()); - op_data->op_cap = current_cap(); - - if (op_data->op_cli_flags & CLI_MIGRATE) { - LASSERTF(fid_is_sane(&op_data->op_fid3), "invalid FID " DFID "\n", - PFID(&op_data->op_fid3)); - - if (op_data->op_mea1) { - struct lmv_stripe_md *lsm = op_data->op_mea1; - struct lmv_tgt_desc *tmp; - - /* Fix the parent fid for striped dir */ - tmp = lmv_locate_target_for_name(lmv, lsm, old, - oldlen, - &op_data->op_fid1, - NULL); - if (IS_ERR(tmp)) - return PTR_ERR(tmp); - } - - rc = lmv_fid_alloc(NULL, exp, &op_data->op_fid2, op_data); - if (rc) - return rc; - src_tgt = lmv_find_target(lmv, &op_data->op_fid3); - if (IS_ERR(src_tgt)) - return PTR_ERR(src_tgt); - - target_exp = src_tgt->ltd_exp; - } else { - if (op_data->op_mea1) { - struct lmv_stripe_md *lsm = op_data->op_mea1; - - src_tgt = lmv_locate_target_for_name(lmv, lsm, old, - oldlen, - &op_data->op_fid1, - &op_data->op_mds); - } else { - src_tgt = lmv_find_target(lmv, &op_data->op_fid1); - } - if (IS_ERR(src_tgt)) - return PTR_ERR(src_tgt); - - if (op_data->op_mea2) { - struct lmv_stripe_md *lsm = op_data->op_mea2; - - tgt_tgt = lmv_locate_target_for_name(lmv, lsm, new, - newlen, - &op_data->op_fid2, - &op_data->op_mds); - } else { - tgt_tgt = lmv_find_target(lmv, &op_data->op_fid2); - } - if (IS_ERR(tgt_tgt)) - return PTR_ERR(tgt_tgt); - - target_exp = tgt_tgt->ltd_exp; - } - - /* - * LOOKUP lock on src child (fid3) should also be cancelled for - * src_tgt in mdc_rename. - */ - op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3; - - /* - * Cancel UPDATE locks on tgt parent (fid2), tgt_tgt is its - * own target. - */ - rc = lmv_early_cancel(exp, NULL, op_data, src_tgt->ltd_idx, - LCK_EX, MDS_INODELOCK_UPDATE, - MF_MDC_CANCEL_FID2); - if (rc) - return rc; - /* - * Cancel LOOKUP locks on source child (fid3) for parent tgt_tgt. - */ - if (fid_is_sane(&op_data->op_fid3)) { - struct lmv_tgt_desc *tgt; - - tgt = lmv_find_target(lmv, &op_data->op_fid1); - if (IS_ERR(tgt)) - return PTR_ERR(tgt); - - /* Cancel LOOKUP lock on its parent */ - rc = lmv_early_cancel(exp, tgt, op_data, src_tgt->ltd_idx, - LCK_EX, MDS_INODELOCK_LOOKUP, - MF_MDC_CANCEL_FID3); - if (rc) - return rc; - - rc = lmv_early_cancel(exp, NULL, op_data, src_tgt->ltd_idx, - LCK_EX, MDS_INODELOCK_FULL, - MF_MDC_CANCEL_FID3); - if (rc) - return rc; - } - -retry_rename: - /* - * Cancel all the locks on tgt child (fid4). - */ - if (fid_is_sane(&op_data->op_fid4)) { - struct lmv_tgt_desc *tgt; - - rc = lmv_early_cancel(exp, NULL, op_data, src_tgt->ltd_idx, - LCK_EX, MDS_INODELOCK_FULL, - MF_MDC_CANCEL_FID4); - if (rc) - return rc; - - tgt = lmv_find_target(lmv, &op_data->op_fid4); - if (IS_ERR(tgt)) - return PTR_ERR(tgt); - - /* - * Since the target child might be destroyed, and it might - * become orphan, and we can only check orphan on the local - * MDT right now, so we send rename request to the MDT where - * target child is located. If target child does not exist, - * then it will send the request to the target parent - */ - target_exp = tgt->ltd_exp; - } - - rc = md_rename(target_exp, op_data, old, oldlen, new, newlen, request); - if (rc && rc != -EREMOTE) - return rc; - - body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY); - if (!body) - return -EPROTO; - - /* Not cross-ref case, just get out of here. */ - if (likely(!(body->mbo_valid & OBD_MD_MDS))) - return rc; - - CDEBUG(D_INODE, "%s: try rename to another MDT for " DFID "\n", - exp->exp_obd->obd_name, PFID(&body->mbo_fid1)); - - op_data->op_fid4 = body->mbo_fid1; - ptlrpc_req_finished(*request); - *request = NULL; - goto retry_rename; -} - -static int lmv_setattr(struct obd_export *exp, struct md_op_data *op_data, - void *ea, size_t ealen, struct ptlrpc_request **request) -{ - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_tgt_desc *tgt; - - CDEBUG(D_INODE, "SETATTR for " DFID ", valid 0x%x\n", - PFID(&op_data->op_fid1), op_data->op_attr.ia_valid); - - op_data->op_flags |= MF_MDC_CANCEL_FID1; - tgt = lmv_find_target(lmv, &op_data->op_fid1); - if (IS_ERR(tgt)) - return PTR_ERR(tgt); - - return md_setattr(tgt->ltd_exp, op_data, ea, ealen, request); -} - -static int lmv_sync(struct obd_export *exp, const struct lu_fid *fid, - struct ptlrpc_request **request) -{ - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_tgt_desc *tgt; - - tgt = lmv_find_target(lmv, fid); - if (IS_ERR(tgt)) - return PTR_ERR(tgt); - - return md_sync(tgt->ltd_exp, fid, request); -} - -/** - * Get current minimum entry from striped directory - * - * This function will search the dir entry, whose hash value is the - * closest(>=) to @hash_offset, from all of sub-stripes, and it is - * only being called for striped directory. - * - * \param[in] exp export of LMV - * \param[in] op_data parameters transferred beween client MD stack - * stripe_information will be included in this - * parameter - * \param[in] cb_op ldlm callback being used in enqueue in - * mdc_read_page - * \param[in] hash_offset the hash value, which is used to locate - * minum(closet) dir entry - * \param[in|out] stripe_offset the caller use this to indicate the stripe - * index of last entry, so to avoid hash conflict - * between stripes. It will also be used to - * return the stripe index of current dir entry. - * \param[in|out] entp the minum entry and it also is being used - * to input the last dir entry to resolve the - * hash conflict - * - * \param[out] ppage the page which holds the minum entry - * - * \retval = 0 get the entry successfully - * negative errno (< 0) does not get the entry - */ -static int lmv_get_min_striped_entry(struct obd_export *exp, - struct md_op_data *op_data, - struct md_callback *cb_op, - __u64 hash_offset, int *stripe_offset, - struct lu_dirent **entp, - struct page **ppage) -{ - struct lmv_stripe_md *lsm = op_data->op_mea1; - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - struct lu_dirent *min_ent = NULL; - struct page *min_page = NULL; - struct lmv_tgt_desc *tgt; - int stripe_count; - int min_idx = 0; - int rc = 0; - int i; - - stripe_count = lsm->lsm_md_stripe_count; - for (i = 0; i < stripe_count; i++) { - __u64 stripe_hash = hash_offset; - struct lu_dirent *ent = NULL; - struct page *page = NULL; - struct lu_dirpage *dp; - - tgt = lmv_get_target(lmv, lsm->lsm_md_oinfo[i].lmo_mds, NULL); - if (IS_ERR(tgt)) { - rc = PTR_ERR(tgt); - goto out; - } - - /* - * op_data will be shared by each stripe, so we need - * reset these value for each stripe - */ - op_data->op_fid1 = lsm->lsm_md_oinfo[i].lmo_fid; - op_data->op_fid2 = lsm->lsm_md_oinfo[i].lmo_fid; - op_data->op_data = lsm->lsm_md_oinfo[i].lmo_root; -next: - rc = md_read_page(tgt->ltd_exp, op_data, cb_op, stripe_hash, - &page); - if (rc) - goto out; - - dp = page_address(page); - for (ent = lu_dirent_start(dp); ent; - ent = lu_dirent_next(ent)) { - /* Skip dummy entry */ - if (!le16_to_cpu(ent->lde_namelen)) - continue; - - if (le64_to_cpu(ent->lde_hash) < hash_offset) - continue; - - if (le64_to_cpu(ent->lde_hash) == hash_offset && - (*entp == ent || i < *stripe_offset)) - continue; - - /* skip . and .. for other stripes */ - if (i && (!strncmp(ent->lde_name, ".", - le16_to_cpu(ent->lde_namelen)) || - !strncmp(ent->lde_name, "..", - le16_to_cpu(ent->lde_namelen)))) - continue; - break; - } - - if (!ent) { - stripe_hash = le64_to_cpu(dp->ldp_hash_end); - - kunmap(page); - put_page(page); - page = NULL; - - /* - * reach the end of current stripe, go to next stripe - */ - if (stripe_hash == MDS_DIR_END_OFF) - continue; - else - goto next; - } - - if (min_ent) { - if (le64_to_cpu(min_ent->lde_hash) > - le64_to_cpu(ent->lde_hash)) { - min_ent = ent; - kunmap(min_page); - put_page(min_page); - min_idx = i; - min_page = page; - } else { - kunmap(page); - put_page(page); - page = NULL; - } - } else { - min_ent = ent; - min_page = page; - min_idx = i; - } - } - -out: - if (*ppage) { - kunmap(*ppage); - put_page(*ppage); - } - *stripe_offset = min_idx; - *entp = min_ent; - *ppage = min_page; - return rc; -} - -/** - * Build dir entry page from a striped directory - * - * This function gets one entry by @offset from a striped directory. It will - * read entries from all of stripes, and choose one closest to the required - * offset(&offset). A few notes - * 1. skip . and .. for non-zero stripes, because there can only have one . - * and .. in a directory. - * 2. op_data will be shared by all of stripes, instead of allocating new - * one, so need to restore before reusing. - * 3. release the entry page if that is not being chosen. - * - * \param[in] exp obd export refer to LMV - * \param[in] op_data hold those MD parameters of read_entry - * \param[in] cb_op ldlm callback being used in enqueue in mdc_read_entry - * \param[out] ldp the entry being read - * \param[out] ppage the page holding the entry. Note: because the entry - * will be accessed in upper layer, so we need hold the - * page until the usages of entry is finished, see - * ll_dir_entry_next. - * - * retval =0 if get entry successfully - * <0 cannot get entry - */ -static int lmv_read_striped_page(struct obd_export *exp, - struct md_op_data *op_data, - struct md_callback *cb_op, - __u64 offset, struct page **ppage) -{ - struct inode *master_inode = op_data->op_data; - struct lu_fid master_fid = op_data->op_fid1; - __u64 hash_offset = offset; - __u32 ldp_flags; - struct page *min_ent_page = NULL; - struct page *ent_page = NULL; - struct lu_dirent *min_ent = NULL; - struct lu_dirent *last_ent; - struct lu_dirent *ent; - struct lu_dirpage *dp; - size_t left_bytes; - int ent_idx = 0; - void *area; - int rc; - - /* - * Allocate a page and read entries from all of stripes and fill - * the page by hash order - */ - ent_page = alloc_page(GFP_KERNEL); - if (!ent_page) - return -ENOMEM; - - /* Initialize the entry page */ - dp = kmap(ent_page); - memset(dp, 0, sizeof(*dp)); - dp->ldp_hash_start = cpu_to_le64(offset); - ldp_flags = LDF_COLLIDE; - - area = dp + 1; - left_bytes = PAGE_SIZE - sizeof(*dp); - ent = area; - last_ent = ent; - do { - __u16 ent_size; - - /* Find the minum entry from all sub-stripes */ - rc = lmv_get_min_striped_entry(exp, op_data, cb_op, hash_offset, - &ent_idx, &min_ent, - &min_ent_page); - if (rc) - goto out; - - /* - * If it can not get minum entry, it means it already reaches - * the end of this directory - */ - if (!min_ent) { - last_ent->lde_reclen = 0; - hash_offset = MDS_DIR_END_OFF; - goto out; - } - - ent_size = le16_to_cpu(min_ent->lde_reclen); - - /* - * the last entry lde_reclen is 0, but it might not - * the end of this entry of this temporay entry - */ - if (!ent_size) - ent_size = lu_dirent_calc_size( - le16_to_cpu(min_ent->lde_namelen), - le32_to_cpu(min_ent->lde_attrs)); - if (ent_size > left_bytes) { - last_ent->lde_reclen = cpu_to_le16(0); - hash_offset = le64_to_cpu(min_ent->lde_hash); - goto out; - } - - memcpy(ent, min_ent, ent_size); - - /* - * Replace . with master FID and Replace .. with the parent FID - * of master object - */ - if (!strncmp(ent->lde_name, ".", - le16_to_cpu(ent->lde_namelen)) && - le16_to_cpu(ent->lde_namelen) == 1) - fid_cpu_to_le(&ent->lde_fid, &master_fid); - else if (!strncmp(ent->lde_name, "..", - le16_to_cpu(ent->lde_namelen)) && - le16_to_cpu(ent->lde_namelen) == 2) - fid_cpu_to_le(&ent->lde_fid, &op_data->op_fid3); - - left_bytes -= ent_size; - ent->lde_reclen = cpu_to_le16(ent_size); - last_ent = ent; - ent = (void *)ent + ent_size; - hash_offset = le64_to_cpu(min_ent->lde_hash); - if (hash_offset == MDS_DIR_END_OFF) { - last_ent->lde_reclen = 0; - break; - } - } while (1); -out: - if (min_ent_page) { - kunmap(min_ent_page); - put_page(min_ent_page); - } - - if (unlikely(rc)) { - __free_page(ent_page); - ent_page = NULL; - } else { - if (ent == area) - ldp_flags |= LDF_EMPTY; - dp->ldp_flags |= cpu_to_le32(ldp_flags); - dp->ldp_hash_end = cpu_to_le64(hash_offset); - } - - /* - * We do not want to allocate md_op_data during each - * dir entry reading, so op_data will be shared by every stripe, - * then we need to restore it back to original value before - * return to the upper layer - */ - op_data->op_fid1 = master_fid; - op_data->op_fid2 = master_fid; - op_data->op_data = master_inode; - - *ppage = ent_page; - - return rc; -} - -static int lmv_read_page(struct obd_export *exp, struct md_op_data *op_data, - struct md_callback *cb_op, __u64 offset, - struct page **ppage) -{ - struct lmv_stripe_md *lsm = op_data->op_mea1; - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_tgt_desc *tgt; - - if (unlikely(lsm)) - return lmv_read_striped_page(exp, op_data, cb_op, offset, ppage); - - tgt = lmv_find_target(lmv, &op_data->op_fid1); - if (IS_ERR(tgt)) - return PTR_ERR(tgt); - - return md_read_page(tgt->ltd_exp, op_data, cb_op, offset, ppage); -} - -/** - * Unlink a file/directory - * - * Unlink a file or directory under the parent dir. The unlink request - * usually will be sent to the MDT where the child is located, but if - * the client does not have the child FID then request will be sent to the - * MDT where the parent is located. - * - * If the parent is a striped directory then it also needs to locate which - * stripe the name of the child is located, and replace the parent FID - * (@op->op_fid1) with the stripe FID. Note: if the stripe is unknown, - * it will walk through all of sub-stripes until the child is being - * unlinked finally. - * - * \param[in] exp export refer to LMV - * \param[in] op_data different parameters transferred beween client - * MD stacks, name, namelen, FIDs etc. - * op_fid1 is the parent FID, op_fid2 is the child - * FID. - * \param[out] request point to the request of unlink. - * - * retval 0 if succeed - * negative errno if failed. - */ -static int lmv_unlink(struct obd_export *exp, struct md_op_data *op_data, - struct ptlrpc_request **request) -{ - struct lmv_stripe_md *lsm = op_data->op_mea1; - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_tgt_desc *parent_tgt = NULL; - struct lmv_tgt_desc *tgt = NULL; - struct mdt_body *body; - int stripe_index = 0; - int rc; - -retry_unlink: - /* For striped dir, we need to locate the parent as well */ - if (lsm) { - struct lmv_tgt_desc *tmp; - - LASSERT(op_data->op_name && op_data->op_namelen); - - tmp = lmv_locate_target_for_name(lmv, lsm, - op_data->op_name, - op_data->op_namelen, - &op_data->op_fid1, - &op_data->op_mds); - - /* - * return -EBADFD means unknown hash type, might - * need try all sub-stripe here - */ - if (IS_ERR(tmp) && PTR_ERR(tmp) != -EBADFD) - return PTR_ERR(tmp); - - /* - * Note: both migrating dir and unknown hash dir need to - * try all of sub-stripes, so we need start search the - * name from stripe 0, but migrating dir is already handled - * inside lmv_locate_target_for_name(), so we only check - * unknown hash type directory here - */ - if (!lmv_is_known_hash_type(lsm->lsm_md_hash_type)) { - struct lmv_oinfo *oinfo; - - oinfo = &lsm->lsm_md_oinfo[stripe_index]; - - op_data->op_fid1 = oinfo->lmo_fid; - op_data->op_mds = oinfo->lmo_mds; - } - } - -try_next_stripe: - /* Send unlink requests to the MDT where the child is located */ - if (likely(!fid_is_zero(&op_data->op_fid2))) - tgt = lmv_find_target(lmv, &op_data->op_fid2); - else if (lsm) - tgt = lmv_get_target(lmv, op_data->op_mds, NULL); - else - tgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1); - - if (IS_ERR(tgt)) - return PTR_ERR(tgt); - - op_data->op_fsuid = from_kuid(&init_user_ns, current_fsuid()); - op_data->op_fsgid = from_kgid(&init_user_ns, current_fsgid()); - op_data->op_cap = current_cap(); - - /* - * If child's fid is given, cancel unused locks for it if it is from - * another export than parent. - * - * LOOKUP lock for child (fid3) should also be cancelled on parent - * tgt_tgt in mdc_unlink(). - */ - op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3; - - /* - * Cancel FULL locks on child (fid3). - */ - parent_tgt = lmv_find_target(lmv, &op_data->op_fid1); - if (IS_ERR(parent_tgt)) - return PTR_ERR(parent_tgt); - - if (parent_tgt != tgt) { - rc = lmv_early_cancel(exp, parent_tgt, op_data, tgt->ltd_idx, - LCK_EX, MDS_INODELOCK_LOOKUP, - MF_MDC_CANCEL_FID3); - } - - rc = lmv_early_cancel(exp, NULL, op_data, tgt->ltd_idx, LCK_EX, - MDS_INODELOCK_FULL, MF_MDC_CANCEL_FID3); - if (rc != 0) - return rc; - - CDEBUG(D_INODE, "unlink with fid=" DFID "/" DFID " -> mds #%u\n", - PFID(&op_data->op_fid1), PFID(&op_data->op_fid2), tgt->ltd_idx); - - rc = md_unlink(tgt->ltd_exp, op_data, request); - if (rc != 0 && rc != -EREMOTE && rc != -ENOENT) - return rc; - - /* Try next stripe if it is needed. */ - if (rc == -ENOENT && lsm && lmv_need_try_all_stripes(lsm)) { - struct lmv_oinfo *oinfo; - - stripe_index++; - if (stripe_index >= lsm->lsm_md_stripe_count) - return rc; - - oinfo = &lsm->lsm_md_oinfo[stripe_index]; - - op_data->op_fid1 = oinfo->lmo_fid; - op_data->op_mds = oinfo->lmo_mds; - - ptlrpc_req_finished(*request); - *request = NULL; - - goto try_next_stripe; - } - - body = req_capsule_server_get(&(*request)->rq_pill, &RMF_MDT_BODY); - if (!body) - return -EPROTO; - - /* Not cross-ref case, just get out of here. */ - if (likely(!(body->mbo_valid & OBD_MD_MDS))) - return rc; - - CDEBUG(D_INODE, "%s: try unlink to another MDT for " DFID "\n", - exp->exp_obd->obd_name, PFID(&body->mbo_fid1)); - - /* This is a remote object, try remote MDT, Note: it may - * try more than 1 time here, Considering following case - * /mnt/lustre is root on MDT0, remote1 is on MDT1 - * 1. Initially A does not know where remote1 is, it send - * unlink RPC to MDT0, MDT0 return -EREMOTE, it will - * resend unlink RPC to MDT1 (retry 1st time). - * - * 2. During the unlink RPC in flight, - * client B mv /mnt/lustre/remote1 /mnt/lustre/remote2 - * and create new remote1, but on MDT0 - * - * 3. MDT1 get unlink RPC(from A), then do remote lock on - * /mnt/lustre, then lookup get fid of remote1, and find - * it is remote dir again, and replay -EREMOTE again. - * - * 4. Then A will resend unlink RPC to MDT0. (retry 2nd times). - * - * In theory, it might try unlimited time here, but it should - * be very rare case. - */ - op_data->op_fid2 = body->mbo_fid1; - ptlrpc_req_finished(*request); - *request = NULL; - - goto retry_unlink; -} - -static int lmv_precleanup(struct obd_device *obd) -{ - fld_client_debugfs_fini(&obd->u.lmv.lmv_fld); - lprocfs_obd_cleanup(obd); - return 0; -} - -/** - * Get by key a value associated with a LMV device. - * - * Dispatch request to lower-layer devices as needed. - * - * \param[in] env execution environment for this thread - * \param[in] exp export for the LMV device - * \param[in] keylen length of key identifier - * \param[in] key identifier of key to get value for - * \param[in] vallen size of \a val - * \param[out] val pointer to storage location for value - * - * \retval 0 on success - * \retval negative negated errno on failure - */ -static int lmv_get_info(const struct lu_env *env, struct obd_export *exp, - __u32 keylen, void *key, __u32 *vallen, void *val) -{ - struct obd_device *obd; - struct lmv_obd *lmv; - int rc = 0; - - obd = class_exp2obd(exp); - if (!obd) { - CDEBUG(D_IOCTL, "Invalid client cookie %#llx\n", - exp->exp_handle.h_cookie); - return -EINVAL; - } - - lmv = &obd->u.lmv; - if (keylen >= strlen("remote_flag") && !strcmp(key, "remote_flag")) { - int i; - - LASSERT(*vallen == sizeof(__u32)); - for (i = 0; i < lmv->desc.ld_tgt_count; i++) { - struct lmv_tgt_desc *tgt = lmv->tgts[i]; - - /* - * All tgts should be connected when this gets called. - */ - if (!tgt || !tgt->ltd_exp) - continue; - - if (!obd_get_info(env, tgt->ltd_exp, keylen, key, - vallen, val)) - return 0; - } - return -EINVAL; - } else if (KEY_IS(KEY_MAX_EASIZE) || - KEY_IS(KEY_DEFAULT_EASIZE) || - KEY_IS(KEY_CONN_DATA)) { - /* - * Forwarding this request to first MDS, it should know LOV - * desc. - */ - rc = obd_get_info(env, lmv->tgts[0]->ltd_exp, keylen, key, - vallen, val); - if (!rc && KEY_IS(KEY_CONN_DATA)) - exp->exp_connect_data = *(struct obd_connect_data *)val; - return rc; - } else if (KEY_IS(KEY_TGT_COUNT)) { - *((int *)val) = lmv->desc.ld_tgt_count; - return 0; - } - - CDEBUG(D_IOCTL, "Invalid key\n"); - return -EINVAL; -} - -/** - * Asynchronously set by key a value associated with a LMV device. - * - * Dispatch request to lower-layer devices as needed. - * - * \param[in] env execution environment for this thread - * \param[in] exp export for the LMV device - * \param[in] keylen length of key identifier - * \param[in] key identifier of key to store value for - * \param[in] vallen size of value to store - * \param[in] val pointer to data to be stored - * \param[in] set optional list of related ptlrpc requests - * - * \retval 0 on success - * \retval negative negated errno on failure - */ -static int lmv_set_info_async(const struct lu_env *env, struct obd_export *exp, - u32 keylen, void *key, u32 vallen, - void *val, struct ptlrpc_request_set *set) -{ - struct lmv_tgt_desc *tgt; - struct obd_device *obd; - struct lmv_obd *lmv; - int rc = 0; - - obd = class_exp2obd(exp); - if (!obd) { - CDEBUG(D_IOCTL, "Invalid client cookie %#llx\n", - exp->exp_handle.h_cookie); - return -EINVAL; - } - lmv = &obd->u.lmv; - - if (KEY_IS(KEY_READ_ONLY) || KEY_IS(KEY_FLUSH_CTX) || - KEY_IS(KEY_DEFAULT_EASIZE)) { - int i, err = 0; - - for (i = 0; i < lmv->desc.ld_tgt_count; i++) { - tgt = lmv->tgts[i]; - - if (!tgt || !tgt->ltd_exp) - continue; - - err = obd_set_info_async(env, tgt->ltd_exp, - keylen, key, vallen, val, set); - if (err && rc == 0) - rc = err; - } - - return rc; - } - - return -EINVAL; -} - -static int lmv_unpack_md_v1(struct obd_export *exp, struct lmv_stripe_md *lsm, - const struct lmv_mds_md_v1 *lmm1) -{ - struct lmv_obd *lmv = &exp->exp_obd->u.lmv; - int stripe_count; - int rc = 0; - int cplen; - int i; - - lsm->lsm_md_magic = le32_to_cpu(lmm1->lmv_magic); - lsm->lsm_md_stripe_count = le32_to_cpu(lmm1->lmv_stripe_count); - lsm->lsm_md_master_mdt_index = le32_to_cpu(lmm1->lmv_master_mdt_index); - if (OBD_FAIL_CHECK(OBD_FAIL_UNKNOWN_LMV_STRIPE)) - lsm->lsm_md_hash_type = LMV_HASH_TYPE_UNKNOWN; - else - lsm->lsm_md_hash_type = le32_to_cpu(lmm1->lmv_hash_type); - lsm->lsm_md_layout_version = le32_to_cpu(lmm1->lmv_layout_version); - cplen = strlcpy(lsm->lsm_md_pool_name, lmm1->lmv_pool_name, - sizeof(lsm->lsm_md_pool_name)); - - if (cplen >= sizeof(lsm->lsm_md_pool_name)) - return -E2BIG; - - CDEBUG(D_INFO, "unpack lsm count %d, master %d hash_type %d layout_version %d\n", - lsm->lsm_md_stripe_count, lsm->lsm_md_master_mdt_index, - lsm->lsm_md_hash_type, lsm->lsm_md_layout_version); - - stripe_count = le32_to_cpu(lmm1->lmv_stripe_count); - for (i = 0; i < stripe_count; i++) { - fid_le_to_cpu(&lsm->lsm_md_oinfo[i].lmo_fid, - &lmm1->lmv_stripe_fids[i]); - rc = lmv_fld_lookup(lmv, &lsm->lsm_md_oinfo[i].lmo_fid, - &lsm->lsm_md_oinfo[i].lmo_mds); - if (rc) - return rc; - CDEBUG(D_INFO, "unpack fid #%d " DFID "\n", i, - PFID(&lsm->lsm_md_oinfo[i].lmo_fid)); - } - - return rc; -} - -static int lmv_unpackmd(struct obd_export *exp, struct lmv_stripe_md **lsmp, - const union lmv_mds_md *lmm, size_t lmm_size) -{ - struct lmv_stripe_md *lsm; - bool allocated = false; - int lsm_size, rc; - - LASSERT(lsmp); - - lsm = *lsmp; - /* Free memmd */ - if (lsm && !lmm) { - int i; - - for (i = 0; i < lsm->lsm_md_stripe_count; i++) { - /* - * For migrating inode, the master stripe and master - * object will be the same, so do not need iput, see - * ll_update_lsm_md - */ - if (!(lsm->lsm_md_hash_type & LMV_HASH_FLAG_MIGRATION && - !i) && lsm->lsm_md_oinfo[i].lmo_root) - iput(lsm->lsm_md_oinfo[i].lmo_root); - } - - kvfree(lsm); - *lsmp = NULL; - return 0; - } - - if (le32_to_cpu(lmm->lmv_magic) == LMV_MAGIC_STRIPE) - return -EPERM; - - /* Unpack memmd */ - if (le32_to_cpu(lmm->lmv_magic) != LMV_MAGIC_V1 && - le32_to_cpu(lmm->lmv_magic) != LMV_USER_MAGIC) { - CERROR("%s: invalid lmv magic %x: rc = %d\n", - exp->exp_obd->obd_name, le32_to_cpu(lmm->lmv_magic), - -EIO); - return -EIO; - } - - if (le32_to_cpu(lmm->lmv_magic) == LMV_MAGIC_V1) - lsm_size = lmv_stripe_md_size(lmv_mds_md_stripe_count_get(lmm)); - else - /** - * Unpack default dirstripe(lmv_user_md) to lmv_stripe_md, - * stripecount should be 0 then. - */ - lsm_size = lmv_stripe_md_size(0); - - if (!lsm) { - lsm = kvzalloc(lsm_size, GFP_NOFS); - if (!lsm) - return -ENOMEM; - allocated = true; - *lsmp = lsm; - } - - switch (le32_to_cpu(lmm->lmv_magic)) { - case LMV_MAGIC_V1: - rc = lmv_unpack_md_v1(exp, lsm, &lmm->lmv_md_v1); - break; - default: - CERROR("%s: unrecognized magic %x\n", exp->exp_obd->obd_name, - le32_to_cpu(lmm->lmv_magic)); - rc = -EINVAL; - break; - } - - if (rc && allocated) { - kvfree(lsm); - *lsmp = NULL; - lsm_size = rc; - } - return lsm_size; -} - -void lmv_free_memmd(struct lmv_stripe_md *lsm) -{ - lmv_unpackmd(NULL, &lsm, NULL, 0); -} -EXPORT_SYMBOL(lmv_free_memmd); - -static int lmv_cancel_unused(struct obd_export *exp, const struct lu_fid *fid, - union ldlm_policy_data *policy, - enum ldlm_mode mode, enum ldlm_cancel_flags flags, - void *opaque) -{ - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - int rc = 0; - int err; - u32 i; - - LASSERT(fid); - - for (i = 0; i < lmv->desc.ld_tgt_count; i++) { - struct lmv_tgt_desc *tgt = lmv->tgts[i]; - - if (!tgt || !tgt->ltd_exp || !tgt->ltd_active) - continue; - - err = md_cancel_unused(tgt->ltd_exp, fid, policy, mode, flags, - opaque); - if (!rc) - rc = err; - } - return rc; -} - -static int lmv_set_lock_data(struct obd_export *exp, - const struct lustre_handle *lockh, - void *data, __u64 *bits) -{ - struct lmv_obd *lmv = &exp->exp_obd->u.lmv; - struct lmv_tgt_desc *tgt = lmv->tgts[0]; - - if (!tgt || !tgt->ltd_exp) - return -EINVAL; - - return md_set_lock_data(tgt->ltd_exp, lockh, data, bits); -} - -static enum ldlm_mode lmv_lock_match(struct obd_export *exp, __u64 flags, - const struct lu_fid *fid, - enum ldlm_type type, - union ldlm_policy_data *policy, - enum ldlm_mode mode, - struct lustre_handle *lockh) -{ - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - enum ldlm_mode rc; - int tgt; - u32 i; - - CDEBUG(D_INODE, "Lock match for " DFID "\n", PFID(fid)); - - /* - * With DNE every object can have two locks in different namespaces: - * lookup lock in space of MDT storing direntry and update/open lock in - * space of MDT storing inode. Try the MDT that the FID maps to first, - * since this can be easily found, and only try others if that fails. - */ - for (i = 0, tgt = lmv_find_target_index(lmv, fid); - i < lmv->desc.ld_tgt_count; - i++, tgt = (tgt + 1) % lmv->desc.ld_tgt_count) { - if (tgt < 0) { - CDEBUG(D_HA, "%s: " DFID " is inaccessible: rc = %d\n", - obd->obd_name, PFID(fid), tgt); - tgt = 0; - } - - if (!lmv->tgts[tgt] || !lmv->tgts[tgt]->ltd_exp || - !lmv->tgts[tgt]->ltd_active) - continue; - - rc = md_lock_match(lmv->tgts[tgt]->ltd_exp, flags, fid, - type, policy, mode, lockh); - if (rc) - return rc; - } - - return 0; -} - -static int lmv_get_lustre_md(struct obd_export *exp, - struct ptlrpc_request *req, - struct obd_export *dt_exp, - struct obd_export *md_exp, - struct lustre_md *md) -{ - struct lmv_obd *lmv = &exp->exp_obd->u.lmv; - struct lmv_tgt_desc *tgt = lmv->tgts[0]; - - if (!tgt || !tgt->ltd_exp) - return -EINVAL; - return md_get_lustre_md(tgt->ltd_exp, req, dt_exp, md_exp, md); -} - -static int lmv_free_lustre_md(struct obd_export *exp, struct lustre_md *md) -{ - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_tgt_desc *tgt = lmv->tgts[0]; - - if (md->lmv) { - lmv_free_memmd(md->lmv); - md->lmv = NULL; - } - if (!tgt || !tgt->ltd_exp) - return -EINVAL; - return md_free_lustre_md(tgt->ltd_exp, md); -} - -static int lmv_set_open_replay_data(struct obd_export *exp, - struct obd_client_handle *och, - struct lookup_intent *it) -{ - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_tgt_desc *tgt; - - tgt = lmv_find_target(lmv, &och->och_fid); - if (IS_ERR(tgt)) - return PTR_ERR(tgt); - - return md_set_open_replay_data(tgt->ltd_exp, och, it); -} - -static int lmv_clear_open_replay_data(struct obd_export *exp, - struct obd_client_handle *och) -{ - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_tgt_desc *tgt; - - tgt = lmv_find_target(lmv, &och->och_fid); - if (IS_ERR(tgt)) - return PTR_ERR(tgt); - - return md_clear_open_replay_data(tgt->ltd_exp, och); -} - -static int lmv_intent_getattr_async(struct obd_export *exp, - struct md_enqueue_info *minfo) -{ - struct md_op_data *op_data = &minfo->mi_data; - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_tgt_desc *ptgt = NULL; - struct lmv_tgt_desc *ctgt = NULL; - - if (!fid_is_sane(&op_data->op_fid2)) - return -EINVAL; - - ptgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid1); - if (IS_ERR(ptgt)) - return PTR_ERR(ptgt); - - ctgt = lmv_locate_mds(lmv, op_data, &op_data->op_fid2); - if (IS_ERR(ctgt)) - return PTR_ERR(ctgt); - - /* - * if child is on remote MDT, we need 2 async RPCs to fetch both LOOKUP - * lock on parent, and UPDATE lock on child MDT, which makes all - * complicated. Considering remote dir is rare case, and not supporting - * it in statahead won't cause any issue, drop its support for now. - */ - if (ptgt != ctgt) - return -ENOTSUPP; - - return md_intent_getattr_async(ptgt->ltd_exp, minfo); -} - -static int lmv_revalidate_lock(struct obd_export *exp, struct lookup_intent *it, - struct lu_fid *fid, __u64 *bits) -{ - struct obd_device *obd = exp->exp_obd; - struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_tgt_desc *tgt; - - tgt = lmv_find_target(lmv, fid); - if (IS_ERR(tgt)) - return PTR_ERR(tgt); - - return md_revalidate_lock(tgt->ltd_exp, it, fid, bits); -} - -static int -lmv_get_fid_from_lsm(struct obd_export *exp, - const struct lmv_stripe_md *lsm, - const char *name, int namelen, struct lu_fid *fid) -{ - const struct lmv_oinfo *oinfo; - - LASSERT(lsm); - oinfo = lsm_name_to_stripe_info(lsm, name, namelen); - if (IS_ERR(oinfo)) - return PTR_ERR(oinfo); - - *fid = oinfo->lmo_fid; - - return 0; -} - -/** - * For lmv, only need to send request to master MDT, and the master MDT will - * process with other slave MDTs. The only exception is Q_GETOQUOTA for which - * we directly fetch data from the slave MDTs. - */ -static int lmv_quotactl(struct obd_device *unused, struct obd_export *exp, - struct obd_quotactl *oqctl) -{ - struct obd_device *obd = class_exp2obd(exp); - struct lmv_obd *lmv = &obd->u.lmv; - struct lmv_tgt_desc *tgt = lmv->tgts[0]; - int rc = 0; - __u64 curspace = 0, curinodes = 0; - u32 i; - - if (!tgt || !tgt->ltd_exp || !tgt->ltd_active || - !lmv->desc.ld_tgt_count) { - CERROR("master lmv inactive\n"); - return -EIO; - } - - if (oqctl->qc_cmd != Q_GETOQUOTA) - return obd_quotactl(tgt->ltd_exp, oqctl); - - for (i = 0; i < lmv->desc.ld_tgt_count; i++) { - int err; - - tgt = lmv->tgts[i]; - - if (!tgt || !tgt->ltd_exp || !tgt->ltd_active) - continue; - - err = obd_quotactl(tgt->ltd_exp, oqctl); - if (err) { - CERROR("getquota on mdt %d failed. %d\n", i, err); - if (!rc) - rc = err; - } else { - curspace += oqctl->qc_dqblk.dqb_curspace; - curinodes += oqctl->qc_dqblk.dqb_curinodes; - } - } - oqctl->qc_dqblk.dqb_curspace = curspace; - oqctl->qc_dqblk.dqb_curinodes = curinodes; - - return rc; -} - -static int lmv_merge_attr(struct obd_export *exp, - const struct lmv_stripe_md *lsm, - struct cl_attr *attr, - ldlm_blocking_callback cb_blocking) -{ - int rc, i; - - rc = lmv_revalidate_slaves(exp, lsm, cb_blocking, 0); - if (rc < 0) - return rc; - - for (i = 0; i < lsm->lsm_md_stripe_count; i++) { - struct inode *inode = lsm->lsm_md_oinfo[i].lmo_root; - - CDEBUG(D_INFO, "" DFID " size %llu, blocks %llu nlink %u, atime %lu ctime %lu, mtime %lu.\n", - PFID(&lsm->lsm_md_oinfo[i].lmo_fid), - i_size_read(inode), (unsigned long long)inode->i_blocks, - inode->i_nlink, LTIME_S(inode->i_atime), - LTIME_S(inode->i_ctime), LTIME_S(inode->i_mtime)); - - /* for slave stripe, it needs to subtract nlink for . and .. */ - if (i) - attr->cat_nlink += inode->i_nlink - 2; - else - attr->cat_nlink = inode->i_nlink; - - attr->cat_size += i_size_read(inode); - attr->cat_blocks += inode->i_blocks; - - if (attr->cat_atime < LTIME_S(inode->i_atime)) - attr->cat_atime = LTIME_S(inode->i_atime); - - if (attr->cat_ctime < LTIME_S(inode->i_ctime)) - attr->cat_ctime = LTIME_S(inode->i_ctime); - - if (attr->cat_mtime < LTIME_S(inode->i_mtime)) - attr->cat_mtime = LTIME_S(inode->i_mtime); - } - return 0; -} - -static struct obd_ops lmv_obd_ops = { - .owner = THIS_MODULE, - .setup = lmv_setup, - .cleanup = lmv_cleanup, - .precleanup = lmv_precleanup, - .process_config = lmv_process_config, - .connect = lmv_connect, - .disconnect = lmv_disconnect, - .statfs = lmv_statfs, - .get_info = lmv_get_info, - .set_info_async = lmv_set_info_async, - .notify = lmv_notify, - .get_uuid = lmv_get_uuid, - .iocontrol = lmv_iocontrol, - .quotactl = lmv_quotactl -}; - -static struct md_ops lmv_md_ops = { - .getstatus = lmv_getstatus, - .null_inode = lmv_null_inode, - .close = lmv_close, - .create = lmv_create, - .enqueue = lmv_enqueue, - .getattr = lmv_getattr, - .getxattr = lmv_getxattr, - .getattr_name = lmv_getattr_name, - .intent_lock = lmv_intent_lock, - .link = lmv_link, - .rename = lmv_rename, - .setattr = lmv_setattr, - .setxattr = lmv_setxattr, - .sync = lmv_sync, - .read_page = lmv_read_page, - .unlink = lmv_unlink, - .init_ea_size = lmv_init_ea_size, - .cancel_unused = lmv_cancel_unused, - .set_lock_data = lmv_set_lock_data, - .lock_match = lmv_lock_match, - .get_lustre_md = lmv_get_lustre_md, - .free_lustre_md = lmv_free_lustre_md, - .merge_attr = lmv_merge_attr, - .set_open_replay_data = lmv_set_open_replay_data, - .clear_open_replay_data = lmv_clear_open_replay_data, - .intent_getattr_async = lmv_intent_getattr_async, - .revalidate_lock = lmv_revalidate_lock, - .get_fid_from_lsm = lmv_get_fid_from_lsm, - .unpackmd = lmv_unpackmd, -}; - -static int __init lmv_init(void) -{ - struct lprocfs_static_vars lvars; - int rc; - - lprocfs_lmv_init_vars(&lvars); - - rc = libcfs_setup(); - if (rc) - return rc; - - return class_register_type(&lmv_obd_ops, &lmv_md_ops, - LUSTRE_LMV_NAME, NULL); -} - -static void lmv_exit(void) -{ - class_unregister_type(LUSTRE_LMV_NAME); -} - -MODULE_AUTHOR("OpenSFS, Inc. "); -MODULE_DESCRIPTION("Lustre Logical Metadata Volume"); -MODULE_VERSION(LUSTRE_VERSION_STRING); -MODULE_LICENSE("GPL"); - -module_init(lmv_init); -module_exit(lmv_exit); diff --git a/drivers/staging/lustre/lustre/lmv/lproc_lmv.c b/drivers/staging/lustre/lustre/lmv/lproc_lmv.c deleted file mode 100644 index 30727b7accccb821d47a2a26aee4a8df58f8520a..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/lmv/lproc_lmv.c +++ /dev/null @@ -1,173 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_CLASS - -#include -#include -#include -#include -#include "lmv_internal.h" - -static ssize_t numobd_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - struct lmv_desc *desc; - - desc = &dev->u.lmv.desc; - return sprintf(buf, "%u\n", desc->ld_tgt_count); -} -LUSTRE_RO_ATTR(numobd); - -static ssize_t activeobd_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - struct lmv_desc *desc; - - desc = &dev->u.lmv.desc; - return sprintf(buf, "%u\n", desc->ld_active_tgt_count); -} -LUSTRE_RO_ATTR(activeobd); - -static int lmv_desc_uuid_seq_show(struct seq_file *m, void *v) -{ - struct obd_device *dev = (struct obd_device *)m->private; - struct lmv_obd *lmv; - - LASSERT(dev); - lmv = &dev->u.lmv; - seq_printf(m, "%s\n", lmv->desc.ld_uuid.uuid); - return 0; -} - -LPROC_SEQ_FOPS_RO(lmv_desc_uuid); - -static void *lmv_tgt_seq_start(struct seq_file *p, loff_t *pos) -{ - struct obd_device *dev = p->private; - struct lmv_obd *lmv = &dev->u.lmv; - - while (*pos < lmv->tgts_size) { - if (lmv->tgts[*pos]) - return lmv->tgts[*pos]; - ++*pos; - } - - return NULL; -} - -static void lmv_tgt_seq_stop(struct seq_file *p, void *v) -{ -} - -static void *lmv_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos) -{ - struct obd_device *dev = p->private; - struct lmv_obd *lmv = &dev->u.lmv; - - ++*pos; - while (*pos < lmv->tgts_size) { - if (lmv->tgts[*pos]) - return lmv->tgts[*pos]; - ++*pos; - } - - return NULL; -} - -static int lmv_tgt_seq_show(struct seq_file *p, void *v) -{ - struct lmv_tgt_desc *tgt = v; - - if (!tgt) - return 0; - seq_printf(p, "%u: %s %sACTIVE\n", - tgt->ltd_idx, tgt->ltd_uuid.uuid, - tgt->ltd_active ? "" : "IN"); - return 0; -} - -static const struct seq_operations lmv_tgt_sops = { - .start = lmv_tgt_seq_start, - .stop = lmv_tgt_seq_stop, - .next = lmv_tgt_seq_next, - .show = lmv_tgt_seq_show, -}; - -static int lmv_target_seq_open(struct inode *inode, struct file *file) -{ - struct seq_file *seq; - int rc; - - rc = seq_open(file, &lmv_tgt_sops); - if (rc) - return rc; - - seq = file->private_data; - seq->private = inode->i_private; - - return 0; -} - -static struct lprocfs_vars lprocfs_lmv_obd_vars[] = { - { "desc_uuid", &lmv_desc_uuid_fops, NULL, 0 }, - { NULL } -}; - -const struct file_operations lmv_proc_target_fops = { - .owner = THIS_MODULE, - .open = lmv_target_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static struct attribute *lmv_attrs[] = { - &lustre_attr_activeobd.attr, - &lustre_attr_numobd.attr, - NULL, -}; - -static const struct attribute_group lmv_attr_group = { - .attrs = lmv_attrs, -}; - -void lprocfs_lmv_init_vars(struct lprocfs_static_vars *lvars) -{ - lvars->sysfs_vars = &lmv_attr_group; - lvars->obd_vars = lprocfs_lmv_obd_vars; -} diff --git a/drivers/staging/lustre/lustre/lov/Makefile b/drivers/staging/lustre/lustre/lov/Makefile deleted file mode 100644 index 1ebf0193f61a63631d6e9819ea9e0d79a5027822..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/lov/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/include -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/lustre/include - -obj-$(CONFIG_LUSTRE_FS) += lov.o -lov-y := lov_obd.o lov_pack.o lov_offset.o lov_merge.o \ - lov_request.o lov_ea.o lov_dev.o lov_object.o lov_page.o \ - lov_lock.o lov_io.o lovsub_dev.o lovsub_object.o lovsub_page.o \ - lovsub_lock.o lov_pool.o lproc_lov.o diff --git a/drivers/staging/lustre/lustre/lov/lov_cl_internal.h b/drivers/staging/lustre/lustre/lov/lov_cl_internal.h deleted file mode 100644 index e4f762137a4abf4d51d11c3482cc698a6b9bd815..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/lov/lov_cl_internal.h +++ /dev/null @@ -1,639 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015 Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Internal interfaces of LOV layer. - * - * Author: Nikita Danilov - * Author: Jinshan Xiong - */ - -#ifndef LOV_CL_INTERNAL_H -#define LOV_CL_INTERNAL_H - -#include -#include -#include "lov_internal.h" - -/** \defgroup lov lov - * Logical object volume layer. This layer implements data striping (raid0). - * - * At the lov layer top-entity (object, page, lock, io) is connected to one or - * more sub-entities: top-object, representing a file is connected to a set of - * sub-objects, each representing a stripe, file-level top-lock is connected - * to a set of per-stripe sub-locks, top-page is connected to a (single) - * sub-page, and a top-level IO is connected to a set of (potentially - * concurrent) sub-IO's. - * - * Sub-object, sub-page, and sub-io have well-defined top-object and top-page - * respectively, while a single sub-lock can be part of multiple top-locks. - * - * Reference counting models are different for different types of entities: - * - * - top-object keeps a reference to its sub-objects, and destroys them - * when it is destroyed. - * - * - top-page keeps a reference to its sub-page, and destroys it when it - * is destroyed. - * - * - IO's are not reference counted. - * - * To implement a connection between top and sub entities, lov layer is split - * into two pieces: lov ("upper half"), and lovsub ("bottom half"), both - * implementing full set of cl-interfaces. For example, top-object has vvp and - * lov layers, and it's sub-object has lovsub and osc layers. lovsub layer is - * used to track child-parent relationship. - * - * @{ - */ - -struct lovsub_device; -struct lovsub_object; -struct lovsub_lock; - -enum lov_device_flags { - LOV_DEV_INITIALIZED = 1 << 0 -}; - -/* - * Upper half. - */ - -struct lov_device { - /* - * XXX Locking of lov-private data is missing. - */ - struct cl_device ld_cl; - struct lov_obd *ld_lov; - /** size of lov_device::ld_target[] array */ - __u32 ld_target_nr; - struct lovsub_device **ld_target; - __u32 ld_flags; -}; - -/** - * Layout type. - */ -enum lov_layout_type { - LLT_EMPTY, /** empty file without body (mknod + truncate) */ - LLT_RAID0, /** striped file */ - LLT_RELEASED, /** file with no objects (data in HSM) */ - LLT_NR -}; - -static inline char *llt2str(enum lov_layout_type llt) -{ - switch (llt) { - case LLT_EMPTY: - return "EMPTY"; - case LLT_RAID0: - return "RAID0"; - case LLT_RELEASED: - return "RELEASED"; - case LLT_NR: - LBUG(); - } - LBUG(); - return ""; -} - -/** - * lov-specific file state. - * - * lov object has particular layout type, determining how top-object is built - * on top of sub-objects. Layout type can change dynamically. When this - * happens, lov_object::lo_type_guard semaphore is taken in exclusive mode, - * all state pertaining to the old layout type is destroyed, and new state is - * constructed. All object methods take said semaphore in the shared mode, - * providing serialization against transition between layout types. - * - * To avoid multiple `if' or `switch' statements, selecting behavior for the - * current layout type, object methods perform double-dispatch, invoking - * function corresponding to the current layout type. - */ -struct lov_object { - struct cl_object lo_cl; - /** - * Serializes object operations with transitions between layout types. - * - * This semaphore is taken in shared mode by all object methods, and - * is taken in exclusive mode when object type is changed. - * - * \see lov_object::lo_type - */ - struct rw_semaphore lo_type_guard; - /** - * Type of an object. Protected by lov_object::lo_type_guard. - */ - enum lov_layout_type lo_type; - /** - * True if layout is invalid. This bit is cleared when layout lock - * is lost. - */ - bool lo_layout_invalid; - /** - * How many IOs are on going on this object. Layout can be changed - * only if there is no active IO. - */ - atomic_t lo_active_ios; - /** - * Waitq - wait for no one else is using lo_lsm - */ - wait_queue_head_t lo_waitq; - /** - * Layout metadata. NULL if empty layout. - */ - struct lov_stripe_md *lo_lsm; - - union lov_layout_state { - struct lov_layout_raid0 { - unsigned int lo_nr; - /** - * When this is true, lov_object::lo_attr contains - * valid up to date attributes for a top-level - * object. This field is reset to 0 when attributes of - * any sub-object change. - */ - int lo_attr_valid; - /** - * Array of sub-objects. Allocated when top-object is - * created (lov_init_raid0()). - * - * Top-object is a strict master of its sub-objects: - * it is created before them, and outlives its - * children (this later is necessary so that basic - * functions like cl_object_top() always - * work). Top-object keeps a reference on every - * sub-object. - * - * When top-object is destroyed (lov_delete_raid0()) - * it releases its reference to a sub-object and waits - * until the latter is finally destroyed. - */ - struct lovsub_object **lo_sub; - /** - * protect lo_sub - */ - spinlock_t lo_sub_lock; - /** - * Cached object attribute, built from sub-object - * attributes. - */ - struct cl_attr lo_attr; - } raid0; - struct lov_layout_state_empty { - } empty; - struct lov_layout_state_released { - } released; - } u; - /** - * Thread that acquired lov_object::lo_type_guard in an exclusive - * mode. - */ - struct task_struct *lo_owner; -}; - -/** - * State lov_lock keeps for each sub-lock. - */ -struct lov_lock_sub { - /** sub-lock itself */ - struct cl_lock sub_lock; - /** Set if the sublock has ever been enqueued, meaning it may - * hold resources of underlying layers - */ - unsigned int sub_is_enqueued:1, - sub_initialized:1; - int sub_stripe; -}; - -/** - * lov-specific lock state. - */ -struct lov_lock { - struct cl_lock_slice lls_cl; - /** Number of sub-locks in this lock */ - int lls_nr; - /** sublock array */ - struct lov_lock_sub lls_sub[0]; -}; - -struct lov_page { - struct cl_page_slice lps_cl; - unsigned int lps_stripe; /* stripe index */ -}; - -/* - * Bottom half. - */ - -struct lovsub_device { - struct cl_device acid_cl; - struct cl_device *acid_next; -}; - -struct lovsub_object { - struct cl_object_header lso_header; - struct cl_object lso_cl; - struct lov_object *lso_super; - int lso_index; -}; - -/** - * Lock state at lovsub layer. - */ -struct lovsub_lock { - struct cl_lock_slice lss_cl; -}; - -/** - * Describe the environment settings for sublocks. - */ -struct lov_sublock_env { - const struct lu_env *lse_env; - struct cl_io *lse_io; -}; - -struct lovsub_page { - struct cl_page_slice lsb_cl; -}; - -struct lov_thread_info { - struct cl_object_conf lti_stripe_conf; - struct lu_fid lti_fid; - struct ost_lvb lti_lvb; - struct cl_2queue lti_cl2q; - struct cl_page_list lti_plist; - wait_queue_entry_t lti_waiter; -}; - -/** - * State that lov_io maintains for every sub-io. - */ -struct lov_io_sub { - u16 sub_stripe; - /** - * environment's refcheck. - * - * \see cl_env_get() - */ - u16 sub_refcheck; - /** - * true, iff cl_io_init() was successfully executed against - * lov_io_sub::sub_io. - */ - u16 sub_io_initialized:1, - /** - * True, iff lov_io_sub::sub_io and lov_io_sub::sub_env weren't - * allocated, but borrowed from a per-device emergency pool. - */ - sub_borrowed:1; - /** - * Linkage into a list (hanging off lov_io::lis_active) of all - * sub-io's active for the current IO iteration. - */ - struct list_head sub_linkage; - /** - * sub-io for a stripe. Ideally sub-io's can be stopped and resumed - * independently, with lov acting as a scheduler to maximize overall - * throughput. - */ - struct cl_io *sub_io; - /** - * environment, in which sub-io executes. - */ - struct lu_env *sub_env; -}; - -/** - * IO state private for LOV. - */ -struct lov_io { - /** super-class */ - struct cl_io_slice lis_cl; - /** - * Pointer to the object slice. This is a duplicate of - * lov_io::lis_cl::cis_object. - */ - struct lov_object *lis_object; - /** - * Original end-of-io position for this IO, set by the upper layer as - * cl_io::u::ci_rw::pos + cl_io::u::ci_rw::count. lov remembers this, - * changes pos and count to fit IO into a single stripe and uses saved - * value to determine when IO iterations have to stop. - * - * This is used only for CIT_READ and CIT_WRITE io's. - */ - loff_t lis_io_endpos; - - /** - * starting position within a file, for the current io loop iteration - * (stripe), used by ci_io_loop(). - */ - u64 lis_pos; - /** - * end position with in a file, for the current stripe io. This is - * exclusive (i.e., next offset after last byte affected by io). - */ - u64 lis_endpos; - - int lis_stripe_count; - int lis_active_subios; - - /** - * the index of ls_single_subio in ls_subios array - */ - int lis_single_subio_index; - struct cl_io lis_single_subio; - - /** - * size of ls_subios array, actually the highest stripe # - */ - int lis_nr_subios; - struct lov_io_sub *lis_subs; - /** - * List of active sub-io's. - */ - struct list_head lis_active; -}; - -struct lov_session { - struct lov_io ls_io; - struct lov_sublock_env ls_subenv; -}; - -extern struct lu_device_type lov_device_type; -extern struct lu_device_type lovsub_device_type; - -extern struct lu_context_key lov_key; -extern struct lu_context_key lov_session_key; - -extern struct kmem_cache *lov_lock_kmem; -extern struct kmem_cache *lov_object_kmem; -extern struct kmem_cache *lov_thread_kmem; -extern struct kmem_cache *lov_session_kmem; - -extern struct kmem_cache *lovsub_lock_kmem; -extern struct kmem_cache *lovsub_object_kmem; - -int lov_object_init(const struct lu_env *env, struct lu_object *obj, - const struct lu_object_conf *conf); -int lovsub_object_init(const struct lu_env *env, struct lu_object *obj, - const struct lu_object_conf *conf); -int lov_lock_init(const struct lu_env *env, struct cl_object *obj, - struct cl_lock *lock, const struct cl_io *io); -int lov_io_init(const struct lu_env *env, struct cl_object *obj, - struct cl_io *io); -int lovsub_lock_init(const struct lu_env *env, struct cl_object *obj, - struct cl_lock *lock, const struct cl_io *io); - -int lov_lock_init_raid0(const struct lu_env *env, struct cl_object *obj, - struct cl_lock *lock, const struct cl_io *io); -int lov_lock_init_empty(const struct lu_env *env, struct cl_object *obj, - struct cl_lock *lock, const struct cl_io *io); -int lov_io_init_raid0(const struct lu_env *env, struct cl_object *obj, - struct cl_io *io); -int lov_io_init_empty(const struct lu_env *env, struct cl_object *obj, - struct cl_io *io); -int lov_io_init_released(const struct lu_env *env, struct cl_object *obj, - struct cl_io *io); - -struct lov_io_sub *lov_sub_get(const struct lu_env *env, struct lov_io *lio, - int stripe); - -int lov_page_init(const struct lu_env *env, struct cl_object *ob, - struct cl_page *page, pgoff_t index); -int lovsub_page_init(const struct lu_env *env, struct cl_object *ob, - struct cl_page *page, pgoff_t index); -int lov_page_init_empty(const struct lu_env *env, struct cl_object *obj, - struct cl_page *page, pgoff_t index); -int lov_page_init_raid0(const struct lu_env *env, struct cl_object *obj, - struct cl_page *page, pgoff_t index); -struct lu_object *lov_object_alloc(const struct lu_env *env, - const struct lu_object_header *hdr, - struct lu_device *dev); -struct lu_object *lovsub_object_alloc(const struct lu_env *env, - const struct lu_object_header *hdr, - struct lu_device *dev); - -struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov); -int lov_page_stripe(const struct cl_page *page); - -#define lov_foreach_target(lov, var) \ - for (var = 0; var < lov_targets_nr(lov); ++var) - -/***************************************************************************** - * - * Type conversions. - * - * Accessors. - * - */ - -static inline struct lov_session *lov_env_session(const struct lu_env *env) -{ - struct lov_session *ses; - - ses = lu_context_key_get(env->le_ses, &lov_session_key); - LASSERT(ses); - return ses; -} - -static inline struct lov_io *lov_env_io(const struct lu_env *env) -{ - return &lov_env_session(env)->ls_io; -} - -static inline int lov_is_object(const struct lu_object *obj) -{ - return obj->lo_dev->ld_type == &lov_device_type; -} - -static inline int lovsub_is_object(const struct lu_object *obj) -{ - return obj->lo_dev->ld_type == &lovsub_device_type; -} - -static inline struct lu_device *lov2lu_dev(struct lov_device *lov) -{ - return &lov->ld_cl.cd_lu_dev; -} - -static inline struct lov_device *lu2lov_dev(const struct lu_device *d) -{ - LINVRNT(d->ld_type == &lov_device_type); - return container_of(d, struct lov_device, ld_cl.cd_lu_dev); -} - -static inline struct cl_device *lovsub2cl_dev(struct lovsub_device *lovsub) -{ - return &lovsub->acid_cl; -} - -static inline struct lu_device *lovsub2lu_dev(struct lovsub_device *lovsub) -{ - return &lovsub2cl_dev(lovsub)->cd_lu_dev; -} - -static inline struct lovsub_device *lu2lovsub_dev(const struct lu_device *d) -{ - LINVRNT(d->ld_type == &lovsub_device_type); - return container_of(d, struct lovsub_device, acid_cl.cd_lu_dev); -} - -static inline struct lovsub_device *cl2lovsub_dev(const struct cl_device *d) -{ - LINVRNT(d->cd_lu_dev.ld_type == &lovsub_device_type); - return container_of(d, struct lovsub_device, acid_cl); -} - -static inline struct lu_object *lov2lu(struct lov_object *lov) -{ - return &lov->lo_cl.co_lu; -} - -static inline struct cl_object *lov2cl(struct lov_object *lov) -{ - return &lov->lo_cl; -} - -static inline struct lov_object *lu2lov(const struct lu_object *obj) -{ - LINVRNT(lov_is_object(obj)); - return container_of(obj, struct lov_object, lo_cl.co_lu); -} - -static inline struct lov_object *cl2lov(const struct cl_object *obj) -{ - LINVRNT(lov_is_object(&obj->co_lu)); - return container_of(obj, struct lov_object, lo_cl); -} - -static inline struct lu_object *lovsub2lu(struct lovsub_object *los) -{ - return &los->lso_cl.co_lu; -} - -static inline struct cl_object *lovsub2cl(struct lovsub_object *los) -{ - return &los->lso_cl; -} - -static inline struct lovsub_object *cl2lovsub(const struct cl_object *obj) -{ - LINVRNT(lovsub_is_object(&obj->co_lu)); - return container_of(obj, struct lovsub_object, lso_cl); -} - -static inline struct lovsub_object *lu2lovsub(const struct lu_object *obj) -{ - LINVRNT(lovsub_is_object(obj)); - return container_of(obj, struct lovsub_object, lso_cl.co_lu); -} - -static inline struct lovsub_lock * -cl2lovsub_lock(const struct cl_lock_slice *slice) -{ - LINVRNT(lovsub_is_object(&slice->cls_obj->co_lu)); - return container_of(slice, struct lovsub_lock, lss_cl); -} - -static inline struct lovsub_lock *cl2sub_lock(const struct cl_lock *lock) -{ - const struct cl_lock_slice *slice; - - slice = cl_lock_at(lock, &lovsub_device_type); - LASSERT(slice); - return cl2lovsub_lock(slice); -} - -static inline struct lov_lock *cl2lov_lock(const struct cl_lock_slice *slice) -{ - LINVRNT(lov_is_object(&slice->cls_obj->co_lu)); - return container_of(slice, struct lov_lock, lls_cl); -} - -static inline struct lov_page *cl2lov_page(const struct cl_page_slice *slice) -{ - LINVRNT(lov_is_object(&slice->cpl_obj->co_lu)); - return container_of(slice, struct lov_page, lps_cl); -} - -static inline struct lovsub_page * -cl2lovsub_page(const struct cl_page_slice *slice) -{ - LINVRNT(lovsub_is_object(&slice->cpl_obj->co_lu)); - return container_of(slice, struct lovsub_page, lsb_cl); -} - -static inline struct lov_io *cl2lov_io(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct lov_io *lio; - - lio = container_of(ios, struct lov_io, lis_cl); - LASSERT(lio == lov_env_io(env)); - return lio; -} - -static inline int lov_targets_nr(const struct lov_device *lov) -{ - return lov->ld_lov->desc.ld_tgt_count; -} - -static inline struct lov_thread_info *lov_env_info(const struct lu_env *env) -{ - struct lov_thread_info *info; - - info = lu_context_key_get(&env->le_ctx, &lov_key); - LASSERT(info); - return info; -} - -static inline struct lov_layout_raid0 *lov_r0(struct lov_object *lov) -{ - LASSERT(lov->lo_type == LLT_RAID0); - LASSERT(lov->lo_lsm->lsm_magic == LOV_MAGIC || - lov->lo_lsm->lsm_magic == LOV_MAGIC_V3); - return &lov->u.raid0; -} - -/* lov_pack.c */ -int lov_getstripe(struct lov_object *obj, struct lov_stripe_md *lsm, - struct lov_user_md __user *lump); - -/** @} lov */ - -#endif diff --git a/drivers/staging/lustre/lustre/lov/lov_dev.c b/drivers/staging/lustre/lustre/lov/lov_dev.c deleted file mode 100644 index c7db23472346c596d45e46bcbc6a3508328cdcdf..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/lov/lov_dev.c +++ /dev/null @@ -1,384 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Implementation of cl_device and cl_device_type for LOV layer. - * - * Author: Nikita Danilov - */ - -#define DEBUG_SUBSYSTEM S_LOV - -/* class_name2obd() */ -#include - -#include "lov_cl_internal.h" -#include "lov_internal.h" - -struct kmem_cache *lov_lock_kmem; -struct kmem_cache *lov_object_kmem; -struct kmem_cache *lov_thread_kmem; -struct kmem_cache *lov_session_kmem; - -struct kmem_cache *lovsub_lock_kmem; -struct kmem_cache *lovsub_object_kmem; - -struct lu_kmem_descr lov_caches[] = { - { - .ckd_cache = &lov_lock_kmem, - .ckd_name = "lov_lock_kmem", - .ckd_size = sizeof(struct lov_lock) - }, - { - .ckd_cache = &lov_object_kmem, - .ckd_name = "lov_object_kmem", - .ckd_size = sizeof(struct lov_object) - }, - { - .ckd_cache = &lov_thread_kmem, - .ckd_name = "lov_thread_kmem", - .ckd_size = sizeof(struct lov_thread_info) - }, - { - .ckd_cache = &lov_session_kmem, - .ckd_name = "lov_session_kmem", - .ckd_size = sizeof(struct lov_session) - }, - { - .ckd_cache = &lovsub_lock_kmem, - .ckd_name = "lovsub_lock_kmem", - .ckd_size = sizeof(struct lovsub_lock) - }, - { - .ckd_cache = &lovsub_object_kmem, - .ckd_name = "lovsub_object_kmem", - .ckd_size = sizeof(struct lovsub_object) - }, - { - .ckd_cache = NULL - } -}; - -/***************************************************************************** - * - * Lov device and device type functions. - * - */ - -static void *lov_key_init(const struct lu_context *ctx, - struct lu_context_key *key) -{ - struct lov_thread_info *info; - - info = kmem_cache_zalloc(lov_thread_kmem, GFP_NOFS); - if (!info) - info = ERR_PTR(-ENOMEM); - return info; -} - -static void lov_key_fini(const struct lu_context *ctx, - struct lu_context_key *key, void *data) -{ - struct lov_thread_info *info = data; - - kmem_cache_free(lov_thread_kmem, info); -} - -struct lu_context_key lov_key = { - .lct_tags = LCT_CL_THREAD, - .lct_init = lov_key_init, - .lct_fini = lov_key_fini -}; - -static void *lov_session_key_init(const struct lu_context *ctx, - struct lu_context_key *key) -{ - struct lov_session *info; - - info = kmem_cache_zalloc(lov_session_kmem, GFP_NOFS); - if (!info) - info = ERR_PTR(-ENOMEM); - return info; -} - -static void lov_session_key_fini(const struct lu_context *ctx, - struct lu_context_key *key, void *data) -{ - struct lov_session *info = data; - - kmem_cache_free(lov_session_kmem, info); -} - -struct lu_context_key lov_session_key = { - .lct_tags = LCT_SESSION, - .lct_init = lov_session_key_init, - .lct_fini = lov_session_key_fini -}; - -/* type constructor/destructor: lov_type_{init,fini,start,stop}() */ -LU_TYPE_INIT_FINI(lov, &lov_key, &lov_session_key); - -static struct lu_device *lov_device_fini(const struct lu_env *env, - struct lu_device *d) -{ - int i; - struct lov_device *ld = lu2lov_dev(d); - - LASSERT(ld->ld_lov); - if (!ld->ld_target) - return NULL; - - lov_foreach_target(ld, i) { - struct lovsub_device *lsd; - - lsd = ld->ld_target[i]; - if (lsd) { - cl_stack_fini(env, lovsub2cl_dev(lsd)); - ld->ld_target[i] = NULL; - } - } - return NULL; -} - -static int lov_device_init(const struct lu_env *env, struct lu_device *d, - const char *name, struct lu_device *next) -{ - struct lov_device *ld = lu2lov_dev(d); - int i; - int rc = 0; - - LASSERT(d->ld_site); - if (!ld->ld_target) - return rc; - - lov_foreach_target(ld, i) { - struct lovsub_device *lsd; - struct cl_device *cl; - struct lov_tgt_desc *desc; - - desc = ld->ld_lov->lov_tgts[i]; - if (!desc) - continue; - - cl = cl_type_setup(env, d->ld_site, &lovsub_device_type, - desc->ltd_obd->obd_lu_dev); - if (IS_ERR(cl)) { - rc = PTR_ERR(cl); - break; - } - lsd = cl2lovsub_dev(cl); - ld->ld_target[i] = lsd; - } - - if (rc) - lov_device_fini(env, d); - else - ld->ld_flags |= LOV_DEV_INITIALIZED; - - return rc; -} - -static struct lu_device *lov_device_free(const struct lu_env *env, - struct lu_device *d) -{ - struct lov_device *ld = lu2lov_dev(d); - - cl_device_fini(lu2cl_dev(d)); - kfree(ld->ld_target); - kfree(ld); - return NULL; -} - -static void lov_cl_del_target(const struct lu_env *env, struct lu_device *dev, - __u32 index) -{ - struct lov_device *ld = lu2lov_dev(dev); - - if (ld->ld_target[index]) { - cl_stack_fini(env, lovsub2cl_dev(ld->ld_target[index])); - ld->ld_target[index] = NULL; - } -} - -static int lov_expand_targets(const struct lu_env *env, struct lov_device *dev) -{ - int result; - __u32 tgt_size; - __u32 sub_size; - - result = 0; - tgt_size = dev->ld_lov->lov_tgt_size; - sub_size = dev->ld_target_nr; - if (sub_size < tgt_size) { - struct lovsub_device **newd; - const size_t sz = sizeof(newd[0]); - - newd = kcalloc(tgt_size, sz, GFP_NOFS); - if (newd) { - if (sub_size > 0) { - memcpy(newd, dev->ld_target, sub_size * sz); - kfree(dev->ld_target); - } - dev->ld_target = newd; - dev->ld_target_nr = tgt_size; - } else { - result = -ENOMEM; - } - } - return result; -} - -static int lov_cl_add_target(const struct lu_env *env, struct lu_device *dev, - __u32 index) -{ - struct obd_device *obd = dev->ld_obd; - struct lov_device *ld = lu2lov_dev(dev); - struct lov_tgt_desc *tgt; - struct lovsub_device *lsd; - struct cl_device *cl; - int rc; - - obd_getref(obd); - - tgt = obd->u.lov.lov_tgts[index]; - - if (!tgt->ltd_obd->obd_set_up) { - CERROR("Target %s not set up\n", obd_uuid2str(&tgt->ltd_uuid)); - return -EINVAL; - } - - rc = lov_expand_targets(env, ld); - if (rc == 0 && ld->ld_flags & LOV_DEV_INITIALIZED) { - LASSERT(dev->ld_site); - - cl = cl_type_setup(env, dev->ld_site, &lovsub_device_type, - tgt->ltd_obd->obd_lu_dev); - if (!IS_ERR(cl)) { - lsd = cl2lovsub_dev(cl); - ld->ld_target[index] = lsd; - } else { - CERROR("add failed (%d), deleting %s\n", rc, - obd_uuid2str(&tgt->ltd_uuid)); - lov_cl_del_target(env, dev, index); - rc = PTR_ERR(cl); - } - } - obd_putref(obd); - return rc; -} - -static int lov_process_config(const struct lu_env *env, - struct lu_device *d, struct lustre_cfg *cfg) -{ - struct obd_device *obd = d->ld_obd; - int cmd; - int rc; - int gen; - __u32 index; - - obd_getref(obd); - - cmd = cfg->lcfg_command; - rc = lov_process_config_base(d->ld_obd, cfg, &index, &gen); - if (rc == 0) { - switch (cmd) { - case LCFG_LOV_ADD_OBD: - case LCFG_LOV_ADD_INA: - rc = lov_cl_add_target(env, d, index); - if (rc != 0) - lov_del_target(d->ld_obd, index, NULL, 0); - break; - case LCFG_LOV_DEL_OBD: - lov_cl_del_target(env, d, index); - break; - } - } - obd_putref(obd); - return rc; -} - -static const struct lu_device_operations lov_lu_ops = { - .ldo_object_alloc = lov_object_alloc, - .ldo_process_config = lov_process_config, -}; - -static struct lu_device *lov_device_alloc(const struct lu_env *env, - struct lu_device_type *t, - struct lustre_cfg *cfg) -{ - struct lu_device *d; - struct lov_device *ld; - struct obd_device *obd; - int rc; - - ld = kzalloc(sizeof(*ld), GFP_NOFS); - if (!ld) - return ERR_PTR(-ENOMEM); - - cl_device_init(&ld->ld_cl, t); - d = lov2lu_dev(ld); - d->ld_ops = &lov_lu_ops; - - /* setup the LOV OBD */ - obd = class_name2obd(lustre_cfg_string(cfg, 0)); - LASSERT(obd); - rc = lov_setup(obd, cfg); - if (rc) { - lov_device_free(env, d); - return ERR_PTR(rc); - } - - ld->ld_lov = &obd->u.lov; - return d; -} - -static const struct lu_device_type_operations lov_device_type_ops = { - .ldto_init = lov_type_init, - .ldto_fini = lov_type_fini, - - .ldto_start = lov_type_start, - .ldto_stop = lov_type_stop, - - .ldto_device_alloc = lov_device_alloc, - .ldto_device_free = lov_device_free, - - .ldto_device_init = lov_device_init, - .ldto_device_fini = lov_device_fini -}; - -struct lu_device_type lov_device_type = { - .ldt_tags = LU_DEVICE_CL, - .ldt_name = LUSTRE_LOV_NAME, - .ldt_ops = &lov_device_type_ops, - .ldt_ctx_tags = LCT_CL_THREAD -}; - -/** @} lov */ diff --git a/drivers/staging/lustre/lustre/lov/lov_ea.c b/drivers/staging/lustre/lustre/lov/lov_ea.c deleted file mode 100644 index c80320ab0858c7878b1f38f4418db1d4d7043cfb..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/lov/lov_ea.c +++ /dev/null @@ -1,331 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/lov/lov_ea.c - * - * Author: Wang Di - */ - -#define DEBUG_SUBSYSTEM S_LOV - -#include - -#include -#include - -#include "lov_internal.h" - -static int lsm_lmm_verify_common(struct lov_mds_md *lmm, int lmm_bytes, - __u16 stripe_count) -{ - if (stripe_count > LOV_V1_INSANE_STRIPE_COUNT) { - CERROR("bad stripe count %d\n", stripe_count); - lov_dump_lmm_common(D_WARNING, lmm); - return -EINVAL; - } - - if (lmm_oi_id(&lmm->lmm_oi) == 0) { - CERROR("zero object id\n"); - lov_dump_lmm_common(D_WARNING, lmm); - return -EINVAL; - } - - if (lov_pattern(le32_to_cpu(lmm->lmm_pattern)) != LOV_PATTERN_RAID0) { - CERROR("bad striping pattern\n"); - lov_dump_lmm_common(D_WARNING, lmm); - return -EINVAL; - } - - if (lmm->lmm_stripe_size == 0 || - (le32_to_cpu(lmm->lmm_stripe_size) & - (LOV_MIN_STRIPE_SIZE - 1)) != 0) { - CERROR("bad stripe size %u\n", - le32_to_cpu(lmm->lmm_stripe_size)); - lov_dump_lmm_common(D_WARNING, lmm); - return -EINVAL; - } - return 0; -} - -struct lov_stripe_md *lsm_alloc_plain(u16 stripe_count) -{ - size_t oinfo_ptrs_size, lsm_size; - struct lov_stripe_md *lsm; - struct lov_oinfo *loi; - int i; - - LASSERT(stripe_count <= LOV_MAX_STRIPE_COUNT); - - oinfo_ptrs_size = sizeof(struct lov_oinfo *) * stripe_count; - lsm_size = sizeof(*lsm) + oinfo_ptrs_size; - - lsm = kvzalloc(lsm_size, GFP_NOFS); - if (!lsm) - return NULL; - - for (i = 0; i < stripe_count; i++) { - loi = kmem_cache_zalloc(lov_oinfo_slab, GFP_NOFS); - if (!loi) - goto err; - lsm->lsm_oinfo[i] = loi; - } - lsm->lsm_stripe_count = stripe_count; - return lsm; - -err: - while (--i >= 0) - kmem_cache_free(lov_oinfo_slab, lsm->lsm_oinfo[i]); - kvfree(lsm); - return NULL; -} - -void lsm_free_plain(struct lov_stripe_md *lsm) -{ - __u16 stripe_count = lsm->lsm_stripe_count; - int i; - - for (i = 0; i < stripe_count; i++) - kmem_cache_free(lov_oinfo_slab, lsm->lsm_oinfo[i]); - kvfree(lsm); -} - -/* - * Find minimum stripe maxbytes value. For inactive or - * reconnecting targets use LUSTRE_EXT3_STRIPE_MAXBYTES. - */ -static loff_t lov_tgt_maxbytes(struct lov_tgt_desc *tgt) -{ - loff_t maxbytes = LUSTRE_EXT3_STRIPE_MAXBYTES; - struct obd_import *imp; - - if (!tgt->ltd_active) - return maxbytes; - - imp = tgt->ltd_obd->u.cli.cl_import; - if (!imp) - return maxbytes; - - spin_lock(&imp->imp_lock); - if (imp->imp_state == LUSTRE_IMP_FULL && - (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_MAXBYTES) && - imp->imp_connect_data.ocd_maxbytes > 0) - maxbytes = imp->imp_connect_data.ocd_maxbytes; - - spin_unlock(&imp->imp_lock); - - return maxbytes; -} - -static int lsm_unpackmd_common(struct lov_obd *lov, - struct lov_stripe_md *lsm, - struct lov_mds_md *lmm, - struct lov_ost_data_v1 *objects) -{ - loff_t min_stripe_maxbytes = 0; - unsigned int stripe_count; - struct lov_oinfo *loi; - loff_t lov_bytes; - unsigned int i; - - /* - * This supposes lov_mds_md_v1/v3 first fields are - * are the same - */ - lmm_oi_le_to_cpu(&lsm->lsm_oi, &lmm->lmm_oi); - lsm->lsm_stripe_size = le32_to_cpu(lmm->lmm_stripe_size); - lsm->lsm_pattern = le32_to_cpu(lmm->lmm_pattern); - lsm->lsm_layout_gen = le16_to_cpu(lmm->lmm_layout_gen); - lsm->lsm_pool_name[0] = '\0'; - - stripe_count = lsm_is_released(lsm) ? 0 : lsm->lsm_stripe_count; - - for (i = 0; i < stripe_count; i++) { - loi = lsm->lsm_oinfo[i]; - ostid_le_to_cpu(&objects[i].l_ost_oi, &loi->loi_oi); - loi->loi_ost_idx = le32_to_cpu(objects[i].l_ost_idx); - loi->loi_ost_gen = le32_to_cpu(objects[i].l_ost_gen); - if (lov_oinfo_is_dummy(loi)) - continue; - - if (loi->loi_ost_idx >= lov->desc.ld_tgt_count && - !lov2obd(lov)->obd_process_conf) { - CERROR("%s: OST index %d more than OST count %d\n", - (char *)lov->desc.ld_uuid.uuid, - loi->loi_ost_idx, lov->desc.ld_tgt_count); - lov_dump_lmm_v1(D_WARNING, lmm); - return -EINVAL; - } - - if (!lov->lov_tgts[loi->loi_ost_idx]) { - CERROR("%s: OST index %d missing\n", - (char *)lov->desc.ld_uuid.uuid, - loi->loi_ost_idx); - lov_dump_lmm_v1(D_WARNING, lmm); - continue; - } - - lov_bytes = lov_tgt_maxbytes(lov->lov_tgts[loi->loi_ost_idx]); - if (min_stripe_maxbytes == 0 || lov_bytes < min_stripe_maxbytes) - min_stripe_maxbytes = lov_bytes; - } - - if (min_stripe_maxbytes == 0) - min_stripe_maxbytes = LUSTRE_EXT3_STRIPE_MAXBYTES; - - stripe_count = lsm->lsm_stripe_count ?: lov->desc.ld_tgt_count; - lov_bytes = min_stripe_maxbytes * stripe_count; - - if (lov_bytes < min_stripe_maxbytes) /* handle overflow */ - lsm->lsm_maxbytes = MAX_LFS_FILESIZE; - else - lsm->lsm_maxbytes = lov_bytes; - - return 0; -} - -static void -lsm_stripe_by_index_plain(struct lov_stripe_md *lsm, int *stripeno, - loff_t *lov_off, loff_t *swidth) -{ - if (swidth) - *swidth = (u64)lsm->lsm_stripe_size * lsm->lsm_stripe_count; -} - -static void -lsm_stripe_by_offset_plain(struct lov_stripe_md *lsm, int *stripeno, - loff_t *lov_off, loff_t *swidth) -{ - if (swidth) - *swidth = (u64)lsm->lsm_stripe_size * lsm->lsm_stripe_count; -} - -static int lsm_lmm_verify_v1(struct lov_mds_md_v1 *lmm, int lmm_bytes, - __u16 *stripe_count) -{ - if (lmm_bytes < sizeof(*lmm)) { - CERROR("lov_mds_md_v1 too small: %d, need at least %d\n", - lmm_bytes, (int)sizeof(*lmm)); - return -EINVAL; - } - - *stripe_count = le16_to_cpu(lmm->lmm_stripe_count); - if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED) - *stripe_count = 0; - - if (lmm_bytes < lov_mds_md_size(*stripe_count, LOV_MAGIC_V1)) { - CERROR("LOV EA V1 too small: %d, need %d\n", - lmm_bytes, lov_mds_md_size(*stripe_count, LOV_MAGIC_V1)); - lov_dump_lmm_common(D_WARNING, lmm); - return -EINVAL; - } - - return lsm_lmm_verify_common(lmm, lmm_bytes, *stripe_count); -} - -static int lsm_unpackmd_v1(struct lov_obd *lov, struct lov_stripe_md *lsm, - struct lov_mds_md_v1 *lmm) -{ - return lsm_unpackmd_common(lov, lsm, lmm, lmm->lmm_objects); -} - -const struct lsm_operations lsm_v1_ops = { - .lsm_free = lsm_free_plain, - .lsm_stripe_by_index = lsm_stripe_by_index_plain, - .lsm_stripe_by_offset = lsm_stripe_by_offset_plain, - .lsm_lmm_verify = lsm_lmm_verify_v1, - .lsm_unpackmd = lsm_unpackmd_v1, -}; - -static int lsm_lmm_verify_v3(struct lov_mds_md *lmmv1, int lmm_bytes, - __u16 *stripe_count) -{ - struct lov_mds_md_v3 *lmm; - - lmm = (struct lov_mds_md_v3 *)lmmv1; - - if (lmm_bytes < sizeof(*lmm)) { - CERROR("lov_mds_md_v3 too small: %d, need at least %d\n", - lmm_bytes, (int)sizeof(*lmm)); - return -EINVAL; - } - - *stripe_count = le16_to_cpu(lmm->lmm_stripe_count); - if (le32_to_cpu(lmm->lmm_pattern) & LOV_PATTERN_F_RELEASED) - *stripe_count = 0; - - if (lmm_bytes < lov_mds_md_size(*stripe_count, LOV_MAGIC_V3)) { - CERROR("LOV EA V3 too small: %d, need %d\n", - lmm_bytes, lov_mds_md_size(*stripe_count, LOV_MAGIC_V3)); - lov_dump_lmm_common(D_WARNING, lmm); - return -EINVAL; - } - - return lsm_lmm_verify_common((struct lov_mds_md_v1 *)lmm, lmm_bytes, - *stripe_count); -} - -static int lsm_unpackmd_v3(struct lov_obd *lov, struct lov_stripe_md *lsm, - struct lov_mds_md *lmm) -{ - struct lov_mds_md_v3 *lmm_v3 = (struct lov_mds_md_v3 *)lmm; - size_t cplen = 0; - int rc; - - rc = lsm_unpackmd_common(lov, lsm, lmm, lmm_v3->lmm_objects); - if (rc) - return rc; - - cplen = strlcpy(lsm->lsm_pool_name, lmm_v3->lmm_pool_name, - sizeof(lsm->lsm_pool_name)); - if (cplen >= sizeof(lsm->lsm_pool_name)) - return -E2BIG; - - return 0; -} - -const struct lsm_operations lsm_v3_ops = { - .lsm_free = lsm_free_plain, - .lsm_stripe_by_index = lsm_stripe_by_index_plain, - .lsm_stripe_by_offset = lsm_stripe_by_offset_plain, - .lsm_lmm_verify = lsm_lmm_verify_v3, - .lsm_unpackmd = lsm_unpackmd_v3, -}; - -void dump_lsm(unsigned int level, const struct lov_stripe_md *lsm) -{ - CDEBUG(level, "lsm %p, objid " DOSTID ", maxbytes %#llx, magic 0x%08X, stripe_size %u, stripe_count %u, refc: %d, layout_gen %u, pool [" LOV_POOLNAMEF "]\n", - lsm, - POSTID(&lsm->lsm_oi), lsm->lsm_maxbytes, lsm->lsm_magic, - lsm->lsm_stripe_size, lsm->lsm_stripe_count, - atomic_read(&lsm->lsm_refc), lsm->lsm_layout_gen, - lsm->lsm_pool_name); -} diff --git a/drivers/staging/lustre/lustre/lov/lov_internal.h b/drivers/staging/lustre/lustre/lov/lov_internal.h deleted file mode 100644 index 47042f27ca902ebb8366a369188ca88104a7c816..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/lov/lov_internal.h +++ /dev/null @@ -1,286 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#ifndef LOV_INTERNAL_H -#define LOV_INTERNAL_H - -#include -#include - -/* - * If we are unable to get the maximum object size from the OST in - * ocd_maxbytes using OBD_CONNECT_MAXBYTES, then we fall back to using - * the old maximum object size from ext3. - */ -#define LUSTRE_EXT3_STRIPE_MAXBYTES 0x1fffffff000ULL - -struct lov_stripe_md { - atomic_t lsm_refc; - spinlock_t lsm_lock; - pid_t lsm_lock_owner; /* debugging */ - - /* - * maximum possible file size, might change as OSTs status changes, - * e.g. disconnected, deactivated - */ - loff_t lsm_maxbytes; - struct ost_id lsm_oi; - u32 lsm_magic; - u32 lsm_stripe_size; - u32 lsm_pattern; /* RAID0, RAID1, released, ... */ - u16 lsm_stripe_count; - u16 lsm_layout_gen; - char lsm_pool_name[LOV_MAXPOOLNAME + 1]; - struct lov_oinfo *lsm_oinfo[0]; -}; - -static inline bool lsm_is_released(struct lov_stripe_md *lsm) -{ - return !!(lsm->lsm_pattern & LOV_PATTERN_F_RELEASED); -} - -static inline bool lsm_has_objects(struct lov_stripe_md *lsm) -{ - if (!lsm) - return false; - - if (lsm_is_released(lsm)) - return false; - - return true; -} - -struct lsm_operations { - void (*lsm_free)(struct lov_stripe_md *); - void (*lsm_stripe_by_index)(struct lov_stripe_md *, int *, loff_t *, - loff_t *); - void (*lsm_stripe_by_offset)(struct lov_stripe_md *, int *, loff_t *, - loff_t *); - int (*lsm_lmm_verify)(struct lov_mds_md *lmm, int lmm_bytes, - u16 *stripe_count); - int (*lsm_unpackmd)(struct lov_obd *lov, struct lov_stripe_md *lsm, - struct lov_mds_md *lmm); -}; - -extern const struct lsm_operations lsm_v1_ops; -extern const struct lsm_operations lsm_v3_ops; - -static inline const struct lsm_operations *lsm_op_find(int magic) -{ - switch (magic) { - case LOV_MAGIC_V1: - return &lsm_v1_ops; - case LOV_MAGIC_V3: - return &lsm_v3_ops; - default: - CERROR("unrecognized lsm_magic %08x\n", magic); - return NULL; - } -} - -/* lov_do_div64(a, b) returns a % b, and a = a / b. - * The 32-bit code is LOV-specific due to knowing about stripe limits in - * order to reduce the divisor to a 32-bit number. If the divisor is - * already a 32-bit value the compiler handles this directly. - */ -#if BITS_PER_LONG == 64 -# define lov_do_div64(n, base) ({ \ - u64 __base = (base); \ - u64 __rem; \ - __rem = ((u64)(n)) % __base; \ - (n) = ((u64)(n)) / __base; \ - __rem; \ -}) -#elif BITS_PER_LONG == 32 -# define lov_do_div64(n, base) ({ \ - u64 __rem; \ - if ((sizeof(base) > 4) && (((base) & 0xffffffff00000000ULL) != 0)) { \ - int __remainder; \ - LASSERTF(!((base) & (LOV_MIN_STRIPE_SIZE - 1)), "64 bit lov " \ - "division %llu / %llu\n", (n), (u64)(base)); \ - __remainder = (n) & (LOV_MIN_STRIPE_SIZE - 1); \ - (n) >>= LOV_MIN_STRIPE_BITS; \ - __rem = do_div(n, (base) >> LOV_MIN_STRIPE_BITS); \ - __rem <<= LOV_MIN_STRIPE_BITS; \ - __rem += __remainder; \ - } else { \ - __rem = do_div(n, base); \ - } \ - __rem; \ -}) -#endif - -#define pool_tgt_size(p) ((p)->pool_obds.op_size) -#define pool_tgt_count(p) ((p)->pool_obds.op_count) -#define pool_tgt_array(p) ((p)->pool_obds.op_array) -#define pool_tgt_rw_sem(p) ((p)->pool_obds.op_rw_sem) - -struct pool_desc { - char pool_name[LOV_MAXPOOLNAME + 1]; - struct ost_pool pool_obds; - atomic_t pool_refcount; - struct rhash_head pool_hash; /* access by poolname */ - union { - struct list_head pool_list; /* serial access */ - struct rcu_head rcu; /* delayed free */ - }; - struct dentry *pool_debugfs_entry; /* file in debugfs */ - struct obd_device *pool_lobd; /* owner */ -}; -int lov_pool_hash_init(struct rhashtable *tbl); -void lov_pool_hash_destroy(struct rhashtable *tbl); - -struct lov_request { - struct obd_info rq_oi; - struct lov_request_set *rq_rqset; - - struct list_head rq_link; - - int rq_idx; /* index in lov->tgts array */ -}; - -struct lov_request_set { - struct obd_info *set_oi; - struct obd_device *set_obd; - int set_count; - atomic_t set_completes; - atomic_t set_success; - struct list_head set_list; -}; - -extern struct kmem_cache *lov_oinfo_slab; - -extern struct lu_kmem_descr lov_caches[]; - -#define lov_uuid2str(lv, index) \ - (char *)((lv)->lov_tgts[index]->ltd_uuid.uuid) - -/* lov_merge.c */ -int lov_merge_lvb_kms(struct lov_stripe_md *lsm, - struct ost_lvb *lvb, __u64 *kms_place); - -/* lov_offset.c */ -u64 lov_stripe_size(struct lov_stripe_md *lsm, u64 ost_size, int stripeno); -int lov_stripe_offset(struct lov_stripe_md *lsm, u64 lov_off, - int stripeno, u64 *u64); -u64 lov_size_to_stripe(struct lov_stripe_md *lsm, u64 file_size, int stripeno); -int lov_stripe_intersects(struct lov_stripe_md *lsm, int stripeno, - u64 start, u64 end, - u64 *obd_start, u64 *obd_end); -int lov_stripe_number(struct lov_stripe_md *lsm, u64 lov_off); -pgoff_t lov_stripe_pgoff(struct lov_stripe_md *lsm, pgoff_t stripe_index, - int stripe); - -/* lov_request.c */ -int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo, - struct lov_request_set **reqset); -int lov_fini_statfs_set(struct lov_request_set *set); - -/* lov_obd.c */ -void lov_stripe_lock(struct lov_stripe_md *md); -void lov_stripe_unlock(struct lov_stripe_md *md); -void lov_fix_desc(struct lov_desc *desc); -void lov_fix_desc_stripe_size(__u64 *val); -void lov_fix_desc_stripe_count(__u32 *val); -void lov_fix_desc_pattern(__u32 *val); -void lov_fix_desc_qos_maxage(__u32 *val); -__u16 lov_get_stripecnt(struct lov_obd *lov, __u32 magic, __u16 stripe_count); -int lov_connect_obd(struct obd_device *obd, __u32 index, int activate, - struct obd_connect_data *data); -int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg); -int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg, - __u32 *indexp, int *genp); -int lov_del_target(struct obd_device *obd, __u32 index, - struct obd_uuid *uuidp, int gen); - -/* lov_pack.c */ -ssize_t lov_lsm_pack(const struct lov_stripe_md *lsm, void *buf, - size_t buf_size); -struct lov_stripe_md *lov_unpackmd(struct lov_obd *lov, struct lov_mds_md *lmm, - size_t lmm_size); -int lov_free_memmd(struct lov_stripe_md **lsmp); - -void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm); -void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm); -void lov_dump_lmm_common(int level, void *lmmp); - -/* lov_ea.c */ -struct lov_stripe_md *lsm_alloc_plain(u16 stripe_count); -void lsm_free_plain(struct lov_stripe_md *lsm); -void dump_lsm(unsigned int level, const struct lov_stripe_md *lsm); - -/* lproc_lov.c */ -extern const struct file_operations lov_proc_target_fops; -void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars); - -/* lov_cl.c */ -extern struct lu_device_type lov_device_type; - -/* ost_pool methods */ -int lov_ost_pool_init(struct ost_pool *op, unsigned int count); -int lov_ost_pool_extend(struct ost_pool *op, unsigned int min_count); -int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count); -int lov_ost_pool_remove(struct ost_pool *op, __u32 idx); -int lov_ost_pool_free(struct ost_pool *op); - -/* high level pool methods */ -int lov_pool_new(struct obd_device *obd, char *poolname); -int lov_pool_del(struct obd_device *obd, char *poolname); -int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname); -int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname); -void lov_pool_putref(struct pool_desc *pool); - -static inline struct lov_stripe_md *lsm_addref(struct lov_stripe_md *lsm) -{ - LASSERT(atomic_read(&lsm->lsm_refc) > 0); - atomic_inc(&lsm->lsm_refc); - return lsm; -} - -static inline bool lov_oinfo_is_dummy(const struct lov_oinfo *loi) -{ - if (unlikely(loi->loi_oi.oi.oi_id == 0 && - loi->loi_oi.oi.oi_seq == 0 && - loi->loi_ost_idx == 0 && - loi->loi_ost_gen == 0)) - return true; - - return false; -} - -static inline struct obd_device *lov2obd(const struct lov_obd *lov) -{ - return container_of_safe(lov, struct obd_device, u.lov); -} - -#endif diff --git a/drivers/staging/lustre/lustre/lov/lov_io.c b/drivers/staging/lustre/lustre/lov/lov_io.c deleted file mode 100644 index b823f8a21856356fd3fd9e337da9b53ad36f586f..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/lov/lov_io.c +++ /dev/null @@ -1,1023 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Implementation of cl_io for LOV layer. - * - * Author: Nikita Danilov - * Author: Jinshan Xiong - */ - -#define DEBUG_SUBSYSTEM S_LOV - -#include "lov_cl_internal.h" - -/** \addtogroup lov - * @{ - */ - -static void lov_io_sub_fini(const struct lu_env *env, struct lov_io *lio, - struct lov_io_sub *sub) -{ - if (sub->sub_io) { - if (sub->sub_io_initialized) { - cl_io_fini(sub->sub_env, sub->sub_io); - sub->sub_io_initialized = 0; - lio->lis_active_subios--; - } - if (sub->sub_stripe == lio->lis_single_subio_index) - lio->lis_single_subio_index = -1; - else if (!sub->sub_borrowed) - kfree(sub->sub_io); - sub->sub_io = NULL; - } - if (!IS_ERR_OR_NULL(sub->sub_env)) { - if (!sub->sub_borrowed) - cl_env_put(sub->sub_env, &sub->sub_refcheck); - sub->sub_env = NULL; - } -} - -static void lov_io_sub_inherit(struct cl_io *io, struct lov_io *lio, - int stripe, loff_t start, loff_t end) -{ - struct lov_stripe_md *lsm = lio->lis_object->lo_lsm; - struct cl_io *parent = lio->lis_cl.cis_io; - - switch (io->ci_type) { - case CIT_SETATTR: { - io->u.ci_setattr.sa_attr = parent->u.ci_setattr.sa_attr; - io->u.ci_setattr.sa_attr_flags = - parent->u.ci_setattr.sa_attr_flags; - io->u.ci_setattr.sa_valid = parent->u.ci_setattr.sa_valid; - io->u.ci_setattr.sa_stripe_index = stripe; - io->u.ci_setattr.sa_parent_fid = - parent->u.ci_setattr.sa_parent_fid; - if (cl_io_is_trunc(io)) { - loff_t new_size = parent->u.ci_setattr.sa_attr.lvb_size; - - new_size = lov_size_to_stripe(lsm, new_size, stripe); - io->u.ci_setattr.sa_attr.lvb_size = new_size; - } - break; - } - case CIT_DATA_VERSION: { - io->u.ci_data_version.dv_data_version = 0; - io->u.ci_data_version.dv_flags = - parent->u.ci_data_version.dv_flags; - break; - } - case CIT_FAULT: { - struct cl_object *obj = parent->ci_obj; - loff_t off = cl_offset(obj, parent->u.ci_fault.ft_index); - - io->u.ci_fault = parent->u.ci_fault; - off = lov_size_to_stripe(lsm, off, stripe); - io->u.ci_fault.ft_index = cl_index(obj, off); - break; - } - case CIT_FSYNC: { - io->u.ci_fsync.fi_start = start; - io->u.ci_fsync.fi_end = end; - io->u.ci_fsync.fi_fid = parent->u.ci_fsync.fi_fid; - io->u.ci_fsync.fi_mode = parent->u.ci_fsync.fi_mode; - break; - } - case CIT_READ: - case CIT_WRITE: { - io->u.ci_wr.wr_sync = cl_io_is_sync_write(parent); - if (cl_io_is_append(parent)) { - io->u.ci_wr.wr_append = 1; - } else { - io->u.ci_rw.crw_pos = start; - io->u.ci_rw.crw_count = end - start; - } - break; - } - default: - break; - } -} - -static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio, - struct lov_io_sub *sub) -{ - struct lov_object *lov = lio->lis_object; - struct cl_io *sub_io; - struct cl_object *sub_obj; - struct cl_io *io = lio->lis_cl.cis_io; - int stripe = sub->sub_stripe; - int rc; - - LASSERT(!sub->sub_io); - LASSERT(!sub->sub_env); - LASSERT(sub->sub_stripe < lio->lis_stripe_count); - - if (unlikely(!lov_r0(lov)->lo_sub[stripe])) - return -EIO; - - sub->sub_io_initialized = 0; - sub->sub_borrowed = 0; - - /* obtain new environment */ - sub->sub_env = cl_env_get(&sub->sub_refcheck); - if (IS_ERR(sub->sub_env)) { - rc = PTR_ERR(sub->sub_env); - goto fini_lov_io; - } - - /* - * First sub-io. Use ->lis_single_subio to - * avoid dynamic allocation. - */ - if (lio->lis_active_subios == 0) { - sub->sub_io = &lio->lis_single_subio; - lio->lis_single_subio_index = stripe; - } else { - sub->sub_io = kzalloc(sizeof(*sub->sub_io), - GFP_NOFS); - if (!sub->sub_io) { - rc = -ENOMEM; - goto fini_lov_io; - } - } - - sub_obj = lovsub2cl(lov_r0(lov)->lo_sub[stripe]); - sub_io = sub->sub_io; - - sub_io->ci_obj = sub_obj; - sub_io->ci_result = 0; - sub_io->ci_parent = io; - sub_io->ci_lockreq = io->ci_lockreq; - sub_io->ci_type = io->ci_type; - sub_io->ci_no_srvlock = io->ci_no_srvlock; - sub_io->ci_noatime = io->ci_noatime; - - rc = cl_io_sub_init(sub->sub_env, sub_io, io->ci_type, sub_obj); - if (rc >= 0) { - lio->lis_active_subios++; - sub->sub_io_initialized = 1; - rc = 0; - } -fini_lov_io: - if (rc) - lov_io_sub_fini(env, lio, sub); - return rc; -} - -struct lov_io_sub *lov_sub_get(const struct lu_env *env, - struct lov_io *lio, int stripe) -{ - int rc; - struct lov_io_sub *sub = &lio->lis_subs[stripe]; - - LASSERT(stripe < lio->lis_stripe_count); - - if (!sub->sub_io_initialized) { - sub->sub_stripe = stripe; - rc = lov_io_sub_init(env, lio, sub); - } else { - rc = 0; - } - if (rc < 0) - sub = ERR_PTR(rc); - - return sub; -} - -/***************************************************************************** - * - * Lov io operations. - * - */ - -int lov_page_stripe(const struct cl_page *page) -{ - const struct cl_page_slice *slice; - - slice = cl_page_at(page, &lov_device_type); - LASSERT(slice->cpl_obj); - - return cl2lov_page(slice)->lps_stripe; -} - -static int lov_io_subio_init(const struct lu_env *env, struct lov_io *lio, - struct cl_io *io) -{ - struct lov_stripe_md *lsm; - int result; - - LASSERT(lio->lis_object); - lsm = lio->lis_object->lo_lsm; - - /* - * Need to be optimized, we can't afford to allocate a piece of memory - * when writing a page. -jay - */ - lio->lis_subs = - kvzalloc(lsm->lsm_stripe_count * - sizeof(lio->lis_subs[0]), - GFP_NOFS); - if (lio->lis_subs) { - lio->lis_nr_subios = lio->lis_stripe_count; - lio->lis_single_subio_index = -1; - lio->lis_active_subios = 0; - result = 0; - } else { - result = -ENOMEM; - } - return result; -} - -static int lov_io_slice_init(struct lov_io *lio, struct lov_object *obj, - struct cl_io *io) -{ - io->ci_result = 0; - lio->lis_object = obj; - - lio->lis_stripe_count = obj->lo_lsm->lsm_stripe_count; - - switch (io->ci_type) { - case CIT_READ: - case CIT_WRITE: - lio->lis_pos = io->u.ci_rw.crw_pos; - lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count; - lio->lis_io_endpos = lio->lis_endpos; - if (cl_io_is_append(io)) { - LASSERT(io->ci_type == CIT_WRITE); - - /* - * If there is LOV EA hole, then we may cannot locate - * the current file-tail exactly. - */ - if (unlikely(obj->lo_lsm->lsm_pattern & - LOV_PATTERN_F_HOLE)) - return -EIO; - - lio->lis_pos = 0; - lio->lis_endpos = OBD_OBJECT_EOF; - } - break; - - case CIT_SETATTR: - if (cl_io_is_trunc(io)) - lio->lis_pos = io->u.ci_setattr.sa_attr.lvb_size; - else - lio->lis_pos = 0; - lio->lis_endpos = OBD_OBJECT_EOF; - break; - - case CIT_DATA_VERSION: - lio->lis_pos = 0; - lio->lis_endpos = OBD_OBJECT_EOF; - break; - - case CIT_FAULT: { - pgoff_t index = io->u.ci_fault.ft_index; - - lio->lis_pos = cl_offset(io->ci_obj, index); - lio->lis_endpos = cl_offset(io->ci_obj, index + 1); - break; - } - - case CIT_FSYNC: { - lio->lis_pos = io->u.ci_fsync.fi_start; - lio->lis_endpos = io->u.ci_fsync.fi_end; - break; - } - - case CIT_MISC: - lio->lis_pos = 0; - lio->lis_endpos = OBD_OBJECT_EOF; - break; - - default: - LBUG(); - } - return 0; -} - -static void lov_io_fini(const struct lu_env *env, const struct cl_io_slice *ios) -{ - struct lov_io *lio = cl2lov_io(env, ios); - struct lov_object *lov = cl2lov(ios->cis_obj); - int i; - - if (lio->lis_subs) { - for (i = 0; i < lio->lis_nr_subios; i++) - lov_io_sub_fini(env, lio, &lio->lis_subs[i]); - kvfree(lio->lis_subs); - lio->lis_nr_subios = 0; - } - - LASSERT(atomic_read(&lov->lo_active_ios) > 0); - if (atomic_dec_and_test(&lov->lo_active_ios)) - wake_up_all(&lov->lo_waitq); -} - -static u64 lov_offset_mod(u64 val, int delta) -{ - if (val != OBD_OBJECT_EOF) - val += delta; - return val; -} - -static int lov_io_iter_init(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct lov_io *lio = cl2lov_io(env, ios); - struct lov_stripe_md *lsm = lio->lis_object->lo_lsm; - struct lov_io_sub *sub; - u64 endpos; - u64 start; - u64 end; - int stripe; - int rc = 0; - - endpos = lov_offset_mod(lio->lis_endpos, -1); - for (stripe = 0; stripe < lio->lis_stripe_count; stripe++) { - if (!lov_stripe_intersects(lsm, stripe, lio->lis_pos, - endpos, &start, &end)) - continue; - - if (unlikely(!lov_r0(lio->lis_object)->lo_sub[stripe])) { - if (ios->cis_io->ci_type == CIT_READ || - ios->cis_io->ci_type == CIT_WRITE || - ios->cis_io->ci_type == CIT_FAULT) - return -EIO; - - continue; - } - - end = lov_offset_mod(end, 1); - sub = lov_sub_get(env, lio, stripe); - if (IS_ERR(sub)) { - rc = PTR_ERR(sub); - break; - } - - lov_io_sub_inherit(sub->sub_io, lio, stripe, start, end); - rc = cl_io_iter_init(sub->sub_env, sub->sub_io); - if (rc) { - cl_io_iter_fini(sub->sub_env, sub->sub_io); - break; - } - CDEBUG(D_VFSTRACE, "shrink: %d [%llu, %llu)\n", - stripe, start, end); - - list_add_tail(&sub->sub_linkage, &lio->lis_active); - } - return rc; -} - -static int lov_io_rw_iter_init(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct lov_io *lio = cl2lov_io(env, ios); - struct cl_io *io = ios->cis_io; - struct lov_stripe_md *lsm = lio->lis_object->lo_lsm; - __u64 start = io->u.ci_rw.crw_pos; - loff_t next; - unsigned long ssize = lsm->lsm_stripe_size; - - LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); - - /* fast path for common case. */ - if (lio->lis_nr_subios != 1 && !cl_io_is_append(io)) { - lov_do_div64(start, ssize); - next = (start + 1) * ssize; - if (next <= start * ssize) - next = ~0ull; - - io->ci_continue = next < lio->lis_io_endpos; - io->u.ci_rw.crw_count = min_t(loff_t, lio->lis_io_endpos, - next) - io->u.ci_rw.crw_pos; - lio->lis_pos = io->u.ci_rw.crw_pos; - lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count; - CDEBUG(D_VFSTRACE, "stripe: %llu chunk: [%llu, %llu) %llu\n", - (__u64)start, lio->lis_pos, lio->lis_endpos, - (__u64)lio->lis_io_endpos); - } - /* - * XXX The following call should be optimized: we know, that - * [lio->lis_pos, lio->lis_endpos) intersects with exactly one stripe. - */ - return lov_io_iter_init(env, ios); -} - -static int lov_io_call(const struct lu_env *env, struct lov_io *lio, - int (*iofunc)(const struct lu_env *, struct cl_io *)) -{ - struct cl_io *parent = lio->lis_cl.cis_io; - struct lov_io_sub *sub; - int rc = 0; - - list_for_each_entry(sub, &lio->lis_active, sub_linkage) { - rc = iofunc(sub->sub_env, sub->sub_io); - if (rc) - break; - - if (parent->ci_result == 0) - parent->ci_result = sub->sub_io->ci_result; - } - return rc; -} - -static int lov_io_lock(const struct lu_env *env, const struct cl_io_slice *ios) -{ - return lov_io_call(env, cl2lov_io(env, ios), cl_io_lock); -} - -static int lov_io_start(const struct lu_env *env, const struct cl_io_slice *ios) -{ - return lov_io_call(env, cl2lov_io(env, ios), cl_io_start); -} - -static int lov_io_end_wrapper(const struct lu_env *env, struct cl_io *io) -{ - /* - * It's possible that lov_io_start() wasn't called against this - * sub-io, either because previous sub-io failed, or upper layer - * completed IO. - */ - if (io->ci_state == CIS_IO_GOING) - cl_io_end(env, io); - else - io->ci_state = CIS_IO_FINISHED; - return 0; -} - -static void -lov_io_data_version_end(const struct lu_env *env, const struct cl_io_slice *ios) -{ - struct lov_io *lio = cl2lov_io(env, ios); - struct cl_io *parent = lio->lis_cl.cis_io; - struct lov_io_sub *sub; - - list_for_each_entry(sub, &lio->lis_active, sub_linkage) { - lov_io_end_wrapper(sub->sub_env, sub->sub_io); - - parent->u.ci_data_version.dv_data_version += - sub->sub_io->u.ci_data_version.dv_data_version; - - if (!parent->ci_result) - parent->ci_result = sub->sub_io->ci_result; - } -} - -static int lov_io_iter_fini_wrapper(const struct lu_env *env, struct cl_io *io) -{ - cl_io_iter_fini(env, io); - return 0; -} - -static int lov_io_unlock_wrapper(const struct lu_env *env, struct cl_io *io) -{ - cl_io_unlock(env, io); - return 0; -} - -static void lov_io_end(const struct lu_env *env, const struct cl_io_slice *ios) -{ - int rc; - - rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_end_wrapper); - LASSERT(rc == 0); -} - -static void lov_io_iter_fini(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct lov_io *lio = cl2lov_io(env, ios); - int rc; - - rc = lov_io_call(env, lio, lov_io_iter_fini_wrapper); - LASSERT(rc == 0); - while (!list_empty(&lio->lis_active)) - list_del_init(lio->lis_active.next); -} - -static void lov_io_unlock(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - int rc; - - rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_unlock_wrapper); - LASSERT(rc == 0); -} - -static int lov_io_read_ahead(const struct lu_env *env, - const struct cl_io_slice *ios, - pgoff_t start, struct cl_read_ahead *ra) -{ - struct lov_io *lio = cl2lov_io(env, ios); - struct lov_object *loo = lio->lis_object; - struct cl_object *obj = lov2cl(loo); - struct lov_layout_raid0 *r0 = lov_r0(loo); - unsigned int pps; /* pages per stripe */ - struct lov_io_sub *sub; - pgoff_t ra_end; - loff_t suboff; - int stripe; - int rc; - - stripe = lov_stripe_number(loo->lo_lsm, cl_offset(obj, start)); - if (unlikely(!r0->lo_sub[stripe])) - return -EIO; - - sub = lov_sub_get(env, lio, stripe); - if (IS_ERR(sub)) - return PTR_ERR(sub); - - lov_stripe_offset(loo->lo_lsm, cl_offset(obj, start), stripe, &suboff); - rc = cl_io_read_ahead(sub->sub_env, sub->sub_io, - cl_index(lovsub2cl(r0->lo_sub[stripe]), suboff), - ra); - - CDEBUG(D_READA, DFID " cra_end = %lu, stripes = %d, rc = %d\n", - PFID(lu_object_fid(lov2lu(loo))), ra->cra_end, r0->lo_nr, rc); - if (rc) - return rc; - - /** - * Adjust the stripe index by layout of raid0. ra->cra_end is - * the maximum page index covered by an underlying DLM lock. - * This function converts cra_end from stripe level to file - * level, and make sure it's not beyond stripe boundary. - */ - if (r0->lo_nr == 1) /* single stripe file */ - return 0; - - /* cra_end is stripe level, convert it into file level */ - ra_end = ra->cra_end; - if (ra_end != CL_PAGE_EOF) - ra_end = lov_stripe_pgoff(loo->lo_lsm, ra_end, stripe); - - pps = loo->lo_lsm->lsm_stripe_size >> PAGE_SHIFT; - - CDEBUG(D_READA, DFID " max_index = %lu, pps = %u, stripe_size = %u, stripe no = %u, start index = %lu\n", - PFID(lu_object_fid(lov2lu(loo))), ra_end, pps, - loo->lo_lsm->lsm_stripe_size, stripe, start); - - /* never exceed the end of the stripe */ - ra->cra_end = min_t(pgoff_t, ra_end, start + pps - start % pps - 1); - return 0; -} - -/** - * lov implementation of cl_operations::cio_submit() method. It takes a list - * of pages in \a queue, splits it into per-stripe sub-lists, invokes - * cl_io_submit() on underlying devices to submit sub-lists, and then splices - * everything back. - * - * Major complication of this function is a need to handle memory cleansing: - * cl_io_submit() is called to write out pages as a part of VM memory - * reclamation, and hence it may not fail due to memory shortages (system - * dead-locks otherwise). To deal with this, some resources (sub-lists, - * sub-environment, etc.) are allocated per-device on "startup" (i.e., in a - * not-memory cleansing context), and in case of memory shortage, these - * pre-allocated resources are used by lov_io_submit() under - * lov_device::ld_mutex mutex. - */ -static int lov_io_submit(const struct lu_env *env, - const struct cl_io_slice *ios, - enum cl_req_type crt, struct cl_2queue *queue) -{ - struct cl_page_list *qin = &queue->c2_qin; - struct lov_io *lio = cl2lov_io(env, ios); - struct lov_io_sub *sub; - struct cl_page_list *plist = &lov_env_info(env)->lti_plist; - struct cl_page *page; - int stripe; - - int rc = 0; - - if (lio->lis_active_subios == 1) { - int idx = lio->lis_single_subio_index; - - LASSERT(idx < lio->lis_nr_subios); - sub = lov_sub_get(env, lio, idx); - LASSERT(!IS_ERR(sub)); - LASSERT(sub->sub_io == &lio->lis_single_subio); - rc = cl_io_submit_rw(sub->sub_env, sub->sub_io, - crt, queue); - return rc; - } - - LASSERT(lio->lis_subs); - - cl_page_list_init(plist); - while (qin->pl_nr > 0) { - struct cl_2queue *cl2q = &lov_env_info(env)->lti_cl2q; - - cl_2queue_init(cl2q); - - page = cl_page_list_first(qin); - cl_page_list_move(&cl2q->c2_qin, qin, page); - - stripe = lov_page_stripe(page); - while (qin->pl_nr > 0) { - page = cl_page_list_first(qin); - if (stripe != lov_page_stripe(page)) - break; - - cl_page_list_move(&cl2q->c2_qin, qin, page); - } - - sub = lov_sub_get(env, lio, stripe); - if (!IS_ERR(sub)) { - rc = cl_io_submit_rw(sub->sub_env, sub->sub_io, - crt, cl2q); - } else { - rc = PTR_ERR(sub); - } - - cl_page_list_splice(&cl2q->c2_qin, plist); - cl_page_list_splice(&cl2q->c2_qout, &queue->c2_qout); - cl_2queue_fini(env, cl2q); - - if (rc != 0) - break; - } - - cl_page_list_splice(plist, qin); - cl_page_list_fini(env, plist); - - return rc; -} - -static int lov_io_commit_async(const struct lu_env *env, - const struct cl_io_slice *ios, - struct cl_page_list *queue, int from, int to, - cl_commit_cbt cb) -{ - struct cl_page_list *plist = &lov_env_info(env)->lti_plist; - struct lov_io *lio = cl2lov_io(env, ios); - struct lov_io_sub *sub; - struct cl_page *page; - int rc = 0; - - if (lio->lis_active_subios == 1) { - int idx = lio->lis_single_subio_index; - - LASSERT(idx < lio->lis_nr_subios); - sub = lov_sub_get(env, lio, idx); - LASSERT(!IS_ERR(sub)); - LASSERT(sub->sub_io == &lio->lis_single_subio); - rc = cl_io_commit_async(sub->sub_env, sub->sub_io, queue, - from, to, cb); - return rc; - } - - LASSERT(lio->lis_subs); - - cl_page_list_init(plist); - while (queue->pl_nr > 0) { - int stripe_to = to; - int stripe; - - LASSERT(plist->pl_nr == 0); - page = cl_page_list_first(queue); - cl_page_list_move(plist, queue, page); - - stripe = lov_page_stripe(page); - while (queue->pl_nr > 0) { - page = cl_page_list_first(queue); - if (stripe != lov_page_stripe(page)) - break; - - cl_page_list_move(plist, queue, page); - } - - if (queue->pl_nr > 0) /* still has more pages */ - stripe_to = PAGE_SIZE; - - sub = lov_sub_get(env, lio, stripe); - if (!IS_ERR(sub)) { - rc = cl_io_commit_async(sub->sub_env, sub->sub_io, - plist, from, stripe_to, cb); - } else { - rc = PTR_ERR(sub); - break; - } - - if (plist->pl_nr > 0) /* short write */ - break; - - from = 0; - } - - /* for error case, add the page back into the qin list */ - LASSERT(ergo(rc == 0, plist->pl_nr == 0)); - while (plist->pl_nr > 0) { - /* error occurred, add the uncommitted pages back into queue */ - page = cl_page_list_last(plist); - cl_page_list_move_head(queue, plist, page); - } - - return rc; -} - -static int lov_io_fault_start(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct cl_fault_io *fio; - struct lov_io *lio; - struct lov_io_sub *sub; - - fio = &ios->cis_io->u.ci_fault; - lio = cl2lov_io(env, ios); - sub = lov_sub_get(env, lio, lov_page_stripe(fio->ft_page)); - if (IS_ERR(sub)) - return PTR_ERR(sub); - sub->sub_io->u.ci_fault.ft_nob = fio->ft_nob; - return lov_io_start(env, ios); -} - -static void lov_io_fsync_end(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct lov_io *lio = cl2lov_io(env, ios); - struct lov_io_sub *sub; - unsigned int *written = &ios->cis_io->u.ci_fsync.fi_nr_written; - - *written = 0; - list_for_each_entry(sub, &lio->lis_active, sub_linkage) { - struct cl_io *subio = sub->sub_io; - - lov_io_end_wrapper(sub->sub_env, subio); - - if (subio->ci_result == 0) - *written += subio->u.ci_fsync.fi_nr_written; - } -} - -static const struct cl_io_operations lov_io_ops = { - .op = { - [CIT_READ] = { - .cio_fini = lov_io_fini, - .cio_iter_init = lov_io_rw_iter_init, - .cio_iter_fini = lov_io_iter_fini, - .cio_lock = lov_io_lock, - .cio_unlock = lov_io_unlock, - .cio_start = lov_io_start, - .cio_end = lov_io_end - }, - [CIT_WRITE] = { - .cio_fini = lov_io_fini, - .cio_iter_init = lov_io_rw_iter_init, - .cio_iter_fini = lov_io_iter_fini, - .cio_lock = lov_io_lock, - .cio_unlock = lov_io_unlock, - .cio_start = lov_io_start, - .cio_end = lov_io_end - }, - [CIT_SETATTR] = { - .cio_fini = lov_io_fini, - .cio_iter_init = lov_io_iter_init, - .cio_iter_fini = lov_io_iter_fini, - .cio_lock = lov_io_lock, - .cio_unlock = lov_io_unlock, - .cio_start = lov_io_start, - .cio_end = lov_io_end - }, - [CIT_DATA_VERSION] = { - .cio_fini = lov_io_fini, - .cio_iter_init = lov_io_iter_init, - .cio_iter_fini = lov_io_iter_fini, - .cio_lock = lov_io_lock, - .cio_unlock = lov_io_unlock, - .cio_start = lov_io_start, - .cio_end = lov_io_data_version_end, - }, - [CIT_FAULT] = { - .cio_fini = lov_io_fini, - .cio_iter_init = lov_io_iter_init, - .cio_iter_fini = lov_io_iter_fini, - .cio_lock = lov_io_lock, - .cio_unlock = lov_io_unlock, - .cio_start = lov_io_fault_start, - .cio_end = lov_io_end - }, - [CIT_FSYNC] = { - .cio_fini = lov_io_fini, - .cio_iter_init = lov_io_iter_init, - .cio_iter_fini = lov_io_iter_fini, - .cio_lock = lov_io_lock, - .cio_unlock = lov_io_unlock, - .cio_start = lov_io_start, - .cio_end = lov_io_fsync_end - }, - [CIT_MISC] = { - .cio_fini = lov_io_fini - } - }, - .cio_read_ahead = lov_io_read_ahead, - .cio_submit = lov_io_submit, - .cio_commit_async = lov_io_commit_async, -}; - -/***************************************************************************** - * - * Empty lov io operations. - * - */ - -static void lov_empty_io_fini(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct lov_object *lov = cl2lov(ios->cis_obj); - - if (atomic_dec_and_test(&lov->lo_active_ios)) - wake_up_all(&lov->lo_waitq); -} - -static int lov_empty_io_submit(const struct lu_env *env, - const struct cl_io_slice *ios, - enum cl_req_type crt, struct cl_2queue *queue) -{ - return -EBADF; -} - -static void lov_empty_impossible(const struct lu_env *env, - struct cl_io_slice *ios) -{ - LBUG(); -} - -#define LOV_EMPTY_IMPOSSIBLE ((void *)lov_empty_impossible) - -/** - * An io operation vector for files without stripes. - */ -static const struct cl_io_operations lov_empty_io_ops = { - .op = { - [CIT_READ] = { - .cio_fini = lov_empty_io_fini, - }, - [CIT_WRITE] = { - .cio_fini = lov_empty_io_fini, - .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, - .cio_lock = LOV_EMPTY_IMPOSSIBLE, - .cio_start = LOV_EMPTY_IMPOSSIBLE, - .cio_end = LOV_EMPTY_IMPOSSIBLE - }, - [CIT_SETATTR] = { - .cio_fini = lov_empty_io_fini, - .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, - .cio_lock = LOV_EMPTY_IMPOSSIBLE, - .cio_start = LOV_EMPTY_IMPOSSIBLE, - .cio_end = LOV_EMPTY_IMPOSSIBLE - }, - [CIT_FAULT] = { - .cio_fini = lov_empty_io_fini, - .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, - .cio_lock = LOV_EMPTY_IMPOSSIBLE, - .cio_start = LOV_EMPTY_IMPOSSIBLE, - .cio_end = LOV_EMPTY_IMPOSSIBLE - }, - [CIT_FSYNC] = { - .cio_fini = lov_empty_io_fini - }, - [CIT_MISC] = { - .cio_fini = lov_empty_io_fini - } - }, - .cio_submit = lov_empty_io_submit, - .cio_commit_async = LOV_EMPTY_IMPOSSIBLE -}; - -int lov_io_init_raid0(const struct lu_env *env, struct cl_object *obj, - struct cl_io *io) -{ - struct lov_io *lio = lov_env_io(env); - struct lov_object *lov = cl2lov(obj); - - INIT_LIST_HEAD(&lio->lis_active); - io->ci_result = lov_io_slice_init(lio, lov, io); - if (io->ci_result == 0) { - io->ci_result = lov_io_subio_init(env, lio, io); - if (io->ci_result == 0) { - cl_io_slice_add(io, &lio->lis_cl, obj, &lov_io_ops); - atomic_inc(&lov->lo_active_ios); - } - } - return io->ci_result; -} - -int lov_io_init_empty(const struct lu_env *env, struct cl_object *obj, - struct cl_io *io) -{ - struct lov_object *lov = cl2lov(obj); - struct lov_io *lio = lov_env_io(env); - int result; - - lio->lis_object = lov; - switch (io->ci_type) { - default: - LBUG(); - case CIT_MISC: - case CIT_READ: - result = 0; - break; - case CIT_FSYNC: - case CIT_SETATTR: - case CIT_DATA_VERSION: - result = 1; - break; - case CIT_WRITE: - result = -EBADF; - break; - case CIT_FAULT: - result = -EFAULT; - CERROR("Page fault on a file without stripes: " DFID "\n", - PFID(lu_object_fid(&obj->co_lu))); - break; - } - if (result == 0) { - cl_io_slice_add(io, &lio->lis_cl, obj, &lov_empty_io_ops); - atomic_inc(&lov->lo_active_ios); - } - - io->ci_result = result < 0 ? result : 0; - return result; -} - -int lov_io_init_released(const struct lu_env *env, struct cl_object *obj, - struct cl_io *io) -{ - struct lov_object *lov = cl2lov(obj); - struct lov_io *lio = lov_env_io(env); - int result; - - LASSERT(lov->lo_lsm); - lio->lis_object = lov; - - switch (io->ci_type) { - default: - LASSERTF(0, "invalid type %d\n", io->ci_type); - result = -EOPNOTSUPP; - break; - case CIT_MISC: - case CIT_FSYNC: - case CIT_DATA_VERSION: - result = 1; - break; - case CIT_SETATTR: - /* the truncate to 0 is managed by MDT: - * - in open, for open O_TRUNC - * - in setattr, for truncate - */ - /* the truncate is for size > 0 so triggers a restore */ - if (cl_io_is_trunc(io)) { - io->ci_restore_needed = 1; - result = -ENODATA; - } else { - result = 1; - } - break; - case CIT_READ: - case CIT_WRITE: - case CIT_FAULT: - io->ci_restore_needed = 1; - result = -ENODATA; - break; - } - if (result == 0) { - cl_io_slice_add(io, &lio->lis_cl, obj, &lov_empty_io_ops); - atomic_inc(&lov->lo_active_ios); - } - - io->ci_result = result < 0 ? result : 0; - return result; -} - -/** @} lov */ diff --git a/drivers/staging/lustre/lustre/lov/lov_lock.c b/drivers/staging/lustre/lustre/lov/lov_lock.c deleted file mode 100644 index b0292100bf2637e30e1b341d47e1db93b793e3a1..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/lov/lov_lock.c +++ /dev/null @@ -1,348 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Implementation of cl_lock for LOV layer. - * - * Author: Nikita Danilov - */ - -#define DEBUG_SUBSYSTEM S_LOV - -#include "lov_cl_internal.h" - -/** \addtogroup lov - * @{ - */ - -/***************************************************************************** - * - * Lov lock operations. - * - */ - -static struct lov_sublock_env *lov_sublock_env_get(const struct lu_env *env, - const struct cl_lock *parent, - struct lov_lock_sub *lls) -{ - struct lov_sublock_env *subenv; - struct lov_io *lio = lov_env_io(env); - struct cl_io *io = lio->lis_cl.cis_io; - struct lov_io_sub *sub; - - subenv = &lov_env_session(env)->ls_subenv; - - /* - * FIXME: We tend to use the subio's env & io to call the sublock - * lock operations because osc lock sometimes stores some control - * variables in thread's IO information(Now only lockless information). - * However, if the lock's host(object) is different from the object - * for current IO, we have no way to get the subenv and subio because - * they are not initialized at all. As a temp fix, in this case, - * we still borrow the parent's env to call sublock operations. - */ - if (!io || !cl_object_same(io->ci_obj, parent->cll_descr.cld_obj)) { - subenv->lse_env = env; - subenv->lse_io = io; - } else { - sub = lov_sub_get(env, lio, lls->sub_stripe); - if (!IS_ERR(sub)) { - subenv->lse_env = sub->sub_env; - subenv->lse_io = sub->sub_io; - } else { - subenv = (void *)sub; - } - } - return subenv; -} - -static int lov_sublock_init(const struct lu_env *env, - const struct cl_lock *parent, - struct lov_lock_sub *lls) -{ - struct lov_sublock_env *subenv; - int result; - - subenv = lov_sublock_env_get(env, parent, lls); - if (!IS_ERR(subenv)) { - result = cl_lock_init(subenv->lse_env, &lls->sub_lock, - subenv->lse_io); - } else { - /* error occurs. */ - result = PTR_ERR(subenv); - } - return result; -} - -/** - * Creates sub-locks for a given lov_lock for the first time. - * - * Goes through all sub-objects of top-object, and creates sub-locks on every - * sub-object intersecting with top-lock extent. This is complicated by the - * fact that top-lock (that is being created) can be accessed concurrently - * through already created sub-locks (possibly shared with other top-locks). - */ -static struct lov_lock *lov_lock_sub_init(const struct lu_env *env, - const struct cl_object *obj, - struct cl_lock *lock) -{ - int result = 0; - int i; - int nr; - u64 start; - u64 end; - u64 file_start; - u64 file_end; - - struct lov_object *loo = cl2lov(obj); - struct lov_layout_raid0 *r0 = lov_r0(loo); - struct lov_lock *lovlck; - - CDEBUG(D_INODE, "%p: lock/io FID " DFID "/" DFID ", lock/io clobj %p/%p\n", - loo, PFID(lu_object_fid(lov2lu(loo))), - PFID(lu_object_fid(&obj->co_lu)), - lov2cl(loo), obj); - - file_start = cl_offset(lov2cl(loo), lock->cll_descr.cld_start); - file_end = cl_offset(lov2cl(loo), lock->cll_descr.cld_end + 1) - 1; - - for (i = 0, nr = 0; i < r0->lo_nr; i++) { - /* - * XXX for wide striping smarter algorithm is desirable, - * breaking out of the loop, early. - */ - if (likely(r0->lo_sub[i]) && /* spare layout */ - lov_stripe_intersects(loo->lo_lsm, i, - file_start, file_end, &start, &end)) - nr++; - } - LASSERT(nr > 0); - lovlck = kvzalloc(offsetof(struct lov_lock, lls_sub[nr]), - GFP_NOFS); - if (!lovlck) - return ERR_PTR(-ENOMEM); - - lovlck->lls_nr = nr; - for (i = 0, nr = 0; i < r0->lo_nr; ++i) { - if (likely(r0->lo_sub[i]) && - lov_stripe_intersects(loo->lo_lsm, i, - file_start, file_end, &start, &end)) { - struct lov_lock_sub *lls = &lovlck->lls_sub[nr]; - struct cl_lock_descr *descr; - - descr = &lls->sub_lock.cll_descr; - - LASSERT(!descr->cld_obj); - descr->cld_obj = lovsub2cl(r0->lo_sub[i]); - descr->cld_start = cl_index(descr->cld_obj, start); - descr->cld_end = cl_index(descr->cld_obj, end); - descr->cld_mode = lock->cll_descr.cld_mode; - descr->cld_gid = lock->cll_descr.cld_gid; - descr->cld_enq_flags = lock->cll_descr.cld_enq_flags; - lls->sub_stripe = i; - - /* initialize sub lock */ - result = lov_sublock_init(env, lock, lls); - if (result < 0) - break; - - lls->sub_initialized = 1; - nr++; - } - } - LASSERT(ergo(result == 0, nr == lovlck->lls_nr)); - - if (result != 0) { - for (i = 0; i < nr; ++i) { - if (!lovlck->lls_sub[i].sub_initialized) - break; - - cl_lock_fini(env, &lovlck->lls_sub[i].sub_lock); - } - kvfree(lovlck); - lovlck = ERR_PTR(result); - } - - return lovlck; -} - -static void lov_lock_fini(const struct lu_env *env, - struct cl_lock_slice *slice) -{ - struct lov_lock *lovlck; - int i; - - lovlck = cl2lov_lock(slice); - for (i = 0; i < lovlck->lls_nr; ++i) { - LASSERT(!lovlck->lls_sub[i].sub_is_enqueued); - if (lovlck->lls_sub[i].sub_initialized) - cl_lock_fini(env, &lovlck->lls_sub[i].sub_lock); - } - kvfree(lovlck); -} - -/** - * Implementation of cl_lock_operations::clo_enqueue() for lov layer. This - * function is rather subtle, as it enqueues top-lock (i.e., advances top-lock - * state machine from CLS_QUEUING to CLS_ENQUEUED states) by juggling sub-lock - * state machines in the face of sub-locks sharing (by multiple top-locks), - * and concurrent sub-lock cancellations. - */ -static int lov_lock_enqueue(const struct lu_env *env, - const struct cl_lock_slice *slice, - struct cl_io *io, struct cl_sync_io *anchor) -{ - struct cl_lock *lock = slice->cls_lock; - struct lov_lock *lovlck = cl2lov_lock(slice); - int i; - int rc = 0; - - for (i = 0; i < lovlck->lls_nr; ++i) { - struct lov_lock_sub *lls = &lovlck->lls_sub[i]; - struct lov_sublock_env *subenv; - - subenv = lov_sublock_env_get(env, lock, lls); - if (IS_ERR(subenv)) { - rc = PTR_ERR(subenv); - break; - } - rc = cl_lock_enqueue(subenv->lse_env, subenv->lse_io, - &lls->sub_lock, anchor); - if (rc != 0) - break; - - lls->sub_is_enqueued = 1; - } - return rc; -} - -static void lov_lock_cancel(const struct lu_env *env, - const struct cl_lock_slice *slice) -{ - struct cl_lock *lock = slice->cls_lock; - struct lov_lock *lovlck = cl2lov_lock(slice); - int i; - - for (i = 0; i < lovlck->lls_nr; ++i) { - struct lov_lock_sub *lls = &lovlck->lls_sub[i]; - struct cl_lock *sublock = &lls->sub_lock; - struct lov_sublock_env *subenv; - - if (!lls->sub_is_enqueued) - continue; - - lls->sub_is_enqueued = 0; - subenv = lov_sublock_env_get(env, lock, lls); - if (!IS_ERR(subenv)) { - cl_lock_cancel(subenv->lse_env, sublock); - } else { - CL_LOCK_DEBUG(D_ERROR, env, slice->cls_lock, - "%s fails with %ld.\n", - __func__, PTR_ERR(subenv)); - } - } -} - -static int lov_lock_print(const struct lu_env *env, void *cookie, - lu_printer_t p, const struct cl_lock_slice *slice) -{ - struct lov_lock *lck = cl2lov_lock(slice); - int i; - - (*p)(env, cookie, "%d\n", lck->lls_nr); - for (i = 0; i < lck->lls_nr; ++i) { - struct lov_lock_sub *sub; - - sub = &lck->lls_sub[i]; - (*p)(env, cookie, " %d %x: ", i, sub->sub_is_enqueued); - cl_lock_print(env, cookie, p, &sub->sub_lock); - } - return 0; -} - -static const struct cl_lock_operations lov_lock_ops = { - .clo_fini = lov_lock_fini, - .clo_enqueue = lov_lock_enqueue, - .clo_cancel = lov_lock_cancel, - .clo_print = lov_lock_print -}; - -int lov_lock_init_raid0(const struct lu_env *env, struct cl_object *obj, - struct cl_lock *lock, const struct cl_io *io) -{ - struct lov_lock *lck; - int result = 0; - - lck = lov_lock_sub_init(env, obj, lock); - if (!IS_ERR(lck)) - cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_lock_ops); - else - result = PTR_ERR(lck); - return result; -} - -static void lov_empty_lock_fini(const struct lu_env *env, - struct cl_lock_slice *slice) -{ - struct lov_lock *lck = cl2lov_lock(slice); - - kmem_cache_free(lov_lock_kmem, lck); -} - -static int lov_empty_lock_print(const struct lu_env *env, void *cookie, - lu_printer_t p, - const struct cl_lock_slice *slice) -{ - (*p)(env, cookie, "empty\n"); - return 0; -} - -/* XXX: more methods will be added later. */ -static const struct cl_lock_operations lov_empty_lock_ops = { - .clo_fini = lov_empty_lock_fini, - .clo_print = lov_empty_lock_print -}; - -int lov_lock_init_empty(const struct lu_env *env, struct cl_object *obj, - struct cl_lock *lock, const struct cl_io *io) -{ - struct lov_lock *lck; - int result = -ENOMEM; - - lck = kmem_cache_zalloc(lov_lock_kmem, GFP_NOFS); - if (lck) { - cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_empty_lock_ops); - result = 0; - } - return result; -} - -/** @} lov */ diff --git a/drivers/staging/lustre/lustre/lov/lov_merge.c b/drivers/staging/lustre/lustre/lov/lov_merge.c deleted file mode 100644 index 006717cf7a41b431d031100ab097322bdc6fbfab..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/lov/lov_merge.c +++ /dev/null @@ -1,105 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_LOV - -#include -#include "lov_internal.h" - -/** Merge the lock value block(&lvb) attributes and KMS from each of the - * stripes in a file into a single lvb. It is expected that the caller - * initializes the current atime, mtime, ctime to avoid regressing a more - * uptodate time on the local client. - */ -int lov_merge_lvb_kms(struct lov_stripe_md *lsm, - struct ost_lvb *lvb, __u64 *kms_place) -{ - __u64 size = 0; - __u64 kms = 0; - __u64 blocks = 0; - s64 current_mtime = lvb->lvb_mtime; - s64 current_atime = lvb->lvb_atime; - s64 current_ctime = lvb->lvb_ctime; - int i; - int rc = 0; - - assert_spin_locked(&lsm->lsm_lock); - LASSERT(lsm->lsm_lock_owner == current->pid); - - CDEBUG(D_INODE, "MDT ID " DOSTID " initial value: s=%llu m=%llu a=%llu c=%llu b=%llu\n", - POSTID(&lsm->lsm_oi), lvb->lvb_size, lvb->lvb_mtime, - lvb->lvb_atime, lvb->lvb_ctime, lvb->lvb_blocks); - for (i = 0; i < lsm->lsm_stripe_count; i++) { - struct lov_oinfo *loi = lsm->lsm_oinfo[i]; - u64 lov_size, tmpsize; - - if (OST_LVB_IS_ERR(loi->loi_lvb.lvb_blocks)) { - rc = OST_LVB_GET_ERR(loi->loi_lvb.lvb_blocks); - continue; - } - - tmpsize = loi->loi_kms; - lov_size = lov_stripe_size(lsm, tmpsize, i); - if (lov_size > kms) - kms = lov_size; - - if (loi->loi_lvb.lvb_size > tmpsize) - tmpsize = loi->loi_lvb.lvb_size; - - lov_size = lov_stripe_size(lsm, tmpsize, i); - if (lov_size > size) - size = lov_size; - /* merge blocks, mtime, atime */ - blocks += loi->loi_lvb.lvb_blocks; - if (loi->loi_lvb.lvb_mtime > current_mtime) - current_mtime = loi->loi_lvb.lvb_mtime; - if (loi->loi_lvb.lvb_atime > current_atime) - current_atime = loi->loi_lvb.lvb_atime; - if (loi->loi_lvb.lvb_ctime > current_ctime) - current_ctime = loi->loi_lvb.lvb_ctime; - - CDEBUG(D_INODE, "MDT ID " DOSTID " on OST[%u]: s=%llu m=%llu a=%llu c=%llu b=%llu\n", - POSTID(&lsm->lsm_oi), loi->loi_ost_idx, - loi->loi_lvb.lvb_size, loi->loi_lvb.lvb_mtime, - loi->loi_lvb.lvb_atime, loi->loi_lvb.lvb_ctime, - loi->loi_lvb.lvb_blocks); - } - - *kms_place = kms; - lvb->lvb_size = size; - lvb->lvb_blocks = blocks; - lvb->lvb_mtime = current_mtime; - lvb->lvb_atime = current_atime; - lvb->lvb_ctime = current_ctime; - return rc; -} diff --git a/drivers/staging/lustre/lustre/lov/lov_obd.c b/drivers/staging/lustre/lustre/lov/lov_obd.c deleted file mode 100644 index 344ff4b201685670a2fd8c3d1b951ffefb1b5679..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/lov/lov_obd.c +++ /dev/null @@ -1,1444 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/lov/lov_obd.c - * - * Author: Phil Schwan - * Author: Peter Braam - * Author: Mike Shaver - * Author: Nathan Rutman - */ - -#define DEBUG_SUBSYSTEM S_LOV - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "lov_internal.h" - -/* Keep a refcount of lov->tgt usage to prevent racing with addition/deletion. - * Any function that expects lov_tgts to remain stationary must take a ref. - */ -static void lov_getref(struct obd_device *obd) -{ - struct lov_obd *lov = &obd->u.lov; - - /* nobody gets through here until lov_putref is done */ - mutex_lock(&lov->lov_lock); - atomic_inc(&lov->lov_refcount); - mutex_unlock(&lov->lov_lock); -} - -static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt); - -static void lov_putref(struct obd_device *obd) -{ - struct lov_obd *lov = &obd->u.lov; - - mutex_lock(&lov->lov_lock); - /* ok to dec to 0 more than once -- ltd_exp's will be null */ - if (atomic_dec_and_test(&lov->lov_refcount) && lov->lov_death_row) { - LIST_HEAD(kill); - int i; - struct lov_tgt_desc *tgt, *n; - - CDEBUG(D_CONFIG, "destroying %d lov targets\n", - lov->lov_death_row); - for (i = 0; i < lov->desc.ld_tgt_count; i++) { - tgt = lov->lov_tgts[i]; - - if (!tgt || !tgt->ltd_reap) - continue; - list_add(&tgt->ltd_kill, &kill); - /* XXX - right now there is a dependency on ld_tgt_count - * being the maximum tgt index for computing the - * mds_max_easize. So we can't shrink it. - */ - lov_ost_pool_remove(&lov->lov_packed, i); - lov->lov_tgts[i] = NULL; - lov->lov_death_row--; - } - mutex_unlock(&lov->lov_lock); - - list_for_each_entry_safe(tgt, n, &kill, ltd_kill) { - list_del(&tgt->ltd_kill); - /* Disconnect */ - __lov_del_obd(obd, tgt); - } - - if (lov->lov_tgts_kobj) - kobject_put(lov->lov_tgts_kobj); - - } else { - mutex_unlock(&lov->lov_lock); - } -} - -static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid, - enum obd_notify_event ev); -static int lov_notify(struct obd_device *obd, struct obd_device *watched, - enum obd_notify_event ev, void *data); - -int lov_connect_obd(struct obd_device *obd, __u32 index, int activate, - struct obd_connect_data *data) -{ - struct lov_obd *lov = &obd->u.lov; - struct obd_uuid *tgt_uuid; - struct obd_device *tgt_obd; - static struct obd_uuid lov_osc_uuid = { "LOV_OSC_UUID" }; - struct obd_import *imp; - int rc; - - if (!lov->lov_tgts[index]) - return -EINVAL; - - tgt_uuid = &lov->lov_tgts[index]->ltd_uuid; - tgt_obd = lov->lov_tgts[index]->ltd_obd; - - if (!tgt_obd->obd_set_up) { - CERROR("Target %s not set up\n", obd_uuid2str(tgt_uuid)); - return -EINVAL; - } - - /* override the sp_me from lov */ - tgt_obd->u.cli.cl_sp_me = lov->lov_sp_me; - - if (data && (data->ocd_connect_flags & OBD_CONNECT_INDEX)) - data->ocd_index = index; - - /* - * Divine LOV knows that OBDs under it are OSCs. - */ - imp = tgt_obd->u.cli.cl_import; - - if (activate) { - tgt_obd->obd_no_recov = 0; - /* FIXME this is probably supposed to be - * ptlrpc_set_import_active. Horrible naming. - */ - ptlrpc_activate_import(imp); - } - - rc = obd_register_observer(tgt_obd, obd); - if (rc) { - CERROR("Target %s register_observer error %d\n", - obd_uuid2str(tgt_uuid), rc); - return rc; - } - - if (imp->imp_invalid) { - CDEBUG(D_CONFIG, "not connecting OSC %s; administratively disabled\n", - obd_uuid2str(tgt_uuid)); - return 0; - } - - rc = obd_connect(NULL, &lov->lov_tgts[index]->ltd_exp, tgt_obd, - &lov_osc_uuid, data, NULL); - if (rc || !lov->lov_tgts[index]->ltd_exp) { - CERROR("Target %s connect error %d\n", - obd_uuid2str(tgt_uuid), rc); - return -ENODEV; - } - - lov->lov_tgts[index]->ltd_reap = 0; - - CDEBUG(D_CONFIG, "Connected tgt idx %d %s (%s) %sactive\n", index, - obd_uuid2str(tgt_uuid), tgt_obd->obd_name, activate ? "":"in"); - - if (lov->lov_tgts_kobj) - /* Even if we failed, that's ok */ - rc = sysfs_create_link(lov->lov_tgts_kobj, &tgt_obd->obd_kobj, - tgt_obd->obd_name); - - return 0; -} - -static int lov_connect(const struct lu_env *env, - struct obd_export **exp, struct obd_device *obd, - struct obd_uuid *cluuid, struct obd_connect_data *data, - void *localdata) -{ - struct lov_obd *lov = &obd->u.lov; - struct lov_tgt_desc *tgt; - struct lustre_handle conn; - int i, rc; - - CDEBUG(D_CONFIG, "connect #%d\n", lov->lov_connects); - - rc = class_connect(&conn, obd, cluuid); - if (rc) - return rc; - - *exp = class_conn2export(&conn); - - /* Why should there ever be more than 1 connect? */ - lov->lov_connects++; - LASSERT(lov->lov_connects == 1); - - memset(&lov->lov_ocd, 0, sizeof(lov->lov_ocd)); - if (data) - lov->lov_ocd = *data; - - obd_getref(obd); - - lov->lov_tgts_kobj = kobject_create_and_add("target_obds", - &obd->obd_kobj); - - for (i = 0; i < lov->desc.ld_tgt_count; i++) { - tgt = lov->lov_tgts[i]; - if (!tgt || obd_uuid_empty(&tgt->ltd_uuid)) - continue; - /* Flags will be lowest common denominator */ - rc = lov_connect_obd(obd, i, tgt->ltd_activate, &lov->lov_ocd); - if (rc) { - CERROR("%s: lov connect tgt %d failed: %d\n", - obd->obd_name, i, rc); - continue; - } - /* connect to administrative disabled ost */ - if (!lov->lov_tgts[i]->ltd_exp) - continue; - - rc = lov_notify(obd, lov->lov_tgts[i]->ltd_exp->exp_obd, - OBD_NOTIFY_CONNECT, (void *)&i); - if (rc) { - CERROR("%s error sending notify %d\n", - obd->obd_name, rc); - } - } - obd_putref(obd); - - return 0; -} - -static int lov_disconnect_obd(struct obd_device *obd, struct lov_tgt_desc *tgt) -{ - struct lov_obd *lov = &obd->u.lov; - struct obd_device *osc_obd; - int rc; - - osc_obd = class_exp2obd(tgt->ltd_exp); - CDEBUG(D_CONFIG, "%s: disconnecting target %s\n", - obd->obd_name, osc_obd ? osc_obd->obd_name : "NULL"); - - if (tgt->ltd_active) { - tgt->ltd_active = 0; - lov->desc.ld_active_tgt_count--; - tgt->ltd_exp->exp_obd->obd_inactive = 1; - } - - if (osc_obd) { - if (lov->lov_tgts_kobj) - sysfs_remove_link(lov->lov_tgts_kobj, - osc_obd->obd_name); - - /* Pass it on to our clients. - * XXX This should be an argument to disconnect, - * XXX not a back-door flag on the OBD. Ah well. - */ - osc_obd->obd_force = obd->obd_force; - osc_obd->obd_fail = obd->obd_fail; - osc_obd->obd_no_recov = obd->obd_no_recov; - } - - obd_register_observer(osc_obd, NULL); - - rc = obd_disconnect(tgt->ltd_exp); - if (rc) { - CERROR("Target %s disconnect error %d\n", - tgt->ltd_uuid.uuid, rc); - rc = 0; - } - - tgt->ltd_exp = NULL; - return 0; -} - -static int lov_disconnect(struct obd_export *exp) -{ - struct obd_device *obd = class_exp2obd(exp); - struct lov_obd *lov = &obd->u.lov; - int i, rc; - - if (!lov->lov_tgts) - goto out; - - /* Only disconnect the underlying layers on the final disconnect. */ - lov->lov_connects--; - if (lov->lov_connects != 0) { - /* why should there be more than 1 connect? */ - CERROR("disconnect #%d\n", lov->lov_connects); - goto out; - } - - /* Let's hold another reference so lov_del_obd doesn't spin through - * putref every time - */ - obd_getref(obd); - - for (i = 0; i < lov->desc.ld_tgt_count; i++) { - if (lov->lov_tgts[i] && lov->lov_tgts[i]->ltd_exp) { - /* Disconnection is the last we know about an obd */ - lov_del_target(obd, i, NULL, lov->lov_tgts[i]->ltd_gen); - } - } - - obd_putref(obd); - -out: - rc = class_disconnect(exp); /* bz 9811 */ - return rc; -} - -/* Error codes: - * - * -EINVAL : UUID can't be found in the LOV's target list - * -ENOTCONN: The UUID is found, but the target connection is bad (!) - * -EBADF : The UUID is found, but the OBD is the wrong type (!) - * any >= 0 : is log target index - */ -static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid, - enum obd_notify_event ev) -{ - struct lov_obd *lov = &obd->u.lov; - struct lov_tgt_desc *tgt; - int index, activate, active; - - CDEBUG(D_INFO, "Searching in lov %p for uuid %s event(%d)\n", - lov, uuid->uuid, ev); - - obd_getref(obd); - for (index = 0; index < lov->desc.ld_tgt_count; index++) { - tgt = lov->lov_tgts[index]; - if (!tgt) - continue; - /* - * LU-642, initially inactive OSC could miss the obd_connect, - * we make up for it here. - */ - if (ev == OBD_NOTIFY_ACTIVATE && !tgt->ltd_exp && - obd_uuid_equals(uuid, &tgt->ltd_uuid)) { - struct obd_uuid lov_osc_uuid = {"LOV_OSC_UUID"}; - - obd_connect(NULL, &tgt->ltd_exp, tgt->ltd_obd, - &lov_osc_uuid, &lov->lov_ocd, NULL); - } - if (!tgt->ltd_exp) - continue; - - CDEBUG(D_INFO, "lov idx %d is %s conn %#llx\n", - index, obd_uuid2str(&tgt->ltd_uuid), - tgt->ltd_exp->exp_handle.h_cookie); - if (obd_uuid_equals(uuid, &tgt->ltd_uuid)) - break; - } - - if (index == lov->desc.ld_tgt_count) { - index = -EINVAL; - goto out; - } - - if (ev == OBD_NOTIFY_DEACTIVATE || ev == OBD_NOTIFY_ACTIVATE) { - activate = (ev == OBD_NOTIFY_ACTIVATE) ? 1 : 0; - - if (lov->lov_tgts[index]->ltd_activate == activate) { - CDEBUG(D_INFO, "OSC %s already %sactivate!\n", - uuid->uuid, activate ? "" : "de"); - } else { - lov->lov_tgts[index]->ltd_activate = activate; - CDEBUG(D_CONFIG, "%sactivate OSC %s\n", - activate ? "" : "de", obd_uuid2str(uuid)); - } - - } else if (ev == OBD_NOTIFY_INACTIVE || ev == OBD_NOTIFY_ACTIVE) { - active = (ev == OBD_NOTIFY_ACTIVE) ? 1 : 0; - - if (lov->lov_tgts[index]->ltd_active == active) { - CDEBUG(D_INFO, "OSC %s already %sactive!\n", - uuid->uuid, active ? "" : "in"); - goto out; - } - CDEBUG(D_CONFIG, "Marking OSC %s %sactive\n", - obd_uuid2str(uuid), active ? "" : "in"); - - lov->lov_tgts[index]->ltd_active = active; - if (active) { - lov->desc.ld_active_tgt_count++; - lov->lov_tgts[index]->ltd_exp->exp_obd->obd_inactive = 0; - } else { - lov->desc.ld_active_tgt_count--; - lov->lov_tgts[index]->ltd_exp->exp_obd->obd_inactive = 1; - } - } else { - CERROR("Unknown event(%d) for uuid %s", ev, uuid->uuid); - } - - out: - obd_putref(obd); - return index; -} - -static int lov_notify(struct obd_device *obd, struct obd_device *watched, - enum obd_notify_event ev, void *data) -{ - int rc = 0; - struct lov_obd *lov = &obd->u.lov; - - down_read(&lov->lov_notify_lock); - if (!lov->lov_connects) { - up_read(&lov->lov_notify_lock); - return rc; - } - - if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE || - ev == OBD_NOTIFY_ACTIVATE || ev == OBD_NOTIFY_DEACTIVATE) { - struct obd_uuid *uuid; - - LASSERT(watched); - - if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME)) { - up_read(&lov->lov_notify_lock); - CERROR("unexpected notification of %s %s!\n", - watched->obd_type->typ_name, - watched->obd_name); - return -EINVAL; - } - uuid = &watched->u.cli.cl_target_uuid; - - /* Set OSC as active before notifying the observer, so the - * observer can use the OSC normally. - */ - rc = lov_set_osc_active(obd, uuid, ev); - if (rc < 0) { - up_read(&lov->lov_notify_lock); - CERROR("event(%d) of %s failed: %d\n", ev, - obd_uuid2str(uuid), rc); - return rc; - } - /* active event should be pass lov target index as data */ - data = &rc; - } - - /* Pass the notification up the chain. */ - if (watched) { - rc = obd_notify_observer(obd, watched, ev, data); - } else { - /* NULL watched means all osc's in the lov (only for syncs) */ - /* sync event should be send lov idx as data */ - struct lov_obd *lov = &obd->u.lov; - int i, is_sync; - - data = &i; - is_sync = (ev == OBD_NOTIFY_SYNC) || - (ev == OBD_NOTIFY_SYNC_NONBLOCK); - - obd_getref(obd); - for (i = 0; i < lov->desc.ld_tgt_count; i++) { - if (!lov->lov_tgts[i]) - continue; - - /* don't send sync event if target not - * connected/activated - */ - if (is_sync && !lov->lov_tgts[i]->ltd_active) - continue; - - rc = obd_notify_observer(obd, lov->lov_tgts[i]->ltd_obd, - ev, data); - if (rc) { - CERROR("%s: notify %s of %s failed %d\n", - obd->obd_name, - obd->obd_observer->obd_name, - lov->lov_tgts[i]->ltd_obd->obd_name, - rc); - } - } - obd_putref(obd); - } - - up_read(&lov->lov_notify_lock); - return rc; -} - -static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp, - __u32 index, int gen, int active) -{ - struct lov_obd *lov = &obd->u.lov; - struct lov_tgt_desc *tgt; - struct obd_device *tgt_obd; - int rc; - - CDEBUG(D_CONFIG, "uuid:%s idx:%d gen:%d active:%d\n", - uuidp->uuid, index, gen, active); - - if (gen <= 0) { - CERROR("request to add OBD %s with invalid generation: %d\n", - uuidp->uuid, gen); - return -EINVAL; - } - - tgt_obd = class_find_client_obd(uuidp, LUSTRE_OSC_NAME, - &obd->obd_uuid); - if (!tgt_obd) - return -EINVAL; - - mutex_lock(&lov->lov_lock); - - if ((index < lov->lov_tgt_size) && lov->lov_tgts[index]) { - tgt = lov->lov_tgts[index]; - CERROR("UUID %s already assigned at LOV target index %d\n", - obd_uuid2str(&tgt->ltd_uuid), index); - mutex_unlock(&lov->lov_lock); - return -EEXIST; - } - - if (index >= lov->lov_tgt_size) { - /* We need to reallocate the lov target array. */ - struct lov_tgt_desc **newtgts, **old = NULL; - __u32 newsize, oldsize = 0; - - newsize = max_t(__u32, lov->lov_tgt_size, 2); - while (newsize < index + 1) - newsize <<= 1; - newtgts = kcalloc(newsize, sizeof(*newtgts), GFP_NOFS); - if (!newtgts) { - mutex_unlock(&lov->lov_lock); - return -ENOMEM; - } - - if (lov->lov_tgt_size) { - memcpy(newtgts, lov->lov_tgts, sizeof(*newtgts) * - lov->lov_tgt_size); - old = lov->lov_tgts; - oldsize = lov->lov_tgt_size; - } - - lov->lov_tgts = newtgts; - lov->lov_tgt_size = newsize; - smp_rmb(); - kfree(old); - - CDEBUG(D_CONFIG, "tgts: %p size: %d\n", - lov->lov_tgts, lov->lov_tgt_size); - } - - tgt = kzalloc(sizeof(*tgt), GFP_NOFS); - if (!tgt) { - mutex_unlock(&lov->lov_lock); - return -ENOMEM; - } - - rc = lov_ost_pool_add(&lov->lov_packed, index, lov->lov_tgt_size); - if (rc) { - mutex_unlock(&lov->lov_lock); - kfree(tgt); - return rc; - } - - tgt->ltd_uuid = *uuidp; - tgt->ltd_obd = tgt_obd; - /* XXX - add a sanity check on the generation number. */ - tgt->ltd_gen = gen; - tgt->ltd_index = index; - tgt->ltd_activate = active; - lov->lov_tgts[index] = tgt; - if (index >= lov->desc.ld_tgt_count) - lov->desc.ld_tgt_count = index + 1; - - mutex_unlock(&lov->lov_lock); - - CDEBUG(D_CONFIG, "idx=%d ltd_gen=%d ld_tgt_count=%d\n", - index, tgt->ltd_gen, lov->desc.ld_tgt_count); - - if (lov->lov_connects == 0) { - /* lov_connect hasn't been called yet. We'll do the - * lov_connect_obd on this target when that fn first runs, - * because we don't know the connect flags yet. - */ - return 0; - } - - obd_getref(obd); - - rc = lov_connect_obd(obd, index, active, &lov->lov_ocd); - if (rc) - goto out; - - /* connect to administrative disabled ost */ - if (!tgt->ltd_exp) { - rc = 0; - goto out; - } - - if (lov->lov_cache) { - rc = obd_set_info_async(NULL, tgt->ltd_exp, - sizeof(KEY_CACHE_SET), KEY_CACHE_SET, - sizeof(struct cl_client_cache), - lov->lov_cache, NULL); - if (rc < 0) - goto out; - } - - rc = lov_notify(obd, tgt->ltd_exp->exp_obd, - active ? OBD_NOTIFY_CONNECT : OBD_NOTIFY_INACTIVE, - (void *)&index); - -out: - if (rc) { - CERROR("add failed (%d), deleting %s\n", rc, - obd_uuid2str(&tgt->ltd_uuid)); - lov_del_target(obd, index, NULL, 0); - } - obd_putref(obd); - return rc; -} - -/* Schedule a target for deletion */ -int lov_del_target(struct obd_device *obd, __u32 index, - struct obd_uuid *uuidp, int gen) -{ - struct lov_obd *lov = &obd->u.lov; - int count = lov->desc.ld_tgt_count; - int rc = 0; - - if (index >= count) { - CERROR("LOV target index %d >= number of LOV OBDs %d.\n", - index, count); - return -EINVAL; - } - - /* to make sure there's no ongoing lov_notify() now */ - down_write(&lov->lov_notify_lock); - obd_getref(obd); - - if (!lov->lov_tgts[index]) { - CERROR("LOV target at index %d is not setup.\n", index); - rc = -EINVAL; - goto out; - } - - if (uuidp && !obd_uuid_equals(uuidp, &lov->lov_tgts[index]->ltd_uuid)) { - CERROR("LOV target UUID %s at index %d doesn't match %s.\n", - lov_uuid2str(lov, index), index, - obd_uuid2str(uuidp)); - rc = -EINVAL; - goto out; - } - - CDEBUG(D_CONFIG, "uuid: %s idx: %d gen: %d exp: %p active: %d\n", - lov_uuid2str(lov, index), index, - lov->lov_tgts[index]->ltd_gen, lov->lov_tgts[index]->ltd_exp, - lov->lov_tgts[index]->ltd_active); - - lov->lov_tgts[index]->ltd_reap = 1; - lov->lov_death_row++; - /* we really delete it from obd_putref */ -out: - obd_putref(obd); - up_write(&lov->lov_notify_lock); - - return rc; -} - -static void __lov_del_obd(struct obd_device *obd, struct lov_tgt_desc *tgt) -{ - struct obd_device *osc_obd; - - LASSERT(tgt); - LASSERT(tgt->ltd_reap); - - osc_obd = class_exp2obd(tgt->ltd_exp); - - CDEBUG(D_CONFIG, "Removing tgt %s : %s\n", - tgt->ltd_uuid.uuid, - osc_obd ? osc_obd->obd_name : ""); - - if (tgt->ltd_exp) - lov_disconnect_obd(obd, tgt); - - kfree(tgt); - - /* Manual cleanup - no cleanup logs to clean up the osc's. We must - * do it ourselves. And we can't do it from lov_cleanup, - * because we just lost our only reference to it. - */ - if (osc_obd) - class_manual_cleanup(osc_obd); -} - -void lov_fix_desc_stripe_size(__u64 *val) -{ - if (*val < LOV_MIN_STRIPE_SIZE) { - if (*val != 0) - LCONSOLE_INFO("Increasing default stripe size to minimum %u\n", - LOV_DESC_STRIPE_SIZE_DEFAULT); - *val = LOV_DESC_STRIPE_SIZE_DEFAULT; - } else if (*val & (LOV_MIN_STRIPE_SIZE - 1)) { - *val &= ~(LOV_MIN_STRIPE_SIZE - 1); - LCONSOLE_WARN("Changing default stripe size to %llu (a multiple of %u)\n", - *val, LOV_MIN_STRIPE_SIZE); - } -} - -void lov_fix_desc_stripe_count(__u32 *val) -{ - if (*val == 0) - *val = 1; -} - -void lov_fix_desc_pattern(__u32 *val) -{ - /* from lov_setstripe */ - if ((*val != 0) && (*val != LOV_PATTERN_RAID0)) { - LCONSOLE_WARN("Unknown stripe pattern: %#x\n", *val); - *val = 0; - } -} - -void lov_fix_desc_qos_maxage(__u32 *val) -{ - if (*val == 0) - *val = LOV_DESC_QOS_MAXAGE_DEFAULT; -} - -void lov_fix_desc(struct lov_desc *desc) -{ - lov_fix_desc_stripe_size(&desc->ld_default_stripe_size); - lov_fix_desc_stripe_count(&desc->ld_default_stripe_count); - lov_fix_desc_pattern(&desc->ld_pattern); - lov_fix_desc_qos_maxage(&desc->ld_qos_maxage); -} - -int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg) -{ - struct lprocfs_static_vars lvars = { NULL }; - struct lov_desc *desc; - struct lov_obd *lov = &obd->u.lov; - int rc; - - if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) { - CERROR("LOV setup requires a descriptor\n"); - return -EINVAL; - } - - desc = (struct lov_desc *)lustre_cfg_buf(lcfg, 1); - - if (sizeof(*desc) > LUSTRE_CFG_BUFLEN(lcfg, 1)) { - CERROR("descriptor size wrong: %d > %d\n", - (int)sizeof(*desc), LUSTRE_CFG_BUFLEN(lcfg, 1)); - return -EINVAL; - } - - if (desc->ld_magic != LOV_DESC_MAGIC) { - if (desc->ld_magic == __swab32(LOV_DESC_MAGIC)) { - CDEBUG(D_OTHER, "%s: Swabbing lov desc %p\n", - obd->obd_name, desc); - lustre_swab_lov_desc(desc); - } else { - CERROR("%s: Bad lov desc magic: %#x\n", - obd->obd_name, desc->ld_magic); - return -EINVAL; - } - } - - lov_fix_desc(desc); - - desc->ld_active_tgt_count = 0; - lov->desc = *desc; - lov->lov_tgt_size = 0; - - mutex_init(&lov->lov_lock); - atomic_set(&lov->lov_refcount, 0); - lov->lov_sp_me = LUSTRE_SP_CLI; - - init_rwsem(&lov->lov_notify_lock); - - INIT_LIST_HEAD(&lov->lov_pool_list); - lov->lov_pool_count = 0; - rc = lov_pool_hash_init(&lov->lov_pools_hash_body); - if (rc) - goto out; - rc = lov_ost_pool_init(&lov->lov_packed, 0); - if (rc) - goto out; - - lprocfs_lov_init_vars(&lvars); - lprocfs_obd_setup(obd, lvars.obd_vars, lvars.sysfs_vars); - - debugfs_create_file("target_obd", 0444, obd->obd_debugfs_entry, obd, - &lov_proc_target_fops); - - lov->lov_pool_debugfs_entry = debugfs_create_dir("pools", - obd->obd_debugfs_entry); - return 0; - -out: - return rc; -} - -static int lov_cleanup(struct obd_device *obd) -{ - struct lov_obd *lov = &obd->u.lov; - struct pool_desc *pool, *tmp; - - list_for_each_entry_safe(pool, tmp, &lov->lov_pool_list, pool_list) { - /* free pool structs */ - CDEBUG(D_INFO, "delete pool %p\n", pool); - /* In the function below, .hs_keycmp resolves to - * pool_hashkey_keycmp() - */ - /* coverity[overrun-buffer-val] */ - lov_pool_del(obd, pool->pool_name); - } - lov_pool_hash_destroy(&lov->lov_pools_hash_body); - lov_ost_pool_free(&lov->lov_packed); - - lprocfs_obd_cleanup(obd); - if (lov->lov_tgts) { - int i; - - obd_getref(obd); - for (i = 0; i < lov->desc.ld_tgt_count; i++) { - if (!lov->lov_tgts[i]) - continue; - - /* Inactive targets may never have connected */ - if (lov->lov_tgts[i]->ltd_active || - atomic_read(&lov->lov_refcount)) - /* We should never get here - these - * should have been removed in the - * disconnect. - */ - CERROR("lov tgt %d not cleaned! deathrow=%d, lovrc=%d\n", - i, lov->lov_death_row, - atomic_read(&lov->lov_refcount)); - lov_del_target(obd, i, NULL, 0); - } - obd_putref(obd); - kfree(lov->lov_tgts); - lov->lov_tgt_size = 0; - } - - if (lov->lov_cache) { - cl_cache_decref(lov->lov_cache); - lov->lov_cache = NULL; - } - - return 0; -} - -int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg, - __u32 *indexp, int *genp) -{ - struct obd_uuid obd_uuid; - int cmd; - int rc = 0; - - switch (cmd = lcfg->lcfg_command) { - case LCFG_LOV_ADD_OBD: - case LCFG_LOV_ADD_INA: - case LCFG_LOV_DEL_OBD: { - __u32 index; - int gen; - /* lov_modify_tgts add 0:lov_mdsA 1:ost1_UUID 2:0 3:1 */ - if (LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(obd_uuid.uuid)) { - rc = -EINVAL; - goto out; - } - - obd_str2uuid(&obd_uuid, lustre_cfg_buf(lcfg, 1)); - - rc = kstrtoint(lustre_cfg_buf(lcfg, 2), 10, indexp); - if (rc < 0) - goto out; - rc = kstrtoint(lustre_cfg_buf(lcfg, 3), 10, genp); - if (rc < 0) - goto out; - index = *indexp; - gen = *genp; - if (cmd == LCFG_LOV_ADD_OBD) - rc = lov_add_target(obd, &obd_uuid, index, gen, 1); - else if (cmd == LCFG_LOV_ADD_INA) - rc = lov_add_target(obd, &obd_uuid, index, gen, 0); - else - rc = lov_del_target(obd, index, &obd_uuid, gen); - goto out; - } - case LCFG_PARAM: { - struct lprocfs_static_vars lvars = { NULL }; - struct lov_desc *desc = &obd->u.lov.desc; - - if (!desc) { - rc = -EINVAL; - goto out; - } - - lprocfs_lov_init_vars(&lvars); - - rc = class_process_proc_param(PARAM_LOV, lvars.obd_vars, - lcfg, obd); - if (rc > 0) - rc = 0; - goto out; - } - case LCFG_POOL_NEW: - case LCFG_POOL_ADD: - case LCFG_POOL_DEL: - case LCFG_POOL_REM: - goto out; - - default: { - CERROR("Unknown command: %d\n", lcfg->lcfg_command); - rc = -EINVAL; - goto out; - } - } -out: - return rc; -} - -static int -lov_statfs_interpret(struct ptlrpc_request_set *rqset, void *data, int rc) -{ - struct lov_request_set *lovset = (struct lov_request_set *)data; - int err; - - if (rc) - atomic_set(&lovset->set_completes, 0); - - err = lov_fini_statfs_set(lovset); - return rc ? rc : err; -} - -static int lov_statfs_async(struct obd_export *exp, struct obd_info *oinfo, - __u64 max_age, struct ptlrpc_request_set *rqset) -{ - struct obd_device *obd = class_exp2obd(exp); - struct lov_request_set *set; - struct lov_request *req; - struct lov_obd *lov; - int rc = 0; - - LASSERT(oinfo->oi_osfs); - - lov = &obd->u.lov; - rc = lov_prep_statfs_set(obd, oinfo, &set); - if (rc) - return rc; - - list_for_each_entry(req, &set->set_list, rq_link) { - rc = obd_statfs_async(lov->lov_tgts[req->rq_idx]->ltd_exp, - &req->rq_oi, max_age, rqset); - if (rc) - break; - } - - if (rc || list_empty(&rqset->set_requests)) { - int err; - - if (rc) - atomic_set(&set->set_completes, 0); - err = lov_fini_statfs_set(set); - return rc ? rc : err; - } - - LASSERT(!rqset->set_interpret); - rqset->set_interpret = lov_statfs_interpret; - rqset->set_arg = (void *)set; - return 0; -} - -static int lov_statfs(const struct lu_env *env, struct obd_export *exp, - struct obd_statfs *osfs, __u64 max_age, __u32 flags) -{ - struct ptlrpc_request_set *set = NULL; - struct obd_info oinfo = { - .oi_osfs = osfs, - .oi_flags = flags, - }; - int rc = 0; - - /* for obdclass we forbid using obd_statfs_rqset, but prefer using async - * statfs requests - */ - set = ptlrpc_prep_set(); - if (!set) - return -ENOMEM; - - rc = lov_statfs_async(exp, &oinfo, max_age, set); - if (rc == 0) - rc = ptlrpc_set_wait(set); - ptlrpc_set_destroy(set); - - return rc; -} - -static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len, - void *karg, void __user *uarg) -{ - struct obd_device *obddev = class_exp2obd(exp); - struct lov_obd *lov = &obddev->u.lov; - int i = 0, rc = 0, count = lov->desc.ld_tgt_count; - struct obd_uuid *uuidp; - - switch (cmd) { - case IOC_OBD_STATFS: { - struct obd_ioctl_data *data = karg; - struct obd_device *osc_obd; - struct obd_statfs stat_buf = {0}; - __u32 index; - __u32 flags; - - memcpy(&index, data->ioc_inlbuf2, sizeof(__u32)); - if (index >= count) - return -ENODEV; - - if (!lov->lov_tgts[index]) - /* Try again with the next index */ - return -EAGAIN; - if (!lov->lov_tgts[index]->ltd_active) - return -ENODATA; - - osc_obd = class_exp2obd(lov->lov_tgts[index]->ltd_exp); - if (!osc_obd) - return -EINVAL; - - /* copy UUID */ - if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(osc_obd), - min_t(unsigned long, data->ioc_plen2, - sizeof(struct obd_uuid)))) - return -EFAULT; - - memcpy(&flags, data->ioc_inlbuf1, sizeof(__u32)); - flags = flags & LL_STATFS_NODELAY ? OBD_STATFS_NODELAY : 0; - - /* got statfs data */ - rc = obd_statfs(NULL, lov->lov_tgts[index]->ltd_exp, &stat_buf, - get_jiffies_64() - OBD_STATFS_CACHE_SECONDS * HZ, - flags); - if (rc) - return rc; - if (copy_to_user(data->ioc_pbuf1, &stat_buf, - min_t(unsigned long, data->ioc_plen1, - sizeof(stat_buf)))) - return -EFAULT; - break; - } - case OBD_IOC_LOV_GET_CONFIG: { - struct obd_ioctl_data *data; - struct lov_desc *desc; - char *buf = NULL; - __u32 *genp; - - len = 0; - if (obd_ioctl_getdata(&buf, &len, uarg)) - return -EINVAL; - - data = (struct obd_ioctl_data *)buf; - - if (sizeof(*desc) > data->ioc_inllen1) { - kvfree(buf); - return -EINVAL; - } - - if (sizeof(uuidp->uuid) * count > data->ioc_inllen2) { - kvfree(buf); - return -EINVAL; - } - - if (sizeof(__u32) * count > data->ioc_inllen3) { - kvfree(buf); - return -EINVAL; - } - - desc = (struct lov_desc *)data->ioc_inlbuf1; - memcpy(desc, &lov->desc, sizeof(*desc)); - - uuidp = (struct obd_uuid *)data->ioc_inlbuf2; - genp = (__u32 *)data->ioc_inlbuf3; - /* the uuid will be empty for deleted OSTs */ - for (i = 0; i < count; i++, uuidp++, genp++) { - if (!lov->lov_tgts[i]) - continue; - *uuidp = lov->lov_tgts[i]->ltd_uuid; - *genp = lov->lov_tgts[i]->ltd_gen; - } - - if (copy_to_user(uarg, buf, len)) - rc = -EFAULT; - kvfree(buf); - break; - } - case OBD_IOC_QUOTACTL: { - struct if_quotactl *qctl = karg; - struct lov_tgt_desc *tgt = NULL; - struct obd_quotactl *oqctl; - - if (qctl->qc_valid == QC_OSTIDX) { - if (count <= qctl->qc_idx) - return -EINVAL; - - tgt = lov->lov_tgts[qctl->qc_idx]; - if (!tgt || !tgt->ltd_exp) - return -EINVAL; - } else if (qctl->qc_valid == QC_UUID) { - for (i = 0; i < count; i++) { - tgt = lov->lov_tgts[i]; - if (!tgt || - !obd_uuid_equals(&tgt->ltd_uuid, - &qctl->obd_uuid)) - continue; - - if (!tgt->ltd_exp) - return -EINVAL; - - break; - } - } else { - return -EINVAL; - } - - if (i >= count) - return -EAGAIN; - - LASSERT(tgt && tgt->ltd_exp); - oqctl = kzalloc(sizeof(*oqctl), GFP_NOFS); - if (!oqctl) - return -ENOMEM; - - QCTL_COPY(oqctl, qctl); - rc = obd_quotactl(tgt->ltd_exp, oqctl); - if (rc == 0) { - QCTL_COPY(qctl, oqctl); - qctl->qc_valid = QC_OSTIDX; - qctl->obd_uuid = tgt->ltd_uuid; - } - kfree(oqctl); - break; - } - default: { - int set = 0; - - if (count == 0) - return -ENOTTY; - - for (i = 0; i < count; i++) { - int err; - struct obd_device *osc_obd; - - /* OST was disconnected */ - if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp) - continue; - - /* ll_umount_begin() sets force flag but for lov, not - * osc. Let's pass it through - */ - osc_obd = class_exp2obd(lov->lov_tgts[i]->ltd_exp); - osc_obd->obd_force = obddev->obd_force; - err = obd_iocontrol(cmd, lov->lov_tgts[i]->ltd_exp, - len, karg, uarg); - if (err) { - if (lov->lov_tgts[i]->ltd_active) { - CDEBUG(err == -ENOTTY ? - D_IOCTL : D_WARNING, - "iocontrol OSC %s on OST idx %d cmd %x: err = %d\n", - lov_uuid2str(lov, i), - i, cmd, err); - if (!rc) - rc = err; - } - } else { - set = 1; - } - } - if (!set && !rc) - rc = -EIO; - } - } - - return rc; -} - -static int lov_get_info(const struct lu_env *env, struct obd_export *exp, - __u32 keylen, void *key, __u32 *vallen, void *val) -{ - struct obd_device *obddev = class_exp2obd(exp); - struct lov_obd *lov = &obddev->u.lov; - struct lov_desc *ld = &lov->desc; - int rc = 0; - - if (!vallen || !val) - return -EFAULT; - - obd_getref(obddev); - - if (KEY_IS(KEY_MAX_EASIZE)) { - u32 max_stripe_count = min_t(u32, ld->ld_active_tgt_count, - LOV_MAX_STRIPE_COUNT); - - *((u32 *)val) = lov_mds_md_size(max_stripe_count, LOV_MAGIC_V3); - } else if (KEY_IS(KEY_DEFAULT_EASIZE)) { - u32 def_stripe_count = min_t(u32, ld->ld_default_stripe_count, - LOV_MAX_STRIPE_COUNT); - - *((u32 *)val) = lov_mds_md_size(def_stripe_count, LOV_MAGIC_V3); - } else if (KEY_IS(KEY_TGT_COUNT)) { - *((int *)val) = lov->desc.ld_tgt_count; - } else { - rc = -EINVAL; - } - - obd_putref(obddev); - return rc; -} - -static int lov_set_info_async(const struct lu_env *env, struct obd_export *exp, - u32 keylen, void *key, u32 vallen, - void *val, struct ptlrpc_request_set *set) -{ - struct obd_device *obddev = class_exp2obd(exp); - struct lov_obd *lov = &obddev->u.lov; - u32 count; - int i, rc = 0, err; - struct lov_tgt_desc *tgt; - int do_inactive = 0, no_set = 0; - - if (!set) { - no_set = 1; - set = ptlrpc_prep_set(); - if (!set) - return -ENOMEM; - } - - obd_getref(obddev); - count = lov->desc.ld_tgt_count; - - if (KEY_IS(KEY_CHECKSUM)) { - do_inactive = 1; - } else if (KEY_IS(KEY_CACHE_SET)) { - LASSERT(!lov->lov_cache); - lov->lov_cache = val; - do_inactive = 1; - cl_cache_incref(lov->lov_cache); - } - - for (i = 0; i < count; i++) { - tgt = lov->lov_tgts[i]; - - /* OST was disconnected */ - if (!tgt || !tgt->ltd_exp) - continue; - - /* OST is inactive and we don't want inactive OSCs */ - if (!tgt->ltd_active && !do_inactive) - continue; - - err = obd_set_info_async(env, tgt->ltd_exp, keylen, key, - vallen, val, set); - if (!rc) - rc = err; - } - - obd_putref(obddev); - if (no_set) { - err = ptlrpc_set_wait(set); - if (!rc) - rc = err; - ptlrpc_set_destroy(set); - } - return rc; -} - -void lov_stripe_lock(struct lov_stripe_md *md) - __acquires(&md->lsm_lock) -{ - LASSERT(md->lsm_lock_owner != current->pid); - spin_lock(&md->lsm_lock); - LASSERT(md->lsm_lock_owner == 0); - md->lsm_lock_owner = current->pid; -} - -void lov_stripe_unlock(struct lov_stripe_md *md) - __releases(&md->lsm_lock) -{ - LASSERT(md->lsm_lock_owner == current->pid); - md->lsm_lock_owner = 0; - spin_unlock(&md->lsm_lock); -} - -static int lov_quotactl(struct obd_device *obd, struct obd_export *exp, - struct obd_quotactl *oqctl) -{ - struct lov_obd *lov = &obd->u.lov; - struct lov_tgt_desc *tgt; - __u64 curspace = 0; - __u64 bhardlimit = 0; - int i, rc = 0; - - if (oqctl->qc_cmd != Q_GETOQUOTA && - oqctl->qc_cmd != LUSTRE_Q_SETQUOTA) { - CERROR("bad quota opc %x for lov obd\n", oqctl->qc_cmd); - return -EFAULT; - } - - /* for lov tgt */ - obd_getref(obd); - for (i = 0; i < lov->desc.ld_tgt_count; i++) { - int err; - - tgt = lov->lov_tgts[i]; - - if (!tgt) - continue; - - if (!tgt->ltd_active || tgt->ltd_reap) { - if (oqctl->qc_cmd == Q_GETOQUOTA && - lov->lov_tgts[i]->ltd_activate) { - rc = -EREMOTEIO; - CERROR("ost %d is inactive\n", i); - } else { - CDEBUG(D_HA, "ost %d is inactive\n", i); - } - continue; - } - - err = obd_quotactl(tgt->ltd_exp, oqctl); - if (err) { - if (tgt->ltd_active && !rc) - rc = err; - continue; - } - - if (oqctl->qc_cmd == Q_GETOQUOTA) { - curspace += oqctl->qc_dqblk.dqb_curspace; - bhardlimit += oqctl->qc_dqblk.dqb_bhardlimit; - } - } - obd_putref(obd); - - if (oqctl->qc_cmd == Q_GETOQUOTA) { - oqctl->qc_dqblk.dqb_curspace = curspace; - oqctl->qc_dqblk.dqb_bhardlimit = bhardlimit; - } - return rc; -} - -static struct obd_ops lov_obd_ops = { - .owner = THIS_MODULE, - .setup = lov_setup, - .cleanup = lov_cleanup, - /*.process_config = lov_process_config,*/ - .connect = lov_connect, - .disconnect = lov_disconnect, - .statfs = lov_statfs, - .statfs_async = lov_statfs_async, - .iocontrol = lov_iocontrol, - .get_info = lov_get_info, - .set_info_async = lov_set_info_async, - .notify = lov_notify, - .pool_new = lov_pool_new, - .pool_rem = lov_pool_remove, - .pool_add = lov_pool_add, - .pool_del = lov_pool_del, - .getref = lov_getref, - .putref = lov_putref, - .quotactl = lov_quotactl, -}; - -struct kmem_cache *lov_oinfo_slab; - -static int __init lov_init(void) -{ - struct lprocfs_static_vars lvars = { NULL }; - int rc; - - /* print an address of _any_ initialized kernel symbol from this - * module, to allow debugging with gdb that doesn't support data - * symbols from modules. - */ - CDEBUG(D_INFO, "Lustre LOV module (%p).\n", &lov_caches); - - rc = libcfs_setup(); - if (rc) - return rc; - - rc = lu_kmem_init(lov_caches); - if (rc) - return rc; - - lov_oinfo_slab = kmem_cache_create("lov_oinfo", - sizeof(struct lov_oinfo), - 0, SLAB_HWCACHE_ALIGN, NULL); - if (!lov_oinfo_slab) { - lu_kmem_fini(lov_caches); - return -ENOMEM; - } - lprocfs_lov_init_vars(&lvars); - - rc = class_register_type(&lov_obd_ops, NULL, - LUSTRE_LOV_NAME, &lov_device_type); - - if (rc) { - kmem_cache_destroy(lov_oinfo_slab); - lu_kmem_fini(lov_caches); - } - - return rc; -} - -static void /*__exit*/ lov_exit(void) -{ - class_unregister_type(LUSTRE_LOV_NAME); - kmem_cache_destroy(lov_oinfo_slab); - - lu_kmem_fini(lov_caches); -} - -MODULE_AUTHOR("OpenSFS, Inc. "); -MODULE_DESCRIPTION("Lustre Logical Object Volume"); -MODULE_LICENSE("GPL"); -MODULE_VERSION(LUSTRE_VERSION_STRING); - -module_init(lov_init); -module_exit(lov_exit); diff --git a/drivers/staging/lustre/lustre/lov/lov_object.c b/drivers/staging/lustre/lustre/lov/lov_object.c deleted file mode 100644 index adc90f310fd7a9a2d99be6ba209c594f1a1981f2..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/lov/lov_object.c +++ /dev/null @@ -1,1625 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Implementation of cl_object for LOV layer. - * - * Author: Nikita Danilov - * Author: Jinshan Xiong - */ - -#define DEBUG_SUBSYSTEM S_LOV - -#include "lov_cl_internal.h" - -static inline struct lov_device *lov_object_dev(struct lov_object *obj) -{ - return lu2lov_dev(obj->lo_cl.co_lu.lo_dev); -} - -/** \addtogroup lov - * @{ - */ - -/***************************************************************************** - * - * Layout operations. - * - */ - -struct lov_layout_operations { - int (*llo_init)(const struct lu_env *env, struct lov_device *dev, - struct lov_object *lov, struct lov_stripe_md *lsm, - const struct cl_object_conf *conf, - union lov_layout_state *state); - int (*llo_delete)(const struct lu_env *env, struct lov_object *lov, - union lov_layout_state *state); - void (*llo_fini)(const struct lu_env *env, struct lov_object *lov, - union lov_layout_state *state); - void (*llo_install)(const struct lu_env *env, struct lov_object *lov, - union lov_layout_state *state); - int (*llo_print)(const struct lu_env *env, void *cookie, - lu_printer_t p, const struct lu_object *o); - int (*llo_page_init)(const struct lu_env *env, struct cl_object *obj, - struct cl_page *page, pgoff_t index); - int (*llo_lock_init)(const struct lu_env *env, - struct cl_object *obj, struct cl_lock *lock, - const struct cl_io *io); - int (*llo_io_init)(const struct lu_env *env, - struct cl_object *obj, struct cl_io *io); - int (*llo_getattr)(const struct lu_env *env, struct cl_object *obj, - struct cl_attr *attr); -}; - -static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov); - -static void lov_lsm_put(struct lov_stripe_md *lsm) -{ - if (lsm) - lov_free_memmd(&lsm); -} - -/***************************************************************************** - * - * Lov object layout operations. - * - */ - -static void lov_install_empty(const struct lu_env *env, - struct lov_object *lov, - union lov_layout_state *state) -{ - /* - * File without objects. - */ -} - -static int lov_init_empty(const struct lu_env *env, struct lov_device *dev, - struct lov_object *lov, struct lov_stripe_md *lsm, - const struct cl_object_conf *conf, - union lov_layout_state *state) -{ - return 0; -} - -static void lov_install_raid0(const struct lu_env *env, - struct lov_object *lov, - union lov_layout_state *state) -{ -} - -static struct cl_object *lov_sub_find(const struct lu_env *env, - struct cl_device *dev, - const struct lu_fid *fid, - const struct cl_object_conf *conf) -{ - struct lu_object *o; - - o = lu_object_find_at(env, cl2lu_dev(dev), fid, &conf->coc_lu); - LASSERT(ergo(!IS_ERR(o), o->lo_dev->ld_type == &lovsub_device_type)); - return lu2cl(o); -} - -static int lov_init_sub(const struct lu_env *env, struct lov_object *lov, - struct cl_object *stripe, struct lov_layout_raid0 *r0, - int idx) -{ - struct cl_object_header *hdr; - struct cl_object_header *subhdr; - struct cl_object_header *parent; - struct lov_oinfo *oinfo; - int result; - - if (OBD_FAIL_CHECK(OBD_FAIL_LOV_INIT)) { - /* For sanity:test_206. - * Do not leave the object in cache to avoid accessing - * freed memory. This is because osc_object is referring to - * lov_oinfo of lsm_stripe_data which will be freed due to - * this failure. - */ - cl_object_kill(env, stripe); - cl_object_put(env, stripe); - return -EIO; - } - - hdr = cl_object_header(lov2cl(lov)); - subhdr = cl_object_header(stripe); - - oinfo = lov->lo_lsm->lsm_oinfo[idx]; - CDEBUG(D_INODE, DFID "@%p[%d] -> " DFID "@%p: ostid: " DOSTID " idx: %d gen: %d\n", - PFID(&subhdr->coh_lu.loh_fid), subhdr, idx, - PFID(&hdr->coh_lu.loh_fid), hdr, POSTID(&oinfo->loi_oi), - oinfo->loi_ost_idx, oinfo->loi_ost_gen); - - /* reuse ->coh_attr_guard to protect coh_parent change */ - spin_lock(&subhdr->coh_attr_guard); - parent = subhdr->coh_parent; - if (!parent) { - subhdr->coh_parent = hdr; - spin_unlock(&subhdr->coh_attr_guard); - subhdr->coh_nesting = hdr->coh_nesting + 1; - lu_object_ref_add(&stripe->co_lu, "lov-parent", lov); - r0->lo_sub[idx] = cl2lovsub(stripe); - r0->lo_sub[idx]->lso_super = lov; - r0->lo_sub[idx]->lso_index = idx; - result = 0; - } else { - struct lu_object *old_obj; - struct lov_object *old_lov; - unsigned int mask = D_INODE; - - spin_unlock(&subhdr->coh_attr_guard); - old_obj = lu_object_locate(&parent->coh_lu, &lov_device_type); - LASSERT(old_obj); - old_lov = cl2lov(lu2cl(old_obj)); - if (old_lov->lo_layout_invalid) { - /* the object's layout has already changed but isn't - * refreshed - */ - lu_object_unhash(env, &stripe->co_lu); - result = -EAGAIN; - } else { - mask = D_ERROR; - result = -EIO; - } - - LU_OBJECT_DEBUG(mask, env, &stripe->co_lu, - "stripe %d is already owned.", idx); - LU_OBJECT_DEBUG(mask, env, old_obj, "owned."); - LU_OBJECT_HEADER(mask, env, lov2lu(lov), "try to own.\n"); - cl_object_put(env, stripe); - } - return result; -} - -static int lov_page_slice_fixup(struct lov_object *lov, - struct cl_object *stripe) -{ - struct cl_object_header *hdr = cl_object_header(&lov->lo_cl); - struct cl_object *o; - - if (!stripe) - return hdr->coh_page_bufsize - lov->lo_cl.co_slice_off - - cfs_size_round(sizeof(struct lov_page)); - - cl_object_for_each(o, stripe) - o->co_slice_off += hdr->coh_page_bufsize; - - return cl_object_header(stripe)->coh_page_bufsize; -} - -static int lov_init_raid0(const struct lu_env *env, struct lov_device *dev, - struct lov_object *lov, struct lov_stripe_md *lsm, - const struct cl_object_conf *conf, - union lov_layout_state *state) -{ - int result; - int i; - - struct cl_object *stripe; - struct lov_thread_info *lti = lov_env_info(env); - struct cl_object_conf *subconf = <i->lti_stripe_conf; - struct lu_fid *ofid = <i->lti_fid; - struct lov_layout_raid0 *r0 = &state->raid0; - - if (lsm->lsm_magic != LOV_MAGIC_V1 && lsm->lsm_magic != LOV_MAGIC_V3) { - dump_lsm(D_ERROR, lsm); - LASSERTF(0, "magic mismatch, expected %d/%d, actual %d.\n", - LOV_MAGIC_V1, LOV_MAGIC_V3, lsm->lsm_magic); - } - - LASSERT(!lov->lo_lsm); - lov->lo_lsm = lsm_addref(lsm); - lov->lo_layout_invalid = true; - r0->lo_nr = lsm->lsm_stripe_count; - LASSERT(r0->lo_nr <= lov_targets_nr(dev)); - - r0->lo_sub = kvzalloc(r0->lo_nr * sizeof(r0->lo_sub[0]), - GFP_NOFS); - if (r0->lo_sub) { - int psz = 0; - - result = 0; - subconf->coc_inode = conf->coc_inode; - spin_lock_init(&r0->lo_sub_lock); - /* - * Create stripe cl_objects. - */ - for (i = 0; i < r0->lo_nr && result == 0; ++i) { - struct cl_device *subdev; - struct lov_oinfo *oinfo = lsm->lsm_oinfo[i]; - int ost_idx = oinfo->loi_ost_idx; - - if (lov_oinfo_is_dummy(oinfo)) - continue; - - result = ostid_to_fid(ofid, &oinfo->loi_oi, - oinfo->loi_ost_idx); - if (result != 0) - goto out; - - if (!dev->ld_target[ost_idx]) { - CERROR("%s: OST %04x is not initialized\n", - lov2obd(dev->ld_lov)->obd_name, ost_idx); - result = -EIO; - goto out; - } - - subdev = lovsub2cl_dev(dev->ld_target[ost_idx]); - subconf->u.coc_oinfo = oinfo; - LASSERTF(subdev, "not init ost %d\n", ost_idx); - /* In the function below, .hs_keycmp resolves to - * lu_obj_hop_keycmp() - */ - /* coverity[overrun-buffer-val] */ - stripe = lov_sub_find(env, subdev, ofid, subconf); - if (!IS_ERR(stripe)) { - result = lov_init_sub(env, lov, stripe, r0, i); - if (result == -EAGAIN) { /* try again */ - --i; - result = 0; - continue; - } - } else { - result = PTR_ERR(stripe); - } - - if (result == 0) { - int sz = lov_page_slice_fixup(lov, stripe); - - LASSERT(ergo(psz > 0, psz == sz)); - psz = sz; - } - } - if (result == 0) - cl_object_header(&lov->lo_cl)->coh_page_bufsize += psz; - } else { - result = -ENOMEM; - } -out: - return result; -} - -static int lov_init_released(const struct lu_env *env, struct lov_device *dev, - struct lov_object *lov, struct lov_stripe_md *lsm, - const struct cl_object_conf *conf, - union lov_layout_state *state) -{ - LASSERT(lsm); - LASSERT(lsm_is_released(lsm)); - LASSERT(!lov->lo_lsm); - - lov->lo_lsm = lsm_addref(lsm); - return 0; -} - -static struct cl_object *lov_find_subobj(const struct lu_env *env, - struct lov_object *lov, - struct lov_stripe_md *lsm, - int stripe_idx) -{ - struct lov_device *dev = lu2lov_dev(lov2lu(lov)->lo_dev); - struct lov_oinfo *oinfo = lsm->lsm_oinfo[stripe_idx]; - struct lov_thread_info *lti = lov_env_info(env); - struct lu_fid *ofid = <i->lti_fid; - struct cl_device *subdev; - struct cl_object *result; - int ost_idx; - int rc; - - if (lov->lo_type != LLT_RAID0) { - result = NULL; - goto out; - } - - ost_idx = oinfo->loi_ost_idx; - rc = ostid_to_fid(ofid, &oinfo->loi_oi, ost_idx); - if (rc) { - result = NULL; - goto out; - } - - subdev = lovsub2cl_dev(dev->ld_target[ost_idx]); - result = lov_sub_find(env, subdev, ofid, NULL); -out: - if (!result) - result = ERR_PTR(-EINVAL); - return result; -} - -static int lov_delete_empty(const struct lu_env *env, struct lov_object *lov, - union lov_layout_state *state) -{ - LASSERT(lov->lo_type == LLT_EMPTY || lov->lo_type == LLT_RELEASED); - - lov_layout_wait(env, lov); - return 0; -} - -static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov, - struct lovsub_object *los, int idx) -{ - struct cl_object *sub; - struct lov_layout_raid0 *r0; - struct lu_site *site; - wait_queue_head_t *wq; - wait_queue_entry_t *waiter; - - r0 = &lov->u.raid0; - LASSERT(r0->lo_sub[idx] == los); - - sub = lovsub2cl(los); - site = sub->co_lu.lo_dev->ld_site; - wq = lu_site_wq_from_fid(site, &sub->co_lu.lo_header->loh_fid); - - cl_object_kill(env, sub); - /* release a reference to the sub-object and ... */ - lu_object_ref_del(&sub->co_lu, "lov-parent", lov); - cl_object_put(env, sub); - - /* ... wait until it is actually destroyed---sub-object clears its - * ->lo_sub[] slot in lovsub_object_fini() - */ - if (r0->lo_sub[idx] == los) { - waiter = &lov_env_info(env)->lti_waiter; - init_waitqueue_entry(waiter, current); - add_wait_queue(wq, waiter); - set_current_state(TASK_UNINTERRUPTIBLE); - while (1) { - /* this wait-queue is signaled at the end of - * lu_object_free(). - */ - set_current_state(TASK_UNINTERRUPTIBLE); - spin_lock(&r0->lo_sub_lock); - if (r0->lo_sub[idx] == los) { - spin_unlock(&r0->lo_sub_lock); - schedule(); - } else { - spin_unlock(&r0->lo_sub_lock); - set_current_state(TASK_RUNNING); - break; - } - } - remove_wait_queue(wq, waiter); - } - LASSERT(!r0->lo_sub[idx]); -} - -static int lov_delete_raid0(const struct lu_env *env, struct lov_object *lov, - union lov_layout_state *state) -{ - struct lov_layout_raid0 *r0 = &state->raid0; - struct lov_stripe_md *lsm = lov->lo_lsm; - int i; - - dump_lsm(D_INODE, lsm); - - lov_layout_wait(env, lov); - if (r0->lo_sub) { - for (i = 0; i < r0->lo_nr; ++i) { - struct lovsub_object *los = r0->lo_sub[i]; - - if (los) { - cl_object_prune(env, &los->lso_cl); - /* - * If top-level object is to be evicted from - * the cache, so are its sub-objects. - */ - lov_subobject_kill(env, lov, los, i); - } - } - } - return 0; -} - -static void lov_fini_empty(const struct lu_env *env, struct lov_object *lov, - union lov_layout_state *state) -{ - LASSERT(lov->lo_type == LLT_EMPTY || lov->lo_type == LLT_RELEASED); -} - -static void lov_fini_raid0(const struct lu_env *env, struct lov_object *lov, - union lov_layout_state *state) -{ - struct lov_layout_raid0 *r0 = &state->raid0; - - if (r0->lo_sub) { - kvfree(r0->lo_sub); - r0->lo_sub = NULL; - } - - dump_lsm(D_INODE, lov->lo_lsm); - lov_free_memmd(&lov->lo_lsm); -} - -static void lov_fini_released(const struct lu_env *env, struct lov_object *lov, - union lov_layout_state *state) -{ - dump_lsm(D_INODE, lov->lo_lsm); - lov_free_memmd(&lov->lo_lsm); -} - -static int lov_print_empty(const struct lu_env *env, void *cookie, - lu_printer_t p, const struct lu_object *o) -{ - (*p)(env, cookie, "empty %d\n", lu2lov(o)->lo_layout_invalid); - return 0; -} - -static int lov_print_raid0(const struct lu_env *env, void *cookie, - lu_printer_t p, const struct lu_object *o) -{ - struct lov_object *lov = lu2lov(o); - struct lov_layout_raid0 *r0 = lov_r0(lov); - struct lov_stripe_md *lsm = lov->lo_lsm; - int i; - - (*p)(env, cookie, "stripes: %d, %s, lsm{%p 0x%08X %d %u %u}:\n", - r0->lo_nr, lov->lo_layout_invalid ? "invalid" : "valid", lsm, - lsm->lsm_magic, atomic_read(&lsm->lsm_refc), - lsm->lsm_stripe_count, lsm->lsm_layout_gen); - for (i = 0; i < r0->lo_nr; ++i) { - struct lu_object *sub; - - if (r0->lo_sub[i]) { - sub = lovsub2lu(r0->lo_sub[i]); - lu_object_print(env, cookie, p, sub); - } else { - (*p)(env, cookie, "sub %d absent\n", i); - } - } - return 0; -} - -static int lov_print_released(const struct lu_env *env, void *cookie, - lu_printer_t p, const struct lu_object *o) -{ - struct lov_object *lov = lu2lov(o); - struct lov_stripe_md *lsm = lov->lo_lsm; - - (*p)(env, cookie, - "released: %s, lsm{%p 0x%08X %d %u %u}:\n", - lov->lo_layout_invalid ? "invalid" : "valid", lsm, - lsm->lsm_magic, atomic_read(&lsm->lsm_refc), - lsm->lsm_stripe_count, lsm->lsm_layout_gen); - return 0; -} - -/** - * Implements cl_object_operations::coo_attr_get() method for an object - * without stripes (LLT_EMPTY layout type). - * - * The only attributes this layer is authoritative in this case is - * cl_attr::cat_blocks---it's 0. - */ -static int lov_attr_get_empty(const struct lu_env *env, struct cl_object *obj, - struct cl_attr *attr) -{ - attr->cat_blocks = 0; - return 0; -} - -static int lov_attr_get_raid0(const struct lu_env *env, struct cl_object *obj, - struct cl_attr *attr) -{ - struct lov_object *lov = cl2lov(obj); - struct lov_layout_raid0 *r0 = lov_r0(lov); - struct cl_attr *lov_attr = &r0->lo_attr; - int result = 0; - - /* this is called w/o holding type guard mutex, so it must be inside - * an on going IO otherwise lsm may be replaced. - * LU-2117: it turns out there exists one exception. For mmaped files, - * the lock of those files may be requested in the other file's IO - * context, and this function is called in ccc_lock_state(), it will - * hit this assertion. - * Anyway, it's still okay to call attr_get w/o type guard as layout - * can't go if locks exist. - */ - /* LASSERT(atomic_read(&lsm->lsm_refc) > 1); */ - - if (!r0->lo_attr_valid) { - struct lov_stripe_md *lsm = lov->lo_lsm; - struct ost_lvb *lvb = &lov_env_info(env)->lti_lvb; - __u64 kms = 0; - - memset(lvb, 0, sizeof(*lvb)); - /* XXX: timestamps can be negative by sanity:test_39m, - * how can it be? - */ - lvb->lvb_atime = LLONG_MIN; - lvb->lvb_ctime = LLONG_MIN; - lvb->lvb_mtime = LLONG_MIN; - - /* - * XXX that should be replaced with a loop over sub-objects, - * doing cl_object_attr_get() on them. But for now, let's - * reuse old lov code. - */ - - /* - * XXX take lsm spin-lock to keep lov_merge_lvb_kms() - * happy. It's not needed, because new code uses - * ->coh_attr_guard spin-lock to protect consistency of - * sub-object attributes. - */ - lov_stripe_lock(lsm); - result = lov_merge_lvb_kms(lsm, lvb, &kms); - lov_stripe_unlock(lsm); - if (result == 0) { - cl_lvb2attr(lov_attr, lvb); - lov_attr->cat_kms = kms; - r0->lo_attr_valid = 1; - } - } - if (result == 0) { /* merge results */ - attr->cat_blocks = lov_attr->cat_blocks; - attr->cat_size = lov_attr->cat_size; - attr->cat_kms = lov_attr->cat_kms; - if (attr->cat_atime < lov_attr->cat_atime) - attr->cat_atime = lov_attr->cat_atime; - if (attr->cat_ctime < lov_attr->cat_ctime) - attr->cat_ctime = lov_attr->cat_ctime; - if (attr->cat_mtime < lov_attr->cat_mtime) - attr->cat_mtime = lov_attr->cat_mtime; - } - return result; -} - -static const struct lov_layout_operations lov_dispatch[] = { - [LLT_EMPTY] = { - .llo_init = lov_init_empty, - .llo_delete = lov_delete_empty, - .llo_fini = lov_fini_empty, - .llo_install = lov_install_empty, - .llo_print = lov_print_empty, - .llo_page_init = lov_page_init_empty, - .llo_lock_init = lov_lock_init_empty, - .llo_io_init = lov_io_init_empty, - .llo_getattr = lov_attr_get_empty - }, - [LLT_RAID0] = { - .llo_init = lov_init_raid0, - .llo_delete = lov_delete_raid0, - .llo_fini = lov_fini_raid0, - .llo_install = lov_install_raid0, - .llo_print = lov_print_raid0, - .llo_page_init = lov_page_init_raid0, - .llo_lock_init = lov_lock_init_raid0, - .llo_io_init = lov_io_init_raid0, - .llo_getattr = lov_attr_get_raid0 - }, - [LLT_RELEASED] = { - .llo_init = lov_init_released, - .llo_delete = lov_delete_empty, - .llo_fini = lov_fini_released, - .llo_install = lov_install_empty, - .llo_print = lov_print_released, - .llo_page_init = lov_page_init_empty, - .llo_lock_init = lov_lock_init_empty, - .llo_io_init = lov_io_init_released, - .llo_getattr = lov_attr_get_empty - } -}; - -/** - * Performs a double-dispatch based on the layout type of an object. - */ -#define LOV_2DISPATCH_NOLOCK(obj, op, ...) \ -({ \ - struct lov_object *__obj = (obj); \ - enum lov_layout_type __llt; \ - \ - __llt = __obj->lo_type; \ - LASSERT(__llt < ARRAY_SIZE(lov_dispatch)); \ - lov_dispatch[__llt].op(__VA_ARGS__); \ -}) - -/** - * Return lov_layout_type associated with a given lsm - */ -static enum lov_layout_type lov_type(struct lov_stripe_md *lsm) -{ - if (!lsm) - return LLT_EMPTY; - if (lsm_is_released(lsm)) - return LLT_RELEASED; - return LLT_RAID0; -} - -static inline void lov_conf_freeze(struct lov_object *lov) -{ - CDEBUG(D_INODE, "To take share lov(%p) owner %p/%p\n", - lov, lov->lo_owner, current); - if (lov->lo_owner != current) - down_read(&lov->lo_type_guard); -} - -static inline void lov_conf_thaw(struct lov_object *lov) -{ - CDEBUG(D_INODE, "To release share lov(%p) owner %p/%p\n", - lov, lov->lo_owner, current); - if (lov->lo_owner != current) - up_read(&lov->lo_type_guard); -} - -#define LOV_2DISPATCH_MAYLOCK(obj, op, lock, ...) \ -({ \ - struct lov_object *__obj = (obj); \ - int __lock = !!(lock); \ - typeof(lov_dispatch[0].op(__VA_ARGS__)) __result; \ - \ - if (__lock) \ - lov_conf_freeze(__obj); \ - __result = LOV_2DISPATCH_NOLOCK(obj, op, __VA_ARGS__); \ - if (__lock) \ - lov_conf_thaw(__obj); \ - __result; \ -}) - -/** - * Performs a locked double-dispatch based on the layout type of an object. - */ -#define LOV_2DISPATCH(obj, op, ...) \ - LOV_2DISPATCH_MAYLOCK(obj, op, 1, __VA_ARGS__) - -#define LOV_2DISPATCH_VOID(obj, op, ...) \ -do { \ - struct lov_object *__obj = (obj); \ - enum lov_layout_type __llt; \ - \ - lov_conf_freeze(__obj); \ - __llt = __obj->lo_type; \ - LASSERT(__llt < ARRAY_SIZE(lov_dispatch)); \ - lov_dispatch[__llt].op(__VA_ARGS__); \ - lov_conf_thaw(__obj); \ -} while (0) - -static void lov_conf_lock(struct lov_object *lov) -{ - LASSERT(lov->lo_owner != current); - down_write(&lov->lo_type_guard); - LASSERT(!lov->lo_owner); - lov->lo_owner = current; - CDEBUG(D_INODE, "Took exclusive lov(%p) owner %p\n", - lov, lov->lo_owner); -} - -static void lov_conf_unlock(struct lov_object *lov) -{ - CDEBUG(D_INODE, "To release exclusive lov(%p) owner %p\n", - lov, lov->lo_owner); - lov->lo_owner = NULL; - up_write(&lov->lo_type_guard); -} - -static int lov_layout_wait(const struct lu_env *env, struct lov_object *lov) -{ - while (atomic_read(&lov->lo_active_ios) > 0) { - CDEBUG(D_INODE, "file:" DFID " wait for active IO, now: %d.\n", - PFID(lu_object_fid(lov2lu(lov))), - atomic_read(&lov->lo_active_ios)); - - wait_event_idle(lov->lo_waitq, - atomic_read(&lov->lo_active_ios) == 0); - } - return 0; -} - -static int lov_layout_change(const struct lu_env *unused, - struct lov_object *lov, struct lov_stripe_md *lsm, - const struct cl_object_conf *conf) -{ - struct lov_device *lov_dev = lov_object_dev(lov); - enum lov_layout_type llt = lov_type(lsm); - union lov_layout_state *state = &lov->u; - const struct lov_layout_operations *old_ops; - const struct lov_layout_operations *new_ops; - struct lu_env *env; - u16 refcheck; - int rc; - - LASSERT(lov->lo_type < ARRAY_SIZE(lov_dispatch)); - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - LASSERT(llt < ARRAY_SIZE(lov_dispatch)); - - CDEBUG(D_INODE, DFID " from %s to %s\n", - PFID(lu_object_fid(lov2lu(lov))), - llt2str(lov->lo_type), llt2str(llt)); - - old_ops = &lov_dispatch[lov->lo_type]; - new_ops = &lov_dispatch[llt]; - - rc = cl_object_prune(env, &lov->lo_cl); - if (rc) - goto out; - - rc = old_ops->llo_delete(env, lov, &lov->u); - if (rc) - goto out; - - old_ops->llo_fini(env, lov, &lov->u); - - LASSERT(!atomic_read(&lov->lo_active_ios)); - - CDEBUG(D_INODE, DFID "Apply new layout lov %p, type %d\n", - PFID(lu_object_fid(lov2lu(lov))), lov, llt); - - lov->lo_type = LLT_EMPTY; - - /* page bufsize fixup */ - cl_object_header(&lov->lo_cl)->coh_page_bufsize -= - lov_page_slice_fixup(lov, NULL); - - rc = new_ops->llo_init(env, lov_dev, lov, lsm, conf, state); - if (rc) { - struct obd_device *obd = lov2obd(lov_dev->ld_lov); - - CERROR("%s: cannot apply new layout on " DFID " : rc = %d\n", - obd->obd_name, PFID(lu_object_fid(lov2lu(lov))), rc); - new_ops->llo_delete(env, lov, state); - new_ops->llo_fini(env, lov, state); - /* this file becomes an EMPTY file. */ - goto out; - } - - new_ops->llo_install(env, lov, state); - lov->lo_type = llt; -out: - cl_env_put(env, &refcheck); - return rc; -} - -/***************************************************************************** - * - * Lov object operations. - * - */ -int lov_object_init(const struct lu_env *env, struct lu_object *obj, - const struct lu_object_conf *conf) -{ - struct lov_object *lov = lu2lov(obj); - struct lov_device *dev = lov_object_dev(lov); - const struct cl_object_conf *cconf = lu2cl_conf(conf); - union lov_layout_state *set = &lov->u; - const struct lov_layout_operations *ops; - struct lov_stripe_md *lsm = NULL; - int rc; - - init_rwsem(&lov->lo_type_guard); - atomic_set(&lov->lo_active_ios, 0); - init_waitqueue_head(&lov->lo_waitq); - cl_object_page_init(lu2cl(obj), sizeof(struct lov_page)); - - lov->lo_type = LLT_EMPTY; - if (cconf->u.coc_layout.lb_buf) { - lsm = lov_unpackmd(dev->ld_lov, - cconf->u.coc_layout.lb_buf, - cconf->u.coc_layout.lb_len); - if (IS_ERR(lsm)) - return PTR_ERR(lsm); - } - - /* no locking is necessary, as object is being created */ - lov->lo_type = lov_type(lsm); - ops = &lov_dispatch[lov->lo_type]; - rc = ops->llo_init(env, dev, lov, lsm, cconf, set); - if (!rc) - ops->llo_install(env, lov, set); - - lov_lsm_put(lsm); - - return rc; -} - -static int lov_conf_set(const struct lu_env *env, struct cl_object *obj, - const struct cl_object_conf *conf) -{ - struct lov_stripe_md *lsm = NULL; - struct lov_object *lov = cl2lov(obj); - int result = 0; - - if (conf->coc_opc == OBJECT_CONF_SET && - conf->u.coc_layout.lb_buf) { - lsm = lov_unpackmd(lov_object_dev(lov)->ld_lov, - conf->u.coc_layout.lb_buf, - conf->u.coc_layout.lb_len); - if (IS_ERR(lsm)) - return PTR_ERR(lsm); - } - - lov_conf_lock(lov); - if (conf->coc_opc == OBJECT_CONF_INVALIDATE) { - lov->lo_layout_invalid = true; - result = 0; - goto out; - } - - if (conf->coc_opc == OBJECT_CONF_WAIT) { - if (lov->lo_layout_invalid && - atomic_read(&lov->lo_active_ios) > 0) { - lov_conf_unlock(lov); - result = lov_layout_wait(env, lov); - lov_conf_lock(lov); - } - goto out; - } - - LASSERT(conf->coc_opc == OBJECT_CONF_SET); - - if ((!lsm && !lov->lo_lsm) || - ((lsm && lov->lo_lsm) && - (lov->lo_lsm->lsm_layout_gen == lsm->lsm_layout_gen) && - (lov->lo_lsm->lsm_pattern == lsm->lsm_pattern))) { - /* same version of layout */ - lov->lo_layout_invalid = false; - result = 0; - goto out; - } - - /* will change layout - check if there still exists active IO. */ - if (atomic_read(&lov->lo_active_ios) > 0) { - lov->lo_layout_invalid = true; - result = -EBUSY; - goto out; - } - - result = lov_layout_change(env, lov, lsm, conf); - lov->lo_layout_invalid = result != 0; - -out: - lov_conf_unlock(lov); - lov_lsm_put(lsm); - CDEBUG(D_INODE, DFID " lo_layout_invalid=%d\n", - PFID(lu_object_fid(lov2lu(lov))), lov->lo_layout_invalid); - return result; -} - -static void lov_object_delete(const struct lu_env *env, struct lu_object *obj) -{ - struct lov_object *lov = lu2lov(obj); - - LOV_2DISPATCH_VOID(lov, llo_delete, env, lov, &lov->u); -} - -static void lov_object_free(const struct lu_env *env, struct lu_object *obj) -{ - struct lov_object *lov = lu2lov(obj); - - LOV_2DISPATCH_VOID(lov, llo_fini, env, lov, &lov->u); - lu_object_fini(obj); - kmem_cache_free(lov_object_kmem, lov); -} - -static int lov_object_print(const struct lu_env *env, void *cookie, - lu_printer_t p, const struct lu_object *o) -{ - return LOV_2DISPATCH_NOLOCK(lu2lov(o), llo_print, env, cookie, p, o); -} - -int lov_page_init(const struct lu_env *env, struct cl_object *obj, - struct cl_page *page, pgoff_t index) -{ - return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_page_init, env, obj, page, - index); -} - -/** - * Implements cl_object_operations::clo_io_init() method for lov - * layer. Dispatches to the appropriate layout io initialization method. - */ -int lov_io_init(const struct lu_env *env, struct cl_object *obj, - struct cl_io *io) -{ - CL_IO_SLICE_CLEAN(lov_env_io(env), lis_cl); - - CDEBUG(D_INODE, DFID "io %p type %d ignore/verify layout %d/%d\n", - PFID(lu_object_fid(&obj->co_lu)), io, io->ci_type, - io->ci_ignore_layout, io->ci_verify_layout); - - return LOV_2DISPATCH_MAYLOCK(cl2lov(obj), llo_io_init, - !io->ci_ignore_layout, env, obj, io); -} - -/** - * An implementation of cl_object_operations::clo_attr_get() method for lov - * layer. For raid0 layout this collects and merges attributes of all - * sub-objects. - */ -static int lov_attr_get(const struct lu_env *env, struct cl_object *obj, - struct cl_attr *attr) -{ - /* do not take lock, as this function is called under a - * spin-lock. Layout is protected from changing by ongoing IO. - */ - return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_getattr, env, obj, attr); -} - -static int lov_attr_update(const struct lu_env *env, struct cl_object *obj, - const struct cl_attr *attr, unsigned int valid) -{ - /* - * No dispatch is required here, as no layout implements this. - */ - return 0; -} - -int lov_lock_init(const struct lu_env *env, struct cl_object *obj, - struct cl_lock *lock, const struct cl_io *io) -{ - /* No need to lock because we've taken one refcount of layout. */ - return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_lock_init, env, obj, lock, - io); -} - -/** - * We calculate on which OST the mapping will end. If the length of mapping - * is greater than (stripe_size * stripe_count) then the last_stripe will - * will be one just before start_stripe. Else we check if the mapping - * intersects each OST and find last_stripe. - * This function returns the last_stripe and also sets the stripe_count - * over which the mapping is spread - * - * \param lsm [in] striping information for the file - * \param fm_start [in] logical start of mapping - * \param fm_end [in] logical end of mapping - * \param start_stripe [in] starting stripe of the mapping - * \param stripe_count [out] the number of stripes across which to map is - * returned - * - * \retval last_stripe return the last stripe of the mapping - */ -static int fiemap_calc_last_stripe(struct lov_stripe_md *lsm, - u64 fm_start, u64 fm_end, - int start_stripe, int *stripe_count) -{ - int last_stripe; - u64 obd_start; - u64 obd_end; - int i, j; - - if (fm_end - fm_start > lsm->lsm_stripe_size * lsm->lsm_stripe_count) { - last_stripe = (start_stripe < 1 ? lsm->lsm_stripe_count - 1 : - start_stripe - 1); - *stripe_count = lsm->lsm_stripe_count; - } else { - for (j = 0, i = start_stripe; j < lsm->lsm_stripe_count; - i = (i + 1) % lsm->lsm_stripe_count, j++) { - if (!(lov_stripe_intersects(lsm, i, fm_start, fm_end, - &obd_start, &obd_end))) - break; - } - *stripe_count = j; - last_stripe = (start_stripe + j - 1) % lsm->lsm_stripe_count; - } - - return last_stripe; -} - -/** - * Set fe_device and copy extents from local buffer into main return buffer. - * - * \param fiemap [out] fiemap to hold all extents - * \param lcl_fm_ext [in] array of fiemap extents get from OSC layer - * \param ost_index [in] OST index to be written into the fm_device - * field for each extent - * \param ext_count [in] number of extents to be copied - * \param current_extent [in] where to start copying in the extent array - */ -static void fiemap_prepare_and_copy_exts(struct fiemap *fiemap, - struct fiemap_extent *lcl_fm_ext, - int ost_index, unsigned int ext_count, - int current_extent) -{ - unsigned int ext; - char *to; - - for (ext = 0; ext < ext_count; ext++) { - lcl_fm_ext[ext].fe_device = ost_index; - lcl_fm_ext[ext].fe_flags |= FIEMAP_EXTENT_NET; - } - - /* Copy fm_extent's from fm_local to return buffer */ - to = (char *)fiemap + fiemap_count_to_size(current_extent); - memcpy(to, lcl_fm_ext, ext_count * sizeof(struct fiemap_extent)); -} - -#define FIEMAP_BUFFER_SIZE 4096 - -/** - * Non-zero fe_logical indicates that this is a continuation FIEMAP - * call. The local end offset and the device are sent in the first - * fm_extent. This function calculates the stripe number from the index. - * This function returns a stripe_no on which mapping is to be restarted. - * - * This function returns fm_end_offset which is the in-OST offset at which - * mapping should be restarted. If fm_end_offset=0 is returned then caller - * will re-calculate proper offset in next stripe. - * Note that the first extent is passed to lov_get_info via the value field. - * - * \param fiemap [in] fiemap request header - * \param lsm [in] striping information for the file - * \param fm_start [in] logical start of mapping - * \param fm_end [in] logical end of mapping - * \param start_stripe [out] starting stripe will be returned in this - */ -static u64 fiemap_calc_fm_end_offset(struct fiemap *fiemap, - struct lov_stripe_md *lsm, - u64 fm_start, u64 fm_end, - int *start_stripe) -{ - u64 local_end = fiemap->fm_extents[0].fe_logical; - u64 lun_start, lun_end; - u64 fm_end_offset; - int stripe_no = -1; - int i; - - if (!fiemap->fm_extent_count || !fiemap->fm_extents[0].fe_logical) - return 0; - - /* Find out stripe_no from ost_index saved in the fe_device */ - for (i = 0; i < lsm->lsm_stripe_count; i++) { - struct lov_oinfo *oinfo = lsm->lsm_oinfo[i]; - - if (lov_oinfo_is_dummy(oinfo)) - continue; - - if (oinfo->loi_ost_idx == fiemap->fm_extents[0].fe_device) { - stripe_no = i; - break; - } - } - - if (stripe_no == -1) - return -EINVAL; - - /* - * If we have finished mapping on previous device, shift logical - * offset to start of next device - */ - if (lov_stripe_intersects(lsm, stripe_no, fm_start, fm_end, - &lun_start, &lun_end) && - local_end < lun_end) { - fm_end_offset = local_end; - *start_stripe = stripe_no; - } else { - /* This is a special value to indicate that caller should - * calculate offset in next stripe. - */ - fm_end_offset = 0; - *start_stripe = (stripe_no + 1) % lsm->lsm_stripe_count; - } - - return fm_end_offset; -} - -struct fiemap_state { - struct fiemap *fs_fm; - u64 fs_start; - u64 fs_length; - u64 fs_end; - u64 fs_end_offset; - int fs_cur_extent; - int fs_cnt_need; - int fs_start_stripe; - int fs_last_stripe; - bool fs_device_done; - bool fs_finish; - bool fs_enough; -}; - -static int fiemap_for_stripe(const struct lu_env *env, struct cl_object *obj, - struct lov_stripe_md *lsm, - struct fiemap *fiemap, size_t *buflen, - struct ll_fiemap_info_key *fmkey, int stripeno, - struct fiemap_state *fs) -{ - struct cl_object *subobj; - struct lov_obd *lov = lu2lov_dev(obj->co_lu.lo_dev)->ld_lov; - struct fiemap_extent *fm_ext = &fs->fs_fm->fm_extents[0]; - u64 req_fm_len; /* Stores length of required mapping */ - u64 len_mapped_single_call; - u64 lun_start; - u64 lun_end; - u64 obd_object_end; - unsigned int ext_count; - /* EOF for object */ - bool ost_eof = false; - /* done with required mapping for this OST? */ - bool ost_done = false; - int ost_index; - int rc = 0; - - fs->fs_device_done = false; - /* Find out range of mapping on this stripe */ - if ((lov_stripe_intersects(lsm, stripeno, fs->fs_start, fs->fs_end, - &lun_start, &obd_object_end)) == 0) - return 0; - - if (lov_oinfo_is_dummy(lsm->lsm_oinfo[stripeno])) - return -EIO; - - /* If this is a continuation FIEMAP call and we are on - * starting stripe then lun_start needs to be set to - * end_offset - */ - if (fs->fs_end_offset != 0 && stripeno == fs->fs_start_stripe) - lun_start = fs->fs_end_offset; - - lun_end = fs->fs_length; - if (lun_end != ~0ULL) { - /* Handle fs->fs_start + fs->fs_length overflow */ - if (fs->fs_start + fs->fs_length < fs->fs_start) - fs->fs_length = ~0ULL - fs->fs_start; - lun_end = lov_size_to_stripe(lsm, fs->fs_start + fs->fs_length, - stripeno); - } - - if (lun_start == lun_end) - return 0; - - req_fm_len = obd_object_end - lun_start; - fs->fs_fm->fm_length = 0; - len_mapped_single_call = 0; - - /* find lobsub object */ - subobj = lov_find_subobj(env, cl2lov(obj), lsm, stripeno); - if (IS_ERR(subobj)) - return PTR_ERR(subobj); - /* If the output buffer is very large and the objects have many - * extents we may need to loop on a single OST repeatedly - */ - do { - if (fiemap->fm_extent_count > 0) { - /* Don't get too many extents. */ - if (fs->fs_cur_extent + fs->fs_cnt_need > - fiemap->fm_extent_count) - fs->fs_cnt_need = fiemap->fm_extent_count - - fs->fs_cur_extent; - } - - lun_start += len_mapped_single_call; - fs->fs_fm->fm_length = req_fm_len - len_mapped_single_call; - req_fm_len = fs->fs_fm->fm_length; - fs->fs_fm->fm_extent_count = fs->fs_enough ? - 1 : fs->fs_cnt_need; - fs->fs_fm->fm_mapped_extents = 0; - fs->fs_fm->fm_flags = fiemap->fm_flags; - - ost_index = lsm->lsm_oinfo[stripeno]->loi_ost_idx; - - if (ost_index < 0 || ost_index >= lov->desc.ld_tgt_count) { - rc = -EINVAL; - goto obj_put; - } - /* If OST is inactive, return extent with UNKNOWN flag. */ - if (!lov->lov_tgts[ost_index]->ltd_active) { - fs->fs_fm->fm_flags |= FIEMAP_EXTENT_LAST; - fs->fs_fm->fm_mapped_extents = 1; - - fm_ext[0].fe_logical = lun_start; - fm_ext[0].fe_length = obd_object_end - lun_start; - fm_ext[0].fe_flags |= FIEMAP_EXTENT_UNKNOWN; - - goto inactive_tgt; - } - - fs->fs_fm->fm_start = lun_start; - fs->fs_fm->fm_flags &= ~FIEMAP_FLAG_DEVICE_ORDER; - memcpy(&fmkey->lfik_fiemap, fs->fs_fm, sizeof(*fs->fs_fm)); - *buflen = fiemap_count_to_size(fs->fs_fm->fm_extent_count); - - rc = cl_object_fiemap(env, subobj, fmkey, fs->fs_fm, buflen); - if (rc) - goto obj_put; -inactive_tgt: - ext_count = fs->fs_fm->fm_mapped_extents; - if (ext_count == 0) { - ost_done = true; - fs->fs_device_done = true; - /* If last stripe has hold at the end, - * we need to return - */ - if (stripeno == fs->fs_last_stripe) { - fiemap->fm_mapped_extents = 0; - fs->fs_finish = true; - goto obj_put; - } - break; - } else if (fs->fs_enough) { - /* - * We've collected enough extents and there are - * more extents after it. - */ - fs->fs_finish = true; - goto obj_put; - } - - /* If we just need num of extents, got to next device */ - if (fiemap->fm_extent_count == 0) { - fs->fs_cur_extent += ext_count; - break; - } - - /* prepare to copy retrived map extents */ - len_mapped_single_call = fm_ext[ext_count - 1].fe_logical + - fm_ext[ext_count - 1].fe_length - - lun_start; - - /* Have we finished mapping on this device? */ - if (req_fm_len <= len_mapped_single_call) { - ost_done = true; - fs->fs_device_done = true; - } - - /* Clear the EXTENT_LAST flag which can be present on - * the last extent - */ - if (fm_ext[ext_count - 1].fe_flags & FIEMAP_EXTENT_LAST) - fm_ext[ext_count - 1].fe_flags &= ~FIEMAP_EXTENT_LAST; - if (lov_stripe_size(lsm, fm_ext[ext_count - 1].fe_logical + - fm_ext[ext_count - 1].fe_length, - stripeno) >= fmkey->lfik_oa.o_size) { - ost_eof = true; - fs->fs_device_done = true; - } - - fiemap_prepare_and_copy_exts(fiemap, fm_ext, ost_index, - ext_count, fs->fs_cur_extent); - fs->fs_cur_extent += ext_count; - - /* Ran out of available extents? */ - if (fs->fs_cur_extent >= fiemap->fm_extent_count) - fs->fs_enough = true; - } while (!ost_done && !ost_eof); - - if (stripeno == fs->fs_last_stripe) - fs->fs_finish = true; -obj_put: - cl_object_put(env, subobj); - - return rc; -} - -/** - * Break down the FIEMAP request and send appropriate calls to individual OSTs. - * This also handles the restarting of FIEMAP calls in case mapping overflows - * the available number of extents in single call. - * - * \param env [in] lustre environment - * \param obj [in] file object - * \param fmkey [in] fiemap request header and other info - * \param fiemap [out] fiemap buffer holding retrived map extents - * \param buflen [in/out] max buffer length of @fiemap, when iterate - * each OST, it is used to limit max map needed - * \retval 0 success - * \retval < 0 error - */ -static int lov_object_fiemap(const struct lu_env *env, struct cl_object *obj, - struct ll_fiemap_info_key *fmkey, - struct fiemap *fiemap, size_t *buflen) -{ - unsigned int buffer_size = FIEMAP_BUFFER_SIZE; - struct fiemap *fm_local = NULL; - struct lov_stripe_md *lsm; - int rc = 0; - int cur_stripe; - int stripe_count; - struct fiemap_state fs = { NULL }; - - lsm = lov_lsm_addref(cl2lov(obj)); - if (!lsm) - return -ENODATA; - - /** - * If the stripe_count > 1 and the application does not understand - * DEVICE_ORDER flag, it cannot interpret the extents correctly. - */ - if (lsm->lsm_stripe_count > 1 && - !(fiemap->fm_flags & FIEMAP_FLAG_DEVICE_ORDER)) { - rc = -ENOTSUPP; - goto out; - } - - if (lsm_is_released(lsm)) { - if (fiemap->fm_start < fmkey->lfik_oa.o_size) { - /** - * released file, return a minimal FIEMAP if - * request fits in file-size. - */ - fiemap->fm_mapped_extents = 1; - fiemap->fm_extents[0].fe_logical = fiemap->fm_start; - if (fiemap->fm_start + fiemap->fm_length < - fmkey->lfik_oa.o_size) - fiemap->fm_extents[0].fe_length = - fiemap->fm_length; - else - fiemap->fm_extents[0].fe_length = - fmkey->lfik_oa.o_size - - fiemap->fm_start; - fiemap->fm_extents[0].fe_flags |= - FIEMAP_EXTENT_UNKNOWN | FIEMAP_EXTENT_LAST; - } - rc = 0; - goto out; - } - - if (fiemap_count_to_size(fiemap->fm_extent_count) < buffer_size) - buffer_size = fiemap_count_to_size(fiemap->fm_extent_count); - - fm_local = kvzalloc(buffer_size, GFP_NOFS); - if (!fm_local) { - rc = -ENOMEM; - goto out; - } - fs.fs_fm = fm_local; - fs.fs_cnt_need = fiemap_size_to_count(buffer_size); - - fs.fs_start = fiemap->fm_start; - /* fs_start is beyond the end of the file */ - if (fs.fs_start > fmkey->lfik_oa.o_size) { - rc = -EINVAL; - goto out; - } - /* Calculate start stripe, last stripe and length of mapping */ - fs.fs_start_stripe = lov_stripe_number(lsm, fs.fs_start); - fs.fs_end = (fs.fs_length == ~0ULL) ? fmkey->lfik_oa.o_size : - fs.fs_start + fs.fs_length - 1; - /* If fs_length != ~0ULL but fs_start+fs_length-1 exceeds file size */ - if (fs.fs_end > fmkey->lfik_oa.o_size) { - fs.fs_end = fmkey->lfik_oa.o_size; - fs.fs_length = fs.fs_end - fs.fs_start; - } - - fs.fs_last_stripe = fiemap_calc_last_stripe(lsm, fs.fs_start, fs.fs_end, - fs.fs_start_stripe, - &stripe_count); - fs.fs_end_offset = fiemap_calc_fm_end_offset(fiemap, lsm, fs.fs_start, - fs.fs_end, - &fs.fs_start_stripe); - if (fs.fs_end_offset == -EINVAL) { - rc = -EINVAL; - goto out; - } - - - /** - * Requested extent count exceeds the fiemap buffer size, shrink our - * ambition. - */ - if (fiemap_count_to_size(fiemap->fm_extent_count) > *buflen) - fiemap->fm_extent_count = fiemap_size_to_count(*buflen); - if (!fiemap->fm_extent_count) - fs.fs_cnt_need = 0; - - fs.fs_finish = false; - fs.fs_enough = false; - fs.fs_cur_extent = 0; - - /* Check each stripe */ - for (cur_stripe = fs.fs_start_stripe; stripe_count > 0; - --stripe_count, - cur_stripe = (cur_stripe + 1) % lsm->lsm_stripe_count) { - rc = fiemap_for_stripe(env, obj, lsm, fiemap, buflen, fmkey, - cur_stripe, &fs); - if (rc < 0) - goto out; - if (fs.fs_finish) - break; - } /* for each stripe */ - /* - * Indicate that we are returning device offsets unless file just has - * single stripe - */ - if (lsm->lsm_stripe_count > 1) - fiemap->fm_flags |= FIEMAP_FLAG_DEVICE_ORDER; - - if (!fiemap->fm_extent_count) - goto skip_last_device_calc; - - /* - * Check if we have reached the last stripe and whether mapping for that - * stripe is done. - */ - if ((cur_stripe == fs.fs_last_stripe) && fs.fs_device_done) - fiemap->fm_extents[fs.fs_cur_extent - 1].fe_flags |= - FIEMAP_EXTENT_LAST; -skip_last_device_calc: - fiemap->fm_mapped_extents = fs.fs_cur_extent; -out: - kvfree(fm_local); - lov_lsm_put(lsm); - return rc; -} - -static int lov_object_getstripe(const struct lu_env *env, struct cl_object *obj, - struct lov_user_md __user *lum) -{ - struct lov_object *lov = cl2lov(obj); - struct lov_stripe_md *lsm; - int rc = 0; - - lsm = lov_lsm_addref(lov); - if (!lsm) - return -ENODATA; - - rc = lov_getstripe(cl2lov(obj), lsm, lum); - lov_lsm_put(lsm); - return rc; -} - -static int lov_object_layout_get(const struct lu_env *env, - struct cl_object *obj, - struct cl_layout *cl) -{ - struct lov_object *lov = cl2lov(obj); - struct lov_stripe_md *lsm = lov_lsm_addref(lov); - struct lu_buf *buf = &cl->cl_buf; - ssize_t rc; - - if (!lsm) { - cl->cl_size = 0; - cl->cl_layout_gen = CL_LAYOUT_GEN_EMPTY; - return 0; - } - - cl->cl_size = lov_mds_md_size(lsm->lsm_stripe_count, lsm->lsm_magic); - cl->cl_layout_gen = lsm->lsm_layout_gen; - - rc = lov_lsm_pack(lsm, buf->lb_buf, buf->lb_len); - lov_lsm_put(lsm); - - return rc < 0 ? rc : 0; -} - -static loff_t lov_object_maxbytes(struct cl_object *obj) -{ - struct lov_object *lov = cl2lov(obj); - struct lov_stripe_md *lsm = lov_lsm_addref(lov); - loff_t maxbytes; - - if (!lsm) - return LLONG_MAX; - - maxbytes = lsm->lsm_maxbytes; - - lov_lsm_put(lsm); - - return maxbytes; -} - -static const struct cl_object_operations lov_ops = { - .coo_page_init = lov_page_init, - .coo_lock_init = lov_lock_init, - .coo_io_init = lov_io_init, - .coo_attr_get = lov_attr_get, - .coo_attr_update = lov_attr_update, - .coo_conf_set = lov_conf_set, - .coo_getstripe = lov_object_getstripe, - .coo_layout_get = lov_object_layout_get, - .coo_maxbytes = lov_object_maxbytes, - .coo_fiemap = lov_object_fiemap, -}; - -static const struct lu_object_operations lov_lu_obj_ops = { - .loo_object_init = lov_object_init, - .loo_object_delete = lov_object_delete, - .loo_object_release = NULL, - .loo_object_free = lov_object_free, - .loo_object_print = lov_object_print, - .loo_object_invariant = NULL -}; - -struct lu_object *lov_object_alloc(const struct lu_env *env, - const struct lu_object_header *unused, - struct lu_device *dev) -{ - struct lov_object *lov; - struct lu_object *obj; - - lov = kmem_cache_zalloc(lov_object_kmem, GFP_NOFS); - if (lov) { - obj = lov2lu(lov); - lu_object_init(obj, NULL, dev); - lov->lo_cl.co_ops = &lov_ops; - lov->lo_type = -1; /* invalid, to catch uninitialized type */ - /* - * object io operation vector (cl_object::co_iop) is installed - * later in lov_object_init(), as different vectors are used - * for object with different layouts. - */ - obj->lo_ops = &lov_lu_obj_ops; - } else { - obj = NULL; - } - return obj; -} - -struct lov_stripe_md *lov_lsm_addref(struct lov_object *lov) -{ - struct lov_stripe_md *lsm = NULL; - - lov_conf_freeze(lov); - if (lov->lo_lsm) { - lsm = lsm_addref(lov->lo_lsm); - CDEBUG(D_INODE, "lsm %p addref %d/%d by %p.\n", - lsm, atomic_read(&lsm->lsm_refc), - lov->lo_layout_invalid, current); - } - lov_conf_thaw(lov); - return lsm; -} - -int lov_read_and_clear_async_rc(struct cl_object *clob) -{ - struct lu_object *luobj; - int rc = 0; - - luobj = lu_object_locate(&cl_object_header(clob)->coh_lu, - &lov_device_type); - if (luobj) { - struct lov_object *lov = lu2lov(luobj); - - lov_conf_freeze(lov); - switch (lov->lo_type) { - case LLT_RAID0: { - struct lov_stripe_md *lsm; - int i; - - lsm = lov->lo_lsm; - for (i = 0; i < lsm->lsm_stripe_count; i++) { - struct lov_oinfo *loi = lsm->lsm_oinfo[i]; - - if (lov_oinfo_is_dummy(loi)) - continue; - - if (loi->loi_ar.ar_rc && !rc) - rc = loi->loi_ar.ar_rc; - loi->loi_ar.ar_rc = 0; - } - } - case LLT_RELEASED: - case LLT_EMPTY: - break; - default: - LBUG(); - } - lov_conf_thaw(lov); - } - return rc; -} -EXPORT_SYMBOL(lov_read_and_clear_async_rc); - -/** @} lov */ diff --git a/drivers/staging/lustre/lustre/lov/lov_offset.c b/drivers/staging/lustre/lustre/lov/lov_offset.c deleted file mode 100644 index a5f00f6ec347f430d2d564f35927853760216a94..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/lov/lov_offset.c +++ /dev/null @@ -1,269 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_LOV - -#include - -#include "lov_internal.h" - -/* compute object size given "stripeno" and the ost size */ -u64 lov_stripe_size(struct lov_stripe_md *lsm, u64 ost_size, int stripeno) -{ - unsigned long ssize = lsm->lsm_stripe_size; - unsigned long stripe_size; - u64 swidth; - u64 lov_size; - int magic = lsm->lsm_magic; - - if (ost_size == 0) - return 0; - - lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, NULL, &swidth); - - /* lov_do_div64(a, b) returns a % b, and a = a / b */ - stripe_size = lov_do_div64(ost_size, ssize); - if (stripe_size) - lov_size = ost_size * swidth + stripeno * ssize + stripe_size; - else - lov_size = (ost_size - 1) * swidth + (stripeno + 1) * ssize; - - return lov_size; -} - -/** - * Compute file level page index by stripe level page offset - */ -pgoff_t lov_stripe_pgoff(struct lov_stripe_md *lsm, pgoff_t stripe_index, - int stripe) -{ - loff_t offset; - - offset = lov_stripe_size(lsm, (stripe_index << PAGE_SHIFT) + 1, stripe); - return offset >> PAGE_SHIFT; -} - -/* we have an offset in file backed by an lov and want to find out where - * that offset lands in our given stripe of the file. for the easy - * case where the offset is within the stripe, we just have to scale the - * offset down to make it relative to the stripe instead of the lov. - * - * the harder case is what to do when the offset doesn't intersect the - * stripe. callers will want start offsets clamped ahead to the start - * of the nearest stripe in the file. end offsets similarly clamped to the - * nearest ending byte of a stripe in the file: - * - * all this function does is move offsets to the nearest region of the - * stripe, and it does its work "mod" the full length of all the stripes. - * consider a file with 3 stripes: - * - * S E - * --------------------------------------------------------------------- - * | 0 | 1 | 2 | 0 | 1 | 2 | - * --------------------------------------------------------------------- - * - * to find stripe 1's offsets for S and E, it divides by the full stripe - * width and does its math in the context of a single set of stripes: - * - * S E - * ----------------------------------- - * | 0 | 1 | 2 | - * ----------------------------------- - * - * it'll notice that E is outside stripe 1 and clamp it to the end of the - * stripe, then multiply it back out by lov_off to give the real offsets in - * the stripe: - * - * S E - * --------------------------------------------------------------------- - * | 1 | 1 | 1 | 1 | 1 | 1 | - * --------------------------------------------------------------------- - * - * it would have done similarly and pulled S forward to the start of a 1 - * stripe if, say, S had landed in a 0 stripe. - * - * this rounding isn't always correct. consider an E lov offset that lands - * on a 0 stripe, the "mod stripe width" math will pull it forward to the - * start of a 1 stripe, when in fact it wanted to be rounded back to the end - * of a previous 1 stripe. this logic is handled by callers and this is why: - * - * this function returns < 0 when the offset was "before" the stripe and - * was moved forward to the start of the stripe in question; 0 when it - * falls in the stripe and no shifting was done; > 0 when the offset - * was outside the stripe and was pulled back to its final byte. - */ -int lov_stripe_offset(struct lov_stripe_md *lsm, u64 lov_off, - int stripeno, u64 *obdoff) -{ - unsigned long ssize = lsm->lsm_stripe_size; - u64 stripe_off, this_stripe, swidth; - int magic = lsm->lsm_magic; - int ret = 0; - - if (lov_off == OBD_OBJECT_EOF) { - *obdoff = OBD_OBJECT_EOF; - return 0; - } - - lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, &lov_off, - &swidth); - - /* lov_do_div64(a, b) returns a % b, and a = a / b */ - stripe_off = lov_do_div64(lov_off, swidth); - - this_stripe = (u64)stripeno * ssize; - if (stripe_off < this_stripe) { - stripe_off = 0; - ret = -1; - } else { - stripe_off -= this_stripe; - - if (stripe_off >= ssize) { - stripe_off = ssize; - ret = 1; - } - } - - *obdoff = lov_off * ssize + stripe_off; - return ret; -} - -/* Given a whole-file size and a stripe number, give the file size which - * corresponds to the individual object of that stripe. - * - * This behaves basically in the same was as lov_stripe_offset, except that - * file sizes falling before the beginning of a stripe are clamped to the end - * of the previous stripe, not the beginning of the next: - * - * S - * --------------------------------------------------------------------- - * | 0 | 1 | 2 | 0 | 1 | 2 | - * --------------------------------------------------------------------- - * - * if clamped to stripe 2 becomes: - * - * S - * --------------------------------------------------------------------- - * | 0 | 1 | 2 | 0 | 1 | 2 | - * --------------------------------------------------------------------- - */ -u64 lov_size_to_stripe(struct lov_stripe_md *lsm, u64 file_size, - int stripeno) -{ - unsigned long ssize = lsm->lsm_stripe_size; - u64 stripe_off, this_stripe, swidth; - int magic = lsm->lsm_magic; - - if (file_size == OBD_OBJECT_EOF) - return OBD_OBJECT_EOF; - - lsm_op_find(magic)->lsm_stripe_by_index(lsm, &stripeno, &file_size, - &swidth); - - /* lov_do_div64(a, b) returns a % b, and a = a / b */ - stripe_off = lov_do_div64(file_size, swidth); - - this_stripe = (u64)stripeno * ssize; - if (stripe_off < this_stripe) { - /* Move to end of previous stripe, or zero */ - if (file_size > 0) { - file_size--; - stripe_off = ssize; - } else { - stripe_off = 0; - } - } else { - stripe_off -= this_stripe; - - if (stripe_off >= ssize) { - /* Clamp to end of this stripe */ - stripe_off = ssize; - } - } - - return (file_size * ssize + stripe_off); -} - -/* given an extent in an lov and a stripe, calculate the extent of the stripe - * that is contained within the lov extent. this returns true if the given - * stripe does intersect with the lov extent. - */ -int lov_stripe_intersects(struct lov_stripe_md *lsm, int stripeno, - u64 start, u64 end, u64 *obd_start, u64 *obd_end) -{ - int start_side, end_side; - - start_side = lov_stripe_offset(lsm, start, stripeno, obd_start); - end_side = lov_stripe_offset(lsm, end, stripeno, obd_end); - - CDEBUG(D_INODE, "[%llu->%llu] -> [(%d) %llu->%llu (%d)]\n", - start, end, start_side, *obd_start, *obd_end, end_side); - - /* this stripe doesn't intersect the file extent when neither - * start or the end intersected the stripe and obd_start and - * obd_end got rounded up to the save value. - */ - if (start_side != 0 && end_side != 0 && *obd_start == *obd_end) - return 0; - - /* as mentioned in the lov_stripe_offset commentary, end - * might have been shifted in the wrong direction. This - * happens when an end offset is before the stripe when viewed - * through the "mod stripe size" math. we detect it being shifted - * in the wrong direction and touch it up. - * interestingly, this can't underflow since end must be > start - * if we passed through the previous check. - * (should we assert for that somewhere?) - */ - if (end_side != 0) - (*obd_end)--; - - return 1; -} - -/* compute which stripe number "lov_off" will be written into */ -int lov_stripe_number(struct lov_stripe_md *lsm, u64 lov_off) -{ - unsigned long ssize = lsm->lsm_stripe_size; - u64 stripe_off, swidth; - int magic = lsm->lsm_magic; - - lsm_op_find(magic)->lsm_stripe_by_offset(lsm, NULL, &lov_off, &swidth); - - stripe_off = lov_do_div64(lov_off, swidth); - - /* Puts stripe_off/ssize result into stripe_off */ - lov_do_div64(stripe_off, ssize); - - return stripe_off; -} diff --git a/drivers/staging/lustre/lustre/lov/lov_pack.c b/drivers/staging/lustre/lustre/lov/lov_pack.c deleted file mode 100644 index b1060d02a164a83405f9a22207433a224246f38f..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/lov/lov_pack.c +++ /dev/null @@ -1,400 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/lov/lov_pack.c - * - * (Un)packing of OST/MDS requests - * - * Author: Andreas Dilger - */ - -#define DEBUG_SUBSYSTEM S_LOV - -#include -#include -#include -#include -#include - -#include "lov_cl_internal.h" -#include "lov_internal.h" - -void lov_dump_lmm_common(int level, void *lmmp) -{ - struct lov_mds_md *lmm = lmmp; - struct ost_id oi; - - lmm_oi_le_to_cpu(&oi, &lmm->lmm_oi); - CDEBUG(level, "objid " DOSTID ", magic 0x%08x, pattern %#x\n", - POSTID(&oi), le32_to_cpu(lmm->lmm_magic), - le32_to_cpu(lmm->lmm_pattern)); - CDEBUG(level, "stripe_size %u, stripe_count %u, layout_gen %u\n", - le32_to_cpu(lmm->lmm_stripe_size), - le16_to_cpu(lmm->lmm_stripe_count), - le16_to_cpu(lmm->lmm_layout_gen)); -} - -static void lov_dump_lmm_objects(int level, struct lov_ost_data *lod, - int stripe_count) -{ - int i; - - if (stripe_count > LOV_V1_INSANE_STRIPE_COUNT) { - CDEBUG(level, "bad stripe_count %u > max_stripe_count %u\n", - stripe_count, LOV_V1_INSANE_STRIPE_COUNT); - return; - } - - for (i = 0; i < stripe_count; ++i, ++lod) { - struct ost_id oi; - - ostid_le_to_cpu(&lod->l_ost_oi, &oi); - CDEBUG(level, "stripe %u idx %u subobj " DOSTID "\n", i, - le32_to_cpu(lod->l_ost_idx), POSTID(&oi)); - } -} - -void lov_dump_lmm_v1(int level, struct lov_mds_md_v1 *lmm) -{ - lov_dump_lmm_common(level, lmm); - lov_dump_lmm_objects(level, lmm->lmm_objects, - le16_to_cpu(lmm->lmm_stripe_count)); -} - -void lov_dump_lmm_v3(int level, struct lov_mds_md_v3 *lmm) -{ - lov_dump_lmm_common(level, lmm); - CDEBUG(level, "pool_name " LOV_POOLNAMEF "\n", lmm->lmm_pool_name); - lov_dump_lmm_objects(level, lmm->lmm_objects, - le16_to_cpu(lmm->lmm_stripe_count)); -} - -/** - * Pack LOV striping metadata for disk storage format (in little - * endian byte order). - * - * This follows the getxattr() conventions. If \a buf_size is zero - * then return the size needed. If \a buf_size is too small then - * return -ERANGE. Otherwise return the size of the result. - */ -ssize_t lov_lsm_pack(const struct lov_stripe_md *lsm, void *buf, - size_t buf_size) -{ - struct lov_ost_data_v1 *lmm_objects; - struct lov_mds_md_v1 *lmmv1 = buf; - struct lov_mds_md_v3 *lmmv3 = buf; - size_t lmm_size; - unsigned int i; - - lmm_size = lov_mds_md_size(lsm->lsm_stripe_count, lsm->lsm_magic); - if (!buf_size) - return lmm_size; - - if (buf_size < lmm_size) - return -ERANGE; - - /* - * lmmv1 and lmmv3 point to the same struct and have the - * same first fields - */ - lmmv1->lmm_magic = cpu_to_le32(lsm->lsm_magic); - lmm_oi_cpu_to_le(&lmmv1->lmm_oi, &lsm->lsm_oi); - lmmv1->lmm_stripe_size = cpu_to_le32(lsm->lsm_stripe_size); - lmmv1->lmm_stripe_count = cpu_to_le16(lsm->lsm_stripe_count); - lmmv1->lmm_pattern = cpu_to_le32(lsm->lsm_pattern); - lmmv1->lmm_layout_gen = cpu_to_le16(lsm->lsm_layout_gen); - - if (lsm->lsm_magic == LOV_MAGIC_V3) { - BUILD_BUG_ON(sizeof(lsm->lsm_pool_name) != - sizeof(lmmv3->lmm_pool_name)); - strlcpy(lmmv3->lmm_pool_name, lsm->lsm_pool_name, - sizeof(lmmv3->lmm_pool_name)); - lmm_objects = lmmv3->lmm_objects; - } else { - lmm_objects = lmmv1->lmm_objects; - } - - for (i = 0; i < lsm->lsm_stripe_count; i++) { - struct lov_oinfo *loi = lsm->lsm_oinfo[i]; - - ostid_cpu_to_le(&loi->loi_oi, &lmm_objects[i].l_ost_oi); - lmm_objects[i].l_ost_gen = cpu_to_le32(loi->loi_ost_gen); - lmm_objects[i].l_ost_idx = cpu_to_le32(loi->loi_ost_idx); - } - - return lmm_size; -} - -/* Find the max stripecount we should use */ -__u16 lov_get_stripecnt(struct lov_obd *lov, __u32 magic, __u16 stripe_count) -{ - __u32 max_stripes = LOV_MAX_STRIPE_COUNT_OLD; - - if (!stripe_count) - stripe_count = lov->desc.ld_default_stripe_count; - if (stripe_count > lov->desc.ld_active_tgt_count) - stripe_count = lov->desc.ld_active_tgt_count; - if (!stripe_count) - stripe_count = 1; - - /* stripe count is based on whether ldiskfs can handle - * larger EA sizes - */ - if (lov->lov_ocd.ocd_connect_flags & OBD_CONNECT_MAX_EASIZE && - lov->lov_ocd.ocd_max_easize) - max_stripes = lov_mds_md_max_stripe_count( - lov->lov_ocd.ocd_max_easize, magic); - - if (stripe_count > max_stripes) - stripe_count = max_stripes; - - return stripe_count; -} - -static int lov_verify_lmm(void *lmm, int lmm_bytes, __u16 *stripe_count) -{ - int rc; - - if (!lsm_op_find(le32_to_cpu(*(__u32 *)lmm))) { - CERROR("bad disk LOV MAGIC: 0x%08X; dumping LMM (size=%d):\n", - le32_to_cpu(*(__u32 *)lmm), lmm_bytes); - CERROR("%*phN\n", lmm_bytes, lmm); - return -EINVAL; - } - rc = lsm_op_find(le32_to_cpu(*(__u32 *)lmm))->lsm_lmm_verify(lmm, - lmm_bytes, - stripe_count); - return rc; -} - -static struct lov_stripe_md *lov_lsm_alloc(u16 stripe_count, u32 pattern, - u32 magic) -{ - struct lov_stripe_md *lsm; - unsigned int i; - - CDEBUG(D_INFO, "alloc lsm, stripe_count %u\n", stripe_count); - - lsm = lsm_alloc_plain(stripe_count); - if (!lsm) { - CERROR("cannot allocate LSM stripe_count %u\n", stripe_count); - return ERR_PTR(-ENOMEM); - } - - atomic_set(&lsm->lsm_refc, 1); - spin_lock_init(&lsm->lsm_lock); - lsm->lsm_magic = magic; - lsm->lsm_stripe_count = stripe_count; - lsm->lsm_maxbytes = LUSTRE_EXT3_STRIPE_MAXBYTES * stripe_count; - lsm->lsm_pattern = pattern; - lsm->lsm_pool_name[0] = '\0'; - lsm->lsm_layout_gen = 0; - if (stripe_count > 0) - lsm->lsm_oinfo[0]->loi_ost_idx = ~0; - - for (i = 0; i < stripe_count; i++) - loi_init(lsm->lsm_oinfo[i]); - - return lsm; -} - -int lov_free_memmd(struct lov_stripe_md **lsmp) -{ - struct lov_stripe_md *lsm = *lsmp; - int refc; - - *lsmp = NULL; - LASSERT(atomic_read(&lsm->lsm_refc) > 0); - refc = atomic_dec_return(&lsm->lsm_refc); - if (refc == 0) - lsm_op_find(lsm->lsm_magic)->lsm_free(lsm); - - return refc; -} - -/* Unpack LOV object metadata from disk storage. It is packed in LE byte - * order and is opaque to the networking layer. - */ -struct lov_stripe_md *lov_unpackmd(struct lov_obd *lov, struct lov_mds_md *lmm, - size_t lmm_size) -{ - struct lov_stripe_md *lsm; - u16 stripe_count; - u32 pattern; - u32 magic; - int rc; - - rc = lov_verify_lmm(lmm, lmm_size, &stripe_count); - if (rc) - return ERR_PTR(rc); - - magic = le32_to_cpu(lmm->lmm_magic); - pattern = le32_to_cpu(lmm->lmm_pattern); - - lsm = lov_lsm_alloc(stripe_count, pattern, magic); - if (IS_ERR(lsm)) - return lsm; - - LASSERT(lsm_op_find(magic)); - rc = lsm_op_find(magic)->lsm_unpackmd(lov, lsm, lmm); - if (rc) { - lov_free_memmd(&lsm); - return ERR_PTR(rc); - } - - return lsm; -} - -/* Retrieve object striping information. - * - * @lump is a pointer to an in-core struct with lmm_ost_count indicating - * the maximum number of OST indices which will fit in the user buffer. - * lmm_magic must be LOV_USER_MAGIC. - */ -int lov_getstripe(struct lov_object *obj, struct lov_stripe_md *lsm, - struct lov_user_md __user *lump) -{ - /* we use lov_user_md_v3 because it is larger than lov_user_md_v1 */ - struct lov_user_md_v3 lum; - struct lov_mds_md *lmmk; - u32 stripe_count; - ssize_t lmm_size; - size_t lmmk_size; - size_t lum_size; - int rc; - - if (!lsm) - return -ENODATA; - - if (lsm->lsm_magic != LOV_MAGIC_V1 && lsm->lsm_magic != LOV_MAGIC_V3) { - CERROR("bad LSM MAGIC: 0x%08X != 0x%08X nor 0x%08X\n", - lsm->lsm_magic, LOV_MAGIC_V1, LOV_MAGIC_V3); - rc = -EIO; - goto out; - } - - if (!lsm_is_released(lsm)) - stripe_count = lsm->lsm_stripe_count; - else - stripe_count = 0; - - /* we only need the header part from user space to get lmm_magic and - * lmm_stripe_count, (the header part is common to v1 and v3) - */ - lum_size = sizeof(struct lov_user_md_v1); - if (copy_from_user(&lum, lump, lum_size)) { - rc = -EFAULT; - goto out; - } - if (lum.lmm_magic != LOV_USER_MAGIC_V1 && - lum.lmm_magic != LOV_USER_MAGIC_V3 && - lum.lmm_magic != LOV_USER_MAGIC_SPECIFIC) { - rc = -EINVAL; - goto out; - } - - if (lum.lmm_stripe_count && - (lum.lmm_stripe_count < lsm->lsm_stripe_count)) { - /* Return right size of stripe to user */ - lum.lmm_stripe_count = stripe_count; - rc = copy_to_user(lump, &lum, lum_size); - rc = -EOVERFLOW; - goto out; - } - lmmk_size = lov_mds_md_size(stripe_count, lsm->lsm_magic); - - - lmmk = kvzalloc(lmmk_size, GFP_NOFS); - if (!lmmk) { - rc = -ENOMEM; - goto out; - } - - lmm_size = lov_lsm_pack(lsm, lmmk, lmmk_size); - if (lmm_size < 0) { - rc = lmm_size; - goto out_free; - } - - /* FIXME: Bug 1185 - copy fields properly when structs change */ - /* struct lov_user_md_v3 and struct lov_mds_md_v3 must be the same */ - BUILD_BUG_ON(sizeof(lum) != sizeof(struct lov_mds_md_v3)); - BUILD_BUG_ON(sizeof(lum.lmm_objects[0]) != sizeof(lmmk->lmm_objects[0])); - - if (cpu_to_le32(LOV_MAGIC) != LOV_MAGIC && - (lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) || - lmmk->lmm_magic == cpu_to_le32(LOV_MAGIC_V3))) { - lustre_swab_lov_mds_md(lmmk); - lustre_swab_lov_user_md_objects( - (struct lov_user_ost_data *)lmmk->lmm_objects, - lmmk->lmm_stripe_count); - } - - if (lum.lmm_magic == LOV_USER_MAGIC) { - /* User request for v1, we need skip lmm_pool_name */ - if (lmmk->lmm_magic == LOV_MAGIC_V3) { - memmove(((struct lov_mds_md_v1 *)lmmk)->lmm_objects, - ((struct lov_mds_md_v3 *)lmmk)->lmm_objects, - lmmk->lmm_stripe_count * - sizeof(struct lov_ost_data_v1)); - lmm_size -= LOV_MAXPOOLNAME; - } - } else { - /* if v3 we just have to update the lum_size */ - lum_size = sizeof(struct lov_user_md_v3); - } - - /* User wasn't expecting this many OST entries */ - if (lum.lmm_stripe_count == 0) { - lmm_size = lum_size; - } else if (lum.lmm_stripe_count < lmmk->lmm_stripe_count) { - rc = -EOVERFLOW; - goto out_free; - } - /* - * Have a difference between lov_mds_md & lov_user_md. - * So we have to re-order the data before copy to user. - */ - lum.lmm_stripe_count = lmmk->lmm_stripe_count; - lum.lmm_layout_gen = lmmk->lmm_layout_gen; - ((struct lov_user_md *)lmmk)->lmm_layout_gen = lum.lmm_layout_gen; - ((struct lov_user_md *)lmmk)->lmm_stripe_count = lum.lmm_stripe_count; - if (copy_to_user(lump, lmmk, lmm_size)) - rc = -EFAULT; - else - rc = 0; - -out_free: - kvfree(lmmk); -out: - return rc; -} diff --git a/drivers/staging/lustre/lustre/lov/lov_page.c b/drivers/staging/lustre/lustre/lov/lov_page.c deleted file mode 100644 index cfae1294d77a463b0e0a5e2d1174e96a7269088a..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/lov/lov_page.c +++ /dev/null @@ -1,136 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Implementation of cl_page for LOV layer. - * - * Author: Nikita Danilov - * Author: Jinshan Xiong - */ - -#define DEBUG_SUBSYSTEM S_LOV - -#include "lov_cl_internal.h" - -/** \addtogroup lov - * @{ - */ - -/***************************************************************************** - * - * Lov page operations. - * - */ - -static int lov_raid0_page_print(const struct lu_env *env, - const struct cl_page_slice *slice, - void *cookie, lu_printer_t printer) -{ - struct lov_page *lp = cl2lov_page(slice); - - return (*printer)(env, cookie, LUSTRE_LOV_NAME "-page@%p, raid0\n", lp); -} - -static const struct cl_page_operations lov_raid0_page_ops = { - .cpo_print = lov_raid0_page_print -}; - -int lov_page_init_raid0(const struct lu_env *env, struct cl_object *obj, - struct cl_page *page, pgoff_t index) -{ - struct lov_object *loo = cl2lov(obj); - struct lov_layout_raid0 *r0 = lov_r0(loo); - struct lov_io *lio = lov_env_io(env); - struct cl_object *subobj; - struct cl_object *o; - struct lov_io_sub *sub; - struct lov_page *lpg = cl_object_page_slice(obj, page); - loff_t offset; - u64 suboff; - int stripe; - int rc; - - offset = cl_offset(obj, index); - stripe = lov_stripe_number(loo->lo_lsm, offset); - LASSERT(stripe < r0->lo_nr); - rc = lov_stripe_offset(loo->lo_lsm, offset, stripe, &suboff); - LASSERT(rc == 0); - - lpg->lps_stripe = stripe; - cl_page_slice_add(page, &lpg->lps_cl, obj, index, &lov_raid0_page_ops); - - sub = lov_sub_get(env, lio, stripe); - if (IS_ERR(sub)) - return PTR_ERR(sub); - - subobj = lovsub2cl(r0->lo_sub[stripe]); - list_for_each_entry(o, &subobj->co_lu.lo_header->loh_layers, - co_lu.lo_linkage) { - if (o->co_ops->coo_page_init) { - rc = o->co_ops->coo_page_init(sub->sub_env, o, page, - cl_index(subobj, suboff)); - if (rc != 0) - break; - } - } - - return rc; -} - -static int lov_empty_page_print(const struct lu_env *env, - const struct cl_page_slice *slice, - void *cookie, lu_printer_t printer) -{ - struct lov_page *lp = cl2lov_page(slice); - - return (*printer)(env, cookie, LUSTRE_LOV_NAME "-page@%p, empty.\n", - lp); -} - -static const struct cl_page_operations lov_empty_page_ops = { - .cpo_print = lov_empty_page_print -}; - -int lov_page_init_empty(const struct lu_env *env, struct cl_object *obj, - struct cl_page *page, pgoff_t index) -{ - struct lov_page *lpg = cl_object_page_slice(obj, page); - void *addr; - - cl_page_slice_add(page, &lpg->lps_cl, obj, index, &lov_empty_page_ops); - addr = kmap(page->cp_vmpage); - memset(addr, 0, cl_page_size(obj)); - kunmap(page->cp_vmpage); - cl_page_export(env, page, 1); - return 0; -} - -/** @} lov */ diff --git a/drivers/staging/lustre/lustre/lov/lov_pool.c b/drivers/staging/lustre/lustre/lov/lov_pool.c deleted file mode 100644 index b2a88ba72eb26484b5a57a94cadd1177f9f09da8..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/lov/lov_pool.c +++ /dev/null @@ -1,546 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/lov/lov_pool.c - * - * OST pool methods - * - * Author: Jacques-Charles LAFOUCRIERE - * Author: Alex Lyashkov - * Author: Nathaniel Rutman - */ - -#define DEBUG_SUBSYSTEM S_LOV - -#include -#include "lov_internal.h" - -#define pool_tgt(_p, _i) \ - _p->pool_lobd->u.lov.lov_tgts[_p->pool_obds.op_array[_i]] - -static u32 pool_hashfh(const void *data, u32 len, u32 seed) -{ - const char *pool_name = data; - return hashlen_hash(hashlen_string((void*)(unsigned long)seed, pool_name)); -} - -static int pool_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) -{ - const struct pool_desc *pool = obj; - const char *pool_name = arg->key; - return strcmp(pool_name, pool->pool_name); -} - -static const struct rhashtable_params pools_hash_params = { - .key_len = 1, /* actually variable */ - .key_offset = offsetof(struct pool_desc, pool_name), - .head_offset = offsetof(struct pool_desc, pool_hash), - .hashfn = pool_hashfh, - .obj_cmpfn = pool_cmpfn, - .automatic_shrinking = true, -}; - -static void lov_pool_getref(struct pool_desc *pool) -{ - CDEBUG(D_INFO, "pool %p\n", pool); - atomic_inc(&pool->pool_refcount); -} - -void lov_pool_putref(struct pool_desc *pool) -{ - CDEBUG(D_INFO, "pool %p\n", pool); - if (atomic_dec_and_test(&pool->pool_refcount)) { - LASSERT(list_empty(&pool->pool_list)); - lov_ost_pool_free(&pool->pool_obds); - kfree_rcu(pool, rcu); - } -} - -/* - * pool debugfs seq_file methods - */ -/* - * iterator is used to go through the target pool entries - * index is the current entry index in the lp_array[] array - * index >= pos returned to the seq_file interface - * pos is from 0 to (pool->pool_obds.op_count - 1) - */ -#define POOL_IT_MAGIC 0xB001CEA0 -struct pool_iterator { - int magic; - struct pool_desc *pool; - int idx; /* from 0 to pool_tgt_size - 1 */ -}; - -static void *pool_proc_next(struct seq_file *s, void *v, loff_t *pos) -{ - struct pool_iterator *iter = (struct pool_iterator *)s->private; - int prev_idx; - - LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X\n", iter->magic); - - /* test if end of file */ - if (*pos >= pool_tgt_count(iter->pool)) - return NULL; - - /* iterate to find a non empty entry */ - prev_idx = iter->idx; - down_read(&pool_tgt_rw_sem(iter->pool)); - iter->idx++; - if (iter->idx == pool_tgt_count(iter->pool)) { - iter->idx = prev_idx; /* we stay on the last entry */ - up_read(&pool_tgt_rw_sem(iter->pool)); - return NULL; - } - up_read(&pool_tgt_rw_sem(iter->pool)); - (*pos)++; - /* return != NULL to continue */ - return iter; -} - -static void *pool_proc_start(struct seq_file *s, loff_t *pos) -{ - struct pool_desc *pool = (struct pool_desc *)s->private; - struct pool_iterator *iter; - - lov_pool_getref(pool); - if ((pool_tgt_count(pool) == 0) || - (*pos >= pool_tgt_count(pool))) { - /* iter is not created, so stop() has no way to - * find pool to dec ref - */ - lov_pool_putref(pool); - return NULL; - } - - iter = kzalloc(sizeof(*iter), GFP_NOFS); - if (!iter) - return ERR_PTR(-ENOMEM); - iter->magic = POOL_IT_MAGIC; - iter->pool = pool; - iter->idx = 0; - - /* we use seq_file private field to memorized iterator so - * we can free it at stop() - */ - /* /!\ do not forget to restore it to pool before freeing it */ - s->private = iter; - if (*pos > 0) { - loff_t i; - void *ptr; - - i = 0; - do { - ptr = pool_proc_next(s, &iter, &i); - } while ((i < *pos) && ptr); - return ptr; - } - return iter; -} - -static void pool_proc_stop(struct seq_file *s, void *v) -{ - struct pool_iterator *iter = (struct pool_iterator *)s->private; - - /* in some cases stop() method is called 2 times, without - * calling start() method (see seq_read() from fs/seq_file.c) - * we have to free only if s->private is an iterator - */ - if ((iter) && (iter->magic == POOL_IT_MAGIC)) { - /* we restore s->private so next call to pool_proc_start() - * will work - */ - s->private = iter->pool; - lov_pool_putref(iter->pool); - kfree(iter); - } -} - -static int pool_proc_show(struct seq_file *s, void *v) -{ - struct pool_iterator *iter = (struct pool_iterator *)v; - struct lov_tgt_desc *tgt; - - LASSERTF(iter->magic == POOL_IT_MAGIC, "%08X\n", iter->magic); - LASSERT(iter->pool); - LASSERT(iter->idx <= pool_tgt_count(iter->pool)); - - down_read(&pool_tgt_rw_sem(iter->pool)); - tgt = pool_tgt(iter->pool, iter->idx); - up_read(&pool_tgt_rw_sem(iter->pool)); - if (tgt) - seq_printf(s, "%s\n", obd_uuid2str(&tgt->ltd_uuid)); - - return 0; -} - -static const struct seq_operations pool_proc_ops = { - .start = pool_proc_start, - .next = pool_proc_next, - .stop = pool_proc_stop, - .show = pool_proc_show, -}; - -static int pool_proc_open(struct inode *inode, struct file *file) -{ - int rc; - - rc = seq_open(file, &pool_proc_ops); - if (!rc) { - struct seq_file *s = file->private_data; - - s->private = inode->i_private; - } - return rc; -} - -static const struct file_operations pool_proc_operations = { - .open = pool_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -#define LOV_POOL_INIT_COUNT 2 -int lov_ost_pool_init(struct ost_pool *op, unsigned int count) -{ - if (count == 0) - count = LOV_POOL_INIT_COUNT; - op->op_array = NULL; - op->op_count = 0; - init_rwsem(&op->op_rw_sem); - op->op_size = count; - op->op_array = kcalloc(op->op_size, sizeof(op->op_array[0]), GFP_NOFS); - if (!op->op_array) { - op->op_size = 0; - return -ENOMEM; - } - return 0; -} - -/* Caller must hold write op_rwlock */ -int lov_ost_pool_extend(struct ost_pool *op, unsigned int min_count) -{ - __u32 *new; - int new_size; - - LASSERT(min_count != 0); - - if (op->op_count < op->op_size) - return 0; - - new_size = max(min_count, 2 * op->op_size); - new = kcalloc(new_size, sizeof(op->op_array[0]), GFP_NOFS); - if (!new) - return -ENOMEM; - - /* copy old array to new one */ - memcpy(new, op->op_array, op->op_size * sizeof(op->op_array[0])); - kfree(op->op_array); - op->op_array = new; - op->op_size = new_size; - return 0; -} - -int lov_ost_pool_add(struct ost_pool *op, __u32 idx, unsigned int min_count) -{ - int rc = 0, i; - - down_write(&op->op_rw_sem); - - rc = lov_ost_pool_extend(op, min_count); - if (rc) - goto out; - - /* search ost in pool array */ - for (i = 0; i < op->op_count; i++) { - if (op->op_array[i] == idx) { - rc = -EEXIST; - goto out; - } - } - /* ost not found we add it */ - op->op_array[op->op_count] = idx; - op->op_count++; -out: - up_write(&op->op_rw_sem); - return rc; -} - -int lov_ost_pool_remove(struct ost_pool *op, __u32 idx) -{ - int i; - - down_write(&op->op_rw_sem); - - for (i = 0; i < op->op_count; i++) { - if (op->op_array[i] == idx) { - memmove(&op->op_array[i], &op->op_array[i + 1], - (op->op_count - i - 1) * sizeof(op->op_array[0])); - op->op_count--; - up_write(&op->op_rw_sem); - return 0; - } - } - - up_write(&op->op_rw_sem); - return -EINVAL; -} - -int lov_ost_pool_free(struct ost_pool *op) -{ - if (op->op_size == 0) - return 0; - - down_write(&op->op_rw_sem); - - kfree(op->op_array); - op->op_array = NULL; - op->op_count = 0; - op->op_size = 0; - - up_write(&op->op_rw_sem); - return 0; -} - -static void -pools_hash_exit(void *vpool, void *data) -{ - struct pool_desc *pool = vpool; - lov_pool_putref(pool); -} - -int lov_pool_hash_init(struct rhashtable *tbl) -{ - return rhashtable_init(tbl, &pools_hash_params); -} - -void lov_pool_hash_destroy(struct rhashtable *tbl) -{ - rhashtable_free_and_destroy(tbl, pools_hash_exit, NULL); -} - -int lov_pool_new(struct obd_device *obd, char *poolname) -{ - struct lov_obd *lov; - struct pool_desc *new_pool; - int rc; - - lov = &obd->u.lov; - - if (strlen(poolname) > LOV_MAXPOOLNAME) - return -ENAMETOOLONG; - - new_pool = kzalloc(sizeof(*new_pool), GFP_NOFS); - if (!new_pool) - return -ENOMEM; - - strlcpy(new_pool->pool_name, poolname, sizeof(new_pool->pool_name)); - new_pool->pool_lobd = obd; - /* ref count init to 1 because when created a pool is always used - * up to deletion - */ - atomic_set(&new_pool->pool_refcount, 1); - rc = lov_ost_pool_init(&new_pool->pool_obds, 0); - if (rc) - goto out_err; - - /* get ref for debugfs file */ - lov_pool_getref(new_pool); - - new_pool->pool_debugfs_entry = debugfs_create_file(poolname, 0444, - lov->lov_pool_debugfs_entry, - new_pool, - &pool_proc_operations); - - spin_lock(&obd->obd_dev_lock); - list_add_tail(&new_pool->pool_list, &lov->lov_pool_list); - lov->lov_pool_count++; - spin_unlock(&obd->obd_dev_lock); - - /* Add to hash table only when it is fully ready. */ - rc = rhashtable_lookup_insert_fast(&lov->lov_pools_hash_body, - &new_pool->pool_hash, pools_hash_params); - if (rc) { - if (rc != -EEXIST) - /* - * Hide -E2BIG and -EBUSY which - * are not helpful. - */ - rc = -ENOMEM; - goto out_err; - } - - CDEBUG(D_CONFIG, LOV_POOLNAMEF " is pool #%d\n", - poolname, lov->lov_pool_count); - - return 0; - -out_err: - spin_lock(&obd->obd_dev_lock); - list_del_init(&new_pool->pool_list); - lov->lov_pool_count--; - spin_unlock(&obd->obd_dev_lock); - debugfs_remove_recursive(new_pool->pool_debugfs_entry); - lov_ost_pool_free(&new_pool->pool_obds); - kfree(new_pool); - - return rc; -} - -int lov_pool_del(struct obd_device *obd, char *poolname) -{ - struct lov_obd *lov; - struct pool_desc *pool; - - lov = &obd->u.lov; - - /* lookup and kill hash reference */ - rcu_read_lock(); - pool = rhashtable_lookup(&lov->lov_pools_hash_body, poolname, pools_hash_params); - if (pool) - if (rhashtable_remove_fast(&lov->lov_pools_hash_body, - &pool->pool_hash, pools_hash_params) != 0) - pool = NULL; - rcu_read_unlock(); - if (!pool) - return -ENOENT; - - debugfs_remove_recursive(pool->pool_debugfs_entry); - lov_pool_putref(pool); - - spin_lock(&obd->obd_dev_lock); - list_del_init(&pool->pool_list); - lov->lov_pool_count--; - spin_unlock(&obd->obd_dev_lock); - - /* release last reference */ - lov_pool_putref(pool); - - return 0; -} - -int lov_pool_add(struct obd_device *obd, char *poolname, char *ostname) -{ - struct obd_uuid ost_uuid; - struct lov_obd *lov; - struct pool_desc *pool; - unsigned int lov_idx; - int rc; - - lov = &obd->u.lov; - - rcu_read_lock(); - pool = rhashtable_lookup(&lov->lov_pools_hash_body, poolname, pools_hash_params); - if (pool && !atomic_inc_not_zero(&pool->pool_refcount)) - pool = NULL; - rcu_read_unlock(); - if (!pool) - return -ENOENT; - - obd_str2uuid(&ost_uuid, ostname); - - /* search ost in lov array */ - obd_getref(obd); - for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) { - if (!lov->lov_tgts[lov_idx]) - continue; - if (obd_uuid_equals(&ost_uuid, - &lov->lov_tgts[lov_idx]->ltd_uuid)) - break; - } - /* test if ost found in lov */ - if (lov_idx == lov->desc.ld_tgt_count) { - rc = -EINVAL; - goto out; - } - - rc = lov_ost_pool_add(&pool->pool_obds, lov_idx, lov->lov_tgt_size); - if (rc) - goto out; - - CDEBUG(D_CONFIG, "Added %s to " LOV_POOLNAMEF " as member %d\n", - ostname, poolname, pool_tgt_count(pool)); - -out: - obd_putref(obd); - lov_pool_putref(pool); - return rc; -} - -int lov_pool_remove(struct obd_device *obd, char *poolname, char *ostname) -{ - struct obd_uuid ost_uuid; - struct lov_obd *lov; - struct pool_desc *pool; - unsigned int lov_idx; - int rc = 0; - - lov = &obd->u.lov; - - rcu_read_lock(); - pool = rhashtable_lookup(&lov->lov_pools_hash_body, poolname, pools_hash_params); - if (pool && !atomic_inc_not_zero(&pool->pool_refcount)) - pool = NULL; - rcu_read_unlock(); - if (!pool) - return -ENOENT; - - obd_str2uuid(&ost_uuid, ostname); - - obd_getref(obd); - /* search ost in lov array, to get index */ - for (lov_idx = 0; lov_idx < lov->desc.ld_tgt_count; lov_idx++) { - if (!lov->lov_tgts[lov_idx]) - continue; - - if (obd_uuid_equals(&ost_uuid, - &lov->lov_tgts[lov_idx]->ltd_uuid)) - break; - } - - /* test if ost found in lov */ - if (lov_idx == lov->desc.ld_tgt_count) { - rc = -EINVAL; - goto out; - } - - lov_ost_pool_remove(&pool->pool_obds, lov_idx); - - CDEBUG(D_CONFIG, "%s removed from " LOV_POOLNAMEF "\n", ostname, - poolname); - -out: - obd_putref(obd); - lov_pool_putref(pool); - return rc; -} diff --git a/drivers/staging/lustre/lustre/lov/lov_request.c b/drivers/staging/lustre/lustre/lov/lov_request.c deleted file mode 100644 index cb8567f20ea77ebf68600bdc12ad8748f3f5f753..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/lov/lov_request.c +++ /dev/null @@ -1,354 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_LOV - -#include -#include -#include "lov_internal.h" - -static void lov_init_set(struct lov_request_set *set) -{ - set->set_count = 0; - atomic_set(&set->set_completes, 0); - atomic_set(&set->set_success, 0); - INIT_LIST_HEAD(&set->set_list); -} - -static void lov_finish_set(struct lov_request_set *set) -{ - struct lov_request *req; - - LASSERT(set); - while ((req = list_first_entry_or_null(&set->set_list, - struct lov_request, - rq_link)) != NULL) { - list_del_init(&req->rq_link); - kfree(req->rq_oi.oi_osfs); - kfree(req); - } - kfree(set); -} - -static void lov_update_set(struct lov_request_set *set, - struct lov_request *req, int rc) -{ - atomic_inc(&set->set_completes); - if (rc == 0) - atomic_inc(&set->set_success); -} - -static void lov_set_add_req(struct lov_request *req, - struct lov_request_set *set) -{ - list_add_tail(&req->rq_link, &set->set_list); - set->set_count++; - req->rq_rqset = set; -} - -static int lov_check_set(struct lov_obd *lov, int idx) -{ - int rc; - struct lov_tgt_desc *tgt; - - mutex_lock(&lov->lov_lock); - tgt = lov->lov_tgts[idx]; - rc = !tgt || tgt->ltd_active || - (tgt->ltd_exp && - class_exp2cliimp(tgt->ltd_exp)->imp_connect_tried); - mutex_unlock(&lov->lov_lock); - - return rc; -} - -/* Check if the OSC connection exists and is active. - * If the OSC has not yet had a chance to connect to the OST the first time, - * wait once for it to connect instead of returning an error. - */ -static int lov_check_and_wait_active(struct lov_obd *lov, int ost_idx) -{ - int cnt = 0; - struct lov_tgt_desc *tgt; - int rc = 0; - - mutex_lock(&lov->lov_lock); - - tgt = lov->lov_tgts[ost_idx]; - - if (unlikely(!tgt)) { - rc = 0; - goto out; - } - - if (likely(tgt->ltd_active)) { - rc = 1; - goto out; - } - - if (tgt->ltd_exp && class_exp2cliimp(tgt->ltd_exp)->imp_connect_tried) { - rc = 0; - goto out; - } - - mutex_unlock(&lov->lov_lock); - - while (cnt < obd_timeout && !lov_check_set(lov, ost_idx)) { - schedule_timeout_uninterruptible(HZ); - cnt++; - } - if (tgt->ltd_active) - return 1; - - return 0; - -out: - mutex_unlock(&lov->lov_lock); - return rc; -} - -#define LOV_U64_MAX ((__u64)~0ULL) -#define LOV_SUM_MAX(tot, add) \ - do { \ - if ((tot) + (add) < (tot)) \ - (tot) = LOV_U64_MAX; \ - else \ - (tot) += (add); \ - } while (0) - -static int lov_fini_statfs(struct obd_device *obd, struct obd_statfs *osfs, - int success) -{ - if (success) { - __u32 expected_stripes = lov_get_stripecnt(&obd->u.lov, - LOV_MAGIC, 0); - if (osfs->os_files != LOV_U64_MAX) - lov_do_div64(osfs->os_files, expected_stripes); - if (osfs->os_ffree != LOV_U64_MAX) - lov_do_div64(osfs->os_ffree, expected_stripes); - - spin_lock(&obd->obd_osfs_lock); - memcpy(&obd->obd_osfs, osfs, sizeof(*osfs)); - obd->obd_osfs_age = get_jiffies_64(); - spin_unlock(&obd->obd_osfs_lock); - return 0; - } - - return -EIO; -} - -int lov_fini_statfs_set(struct lov_request_set *set) -{ - int rc = 0; - - if (!set) - return 0; - - if (atomic_read(&set->set_completes)) { - rc = lov_fini_statfs(set->set_obd, set->set_oi->oi_osfs, - atomic_read(&set->set_success)); - } - - lov_finish_set(set); - - return rc; -} - -static void lov_update_statfs(struct obd_statfs *osfs, - struct obd_statfs *lov_sfs, - int success) -{ - int shift = 0, quit = 0; - __u64 tmp; - - if (success == 0) { - memcpy(osfs, lov_sfs, sizeof(*lov_sfs)); - } else { - if (osfs->os_bsize != lov_sfs->os_bsize) { - /* assume all block sizes are always powers of 2 */ - /* get the bits difference */ - tmp = osfs->os_bsize | lov_sfs->os_bsize; - for (shift = 0; shift <= 64; ++shift) { - if (tmp & 1) { - if (quit) - break; - quit = 1; - shift = 0; - } - tmp >>= 1; - } - } - - if (osfs->os_bsize < lov_sfs->os_bsize) { - osfs->os_bsize = lov_sfs->os_bsize; - - osfs->os_bfree >>= shift; - osfs->os_bavail >>= shift; - osfs->os_blocks >>= shift; - } else if (shift != 0) { - lov_sfs->os_bfree >>= shift; - lov_sfs->os_bavail >>= shift; - lov_sfs->os_blocks >>= shift; - } - osfs->os_bfree += lov_sfs->os_bfree; - osfs->os_bavail += lov_sfs->os_bavail; - osfs->os_blocks += lov_sfs->os_blocks; - /* XXX not sure about this one - depends on policy. - * - could be minimum if we always stripe on all OBDs - * (but that would be wrong for any other policy, - * if one of the OBDs has no more objects left) - * - could be sum if we stripe whole objects - * - could be average, just to give a nice number - * - * To give a "reasonable" (if not wholly accurate) - * number, we divide the total number of free objects - * by expected stripe count (watch out for overflow). - */ - LOV_SUM_MAX(osfs->os_files, lov_sfs->os_files); - LOV_SUM_MAX(osfs->os_ffree, lov_sfs->os_ffree); - } -} - -/* The callback for osc_statfs_async that finalizes a request info when a - * response is received. - */ -static int cb_statfs_update(void *cookie, int rc) -{ - struct obd_info *oinfo = cookie; - struct lov_request *lovreq; - struct lov_request_set *set; - struct obd_statfs *osfs, *lov_sfs; - struct lov_obd *lov; - struct lov_tgt_desc *tgt; - struct obd_device *lovobd, *tgtobd; - int success; - - lovreq = container_of(oinfo, struct lov_request, rq_oi); - set = lovreq->rq_rqset; - lovobd = set->set_obd; - lov = &lovobd->u.lov; - osfs = set->set_oi->oi_osfs; - lov_sfs = oinfo->oi_osfs; - success = atomic_read(&set->set_success); - /* XXX: the same is done in lov_update_common_set, however - * lovset->set_exp is not initialized. - */ - lov_update_set(set, lovreq, rc); - if (rc) - goto out; - - obd_getref(lovobd); - tgt = lov->lov_tgts[lovreq->rq_idx]; - if (!tgt || !tgt->ltd_active) - goto out_update; - - tgtobd = class_exp2obd(tgt->ltd_exp); - spin_lock(&tgtobd->obd_osfs_lock); - memcpy(&tgtobd->obd_osfs, lov_sfs, sizeof(*lov_sfs)); - if ((oinfo->oi_flags & OBD_STATFS_FROM_CACHE) == 0) - tgtobd->obd_osfs_age = get_jiffies_64(); - spin_unlock(&tgtobd->obd_osfs_lock); - -out_update: - lov_update_statfs(osfs, lov_sfs, success); - obd_putref(lovobd); -out: - return 0; -} - -int lov_prep_statfs_set(struct obd_device *obd, struct obd_info *oinfo, - struct lov_request_set **reqset) -{ - struct lov_request_set *set; - struct lov_obd *lov = &obd->u.lov; - int rc = 0, i; - - set = kzalloc(sizeof(*set), GFP_NOFS); - if (!set) - return -ENOMEM; - lov_init_set(set); - - set->set_obd = obd; - set->set_oi = oinfo; - - /* We only get block data from the OBD */ - for (i = 0; i < lov->desc.ld_tgt_count; i++) { - struct lov_request *req; - - if (!lov->lov_tgts[i] || - (oinfo->oi_flags & OBD_STATFS_NODELAY && - !lov->lov_tgts[i]->ltd_active)) { - CDEBUG(D_HA, "lov idx %d inactive\n", i); - continue; - } - - /* skip targets that have been explicitly disabled by the - * administrator - */ - if (!lov->lov_tgts[i]->ltd_exp) { - CDEBUG(D_HA, "lov idx %d administratively disabled\n", i); - continue; - } - - if (!lov->lov_tgts[i]->ltd_active) - lov_check_and_wait_active(lov, i); - - req = kzalloc(sizeof(*req), GFP_NOFS); - if (!req) { - rc = -ENOMEM; - goto out_set; - } - - req->rq_oi.oi_osfs = kzalloc(sizeof(*req->rq_oi.oi_osfs), - GFP_NOFS); - if (!req->rq_oi.oi_osfs) { - kfree(req); - rc = -ENOMEM; - goto out_set; - } - - req->rq_idx = i; - req->rq_oi.oi_cb_up = cb_statfs_update; - req->rq_oi.oi_flags = oinfo->oi_flags; - - lov_set_add_req(req, set); - } - if (!set->set_count) { - rc = -EIO; - goto out_set; - } - *reqset = set; - return rc; -out_set: - lov_fini_statfs_set(set); - return rc; -} diff --git a/drivers/staging/lustre/lustre/lov/lovsub_dev.c b/drivers/staging/lustre/lustre/lov/lovsub_dev.c deleted file mode 100644 index 7e89a2e485fc06cf8cd75085a84c15b5ffc26b72..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/lov/lovsub_dev.c +++ /dev/null @@ -1,147 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2013, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Implementation of cl_device and cl_device_type for LOVSUB layer. - * - * Author: Nikita Danilov - */ - -#define DEBUG_SUBSYSTEM S_LOV - -#include "lov_cl_internal.h" - -/** \addtogroup lov - * @{ - */ - -/***************************************************************************** - * - * Lov-sub device and device type functions. - * - */ - -static int lovsub_device_init(const struct lu_env *env, struct lu_device *d, - const char *name, struct lu_device *next) -{ - struct lovsub_device *lsd = lu2lovsub_dev(d); - struct lu_device_type *ldt; - int rc; - - next->ld_site = d->ld_site; - ldt = next->ld_type; - rc = ldt->ldt_ops->ldto_device_init(env, next, ldt->ldt_name, NULL); - if (rc) { - next->ld_site = NULL; - return rc; - } - - lu_device_get(next); - lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init); - lsd->acid_next = lu2cl_dev(next); - return rc; -} - -static struct lu_device *lovsub_device_fini(const struct lu_env *env, - struct lu_device *d) -{ - struct lu_device *next; - struct lovsub_device *lsd; - - lsd = lu2lovsub_dev(d); - next = cl2lu_dev(lsd->acid_next); - lsd->acid_next = NULL; - return next; -} - -static struct lu_device *lovsub_device_free(const struct lu_env *env, - struct lu_device *d) -{ - struct lovsub_device *lsd = lu2lovsub_dev(d); - struct lu_device *next = cl2lu_dev(lsd->acid_next); - - if (atomic_read(&d->ld_ref) && d->ld_site) { - LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_ERROR, NULL); - lu_site_print(env, d->ld_site, &msgdata, lu_cdebug_printer); - } - cl_device_fini(lu2cl_dev(d)); - kfree(lsd); - return next; -} - -static const struct lu_device_operations lovsub_lu_ops = { - .ldo_object_alloc = lovsub_object_alloc, - .ldo_process_config = NULL, - .ldo_recovery_complete = NULL -}; - -static struct lu_device *lovsub_device_alloc(const struct lu_env *env, - struct lu_device_type *t, - struct lustre_cfg *cfg) -{ - struct lu_device *d; - struct lovsub_device *lsd; - - lsd = kzalloc(sizeof(*lsd), GFP_NOFS); - if (lsd) { - int result; - - result = cl_device_init(&lsd->acid_cl, t); - if (result == 0) { - d = lovsub2lu_dev(lsd); - d->ld_ops = &lovsub_lu_ops; - } else { - d = ERR_PTR(result); - } - } else { - d = ERR_PTR(-ENOMEM); - } - return d; -} - -static const struct lu_device_type_operations lovsub_device_type_ops = { - .ldto_device_alloc = lovsub_device_alloc, - .ldto_device_free = lovsub_device_free, - - .ldto_device_init = lovsub_device_init, - .ldto_device_fini = lovsub_device_fini -}; - -#define LUSTRE_LOVSUB_NAME "lovsub" - -struct lu_device_type lovsub_device_type = { - .ldt_tags = LU_DEVICE_CL, - .ldt_name = LUSTRE_LOVSUB_NAME, - .ldt_ops = &lovsub_device_type_ops, - .ldt_ctx_tags = LCT_CL_THREAD -}; - -/** @} lov */ diff --git a/drivers/staging/lustre/lustre/lov/lovsub_lock.c b/drivers/staging/lustre/lustre/lov/lovsub_lock.c deleted file mode 100644 index ea492be2eef354db78caade48b08138606fbb7d5..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/lov/lovsub_lock.c +++ /dev/null @@ -1,81 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Implementation of cl_lock for LOVSUB layer. - * - * Author: Nikita Danilov - */ - -#define DEBUG_SUBSYSTEM S_LOV - -#include "lov_cl_internal.h" - -/** \addtogroup lov - * @{ - */ - -/***************************************************************************** - * - * Lovsub lock operations. - * - */ - -static void lovsub_lock_fini(const struct lu_env *env, - struct cl_lock_slice *slice) -{ - struct lovsub_lock *lsl; - - lsl = cl2lovsub_lock(slice); - kmem_cache_free(lovsub_lock_kmem, lsl); -} - -static const struct cl_lock_operations lovsub_lock_ops = { - .clo_fini = lovsub_lock_fini, -}; - -int lovsub_lock_init(const struct lu_env *env, struct cl_object *obj, - struct cl_lock *lock, const struct cl_io *io) -{ - struct lovsub_lock *lsk; - int result; - - lsk = kmem_cache_zalloc(lovsub_lock_kmem, GFP_NOFS); - if (lsk) { - cl_lock_slice_add(lock, &lsk->lss_cl, obj, &lovsub_lock_ops); - result = 0; - } else { - result = -ENOMEM; - } - return result; -} - -/** @} lov */ diff --git a/drivers/staging/lustre/lustre/lov/lovsub_object.c b/drivers/staging/lustre/lustre/lov/lovsub_object.c deleted file mode 100644 index 13d452086b61f764e88624160d440fcb0747481f..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/lov/lovsub_object.c +++ /dev/null @@ -1,180 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015 Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Implementation of cl_object for LOVSUB layer. - * - * Author: Nikita Danilov - */ - -#define DEBUG_SUBSYSTEM S_LOV - -#include "lov_cl_internal.h" - -/** \addtogroup lov - * @{ - */ - -/***************************************************************************** - * - * Lovsub object operations. - * - */ - -int lovsub_object_init(const struct lu_env *env, struct lu_object *obj, - const struct lu_object_conf *conf) -{ - struct lovsub_device *dev = lu2lovsub_dev(obj->lo_dev); - struct lu_object *below; - struct lu_device *under; - - int result; - - under = &dev->acid_next->cd_lu_dev; - below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under); - if (below) { - lu_object_add(obj, below); - cl_object_page_init(lu2cl(obj), sizeof(struct lovsub_page)); - result = 0; - } else { - result = -ENOMEM; - } - return result; -} - -static void lovsub_object_free(const struct lu_env *env, struct lu_object *obj) -{ - struct lovsub_object *los = lu2lovsub(obj); - struct lov_object *lov = los->lso_super; - - /* We can't assume lov was assigned here, because of the shadow - * object handling in lu_object_find. - */ - if (lov) { - LASSERT(lov->lo_type == LLT_RAID0); - LASSERT(lov->u.raid0.lo_sub[los->lso_index] == los); - spin_lock(&lov->u.raid0.lo_sub_lock); - lov->u.raid0.lo_sub[los->lso_index] = NULL; - spin_unlock(&lov->u.raid0.lo_sub_lock); - } - - lu_object_fini(obj); - lu_object_header_fini(&los->lso_header.coh_lu); - kmem_cache_free(lovsub_object_kmem, los); -} - -static int lovsub_object_print(const struct lu_env *env, void *cookie, - lu_printer_t p, const struct lu_object *obj) -{ - struct lovsub_object *los = lu2lovsub(obj); - - return (*p)(env, cookie, "[%d]", los->lso_index); -} - -static int lovsub_attr_update(const struct lu_env *env, struct cl_object *obj, - const struct cl_attr *attr, unsigned int valid) -{ - struct lov_object *lov = cl2lovsub(obj)->lso_super; - - lov_r0(lov)->lo_attr_valid = 0; - return 0; -} - -static int lovsub_object_glimpse(const struct lu_env *env, - const struct cl_object *obj, - struct ost_lvb *lvb) -{ - struct lovsub_object *los = cl2lovsub(obj); - - return cl_object_glimpse(env, &los->lso_super->lo_cl, lvb); -} - -/** - * Implementation of struct cl_object_operations::coo_req_attr_set() for lovsub - * layer. Lov and lovsub are responsible only for struct obdo::o_stripe_idx - * field, which is filled there. - */ -static void lovsub_req_attr_set(const struct lu_env *env, struct cl_object *obj, - struct cl_req_attr *attr) -{ - struct lovsub_object *subobj = cl2lovsub(obj); - - cl_req_attr_set(env, &subobj->lso_super->lo_cl, attr); - - /* - * There is no OBD_MD_* flag for obdo::o_stripe_idx, so set it - * unconditionally. It never changes anyway. - */ - attr->cra_oa->o_stripe_idx = subobj->lso_index; -} - -static const struct cl_object_operations lovsub_ops = { - .coo_page_init = lovsub_page_init, - .coo_lock_init = lovsub_lock_init, - .coo_attr_update = lovsub_attr_update, - .coo_glimpse = lovsub_object_glimpse, - .coo_req_attr_set = lovsub_req_attr_set -}; - -static const struct lu_object_operations lovsub_lu_obj_ops = { - .loo_object_init = lovsub_object_init, - .loo_object_delete = NULL, - .loo_object_release = NULL, - .loo_object_free = lovsub_object_free, - .loo_object_print = lovsub_object_print, - .loo_object_invariant = NULL -}; - -struct lu_object *lovsub_object_alloc(const struct lu_env *env, - const struct lu_object_header *unused, - struct lu_device *dev) -{ - struct lovsub_object *los; - struct lu_object *obj; - - los = kmem_cache_zalloc(lovsub_object_kmem, GFP_NOFS); - if (los) { - struct cl_object_header *hdr; - - obj = lovsub2lu(los); - hdr = &los->lso_header; - cl_object_header_init(hdr); - lu_object_init(obj, &hdr->coh_lu, dev); - lu_object_add_top(&hdr->coh_lu, obj); - los->lso_cl.co_ops = &lovsub_ops; - obj->lo_ops = &lovsub_lu_obj_ops; - } else { - obj = NULL; - } - return obj; -} - -/** @} lov */ diff --git a/drivers/staging/lustre/lustre/lov/lovsub_page.c b/drivers/staging/lustre/lustre/lov/lovsub_page.c deleted file mode 100644 index 915520bcdd60e51169997edfe6ca251c9c0bd156..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/lov/lovsub_page.c +++ /dev/null @@ -1,68 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Implementation of cl_page for LOVSUB layer. - * - * Author: Nikita Danilov - */ - -#define DEBUG_SUBSYSTEM S_LOV - -#include "lov_cl_internal.h" - -/** \addtogroup lov - * @{ - */ - -/***************************************************************************** - * - * Lovsub page operations. - * - */ - -static void lovsub_page_fini(const struct lu_env *env, - struct cl_page_slice *slice) -{ -} - -static const struct cl_page_operations lovsub_page_ops = { - .cpo_fini = lovsub_page_fini -}; - -int lovsub_page_init(const struct lu_env *env, struct cl_object *obj, - struct cl_page *page, pgoff_t index) -{ - struct lovsub_page *lsb = cl_object_page_slice(obj, page); - - cl_page_slice_add(page, &lsb->lsb_cl, obj, index, &lovsub_page_ops); - return 0; -} - -/** @} lov */ diff --git a/drivers/staging/lustre/lustre/lov/lproc_lov.c b/drivers/staging/lustre/lustre/lov/lproc_lov.c deleted file mode 100644 index 721440feef728fc00cec2a8d268e3d109729fa5d..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/lov/lproc_lov.c +++ /dev/null @@ -1,299 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015 Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ -#define DEBUG_SUBSYSTEM S_CLASS - -#include -#include -#include -#include -#include "lov_internal.h" - -static int lov_stripesize_seq_show(struct seq_file *m, void *v) -{ - struct obd_device *dev = (struct obd_device *)m->private; - struct lov_desc *desc; - - LASSERT(dev); - desc = &dev->u.lov.desc; - seq_printf(m, "%llu\n", desc->ld_default_stripe_size); - return 0; -} - -static ssize_t lov_stripesize_seq_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *off) -{ - struct obd_device *dev = ((struct seq_file *)file->private_data)->private; - struct lov_desc *desc; - __u64 val; - int rc; - - LASSERT(dev); - desc = &dev->u.lov.desc; - rc = lprocfs_write_u64_helper(buffer, count, &val); - if (rc) - return rc; - - lov_fix_desc_stripe_size(&val); - desc->ld_default_stripe_size = val; - return count; -} - -LPROC_SEQ_FOPS(lov_stripesize); - -static int lov_stripeoffset_seq_show(struct seq_file *m, void *v) -{ - struct obd_device *dev = (struct obd_device *)m->private; - struct lov_desc *desc; - - LASSERT(dev); - desc = &dev->u.lov.desc; - seq_printf(m, "%llu\n", desc->ld_default_stripe_offset); - return 0; -} - -static ssize_t lov_stripeoffset_seq_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *off) -{ - struct obd_device *dev = ((struct seq_file *)file->private_data)->private; - struct lov_desc *desc; - __u64 val; - int rc; - - LASSERT(dev); - desc = &dev->u.lov.desc; - rc = lprocfs_write_u64_helper(buffer, count, &val); - if (rc) - return rc; - - desc->ld_default_stripe_offset = val; - return count; -} - -LPROC_SEQ_FOPS(lov_stripeoffset); - -static int lov_stripetype_seq_show(struct seq_file *m, void *v) -{ - struct obd_device *dev = (struct obd_device *)m->private; - struct lov_desc *desc; - - LASSERT(dev); - desc = &dev->u.lov.desc; - seq_printf(m, "%u\n", desc->ld_pattern); - return 0; -} - -static ssize_t lov_stripetype_seq_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *off) -{ - struct obd_device *dev = ((struct seq_file *)file->private_data)->private; - struct lov_desc *desc; - int val, rc; - - LASSERT(dev); - desc = &dev->u.lov.desc; - rc = lprocfs_write_helper(buffer, count, &val); - if (rc) - return rc; - - lov_fix_desc_pattern(&val); - desc->ld_pattern = val; - return count; -} - -LPROC_SEQ_FOPS(lov_stripetype); - -static int lov_stripecount_seq_show(struct seq_file *m, void *v) -{ - struct obd_device *dev = (struct obd_device *)m->private; - struct lov_desc *desc; - - LASSERT(dev); - desc = &dev->u.lov.desc; - seq_printf(m, "%d\n", (__s16)(desc->ld_default_stripe_count + 1) - 1); - return 0; -} - -static ssize_t lov_stripecount_seq_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *off) -{ - struct obd_device *dev = ((struct seq_file *)file->private_data)->private; - struct lov_desc *desc; - int val, rc; - - LASSERT(dev); - desc = &dev->u.lov.desc; - rc = lprocfs_write_helper(buffer, count, &val); - if (rc) - return rc; - - lov_fix_desc_stripe_count(&val); - desc->ld_default_stripe_count = val; - return count; -} - -LPROC_SEQ_FOPS(lov_stripecount); - -static ssize_t numobd_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - struct lov_desc *desc; - - desc = &dev->u.lov.desc; - return sprintf(buf, "%u\n", desc->ld_tgt_count); -} -LUSTRE_RO_ATTR(numobd); - -static ssize_t activeobd_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - struct lov_desc *desc; - - desc = &dev->u.lov.desc; - return sprintf(buf, "%u\n", desc->ld_active_tgt_count); -} -LUSTRE_RO_ATTR(activeobd); - -static int lov_desc_uuid_seq_show(struct seq_file *m, void *v) -{ - struct obd_device *dev = (struct obd_device *)m->private; - struct lov_obd *lov; - - LASSERT(dev); - lov = &dev->u.lov; - seq_printf(m, "%s\n", lov->desc.ld_uuid.uuid); - return 0; -} - -LPROC_SEQ_FOPS_RO(lov_desc_uuid); - -static void *lov_tgt_seq_start(struct seq_file *p, loff_t *pos) -{ - struct obd_device *dev = p->private; - struct lov_obd *lov = &dev->u.lov; - - while (*pos < lov->desc.ld_tgt_count) { - if (lov->lov_tgts[*pos]) - return lov->lov_tgts[*pos]; - ++*pos; - } - return NULL; -} - -static void lov_tgt_seq_stop(struct seq_file *p, void *v) -{ -} - -static void *lov_tgt_seq_next(struct seq_file *p, void *v, loff_t *pos) -{ - struct obd_device *dev = p->private; - struct lov_obd *lov = &dev->u.lov; - - while (++*pos < lov->desc.ld_tgt_count) { - if (lov->lov_tgts[*pos]) - return lov->lov_tgts[*pos]; - } - return NULL; -} - -static int lov_tgt_seq_show(struct seq_file *p, void *v) -{ - struct lov_tgt_desc *tgt = v; - - seq_printf(p, "%d: %s %sACTIVE\n", - tgt->ltd_index, obd_uuid2str(&tgt->ltd_uuid), - tgt->ltd_active ? "" : "IN"); - return 0; -} - -static const struct seq_operations lov_tgt_sops = { - .start = lov_tgt_seq_start, - .stop = lov_tgt_seq_stop, - .next = lov_tgt_seq_next, - .show = lov_tgt_seq_show, -}; - -static int lov_target_seq_open(struct inode *inode, struct file *file) -{ - struct seq_file *seq; - int rc; - - rc = seq_open(file, &lov_tgt_sops); - if (rc) - return rc; - - seq = file->private_data; - seq->private = inode->i_private; - return 0; -} - -static struct lprocfs_vars lprocfs_lov_obd_vars[] = { - { "stripesize", &lov_stripesize_fops, NULL }, - { "stripeoffset", &lov_stripeoffset_fops, NULL }, - { "stripecount", &lov_stripecount_fops, NULL }, - { "stripetype", &lov_stripetype_fops, NULL }, - /*{ "filegroups", lprocfs_rd_filegroups, NULL, 0 },*/ - { "desc_uuid", &lov_desc_uuid_fops, NULL, 0 }, - { NULL } -}; - -static struct attribute *lov_attrs[] = { - &lustre_attr_activeobd.attr, - &lustre_attr_numobd.attr, - NULL, -}; - -static const struct attribute_group lov_attr_group = { - .attrs = lov_attrs, -}; - -void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars) -{ - lvars->sysfs_vars = &lov_attr_group; - lvars->obd_vars = lprocfs_lov_obd_vars; -} - -const struct file_operations lov_proc_target_fops = { - .owner = THIS_MODULE, - .open = lov_target_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = lprocfs_seq_release, -}; diff --git a/drivers/staging/lustre/lustre/mdc/Makefile b/drivers/staging/lustre/lustre/mdc/Makefile deleted file mode 100644 index c7bc3351ccb01d6cd3a5a96e6a459c32873fd38d..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/mdc/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/include -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/lustre/include - -obj-$(CONFIG_LUSTRE_FS) += mdc.o -mdc-y := mdc_request.o mdc_reint.o mdc_lib.o mdc_locks.o lproc_mdc.o diff --git a/drivers/staging/lustre/lustre/mdc/lproc_mdc.c b/drivers/staging/lustre/lustre/mdc/lproc_mdc.c deleted file mode 100644 index 6cce32491eb57a561d64acad2be68ec9eca3037c..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/mdc/lproc_mdc.c +++ /dev/null @@ -1,231 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ -#define DEBUG_SUBSYSTEM S_CLASS - -#include -#include -#include -#include "mdc_internal.h" - -static ssize_t active_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - - return sprintf(buf, "%u\n", !dev->u.cli.cl_import->imp_deactive); -} - -static ssize_t active_store(struct kobject *kobj, struct attribute *attr, - const char *buffer, size_t count) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - unsigned long val; - int rc; - - rc = kstrtoul(buffer, 10, &val); - if (rc) - return rc; - - if (val > 1) - return -ERANGE; - - /* opposite senses */ - if (dev->u.cli.cl_import->imp_deactive == val) { - rc = ptlrpc_set_import_active(dev->u.cli.cl_import, val); - if (rc) - count = rc; - } else { - CDEBUG(D_CONFIG, "activate %lu: ignoring repeat request\n", val); - } - return count; -} -LUSTRE_RW_ATTR(active); - -static ssize_t max_rpcs_in_flight_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - int len; - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - __u32 max; - - max = obd_get_max_rpcs_in_flight(&dev->u.cli); - len = sprintf(buf, "%u\n", max); - - return len; -} - -static ssize_t max_rpcs_in_flight_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - int rc; - unsigned long val; - - rc = kstrtoul(buffer, 10, &val); - if (rc) - return rc; - - rc = obd_set_max_rpcs_in_flight(&dev->u.cli, val); - if (rc) - count = rc; - - return count; -} -LUSTRE_RW_ATTR(max_rpcs_in_flight); - -static ssize_t max_mod_rpcs_in_flight_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - u16 max; - int len; - - max = dev->u.cli.cl_max_mod_rpcs_in_flight; - len = sprintf(buf, "%hu\n", max); - - return len; -} - -static ssize_t max_mod_rpcs_in_flight_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - u16 val; - int rc; - - rc = kstrtou16(buffer, 10, &val); - if (rc) - return rc; - - rc = obd_set_max_mod_rpcs_in_flight(&dev->u.cli, val); - if (rc) - count = rc; - - return count; -} -LUSTRE_RW_ATTR(max_mod_rpcs_in_flight); - -static int mdc_rpc_stats_seq_show(struct seq_file *seq, void *v) -{ - struct obd_device *dev = seq->private; - - return obd_mod_rpc_stats_seq_show(&dev->u.cli, seq); -} - -static ssize_t mdc_rpc_stats_seq_write(struct file *file, - const char __user *buf, - size_t len, loff_t *off) -{ - struct seq_file *seq = file->private_data; - struct obd_device *dev = seq->private; - struct client_obd *cli = &dev->u.cli; - - lprocfs_oh_clear(&cli->cl_mod_rpcs_hist); - - return len; -} -LPROC_SEQ_FOPS(mdc_rpc_stats); - -LPROC_SEQ_FOPS_WR_ONLY(mdc, ping); - -LPROC_SEQ_FOPS_RO_TYPE(mdc, connect_flags); -LPROC_SEQ_FOPS_RO_TYPE(mdc, server_uuid); -LPROC_SEQ_FOPS_RO_TYPE(mdc, conn_uuid); -LPROC_SEQ_FOPS_RO_TYPE(mdc, timeouts); -LPROC_SEQ_FOPS_RO_TYPE(mdc, state); - -/* - * Note: below sysfs entry is provided, but not currently in use, instead - * sbi->sb_md_brw_size is used, the per obd variable should be used - * when DNE is enabled, and dir pages are managed in MDC layer. - * Don't forget to enable sysfs store function then. - */ -static ssize_t max_pages_per_rpc_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - struct client_obd *cli = &dev->u.cli; - - return sprintf(buf, "%d\n", cli->cl_max_pages_per_rpc); -} -LUSTRE_RO_ATTR(max_pages_per_rpc); - -LPROC_SEQ_FOPS_RW_TYPE(mdc, import); -LPROC_SEQ_FOPS_RW_TYPE(mdc, pinger_recov); - -static struct lprocfs_vars lprocfs_mdc_obd_vars[] = { - { "ping", &mdc_ping_fops, NULL, 0222 }, - { "connect_flags", &mdc_connect_flags_fops, NULL, 0 }, - /*{ "filegroups", lprocfs_rd_filegroups, NULL, 0 },*/ - { "mds_server_uuid", &mdc_server_uuid_fops, NULL, 0 }, - { "mds_conn_uuid", &mdc_conn_uuid_fops, NULL, 0 }, - { "timeouts", &mdc_timeouts_fops, NULL, 0 }, - { "import", &mdc_import_fops, NULL, 0 }, - { "state", &mdc_state_fops, NULL, 0 }, - { "pinger_recov", &mdc_pinger_recov_fops, NULL, 0 }, - { .name = "rpc_stats", - .fops = &mdc_rpc_stats_fops }, - { NULL } -}; - -static struct attribute *mdc_attrs[] = { - &lustre_attr_active.attr, - &lustre_attr_max_rpcs_in_flight.attr, - &lustre_attr_max_mod_rpcs_in_flight.attr, - &lustre_attr_max_pages_per_rpc.attr, - NULL, -}; - -static const struct attribute_group mdc_attr_group = { - .attrs = mdc_attrs, -}; - -void lprocfs_mdc_init_vars(struct lprocfs_static_vars *lvars) -{ - lvars->sysfs_vars = &mdc_attr_group; - lvars->obd_vars = lprocfs_mdc_obd_vars; -} diff --git a/drivers/staging/lustre/lustre/mdc/mdc_internal.h b/drivers/staging/lustre/lustre/mdc/mdc_internal.h deleted file mode 100644 index 28924e927b5034782bb59da0a0c5dad2301ebdc2..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/mdc/mdc_internal.h +++ /dev/null @@ -1,144 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015 Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#ifndef _MDC_INTERNAL_H -#define _MDC_INTERNAL_H - -#include - -void lprocfs_mdc_init_vars(struct lprocfs_static_vars *lvars); - -void mdc_pack_body(struct ptlrpc_request *req, const struct lu_fid *fid, - __u64 valid, size_t ea_size, __u32 suppgid, u32 flags); -void mdc_swap_layouts_pack(struct ptlrpc_request *req, - struct md_op_data *op_data); -void mdc_readdir_pack(struct ptlrpc_request *req, __u64 pgoff, size_t size, - const struct lu_fid *fid); -void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, u32 flags, - struct md_op_data *data, size_t ea_size); -void mdc_setattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data, - void *ea, size_t ealen); -void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data, - const void *data, size_t datalen, umode_t mode, uid_t uid, - gid_t gid, kernel_cap_t capability, __u64 rdev); -void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data, - umode_t mode, __u64 rdev, __u64 flags, const void *data, - size_t datalen); -void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data); -void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data); -void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data, - const char *old, size_t oldlen, - const char *new, size_t newlen); -void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data); - -/* mdc/mdc_locks.c */ -int mdc_set_lock_data(struct obd_export *exp, - const struct lustre_handle *lockh, - void *data, __u64 *bits); - -int mdc_null_inode(struct obd_export *exp, const struct lu_fid *fid); - -int mdc_intent_lock(struct obd_export *exp, - struct md_op_data *op_data, - struct lookup_intent *it, - struct ptlrpc_request **reqp, - ldlm_blocking_callback cb_blocking, - __u64 extra_lock_flags); - -int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo, - const union ldlm_policy_data *policy, - struct md_op_data *op_data, - struct lustre_handle *lockh, u64 extra_lock_flags); - -int mdc_resource_get_unused(struct obd_export *exp, const struct lu_fid *fid, - struct list_head *cancels, enum ldlm_mode mode, - __u64 bits); -/* mdc/mdc_request.c */ -int mdc_fid_alloc(const struct lu_env *env, struct obd_export *exp, - struct lu_fid *fid, struct md_op_data *op_data); -struct obd_client_handle; - -int mdc_set_open_replay_data(struct obd_export *exp, - struct obd_client_handle *och, - struct lookup_intent *it); - -void mdc_commit_open(struct ptlrpc_request *req); -void mdc_replay_open(struct ptlrpc_request *req); - -int mdc_create(struct obd_export *exp, struct md_op_data *op_data, - const void *data, size_t datalen, umode_t mode, uid_t uid, - gid_t gid, kernel_cap_t capability, __u64 rdev, - struct ptlrpc_request **request); -int mdc_link(struct obd_export *exp, struct md_op_data *op_data, - struct ptlrpc_request **request); -int mdc_rename(struct obd_export *exp, struct md_op_data *op_data, - const char *old, size_t oldlen, - const char *new, size_t newlen, - struct ptlrpc_request **request); -int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data, - void *ea, size_t ealen, struct ptlrpc_request **request); -int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data, - struct ptlrpc_request **request); -int mdc_cancel_unused(struct obd_export *exp, const struct lu_fid *fid, - union ldlm_policy_data *policy, enum ldlm_mode mode, - enum ldlm_cancel_flags flags, void *opaque); - -int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it, - struct lu_fid *fid, __u64 *bits); - -int mdc_intent_getattr_async(struct obd_export *exp, - struct md_enqueue_info *minfo); - -enum ldlm_mode mdc_lock_match(struct obd_export *exp, __u64 flags, - const struct lu_fid *fid, enum ldlm_type type, - union ldlm_policy_data *policy, - enum ldlm_mode mode, - struct lustre_handle *lockh); - -static inline int mdc_prep_elc_req(struct obd_export *exp, - struct ptlrpc_request *req, int opc, - struct list_head *cancels, int count) -{ - return ldlm_prep_elc_req(exp, req, LUSTRE_MDS_VERSION, opc, 0, cancels, - count); -} - -static inline unsigned long hash_x_index(__u64 hash, int hash64) -{ - if (BITS_PER_LONG == 32 && hash64) - hash >>= 32; - /* save hash 0 with hash 1 */ - return ~0UL - (hash + !hash); -} - -#endif diff --git a/drivers/staging/lustre/lustre/mdc/mdc_lib.c b/drivers/staging/lustre/lustre/mdc/mdc_lib.c deleted file mode 100644 index d582968987ff27b8460dd7c78baead51a8fcaa2e..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/mdc/mdc_lib.c +++ /dev/null @@ -1,498 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_MDC -#include -#include -#include "mdc_internal.h" - -static void set_mrc_cr_flags(struct mdt_rec_create *mrc, u64 flags) -{ - mrc->cr_flags_l = (u32)(flags & 0xFFFFFFFFUll); - mrc->cr_flags_h = (u32)(flags >> 32); -} - -static void __mdc_pack_body(struct mdt_body *b, __u32 suppgid) -{ - b->mbo_suppgid = suppgid; - b->mbo_uid = from_kuid(&init_user_ns, current_uid()); - b->mbo_gid = from_kgid(&init_user_ns, current_gid()); - b->mbo_fsuid = from_kuid(&init_user_ns, current_fsuid()); - b->mbo_fsgid = from_kgid(&init_user_ns, current_fsgid()); - b->mbo_capability = current_cap().cap[0]; -} - -void mdc_swap_layouts_pack(struct ptlrpc_request *req, - struct md_op_data *op_data) -{ - struct mdt_body *b = req_capsule_client_get(&req->rq_pill, - &RMF_MDT_BODY); - - __mdc_pack_body(b, op_data->op_suppgids[0]); - b->mbo_fid1 = op_data->op_fid1; - b->mbo_fid2 = op_data->op_fid2; - b->mbo_valid |= OBD_MD_FLID; -} - -void mdc_pack_body(struct ptlrpc_request *req, const struct lu_fid *fid, - __u64 valid, size_t ea_size, __u32 suppgid, u32 flags) -{ - struct mdt_body *b = req_capsule_client_get(&req->rq_pill, - &RMF_MDT_BODY); - b->mbo_valid = valid; - b->mbo_eadatasize = ea_size; - b->mbo_flags = flags; - __mdc_pack_body(b, suppgid); - if (fid) { - b->mbo_fid1 = *fid; - b->mbo_valid |= OBD_MD_FLID; - } -} - -/** - * Pack a name (path component) into a request - * - * \param[in] req request - * \param[in] field request field (usually RMF_NAME) - * \param[in] name path component - * \param[in] name_len length of path component - * - * \a field must be present in \a req and of size \a name_len + 1. - * - * \a name must be '\0' terminated of length \a name_len and represent - * a single path component (not contain '/'). - */ -static void mdc_pack_name(struct ptlrpc_request *req, - const struct req_msg_field *field, - const char *name, size_t name_len) -{ - size_t buf_size; - size_t cpy_len; - char *buf; - - buf = req_capsule_client_get(&req->rq_pill, field); - buf_size = req_capsule_get_size(&req->rq_pill, field, RCL_CLIENT); - - LASSERT(name && name_len && buf && buf_size == name_len + 1); - - cpy_len = strlcpy(buf, name, buf_size); - - LASSERT(cpy_len == name_len && lu_name_is_valid_2(buf, cpy_len)); -} - -void mdc_readdir_pack(struct ptlrpc_request *req, __u64 pgoff, size_t size, - const struct lu_fid *fid) -{ - struct mdt_body *b = req_capsule_client_get(&req->rq_pill, - &RMF_MDT_BODY); - b->mbo_fid1 = *fid; - b->mbo_valid |= OBD_MD_FLID; - b->mbo_size = pgoff; /* !! */ - b->mbo_nlink = size; /* !! */ - __mdc_pack_body(b, -1); - b->mbo_mode = LUDA_FID | LUDA_TYPE; -} - -/* packing of MDS records */ -void mdc_create_pack(struct ptlrpc_request *req, struct md_op_data *op_data, - const void *data, size_t datalen, umode_t mode, - uid_t uid, gid_t gid, kernel_cap_t cap_effective, - __u64 rdev) -{ - struct mdt_rec_create *rec; - char *tmp; - __u64 flags; - - BUILD_BUG_ON(sizeof(struct mdt_rec_reint) != sizeof(struct mdt_rec_create)); - rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); - - rec->cr_opcode = REINT_CREATE; - rec->cr_fsuid = uid; - rec->cr_fsgid = gid; - rec->cr_cap = cap_effective.cap[0]; - rec->cr_fid1 = op_data->op_fid1; - rec->cr_fid2 = op_data->op_fid2; - rec->cr_mode = mode; - rec->cr_rdev = rdev; - rec->cr_time = op_data->op_mod_time; - rec->cr_suppgid1 = op_data->op_suppgids[0]; - rec->cr_suppgid2 = op_data->op_suppgids[1]; - flags = 0; - if (op_data->op_bias & MDS_CREATE_VOLATILE) - flags |= MDS_OPEN_VOLATILE; - set_mrc_cr_flags(rec, flags); - rec->cr_bias = op_data->op_bias; - rec->cr_umask = current_umask(); - - mdc_pack_name(req, &RMF_NAME, op_data->op_name, op_data->op_namelen); - if (data) { - tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA); - memcpy(tmp, data, datalen); - } -} - -static inline __u64 mds_pack_open_flags(__u64 flags) -{ - __u64 cr_flags = (flags & (FMODE_READ | FMODE_WRITE | - MDS_OPEN_FL_INTERNAL)); - if (flags & O_CREAT) - cr_flags |= MDS_OPEN_CREAT; - if (flags & O_EXCL) - cr_flags |= MDS_OPEN_EXCL; - if (flags & O_TRUNC) - cr_flags |= MDS_OPEN_TRUNC; - if (flags & O_APPEND) - cr_flags |= MDS_OPEN_APPEND; - if (flags & O_SYNC) - cr_flags |= MDS_OPEN_SYNC; - if (flags & O_DIRECTORY) - cr_flags |= MDS_OPEN_DIRECTORY; - if (flags & __FMODE_EXEC) - cr_flags |= MDS_FMODE_EXEC; - if (cl_is_lov_delay_create(flags)) - cr_flags |= MDS_OPEN_DELAY_CREATE; - - if (flags & O_NONBLOCK) - cr_flags |= MDS_OPEN_NORESTORE; - - return cr_flags; -} - -/* packing of MDS records */ -void mdc_open_pack(struct ptlrpc_request *req, struct md_op_data *op_data, - umode_t mode, __u64 rdev, __u64 flags, const void *lmm, - size_t lmmlen) -{ - struct mdt_rec_create *rec; - char *tmp; - __u64 cr_flags; - - BUILD_BUG_ON(sizeof(struct mdt_rec_reint) != sizeof(struct mdt_rec_create)); - rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); - - /* XXX do something about time, uid, gid */ - rec->cr_opcode = REINT_OPEN; - rec->cr_fsuid = from_kuid(&init_user_ns, current_fsuid()); - rec->cr_fsgid = from_kgid(&init_user_ns, current_fsgid()); - rec->cr_cap = current_cap().cap[0]; - rec->cr_fid1 = op_data->op_fid1; - rec->cr_fid2 = op_data->op_fid2; - - rec->cr_mode = mode; - cr_flags = mds_pack_open_flags(flags); - rec->cr_rdev = rdev; - rec->cr_time = op_data->op_mod_time; - rec->cr_suppgid1 = op_data->op_suppgids[0]; - rec->cr_suppgid2 = op_data->op_suppgids[1]; - rec->cr_bias = op_data->op_bias; - rec->cr_umask = current_umask(); - rec->cr_old_handle = op_data->op_handle; - - if (op_data->op_name) { - mdc_pack_name(req, &RMF_NAME, op_data->op_name, - op_data->op_namelen); - - if (op_data->op_bias & MDS_CREATE_VOLATILE) - cr_flags |= MDS_OPEN_VOLATILE; - } - - if (lmm) { - cr_flags |= MDS_OPEN_HAS_EA; - tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA); - memcpy(tmp, lmm, lmmlen); - } - set_mrc_cr_flags(rec, cr_flags); -} - -static inline __u64 attr_pack(unsigned int ia_valid) -{ - __u64 sa_valid = 0; - - if (ia_valid & ATTR_MODE) - sa_valid |= MDS_ATTR_MODE; - if (ia_valid & ATTR_UID) - sa_valid |= MDS_ATTR_UID; - if (ia_valid & ATTR_GID) - sa_valid |= MDS_ATTR_GID; - if (ia_valid & ATTR_SIZE) - sa_valid |= MDS_ATTR_SIZE; - if (ia_valid & ATTR_ATIME) - sa_valid |= MDS_ATTR_ATIME; - if (ia_valid & ATTR_MTIME) - sa_valid |= MDS_ATTR_MTIME; - if (ia_valid & ATTR_CTIME) - sa_valid |= MDS_ATTR_CTIME; - if (ia_valid & ATTR_ATIME_SET) - sa_valid |= MDS_ATTR_ATIME_SET; - if (ia_valid & ATTR_MTIME_SET) - sa_valid |= MDS_ATTR_MTIME_SET; - if (ia_valid & ATTR_FORCE) - sa_valid |= MDS_ATTR_FORCE; - if (ia_valid & ATTR_ATTR_FLAG) - sa_valid |= MDS_ATTR_ATTR_FLAG; - if (ia_valid & ATTR_KILL_SUID) - sa_valid |= MDS_ATTR_KILL_SUID; - if (ia_valid & ATTR_KILL_SGID) - sa_valid |= MDS_ATTR_KILL_SGID; - if (ia_valid & ATTR_CTIME_SET) - sa_valid |= MDS_ATTR_CTIME_SET; - if (ia_valid & ATTR_OPEN) - sa_valid |= MDS_ATTR_FROM_OPEN; - if (ia_valid & ATTR_BLOCKS) - sa_valid |= MDS_ATTR_BLOCKS; - if (ia_valid & MDS_OPEN_OWNEROVERRIDE) - /* NFSD hack (see bug 5781) */ - sa_valid |= MDS_OPEN_OWNEROVERRIDE; - return sa_valid; -} - -static void mdc_setattr_pack_rec(struct mdt_rec_setattr *rec, - struct md_op_data *op_data) -{ - rec->sa_opcode = REINT_SETATTR; - rec->sa_fsuid = from_kuid(&init_user_ns, current_fsuid()); - rec->sa_fsgid = from_kgid(&init_user_ns, current_fsgid()); - rec->sa_cap = current_cap().cap[0]; - rec->sa_suppgid = -1; - - rec->sa_fid = op_data->op_fid1; - rec->sa_valid = attr_pack(op_data->op_attr.ia_valid); - rec->sa_mode = op_data->op_attr.ia_mode; - rec->sa_uid = from_kuid(&init_user_ns, op_data->op_attr.ia_uid); - rec->sa_gid = from_kgid(&init_user_ns, op_data->op_attr.ia_gid); - rec->sa_size = op_data->op_attr.ia_size; - rec->sa_blocks = op_data->op_attr_blocks; - rec->sa_atime = LTIME_S(op_data->op_attr.ia_atime); - rec->sa_mtime = LTIME_S(op_data->op_attr.ia_mtime); - rec->sa_ctime = LTIME_S(op_data->op_attr.ia_ctime); - rec->sa_attr_flags = op_data->op_attr_flags; - if ((op_data->op_attr.ia_valid & ATTR_GID) && - in_group_p(op_data->op_attr.ia_gid)) - rec->sa_suppgid = - from_kgid(&init_user_ns, op_data->op_attr.ia_gid); - else - rec->sa_suppgid = op_data->op_suppgids[0]; - - rec->sa_bias = op_data->op_bias; -} - -static void mdc_ioepoch_pack(struct mdt_ioepoch *epoch, - struct md_op_data *op_data) -{ - epoch->mio_handle = op_data->op_handle; - epoch->mio_unused1 = 0; - epoch->mio_unused2 = 0; - epoch->mio_padding = 0; -} - -void mdc_setattr_pack(struct ptlrpc_request *req, struct md_op_data *op_data, - void *ea, size_t ealen) -{ - struct mdt_rec_setattr *rec; - struct lov_user_md *lum = NULL; - - BUILD_BUG_ON(sizeof(struct mdt_rec_reint) != - sizeof(struct mdt_rec_setattr)); - rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); - mdc_setattr_pack_rec(rec, op_data); - - if (ealen == 0) - return; - - lum = req_capsule_client_get(&req->rq_pill, &RMF_EADATA); - if (!ea) { /* Remove LOV EA */ - lum->lmm_magic = cpu_to_le32(LOV_USER_MAGIC_V1); - lum->lmm_stripe_size = 0; - lum->lmm_stripe_count = 0; - lum->lmm_stripe_offset = (typeof(lum->lmm_stripe_offset))(-1); - } else { - memcpy(lum, ea, ealen); - } -} - -void mdc_unlink_pack(struct ptlrpc_request *req, struct md_op_data *op_data) -{ - struct mdt_rec_unlink *rec; - - BUILD_BUG_ON(sizeof(struct mdt_rec_reint) != sizeof(struct mdt_rec_unlink)); - rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); - - rec->ul_opcode = op_data->op_cli_flags & CLI_RM_ENTRY ? - REINT_RMENTRY : REINT_UNLINK; - rec->ul_fsuid = op_data->op_fsuid; - rec->ul_fsgid = op_data->op_fsgid; - rec->ul_cap = op_data->op_cap.cap[0]; - rec->ul_mode = op_data->op_mode; - rec->ul_suppgid1 = op_data->op_suppgids[0]; - rec->ul_suppgid2 = -1; - rec->ul_fid1 = op_data->op_fid1; - rec->ul_fid2 = op_data->op_fid2; - rec->ul_time = op_data->op_mod_time; - rec->ul_bias = op_data->op_bias; - - mdc_pack_name(req, &RMF_NAME, op_data->op_name, op_data->op_namelen); -} - -void mdc_link_pack(struct ptlrpc_request *req, struct md_op_data *op_data) -{ - struct mdt_rec_link *rec; - - BUILD_BUG_ON(sizeof(struct mdt_rec_reint) != sizeof(struct mdt_rec_link)); - rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); - - rec->lk_opcode = REINT_LINK; - rec->lk_fsuid = op_data->op_fsuid; /* current->fsuid; */ - rec->lk_fsgid = op_data->op_fsgid; /* current->fsgid; */ - rec->lk_cap = op_data->op_cap.cap[0]; /* current->cap_effective; */ - rec->lk_suppgid1 = op_data->op_suppgids[0]; - rec->lk_suppgid2 = op_data->op_suppgids[1]; - rec->lk_fid1 = op_data->op_fid1; - rec->lk_fid2 = op_data->op_fid2; - rec->lk_time = op_data->op_mod_time; - rec->lk_bias = op_data->op_bias; - - mdc_pack_name(req, &RMF_NAME, op_data->op_name, op_data->op_namelen); -} - -static void mdc_intent_close_pack(struct ptlrpc_request *req, - struct md_op_data *op_data) -{ - enum mds_op_bias bias = op_data->op_bias; - struct close_data *data; - struct ldlm_lock *lock; - - if (!(bias & (MDS_HSM_RELEASE | MDS_CLOSE_LAYOUT_SWAP | - MDS_RENAME_MIGRATE))) - return; - - data = req_capsule_client_get(&req->rq_pill, &RMF_CLOSE_DATA); - LASSERT(data); - - lock = ldlm_handle2lock(&op_data->op_lease_handle); - if (lock) { - data->cd_handle = lock->l_remote_handle; - LDLM_LOCK_PUT(lock); - } - ldlm_cli_cancel(&op_data->op_lease_handle, LCF_LOCAL); - - data->cd_data_version = op_data->op_data_version; - data->cd_fid = op_data->op_fid2; -} - -void mdc_rename_pack(struct ptlrpc_request *req, struct md_op_data *op_data, - const char *old, size_t oldlen, - const char *new, size_t newlen) -{ - struct mdt_rec_rename *rec; - - BUILD_BUG_ON(sizeof(struct mdt_rec_reint) != sizeof(struct mdt_rec_rename)); - rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); - - /* XXX do something about time, uid, gid */ - rec->rn_opcode = op_data->op_cli_flags & CLI_MIGRATE ? - REINT_MIGRATE : REINT_RENAME; - rec->rn_opcode = REINT_RENAME; - rec->rn_fsuid = op_data->op_fsuid; - rec->rn_fsgid = op_data->op_fsgid; - rec->rn_cap = op_data->op_cap.cap[0]; - rec->rn_suppgid1 = op_data->op_suppgids[0]; - rec->rn_suppgid2 = op_data->op_suppgids[1]; - rec->rn_fid1 = op_data->op_fid1; - rec->rn_fid2 = op_data->op_fid2; - rec->rn_time = op_data->op_mod_time; - rec->rn_mode = op_data->op_mode; - rec->rn_bias = op_data->op_bias; - - mdc_pack_name(req, &RMF_NAME, old, oldlen); - - if (new) - mdc_pack_name(req, &RMF_SYMTGT, new, newlen); - - if (op_data->op_cli_flags & CLI_MIGRATE && - op_data->op_bias & MDS_RENAME_MIGRATE) { - struct mdt_ioepoch *epoch; - - mdc_intent_close_pack(req, op_data); - epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH); - mdc_ioepoch_pack(epoch, op_data); - } -} - -void mdc_getattr_pack(struct ptlrpc_request *req, __u64 valid, u32 flags, - struct md_op_data *op_data, size_t ea_size) -{ - struct mdt_body *b = req_capsule_client_get(&req->rq_pill, - &RMF_MDT_BODY); - - b->mbo_valid = valid; - if (op_data->op_bias & MDS_CHECK_SPLIT) - b->mbo_valid |= OBD_MD_FLCKSPLIT; - if (op_data->op_bias & MDS_CROSS_REF) - b->mbo_valid |= OBD_MD_FLCROSSREF; - b->mbo_eadatasize = ea_size; - b->mbo_flags = flags; - __mdc_pack_body(b, op_data->op_suppgids[0]); - - b->mbo_fid1 = op_data->op_fid1; - b->mbo_fid2 = op_data->op_fid2; - b->mbo_valid |= OBD_MD_FLID; - - if (op_data->op_name) - mdc_pack_name(req, &RMF_NAME, op_data->op_name, - op_data->op_namelen); -} - -void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data) -{ - struct mdt_ioepoch *epoch; - struct mdt_rec_setattr *rec; - - epoch = req_capsule_client_get(&req->rq_pill, &RMF_MDT_EPOCH); - rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); - - mdc_setattr_pack_rec(rec, op_data); - /* - * The client will zero out local timestamps when losing the IBITS lock - * so any new RPC timestamps will update the client inode's timestamps. - * There was a defect on the server side which allowed the atime to be - * overwritten by a zeroed-out atime packed into the close RPC. - * - * Proactively clear the MDS_ATTR_ATIME flag in the RPC in this case - * to avoid zeroing the atime on old unpatched servers. See LU-8041. - */ - if (rec->sa_atime == 0) - rec->sa_valid &= ~MDS_ATTR_ATIME; - - mdc_ioepoch_pack(epoch, op_data); - mdc_intent_close_pack(req, op_data); -} diff --git a/drivers/staging/lustre/lustre/mdc/mdc_locks.c b/drivers/staging/lustre/lustre/mdc/mdc_locks.c deleted file mode 100644 index a8aa0fa5e87ace1030403e0e062a30f8a196ffbb..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/mdc/mdc_locks.c +++ /dev/null @@ -1,1239 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_MDC - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "mdc_internal.h" - -struct mdc_getattr_args { - struct obd_export *ga_exp; - struct md_enqueue_info *ga_minfo; -}; - -int it_open_error(int phase, struct lookup_intent *it) -{ - if (it_disposition(it, DISP_OPEN_LEASE)) { - if (phase >= DISP_OPEN_LEASE) - return it->it_status; - else - return 0; - } - if (it_disposition(it, DISP_OPEN_OPEN)) { - if (phase >= DISP_OPEN_OPEN) - return it->it_status; - else - return 0; - } - - if (it_disposition(it, DISP_OPEN_CREATE)) { - if (phase >= DISP_OPEN_CREATE) - return it->it_status; - else - return 0; - } - - if (it_disposition(it, DISP_LOOKUP_EXECD)) { - if (phase >= DISP_LOOKUP_EXECD) - return it->it_status; - else - return 0; - } - - if (it_disposition(it, DISP_IT_EXECD)) { - if (phase >= DISP_IT_EXECD) - return it->it_status; - else - return 0; - } - CERROR("it disp: %X, status: %d\n", it->it_disposition, - it->it_status); - LBUG(); - return 0; -} -EXPORT_SYMBOL(it_open_error); - -/* this must be called on a lockh that is known to have a referenced lock */ -int mdc_set_lock_data(struct obd_export *exp, const struct lustre_handle *lockh, - void *data, __u64 *bits) -{ - struct ldlm_lock *lock; - struct inode *new_inode = data; - - if (bits) - *bits = 0; - - if (!lustre_handle_is_used(lockh)) - return 0; - - lock = ldlm_handle2lock(lockh); - - LASSERT(lock); - lock_res_and_lock(lock); - if (lock->l_resource->lr_lvb_inode && - lock->l_resource->lr_lvb_inode != data) { - struct inode *old_inode = lock->l_resource->lr_lvb_inode; - - LASSERTF(old_inode->i_state & I_FREEING, - "Found existing inode %p/%lu/%u state %lu in lock: setting data to %p/%lu/%u\n", - old_inode, old_inode->i_ino, old_inode->i_generation, - old_inode->i_state, new_inode, new_inode->i_ino, - new_inode->i_generation); - } - lock->l_resource->lr_lvb_inode = new_inode; - if (bits) - *bits = lock->l_policy_data.l_inodebits.bits; - - unlock_res_and_lock(lock); - LDLM_LOCK_PUT(lock); - - return 0; -} - -enum ldlm_mode mdc_lock_match(struct obd_export *exp, __u64 flags, - const struct lu_fid *fid, enum ldlm_type type, - union ldlm_policy_data *policy, - enum ldlm_mode mode, - struct lustre_handle *lockh) -{ - struct ldlm_res_id res_id; - enum ldlm_mode rc; - - fid_build_reg_res_name(fid, &res_id); - /* LU-4405: Clear bits not supported by server */ - policy->l_inodebits.bits &= exp_connect_ibits(exp); - rc = ldlm_lock_match(class_exp2obd(exp)->obd_namespace, flags, - &res_id, type, policy, mode, lockh, 0); - return rc; -} - -int mdc_cancel_unused(struct obd_export *exp, - const struct lu_fid *fid, - union ldlm_policy_data *policy, - enum ldlm_mode mode, - enum ldlm_cancel_flags flags, - void *opaque) -{ - struct ldlm_res_id res_id; - struct obd_device *obd = class_exp2obd(exp); - int rc; - - fid_build_reg_res_name(fid, &res_id); - rc = ldlm_cli_cancel_unused_resource(obd->obd_namespace, &res_id, - policy, mode, flags, opaque); - return rc; -} - -int mdc_null_inode(struct obd_export *exp, - const struct lu_fid *fid) -{ - struct ldlm_res_id res_id; - struct ldlm_resource *res; - struct ldlm_namespace *ns = class_exp2obd(exp)->obd_namespace; - - LASSERTF(ns, "no namespace passed\n"); - - fid_build_reg_res_name(fid, &res_id); - - res = ldlm_resource_get(ns, NULL, &res_id, 0, 0); - if (IS_ERR(res)) - return 0; - - lock_res(res); - res->lr_lvb_inode = NULL; - unlock_res(res); - - ldlm_resource_putref(res); - return 0; -} - -static inline void mdc_clear_replay_flag(struct ptlrpc_request *req, int rc) -{ - /* Don't hold error requests for replay. */ - if (req->rq_replay) { - spin_lock(&req->rq_lock); - req->rq_replay = 0; - spin_unlock(&req->rq_lock); - } - if (rc && req->rq_transno != 0) { - DEBUG_REQ(D_ERROR, req, "transno returned on error rc %d", rc); - LBUG(); - } -} - -/* Save a large LOV EA into the request buffer so that it is available - * for replay. We don't do this in the initial request because the - * original request doesn't need this buffer (at most it sends just the - * lov_mds_md) and it is a waste of RAM/bandwidth to send the empty - * buffer and may also be difficult to allocate and save a very large - * request buffer for each open. (bug 5707) - * - * OOM here may cause recovery failure if lmm is needed (only for the - * original open if the MDS crashed just when this client also OOM'd) - * but this is incredibly unlikely, and questionable whether the client - * could do MDS recovery under OOM anyways... - */ -static void mdc_realloc_openmsg(struct ptlrpc_request *req, - struct mdt_body *body) -{ - int rc; - - /* FIXME: remove this explicit offset. */ - rc = sptlrpc_cli_enlarge_reqbuf(req, DLM_INTENT_REC_OFF + 4, - body->mbo_eadatasize); - if (rc) { - CERROR("Can't enlarge segment %d size to %d\n", - DLM_INTENT_REC_OFF + 4, body->mbo_eadatasize); - body->mbo_valid &= ~OBD_MD_FLEASIZE; - body->mbo_eadatasize = 0; - } -} - -static struct ptlrpc_request * -mdc_intent_open_pack(struct obd_export *exp, struct lookup_intent *it, - struct md_op_data *op_data) -{ - struct ptlrpc_request *req; - struct obd_device *obddev = class_exp2obd(exp); - struct ldlm_intent *lit; - const void *lmm = op_data->op_data; - u32 lmmsize = op_data->op_data_size; - LIST_HEAD(cancels); - int count = 0; - int mode; - int rc; - - it->it_create_mode = (it->it_create_mode & ~S_IFMT) | S_IFREG; - - /* XXX: openlock is not cancelled for cross-refs. */ - /* If inode is known, cancel conflicting OPEN locks. */ - if (fid_is_sane(&op_data->op_fid2)) { - if (it->it_flags & MDS_OPEN_LEASE) { /* try to get lease */ - if (it->it_flags & FMODE_WRITE) - mode = LCK_EX; - else - mode = LCK_PR; - } else { - if (it->it_flags & (FMODE_WRITE | MDS_OPEN_TRUNC)) - mode = LCK_CW; - else if (it->it_flags & __FMODE_EXEC) - mode = LCK_PR; - else - mode = LCK_CR; - } - count = mdc_resource_get_unused(exp, &op_data->op_fid2, - &cancels, mode, - MDS_INODELOCK_OPEN); - } - - /* If CREATE, cancel parent's UPDATE lock. */ - if (it->it_op & IT_CREAT) - mode = LCK_EX; - else - mode = LCK_CR; - count += mdc_resource_get_unused(exp, &op_data->op_fid1, - &cancels, mode, - MDS_INODELOCK_UPDATE); - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), - &RQF_LDLM_INTENT_OPEN); - if (!req) { - ldlm_lock_list_put(&cancels, l_bl_ast, count); - return ERR_PTR(-ENOMEM); - } - - req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, - op_data->op_namelen + 1); - req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, - max(lmmsize, obddev->u.cli.cl_default_mds_easize)); - - rc = ldlm_prep_enqueue_req(exp, req, &cancels, count); - if (rc < 0) { - ptlrpc_request_free(req); - return ERR_PTR(rc); - } - - spin_lock(&req->rq_lock); - req->rq_replay = req->rq_import->imp_replayable; - spin_unlock(&req->rq_lock); - - /* pack the intent */ - lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT); - lit->opc = (__u64)it->it_op; - - /* pack the intended request */ - mdc_open_pack(req, op_data, it->it_create_mode, 0, it->it_flags, lmm, - lmmsize); - - req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, - obddev->u.cli.cl_max_mds_easize); - req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, - req->rq_import->imp_connect_data.ocd_max_easize); - - ptlrpc_request_set_replen(req); - return req; -} - -#define GA_DEFAULT_EA_NAME_LEN 20 -#define GA_DEFAULT_EA_VAL_LEN 250 -#define GA_DEFAULT_EA_NUM 10 - -static struct ptlrpc_request * -mdc_intent_getxattr_pack(struct obd_export *exp, - struct lookup_intent *it, - struct md_op_data *op_data) -{ - struct ptlrpc_request *req; - struct ldlm_intent *lit; - int rc, count = 0; - LIST_HEAD(cancels); - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), - &RQF_LDLM_INTENT_GETXATTR); - if (!req) - return ERR_PTR(-ENOMEM); - - rc = ldlm_prep_enqueue_req(exp, req, &cancels, count); - if (rc) { - ptlrpc_request_free(req); - return ERR_PTR(rc); - } - - /* pack the intent */ - lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT); - lit->opc = IT_GETXATTR; - - /* pack the intended request */ - mdc_pack_body(req, &op_data->op_fid1, op_data->op_valid, - GA_DEFAULT_EA_NAME_LEN * GA_DEFAULT_EA_NUM, -1, 0); - - req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, - GA_DEFAULT_EA_NAME_LEN * GA_DEFAULT_EA_NUM); - - req_capsule_set_size(&req->rq_pill, &RMF_EAVALS, RCL_SERVER, - GA_DEFAULT_EA_NAME_LEN * GA_DEFAULT_EA_NUM); - - req_capsule_set_size(&req->rq_pill, &RMF_EAVALS_LENS, RCL_SERVER, - sizeof(u32) * GA_DEFAULT_EA_NUM); - - req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, 0); - - ptlrpc_request_set_replen(req); - - return req; -} - -static struct ptlrpc_request *mdc_intent_unlink_pack(struct obd_export *exp, - struct lookup_intent *it, - struct md_op_data *op_data) -{ - struct ptlrpc_request *req; - struct obd_device *obddev = class_exp2obd(exp); - struct ldlm_intent *lit; - int rc; - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), - &RQF_LDLM_INTENT_UNLINK); - if (!req) - return ERR_PTR(-ENOMEM); - - req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, - op_data->op_namelen + 1); - - rc = ldlm_prep_enqueue_req(exp, req, NULL, 0); - if (rc) { - ptlrpc_request_free(req); - return ERR_PTR(rc); - } - - /* pack the intent */ - lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT); - lit->opc = (__u64)it->it_op; - - /* pack the intended request */ - mdc_unlink_pack(req, op_data); - - req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, - obddev->u.cli.cl_default_mds_easize); - ptlrpc_request_set_replen(req); - return req; -} - -static struct ptlrpc_request *mdc_intent_getattr_pack(struct obd_export *exp, - struct lookup_intent *it, - struct md_op_data *op_data) -{ - struct ptlrpc_request *req; - struct obd_device *obddev = class_exp2obd(exp); - u64 valid = OBD_MD_FLGETATTR | OBD_MD_FLEASIZE | - OBD_MD_FLMODEASIZE | OBD_MD_FLDIREA | - OBD_MD_MEA | OBD_MD_FLACL; - struct ldlm_intent *lit; - int rc; - u32 easize; - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), - &RQF_LDLM_INTENT_GETATTR); - if (!req) - return ERR_PTR(-ENOMEM); - - req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, - op_data->op_namelen + 1); - - rc = ldlm_prep_enqueue_req(exp, req, NULL, 0); - if (rc) { - ptlrpc_request_free(req); - return ERR_PTR(rc); - } - - /* pack the intent */ - lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT); - lit->opc = (__u64)it->it_op; - - if (obddev->u.cli.cl_default_mds_easize > 0) - easize = obddev->u.cli.cl_default_mds_easize; - else - easize = obddev->u.cli.cl_max_mds_easize; - - /* pack the intended request */ - mdc_getattr_pack(req, valid, it->it_flags, op_data, easize); - - req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, easize); - req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, - req->rq_import->imp_connect_data.ocd_max_easize); - ptlrpc_request_set_replen(req); - return req; -} - -static struct ptlrpc_request *mdc_intent_layout_pack(struct obd_export *exp, - struct lookup_intent *it, - struct md_op_data *unused) -{ - struct obd_device *obd = class_exp2obd(exp); - struct ptlrpc_request *req; - struct ldlm_intent *lit; - struct layout_intent *layout; - int rc; - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), - &RQF_LDLM_INTENT_LAYOUT); - if (!req) - return ERR_PTR(-ENOMEM); - - req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, 0); - rc = ldlm_prep_enqueue_req(exp, req, NULL, 0); - if (rc) { - ptlrpc_request_free(req); - return ERR_PTR(rc); - } - - /* pack the intent */ - lit = req_capsule_client_get(&req->rq_pill, &RMF_LDLM_INTENT); - lit->opc = (__u64)it->it_op; - - /* pack the layout intent request */ - layout = req_capsule_client_get(&req->rq_pill, &RMF_LAYOUT_INTENT); - /* LAYOUT_INTENT_ACCESS is generic, specific operation will be - * set for replication - */ - layout->li_opc = LAYOUT_INTENT_ACCESS; - - req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, - obd->u.cli.cl_default_mds_easize); - ptlrpc_request_set_replen(req); - return req; -} - -static struct ptlrpc_request * -mdc_enqueue_pack(struct obd_export *exp, int lvb_len) -{ - struct ptlrpc_request *req; - int rc; - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_LDLM_ENQUEUE); - if (!req) - return ERR_PTR(-ENOMEM); - - rc = ldlm_prep_enqueue_req(exp, req, NULL, 0); - if (rc) { - ptlrpc_request_free(req); - return ERR_PTR(rc); - } - - req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, lvb_len); - ptlrpc_request_set_replen(req); - return req; -} - -static int mdc_finish_enqueue(struct obd_export *exp, - struct ptlrpc_request *req, - struct ldlm_enqueue_info *einfo, - struct lookup_intent *it, - struct lustre_handle *lockh, - int rc) -{ - struct req_capsule *pill = &req->rq_pill; - struct ldlm_request *lockreq; - struct ldlm_reply *lockrep; - struct ldlm_lock *lock; - void *lvb_data = NULL; - u32 lvb_len = 0; - - LASSERT(rc >= 0); - /* Similarly, if we're going to replay this request, we don't want to - * actually get a lock, just perform the intent. - */ - if (req->rq_transno || req->rq_replay) { - lockreq = req_capsule_client_get(pill, &RMF_DLM_REQ); - lockreq->lock_flags |= ldlm_flags_to_wire(LDLM_FL_INTENT_ONLY); - } - - if (rc == ELDLM_LOCK_ABORTED) { - einfo->ei_mode = 0; - memset(lockh, 0, sizeof(*lockh)); - rc = 0; - } else { /* rc = 0 */ - lock = ldlm_handle2lock(lockh); - - /* If the server gave us back a different lock mode, we should - * fix up our variables. - */ - if (lock->l_req_mode != einfo->ei_mode) { - ldlm_lock_addref(lockh, lock->l_req_mode); - ldlm_lock_decref(lockh, einfo->ei_mode); - einfo->ei_mode = lock->l_req_mode; - } - LDLM_LOCK_PUT(lock); - } - - lockrep = req_capsule_server_get(pill, &RMF_DLM_REP); - - it->it_disposition = (int)lockrep->lock_policy_res1; - it->it_status = (int)lockrep->lock_policy_res2; - it->it_lock_mode = einfo->ei_mode; - it->it_lock_handle = lockh->cookie; - it->it_request = req; - - /* Technically speaking rq_transno must already be zero if - * it_status is in error, so the check is a bit redundant - */ - if ((!req->rq_transno || it->it_status < 0) && req->rq_replay) - mdc_clear_replay_flag(req, it->it_status); - - /* If we're doing an IT_OPEN which did not result in an actual - * successful open, then we need to remove the bit which saves - * this request for unconditional replay. - * - * It's important that we do this first! Otherwise we might exit the - * function without doing so, and try to replay a failed create - * (bug 3440) - */ - if (it->it_op & IT_OPEN && req->rq_replay && - (!it_disposition(it, DISP_OPEN_OPEN) || it->it_status != 0)) - mdc_clear_replay_flag(req, it->it_status); - - DEBUG_REQ(D_RPCTRACE, req, "op: %d disposition: %x, status: %d", - it->it_op, it->it_disposition, it->it_status); - - /* We know what to expect, so we do any byte flipping required here */ - if (it_has_reply_body(it)) { - struct mdt_body *body; - - body = req_capsule_server_get(pill, &RMF_MDT_BODY); - if (!body) { - CERROR("Can't swab mdt_body\n"); - return -EPROTO; - } - - if (it_disposition(it, DISP_OPEN_OPEN) && - !it_open_error(DISP_OPEN_OPEN, it)) { - /* - * If this is a successful OPEN request, we need to set - * replay handler and data early, so that if replay - * happens immediately after swabbing below, new reply - * is swabbed by that handler correctly. - */ - mdc_set_open_replay_data(NULL, NULL, it); - } - - if ((body->mbo_valid & (OBD_MD_FLDIREA | OBD_MD_FLEASIZE)) != 0) { - void *eadata; - - mdc_update_max_ea_from_body(exp, body); - - /* - * The eadata is opaque; just check that it is there. - * Eventually, obd_unpackmd() will check the contents. - */ - eadata = req_capsule_server_sized_get(pill, &RMF_MDT_MD, - body->mbo_eadatasize); - if (!eadata) - return -EPROTO; - - /* save lvb data and length in case this is for layout - * lock - */ - lvb_data = eadata; - lvb_len = body->mbo_eadatasize; - - /* - * We save the reply LOV EA in case we have to replay a - * create for recovery. If we didn't allocate a large - * enough request buffer above we need to reallocate it - * here to hold the actual LOV EA. - * - * To not save LOV EA if request is not going to replay - * (for example error one). - */ - if ((it->it_op & IT_OPEN) && req->rq_replay) { - void *lmm; - - if (req_capsule_get_size(pill, &RMF_EADATA, - RCL_CLIENT) < - body->mbo_eadatasize) - mdc_realloc_openmsg(req, body); - else - req_capsule_shrink(pill, &RMF_EADATA, - body->mbo_eadatasize, - RCL_CLIENT); - - req_capsule_set_size(pill, &RMF_EADATA, - RCL_CLIENT, - body->mbo_eadatasize); - - lmm = req_capsule_client_get(pill, &RMF_EADATA); - if (lmm) - memcpy(lmm, eadata, body->mbo_eadatasize); - } - } - } else if (it->it_op & IT_LAYOUT) { - /* maybe the lock was granted right away and layout - * is packed into RMF_DLM_LVB of req - */ - lvb_len = req_capsule_get_size(pill, &RMF_DLM_LVB, RCL_SERVER); - if (lvb_len > 0) { - lvb_data = req_capsule_server_sized_get(pill, - &RMF_DLM_LVB, - lvb_len); - if (!lvb_data) - return -EPROTO; - } - } - - /* fill in stripe data for layout lock */ - lock = ldlm_handle2lock(lockh); - if (lock && ldlm_has_layout(lock) && lvb_data) { - void *lmm; - - LDLM_DEBUG(lock, "layout lock returned by: %s, lvb_len: %d", - ldlm_it2str(it->it_op), lvb_len); - - lmm = kvzalloc(lvb_len, GFP_NOFS); - if (!lmm) { - LDLM_LOCK_PUT(lock); - return -ENOMEM; - } - memcpy(lmm, lvb_data, lvb_len); - - /* install lvb_data */ - lock_res_and_lock(lock); - if (!lock->l_lvb_data) { - lock->l_lvb_type = LVB_T_LAYOUT; - lock->l_lvb_data = lmm; - lock->l_lvb_len = lvb_len; - lmm = NULL; - } - unlock_res_and_lock(lock); - if (lmm) - kvfree(lmm); - } - if (lock) - LDLM_LOCK_PUT(lock); - - return rc; -} - -/* We always reserve enough space in the reply packet for a stripe MD, because - * we don't know in advance the file type. - */ -int mdc_enqueue_base(struct obd_export *exp, struct ldlm_enqueue_info *einfo, - const union ldlm_policy_data *policy, - struct lookup_intent *it, struct md_op_data *op_data, - struct lustre_handle *lockh, u64 extra_lock_flags) -{ - static const union ldlm_policy_data lookup_policy = { - .l_inodebits = { MDS_INODELOCK_LOOKUP } - }; - static const union ldlm_policy_data update_policy = { - .l_inodebits = { MDS_INODELOCK_UPDATE } - }; - static const union ldlm_policy_data layout_policy = { - .l_inodebits = { MDS_INODELOCK_LAYOUT } - }; - static const union ldlm_policy_data getxattr_policy = { - .l_inodebits = { MDS_INODELOCK_XATTR } - }; - struct obd_device *obddev = class_exp2obd(exp); - struct ptlrpc_request *req = NULL; - u64 flags, saved_flags = extra_lock_flags; - struct ldlm_res_id res_id; - int generation, resends = 0; - struct ldlm_reply *lockrep; - enum lvb_type lvb_type = LVB_T_NONE; - int rc; - - LASSERTF(!it || einfo->ei_type == LDLM_IBITS, "lock type %d\n", - einfo->ei_type); - fid_build_reg_res_name(&op_data->op_fid1, &res_id); - - if (it) { - LASSERT(!policy); - - saved_flags |= LDLM_FL_HAS_INTENT; - if (it->it_op & (IT_UNLINK | IT_GETATTR | IT_READDIR)) - policy = &update_policy; - else if (it->it_op & IT_LAYOUT) - policy = &layout_policy; - else if (it->it_op & (IT_GETXATTR | IT_SETXATTR)) - policy = &getxattr_policy; - else - policy = &lookup_policy; - } - - generation = obddev->u.cli.cl_import->imp_generation; -resend: - flags = saved_flags; - if (!it) { - /* The only way right now is FLOCK. */ - LASSERTF(einfo->ei_type == LDLM_FLOCK, "lock type %d\n", - einfo->ei_type); - res_id.name[3] = LDLM_FLOCK; - } else if (it->it_op & IT_OPEN) { - req = mdc_intent_open_pack(exp, it, op_data); - } else if (it->it_op & IT_UNLINK) { - req = mdc_intent_unlink_pack(exp, it, op_data); - } else if (it->it_op & (IT_GETATTR | IT_LOOKUP)) { - req = mdc_intent_getattr_pack(exp, it, op_data); - } else if (it->it_op & IT_READDIR) { - req = mdc_enqueue_pack(exp, 0); - } else if (it->it_op & IT_LAYOUT) { - if (!imp_connect_lvb_type(class_exp2cliimp(exp))) - return -EOPNOTSUPP; - req = mdc_intent_layout_pack(exp, it, op_data); - lvb_type = LVB_T_LAYOUT; - } else if (it->it_op & IT_GETXATTR) { - req = mdc_intent_getxattr_pack(exp, it, op_data); - } else { - LBUG(); - return -EINVAL; - } - - if (IS_ERR(req)) - return PTR_ERR(req); - - if (resends) { - req->rq_generation_set = 1; - req->rq_import_generation = generation; - req->rq_sent = ktime_get_real_seconds() + resends; - } - - /* It is important to obtain modify RPC slot first (if applicable), so - * that threads that are waiting for a modify RPC slot are not polluting - * our rpcs in flight counter. - * We do not do flock request limiting, though - */ - if (it) { - mdc_get_mod_rpc_slot(req, it); - rc = obd_get_request_slot(&obddev->u.cli); - if (rc != 0) { - mdc_put_mod_rpc_slot(req, it); - mdc_clear_replay_flag(req, 0); - ptlrpc_req_finished(req); - return rc; - } - } - - rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, policy, &flags, NULL, - 0, lvb_type, lockh, 0); - if (!it) { - /* For flock requests we immediately return without further - * delay and let caller deal with the rest, since rest of - * this function metadata processing makes no sense for flock - * requests anyway. But in case of problem during comms with - * Server (ETIMEDOUT) or any signal/kill attempt (EINTR), we - * can not rely on caller and this mainly for F_UNLCKs - * (explicits or automatically generated by Kernel to clean - * current FLocks upon exit) that can't be trashed - */ - if (((rc == -EINTR) || (rc == -ETIMEDOUT)) && - (einfo->ei_type == LDLM_FLOCK) && - (einfo->ei_mode == LCK_NL)) - goto resend; - return rc; - } - - obd_put_request_slot(&obddev->u.cli); - mdc_put_mod_rpc_slot(req, it); - - if (rc < 0) { - CDEBUG(D_INFO, "%s: ldlm_cli_enqueue failed: rc = %d\n", - obddev->obd_name, rc); - - mdc_clear_replay_flag(req, rc); - ptlrpc_req_finished(req); - return rc; - } - - lockrep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); - - lockrep->lock_policy_res2 = - ptlrpc_status_ntoh(lockrep->lock_policy_res2); - - /* - * Retry infinitely when the server returns -EINPROGRESS for the - * intent operation, when server returns -EINPROGRESS for acquiring - * intent lock, we'll retry in after_reply(). - */ - if (it->it_op && (int)lockrep->lock_policy_res2 == -EINPROGRESS) { - mdc_clear_replay_flag(req, rc); - ptlrpc_req_finished(req); - resends++; - - CDEBUG(D_HA, "%s: resend:%d op:%d " DFID "/" DFID "\n", - obddev->obd_name, resends, it->it_op, - PFID(&op_data->op_fid1), PFID(&op_data->op_fid2)); - - if (generation == obddev->u.cli.cl_import->imp_generation) { - goto resend; - } else { - CDEBUG(D_HA, "resend cross eviction\n"); - return -EIO; - } - } - - rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc); - if (rc < 0) { - if (lustre_handle_is_used(lockh)) { - ldlm_lock_decref(lockh, einfo->ei_mode); - memset(lockh, 0, sizeof(*lockh)); - } - ptlrpc_req_finished(req); - - it->it_lock_handle = 0; - it->it_lock_mode = 0; - it->it_request = NULL; - } - - return rc; -} - -int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo, - const union ldlm_policy_data *policy, - struct md_op_data *op_data, - struct lustre_handle *lockh, u64 extra_lock_flags) -{ - return mdc_enqueue_base(exp, einfo, policy, NULL, - op_data, lockh, extra_lock_flags); -} - -static int mdc_finish_intent_lock(struct obd_export *exp, - struct ptlrpc_request *request, - struct md_op_data *op_data, - struct lookup_intent *it, - struct lustre_handle *lockh) -{ - struct lustre_handle old_lock; - struct ldlm_lock *lock; - int rc = 0; - - LASSERT(request != LP_POISON); - LASSERT(request->rq_repmsg != LP_POISON); - - if (it->it_op & IT_READDIR) - return 0; - - if (it->it_op & (IT_GETXATTR | IT_LAYOUT)) { - if (it->it_status != 0) { - rc = it->it_status; - goto out; - } - goto matching_lock; - } - - if (!it_disposition(it, DISP_IT_EXECD)) { - /* The server failed before it even started executing the - * intent, i.e. because it couldn't unpack the request. - */ - LASSERT(it->it_status != 0); - rc = it->it_status; - goto out; - } - - rc = it_open_error(DISP_IT_EXECD, it); - if (rc) - goto out; - - rc = it_open_error(DISP_LOOKUP_EXECD, it); - if (rc) - goto out; - - /* keep requests around for the multiple phases of the call - * this shows the DISP_XX must guarantee we make it into the call - */ - if (!it_disposition(it, DISP_ENQ_CREATE_REF) && - it_disposition(it, DISP_OPEN_CREATE) && - !it_open_error(DISP_OPEN_CREATE, it)) { - it_set_disposition(it, DISP_ENQ_CREATE_REF); - ptlrpc_request_addref(request); /* balanced in ll_create_node */ - } - if (!it_disposition(it, DISP_ENQ_OPEN_REF) && - it_disposition(it, DISP_OPEN_OPEN) && - !it_open_error(DISP_OPEN_OPEN, it)) { - it_set_disposition(it, DISP_ENQ_OPEN_REF); - ptlrpc_request_addref(request); /* balanced in ll_file_open */ - /* BUG 11546 - eviction in the middle of open rpc processing */ - OBD_FAIL_TIMEOUT(OBD_FAIL_MDC_ENQUEUE_PAUSE, obd_timeout); - } - - if (it->it_op & IT_CREAT) - /* XXX this belongs in ll_create_it */ - ; - else if (it->it_op == IT_OPEN) - LASSERT(!it_disposition(it, DISP_OPEN_CREATE)); - else - LASSERT(it->it_op & (IT_GETATTR | IT_LOOKUP)); - -matching_lock: - /* If we already have a matching lock, then cancel the new - * one. We have to set the data here instead of in - * mdc_enqueue, because we need to use the child's inode as - * the l_ast_data to match, and that's not available until - * intent_finish has performed the iget().) - */ - lock = ldlm_handle2lock(lockh); - if (lock) { - union ldlm_policy_data policy = lock->l_policy_data; - - LDLM_DEBUG(lock, "matching against this"); - - if (it_has_reply_body(it)) { - struct mdt_body *body; - - body = req_capsule_server_get(&request->rq_pill, - &RMF_MDT_BODY); - - /* mdc_enqueue checked */ - LASSERT(body); - LASSERTF(fid_res_name_eq(&body->mbo_fid1, - &lock->l_resource->lr_name), - "Lock res_id: " DLDLMRES ", fid: " DFID "\n", - PLDLMRES(lock->l_resource), - PFID(&body->mbo_fid1)); - } - LDLM_LOCK_PUT(lock); - - memcpy(&old_lock, lockh, sizeof(*lockh)); - if (ldlm_lock_match(NULL, LDLM_FL_BLOCK_GRANTED, NULL, - LDLM_IBITS, &policy, LCK_NL, - &old_lock, 0)) { - ldlm_lock_decref_and_cancel(lockh, - it->it_lock_mode); - memcpy(lockh, &old_lock, sizeof(old_lock)); - it->it_lock_handle = lockh->cookie; - } - } -out: - CDEBUG(D_DENTRY, - "D_IT dentry %.*s intent: %s status %d disp %x rc %d\n", - (int)op_data->op_namelen, op_data->op_name, - ldlm_it2str(it->it_op), it->it_status, it->it_disposition, rc); - return rc; -} - -int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it, - struct lu_fid *fid, __u64 *bits) -{ - /* We could just return 1 immediately, but since we should only - * be called in revalidate_it if we already have a lock, let's - * verify that. - */ - struct ldlm_res_id res_id; - struct lustre_handle lockh; - union ldlm_policy_data policy; - enum ldlm_mode mode; - - if (it->it_lock_handle) { - lockh.cookie = it->it_lock_handle; - mode = ldlm_revalidate_lock_handle(&lockh, bits); - } else { - fid_build_reg_res_name(fid, &res_id); - switch (it->it_op) { - case IT_GETATTR: - /* File attributes are held under multiple bits: - * nlink is under lookup lock, size and times are - * under UPDATE lock and recently we've also got - * a separate permissions lock for owner/group/acl that - * were protected by lookup lock before. - * Getattr must provide all of that information, - * so we need to ensure we have all of those locks. - * Unfortunately, if the bits are split across multiple - * locks, there's no easy way to match all of them here, - * so an extra RPC would be performed to fetch all - * of those bits at once for now. - */ - /* For new MDTs(> 2.4), UPDATE|PERM should be enough, - * but for old MDTs (< 2.4), permission is covered - * by LOOKUP lock, so it needs to match all bits here. - */ - policy.l_inodebits.bits = MDS_INODELOCK_UPDATE | - MDS_INODELOCK_LOOKUP | - MDS_INODELOCK_PERM; - break; - case IT_READDIR: - policy.l_inodebits.bits = MDS_INODELOCK_UPDATE; - break; - case IT_LAYOUT: - policy.l_inodebits.bits = MDS_INODELOCK_LAYOUT; - break; - default: - policy.l_inodebits.bits = MDS_INODELOCK_LOOKUP; - break; - } - - mode = mdc_lock_match(exp, LDLM_FL_BLOCK_GRANTED, fid, - LDLM_IBITS, &policy, - LCK_CR | LCK_CW | LCK_PR | LCK_PW, - &lockh); - } - - if (mode) { - it->it_lock_handle = lockh.cookie; - it->it_lock_mode = mode; - } else { - it->it_lock_handle = 0; - it->it_lock_mode = 0; - } - - return !!mode; -} - -/* - * This long block is all about fixing up the lock and request state - * so that it is correct as of the moment _before_ the operation was - * applied; that way, the VFS will think that everything is normal and - * call Lustre's regular VFS methods. - * - * If we're performing a creation, that means that unless the creation - * failed with EEXIST, we should fake up a negative dentry. - * - * For everything else, we want the lookup to succeed. - * - * One additional note: if CREATE or OPEN succeeded, we add an extra - * reference to the request because we need to keep it around until - * ll_create/ll_open gets called. - * - * The server will return to us, in it_disposition, an indication of - * exactly what it_status refers to. - * - * If DISP_OPEN_OPEN is set, then it_status refers to the open() call, - * otherwise if DISP_OPEN_CREATE is set, then it_status is the - * creation failure mode. In either case, one of DISP_LOOKUP_NEG or - * DISP_LOOKUP_POS will be set, indicating whether the child lookup - * was successful. - * - * Else, if DISP_LOOKUP_EXECD then it_status is the rc of the - * child lookup. - */ -int mdc_intent_lock(struct obd_export *exp, struct md_op_data *op_data, - struct lookup_intent *it, struct ptlrpc_request **reqp, - ldlm_blocking_callback cb_blocking, __u64 extra_lock_flags) -{ - struct ldlm_enqueue_info einfo = { - .ei_type = LDLM_IBITS, - .ei_mode = it_to_lock_mode(it), - .ei_cb_bl = cb_blocking, - .ei_cb_cp = ldlm_completion_ast, - }; - struct lustre_handle lockh; - int rc = 0; - - LASSERT(it); - - CDEBUG(D_DLMTRACE, "(name: %.*s," DFID ") in obj " DFID - ", intent: %s flags %#Lo\n", (int)op_data->op_namelen, - op_data->op_name, PFID(&op_data->op_fid2), - PFID(&op_data->op_fid1), ldlm_it2str(it->it_op), - it->it_flags); - - lockh.cookie = 0; - if (fid_is_sane(&op_data->op_fid2) && - (it->it_op & (IT_LOOKUP | IT_GETATTR | IT_READDIR))) { - /* We could just return 1 immediately, but since we should only - * be called in revalidate_it if we already have a lock, let's - * verify that. - */ - it->it_lock_handle = 0; - rc = mdc_revalidate_lock(exp, it, &op_data->op_fid2, NULL); - /* Only return failure if it was not GETATTR by cfid - * (from inode_revalidate) - */ - if (rc || op_data->op_namelen != 0) - return rc; - } - - /* For case if upper layer did not alloc fid, do it now. */ - if (!fid_is_sane(&op_data->op_fid2) && it->it_op & IT_CREAT) { - rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2, op_data); - if (rc < 0) { - CERROR("Can't alloc new fid, rc %d\n", rc); - return rc; - } - } - - rc = mdc_enqueue_base(exp, &einfo, NULL, it, op_data, &lockh, - extra_lock_flags); - if (rc < 0) - return rc; - - *reqp = it->it_request; - rc = mdc_finish_intent_lock(exp, *reqp, op_data, it, &lockh); - return rc; -} - -static int mdc_intent_getattr_async_interpret(const struct lu_env *env, - struct ptlrpc_request *req, - void *args, int rc) -{ - struct mdc_getattr_args *ga = args; - struct obd_export *exp = ga->ga_exp; - struct md_enqueue_info *minfo = ga->ga_minfo; - struct ldlm_enqueue_info *einfo = &minfo->mi_einfo; - struct lookup_intent *it; - struct lustre_handle *lockh; - struct obd_device *obddev; - struct ldlm_reply *lockrep; - __u64 flags = LDLM_FL_HAS_INTENT; - - it = &minfo->mi_it; - lockh = &minfo->mi_lockh; - - obddev = class_exp2obd(exp); - - obd_put_request_slot(&obddev->u.cli); - if (OBD_FAIL_CHECK(OBD_FAIL_MDC_GETATTR_ENQUEUE)) - rc = -ETIMEDOUT; - - rc = ldlm_cli_enqueue_fini(exp, req, einfo->ei_type, 1, einfo->ei_mode, - &flags, NULL, 0, lockh, rc); - if (rc < 0) { - CERROR("ldlm_cli_enqueue_fini: %d\n", rc); - mdc_clear_replay_flag(req, rc); - goto out; - } - - lockrep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); - - lockrep->lock_policy_res2 = - ptlrpc_status_ntoh(lockrep->lock_policy_res2); - - rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc); - if (rc) - goto out; - - rc = mdc_finish_intent_lock(exp, req, &minfo->mi_data, it, lockh); - -out: - minfo->mi_cb(req, minfo, rc); - return 0; -} - -int mdc_intent_getattr_async(struct obd_export *exp, - struct md_enqueue_info *minfo) -{ - struct md_op_data *op_data = &minfo->mi_data; - struct lookup_intent *it = &minfo->mi_it; - struct ptlrpc_request *req; - struct mdc_getattr_args *ga; - struct obd_device *obddev = class_exp2obd(exp); - struct ldlm_res_id res_id; - union ldlm_policy_data policy = { - .l_inodebits = { MDS_INODELOCK_LOOKUP | MDS_INODELOCK_UPDATE } - }; - int rc = 0; - __u64 flags = LDLM_FL_HAS_INTENT; - - CDEBUG(D_DLMTRACE, - "name: %.*s in inode " DFID ", intent: %s flags %#Lo\n", - (int)op_data->op_namelen, op_data->op_name, - PFID(&op_data->op_fid1), ldlm_it2str(it->it_op), it->it_flags); - - fid_build_reg_res_name(&op_data->op_fid1, &res_id); - req = mdc_intent_getattr_pack(exp, it, op_data); - if (IS_ERR(req)) - return PTR_ERR(req); - - rc = obd_get_request_slot(&obddev->u.cli); - if (rc != 0) { - ptlrpc_req_finished(req); - return rc; - } - - rc = ldlm_cli_enqueue(exp, &req, &minfo->mi_einfo, &res_id, &policy, - &flags, NULL, 0, LVB_T_NONE, &minfo->mi_lockh, 1); - if (rc < 0) { - obd_put_request_slot(&obddev->u.cli); - ptlrpc_req_finished(req); - return rc; - } - - BUILD_BUG_ON(sizeof(*ga) > sizeof(req->rq_async_args)); - ga = ptlrpc_req_async_args(req); - ga->ga_exp = exp; - ga->ga_minfo = minfo; - - req->rq_interpret_reply = mdc_intent_getattr_async_interpret; - ptlrpcd_add_req(req); - - return 0; -} diff --git a/drivers/staging/lustre/lustre/mdc/mdc_reint.c b/drivers/staging/lustre/lustre/mdc/mdc_reint.c deleted file mode 100644 index e77c00df0693358fce3f2e9b23a36982600ddbba..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/mdc/mdc_reint.c +++ /dev/null @@ -1,421 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_MDC - -# include -# include - -#include -#include "mdc_internal.h" -#include - -/* mdc_setattr does its own semaphore handling */ -static int mdc_reint(struct ptlrpc_request *request, int level) -{ - int rc; - - request->rq_send_state = level; - - mdc_get_mod_rpc_slot(request, NULL); - rc = ptlrpc_queue_wait(request); - mdc_put_mod_rpc_slot(request, NULL); - if (rc) - CDEBUG(D_INFO, "error in handling %d\n", rc); - else if (!req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY)) - rc = -EPROTO; - - return rc; -} - -/* Find and cancel locally locks matched by inode @bits & @mode in the resource - * found by @fid. Found locks are added into @cancel list. Returns the amount of - * locks added to @cancels list. - */ -int mdc_resource_get_unused(struct obd_export *exp, const struct lu_fid *fid, - struct list_head *cancels, enum ldlm_mode mode, - __u64 bits) -{ - struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; - union ldlm_policy_data policy = {}; - struct ldlm_res_id res_id; - struct ldlm_resource *res; - int count; - - /* Return, i.e. cancel nothing, only if ELC is supported (flag in - * export) but disabled through procfs (flag in NS). - * - * This distinguishes from a case when ELC is not supported originally, - * when we still want to cancel locks in advance and just cancel them - * locally, without sending any RPC. - */ - if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns)) - return 0; - - fid_build_reg_res_name(fid, &res_id); - res = ldlm_resource_get(exp->exp_obd->obd_namespace, - NULL, &res_id, 0, 0); - if (IS_ERR(res)) - return 0; - LDLM_RESOURCE_ADDREF(res); - /* Initialize ibits lock policy. */ - policy.l_inodebits.bits = bits; - count = ldlm_cancel_resource_local(res, cancels, &policy, - mode, 0, 0, NULL); - LDLM_RESOURCE_DELREF(res); - ldlm_resource_putref(res); - return count; -} - -int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data, - void *ea, size_t ealen, struct ptlrpc_request **request) -{ - LIST_HEAD(cancels); - struct ptlrpc_request *req; - int count = 0, rc; - __u64 bits; - - bits = MDS_INODELOCK_UPDATE; - if (op_data->op_attr.ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) - bits |= MDS_INODELOCK_LOOKUP; - if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && - (fid_is_sane(&op_data->op_fid1))) - count = mdc_resource_get_unused(exp, &op_data->op_fid1, - &cancels, LCK_EX, bits); - req = ptlrpc_request_alloc(class_exp2cliimp(exp), - &RQF_MDS_REINT_SETATTR); - if (!req) { - ldlm_lock_list_put(&cancels, l_bl_ast, count); - return -ENOMEM; - } - req_capsule_set_size(&req->rq_pill, &RMF_MDT_EPOCH, RCL_CLIENT, 0); - req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, ealen); - req_capsule_set_size(&req->rq_pill, &RMF_LOGCOOKIES, RCL_CLIENT, 0); - - rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - if (op_data->op_attr.ia_valid & (ATTR_MTIME | ATTR_CTIME)) - CDEBUG(D_INODE, "setting mtime %ld, ctime %ld\n", - LTIME_S(op_data->op_attr.ia_mtime), - LTIME_S(op_data->op_attr.ia_ctime)); - mdc_setattr_pack(req, op_data, ea, ealen); - - req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, - req->rq_import->imp_connect_data.ocd_max_easize); - ptlrpc_request_set_replen(req); - - rc = mdc_reint(req, LUSTRE_IMP_FULL); - - if (rc == -ERESTARTSYS) - rc = 0; - - *request = req; - - return rc; -} - -int mdc_create(struct obd_export *exp, struct md_op_data *op_data, - const void *data, size_t datalen, umode_t mode, - uid_t uid, gid_t gid, kernel_cap_t cap_effective, - __u64 rdev, struct ptlrpc_request **request) -{ - struct ptlrpc_request *req; - int level, rc; - int count, resends = 0; - struct obd_import *import = exp->exp_obd->u.cli.cl_import; - int generation = import->imp_generation; - LIST_HEAD(cancels); - - /* For case if upper layer did not alloc fid, do it now. */ - if (!fid_is_sane(&op_data->op_fid2)) { - /* - * mdc_fid_alloc() may return errno 1 in case of switch to new - * sequence, handle this. - */ - rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2, op_data); - if (rc < 0) - return rc; - } - -rebuild: - count = 0; - if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && - (fid_is_sane(&op_data->op_fid1))) - count = mdc_resource_get_unused(exp, &op_data->op_fid1, - &cancels, LCK_EX, - MDS_INODELOCK_UPDATE); - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), - &RQF_MDS_REINT_CREATE_ACL); - if (!req) { - ldlm_lock_list_put(&cancels, l_bl_ast, count); - return -ENOMEM; - } - req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, - op_data->op_namelen + 1); - req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, - data && datalen ? datalen : 0); - - rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - /* - * mdc_create_pack() fills msg->bufs[1] with name and msg->bufs[2] with - * tgt, for symlinks or lov MD data. - */ - mdc_create_pack(req, op_data, data, datalen, mode, uid, - gid, cap_effective, rdev); - - ptlrpc_request_set_replen(req); - - /* ask ptlrpc not to resend on EINPROGRESS since we have our own retry - * logic here - */ - req->rq_no_retry_einprogress = 1; - - if (resends) { - req->rq_generation_set = 1; - req->rq_import_generation = generation; - req->rq_sent = ktime_get_real_seconds() + resends; - } - level = LUSTRE_IMP_FULL; - resend: - rc = mdc_reint(req, level); - - /* Resend if we were told to. */ - if (rc == -ERESTARTSYS) { - level = LUSTRE_IMP_RECOVER; - goto resend; - } else if (rc == -EINPROGRESS) { - /* Retry create infinitely until succeed or get other - * error code. - */ - ptlrpc_req_finished(req); - resends++; - - CDEBUG(D_HA, "%s: resend:%d create on " DFID "/" DFID "\n", - exp->exp_obd->obd_name, resends, - PFID(&op_data->op_fid1), PFID(&op_data->op_fid2)); - - if (generation == import->imp_generation) { - goto rebuild; - } else { - CDEBUG(D_HA, "resend cross eviction\n"); - return -EIO; - } - } - - *request = req; - return rc; -} - -int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data, - struct ptlrpc_request **request) -{ - LIST_HEAD(cancels); - struct obd_device *obd = class_exp2obd(exp); - struct ptlrpc_request *req = *request; - int count = 0, rc; - - LASSERT(!req); - - if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && - (fid_is_sane(&op_data->op_fid1))) - count = mdc_resource_get_unused(exp, &op_data->op_fid1, - &cancels, LCK_EX, - MDS_INODELOCK_UPDATE); - if ((op_data->op_flags & MF_MDC_CANCEL_FID3) && - (fid_is_sane(&op_data->op_fid3))) - count += mdc_resource_get_unused(exp, &op_data->op_fid3, - &cancels, LCK_EX, - MDS_INODELOCK_FULL); - req = ptlrpc_request_alloc(class_exp2cliimp(exp), - &RQF_MDS_REINT_UNLINK); - if (!req) { - ldlm_lock_list_put(&cancels, l_bl_ast, count); - return -ENOMEM; - } - req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, - op_data->op_namelen + 1); - - rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - mdc_unlink_pack(req, op_data); - - req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, - obd->u.cli.cl_default_mds_easize); - ptlrpc_request_set_replen(req); - - *request = req; - - rc = mdc_reint(req, LUSTRE_IMP_FULL); - if (rc == -ERESTARTSYS) - rc = 0; - return rc; -} - -int mdc_link(struct obd_export *exp, struct md_op_data *op_data, - struct ptlrpc_request **request) -{ - LIST_HEAD(cancels); - struct ptlrpc_request *req; - int count = 0, rc; - - if ((op_data->op_flags & MF_MDC_CANCEL_FID2) && - (fid_is_sane(&op_data->op_fid2))) - count = mdc_resource_get_unused(exp, &op_data->op_fid2, - &cancels, LCK_EX, - MDS_INODELOCK_UPDATE); - if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && - (fid_is_sane(&op_data->op_fid1))) - count += mdc_resource_get_unused(exp, &op_data->op_fid1, - &cancels, LCK_EX, - MDS_INODELOCK_UPDATE); - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_REINT_LINK); - if (!req) { - ldlm_lock_list_put(&cancels, l_bl_ast, count); - return -ENOMEM; - } - req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, - op_data->op_namelen + 1); - - rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - mdc_link_pack(req, op_data); - ptlrpc_request_set_replen(req); - - rc = mdc_reint(req, LUSTRE_IMP_FULL); - *request = req; - if (rc == -ERESTARTSYS) - rc = 0; - - return rc; -} - -int mdc_rename(struct obd_export *exp, struct md_op_data *op_data, - const char *old, size_t oldlen, const char *new, size_t newlen, - struct ptlrpc_request **request) -{ - LIST_HEAD(cancels); - struct obd_device *obd = exp->exp_obd; - struct ptlrpc_request *req; - int count = 0, rc; - - if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && - (fid_is_sane(&op_data->op_fid1))) - count = mdc_resource_get_unused(exp, &op_data->op_fid1, - &cancels, LCK_EX, - MDS_INODELOCK_UPDATE); - if ((op_data->op_flags & MF_MDC_CANCEL_FID2) && - (fid_is_sane(&op_data->op_fid2))) - count += mdc_resource_get_unused(exp, &op_data->op_fid2, - &cancels, LCK_EX, - MDS_INODELOCK_UPDATE); - if ((op_data->op_flags & MF_MDC_CANCEL_FID3) && - (fid_is_sane(&op_data->op_fid3))) - count += mdc_resource_get_unused(exp, &op_data->op_fid3, - &cancels, LCK_EX, - MDS_INODELOCK_LOOKUP); - if ((op_data->op_flags & MF_MDC_CANCEL_FID4) && - (fid_is_sane(&op_data->op_fid4))) - count += mdc_resource_get_unused(exp, &op_data->op_fid4, - &cancels, LCK_EX, - MDS_INODELOCK_FULL); - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), - op_data->op_cli_flags & CLI_MIGRATE ? - &RQF_MDS_REINT_MIGRATE : &RQF_MDS_REINT_RENAME); - if (!req) { - ldlm_lock_list_put(&cancels, l_bl_ast, count); - return -ENOMEM; - } - - req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, oldlen + 1); - req_capsule_set_size(&req->rq_pill, &RMF_SYMTGT, RCL_CLIENT, - newlen + 1); - - rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - if (op_data->op_cli_flags & CLI_MIGRATE && op_data->op_data) { - struct md_open_data *mod = op_data->op_data; - - LASSERTF(mod->mod_open_req && - mod->mod_open_req->rq_type != LI_POISON, - "POISONED open %p!\n", mod->mod_open_req); - - DEBUG_REQ(D_HA, mod->mod_open_req, "matched open"); - /* - * We no longer want to preserve this open for replay even - * though the open was committed. b=3632, b=3633 - */ - spin_lock(&mod->mod_open_req->rq_lock); - mod->mod_open_req->rq_replay = 0; - spin_unlock(&mod->mod_open_req->rq_lock); - } - - if (exp_connect_cancelset(exp) && req) - ldlm_cli_cancel_list(&cancels, count, req, 0); - - mdc_rename_pack(req, op_data, old, oldlen, new, newlen); - - req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, - obd->u.cli.cl_default_mds_easize); - ptlrpc_request_set_replen(req); - - rc = mdc_reint(req, LUSTRE_IMP_FULL); - *request = req; - if (rc == -ERESTARTSYS) - rc = 0; - - return rc; -} diff --git a/drivers/staging/lustre/lustre/mdc/mdc_request.c b/drivers/staging/lustre/lustre/mdc/mdc_request.c deleted file mode 100644 index cff31cb0a9ace637f8fc0e6245576fb8b446766c..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/mdc/mdc_request.c +++ /dev/null @@ -1,2770 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_MDC - -# include -# include -# include -# include -# include -# include -# include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "mdc_internal.h" - -#define REQUEST_MINOR 244 - -static int mdc_cleanup(struct obd_device *obd); - -static inline int mdc_queue_wait(struct ptlrpc_request *req) -{ - struct client_obd *cli = &req->rq_import->imp_obd->u.cli; - int rc; - - /* obd_get_request_slot() ensures that this client has no more - * than cl_max_rpcs_in_flight RPCs simultaneously inf light - * against an MDT. - */ - rc = obd_get_request_slot(cli); - if (rc != 0) - return rc; - - rc = ptlrpc_queue_wait(req); - obd_put_request_slot(cli); - - return rc; -} - -static int mdc_getstatus(struct obd_export *exp, struct lu_fid *rootfid) -{ - struct ptlrpc_request *req; - struct mdt_body *body; - int rc; - - req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), - &RQF_MDS_GETSTATUS, - LUSTRE_MDS_VERSION, MDS_GETSTATUS); - if (!req) - return -ENOMEM; - - mdc_pack_body(req, NULL, 0, 0, -1, 0); - req->rq_send_state = LUSTRE_IMP_FULL; - - ptlrpc_request_set_replen(req); - - rc = ptlrpc_queue_wait(req); - if (rc) - goto out; - - body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - if (!body) { - rc = -EPROTO; - goto out; - } - - *rootfid = body->mbo_fid1; - CDEBUG(D_NET, - "root fid=" DFID ", last_committed=%llu\n", - PFID(rootfid), - lustre_msg_get_last_committed(req->rq_repmsg)); -out: - ptlrpc_req_finished(req); - return rc; -} - -/* - * This function now is known to always saying that it will receive 4 buffers - * from server. Even for cases when acl_size and md_size is zero, RPC header - * will contain 4 fields and RPC itself will contain zero size fields. This is - * because mdt_getattr*() _always_ returns 4 fields, but if acl is not needed - * and thus zero, it shrinks it, making zero size. The same story about - * md_size. And this is course of problem when client waits for smaller number - * of fields. This issue will be fixed later when client gets aware of RPC - * layouts. --umka - */ -static int mdc_getattr_common(struct obd_export *exp, - struct ptlrpc_request *req) -{ - struct req_capsule *pill = &req->rq_pill; - struct mdt_body *body; - void *eadata; - int rc; - - /* Request message already built. */ - rc = ptlrpc_queue_wait(req); - if (rc != 0) - return rc; - - /* sanity check for the reply */ - body = req_capsule_server_get(pill, &RMF_MDT_BODY); - if (!body) - return -EPROTO; - - CDEBUG(D_NET, "mode: %o\n", body->mbo_mode); - - mdc_update_max_ea_from_body(exp, body); - if (body->mbo_eadatasize != 0) { - eadata = req_capsule_server_sized_get(pill, &RMF_MDT_MD, - body->mbo_eadatasize); - if (!eadata) - return -EPROTO; - } - - return 0; -} - -static int mdc_getattr(struct obd_export *exp, struct md_op_data *op_data, - struct ptlrpc_request **request) -{ - struct ptlrpc_request *req; - int rc; - - /* Single MDS without an LMV case */ - if (op_data->op_flags & MF_GET_MDT_IDX) { - op_data->op_mds = 0; - return 0; - } - *request = NULL; - req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_GETATTR); - if (!req) - return -ENOMEM; - - rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - mdc_pack_body(req, &op_data->op_fid1, op_data->op_valid, - op_data->op_mode, -1, 0); - - req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, - req->rq_import->imp_connect_data.ocd_max_easize); - req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, - op_data->op_mode); - ptlrpc_request_set_replen(req); - - rc = mdc_getattr_common(exp, req); - if (rc) - ptlrpc_req_finished(req); - else - *request = req; - return rc; -} - -static int mdc_getattr_name(struct obd_export *exp, struct md_op_data *op_data, - struct ptlrpc_request **request) -{ - struct ptlrpc_request *req; - int rc; - - *request = NULL; - req = ptlrpc_request_alloc(class_exp2cliimp(exp), - &RQF_MDS_GETATTR_NAME); - if (!req) - return -ENOMEM; - - req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, - op_data->op_namelen + 1); - - rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GETATTR_NAME); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - mdc_pack_body(req, &op_data->op_fid1, op_data->op_valid, - op_data->op_mode, op_data->op_suppgids[0], 0); - - if (op_data->op_name) { - char *name = req_capsule_client_get(&req->rq_pill, &RMF_NAME); - - LASSERT(strnlen(op_data->op_name, op_data->op_namelen) == - op_data->op_namelen); - memcpy(name, op_data->op_name, op_data->op_namelen); - } - - req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, - op_data->op_mode); - req_capsule_set_size(&req->rq_pill, &RMF_ACL, RCL_SERVER, - req->rq_import->imp_connect_data.ocd_max_easize); - ptlrpc_request_set_replen(req); - - rc = mdc_getattr_common(exp, req); - if (rc) - ptlrpc_req_finished(req); - else - *request = req; - return rc; -} - -static int mdc_xattr_common(struct obd_export *exp, - const struct req_format *fmt, - const struct lu_fid *fid, - int opcode, u64 valid, - const char *xattr_name, const char *input, - int input_size, int output_size, int flags, - __u32 suppgid, struct ptlrpc_request **request) -{ - struct ptlrpc_request *req; - int xattr_namelen = 0; - char *tmp; - int rc; - - *request = NULL; - req = ptlrpc_request_alloc(class_exp2cliimp(exp), fmt); - if (!req) - return -ENOMEM; - - if (xattr_name) { - xattr_namelen = strlen(xattr_name) + 1; - req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, - xattr_namelen); - } - if (input_size) { - LASSERT(input); - req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_CLIENT, - input_size); - } - - /* Flush local XATTR locks to get rid of a possible cancel RPC */ - if (opcode == MDS_REINT && fid_is_sane(fid) && - exp->exp_connect_data.ocd_ibits_known & MDS_INODELOCK_XATTR) { - LIST_HEAD(cancels); - int count; - - /* Without that packing would fail */ - if (input_size == 0) - req_capsule_set_size(&req->rq_pill, &RMF_EADATA, - RCL_CLIENT, 0); - - count = mdc_resource_get_unused(exp, fid, - &cancels, LCK_EX, - MDS_INODELOCK_XATTR); - - rc = mdc_prep_elc_req(exp, req, MDS_REINT, &cancels, count); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - } else { - rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, opcode); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - } - - if (opcode == MDS_REINT) { - struct mdt_rec_setxattr *rec; - - BUILD_BUG_ON(sizeof(struct mdt_rec_setxattr) != - sizeof(struct mdt_rec_reint)); - rec = req_capsule_client_get(&req->rq_pill, &RMF_REC_REINT); - rec->sx_opcode = REINT_SETXATTR; - rec->sx_fsuid = from_kuid(&init_user_ns, current_fsuid()); - rec->sx_fsgid = from_kgid(&init_user_ns, current_fsgid()); - rec->sx_cap = current_cap().cap[0]; - rec->sx_suppgid1 = suppgid; - rec->sx_suppgid2 = -1; - rec->sx_fid = *fid; - rec->sx_valid = valid | OBD_MD_FLCTIME; - rec->sx_time = ktime_get_real_seconds(); - rec->sx_size = output_size; - rec->sx_flags = flags; - - } else { - mdc_pack_body(req, fid, valid, output_size, suppgid, flags); - } - - if (xattr_name) { - tmp = req_capsule_client_get(&req->rq_pill, &RMF_NAME); - memcpy(tmp, xattr_name, xattr_namelen); - } - if (input_size) { - tmp = req_capsule_client_get(&req->rq_pill, &RMF_EADATA); - memcpy(tmp, input, input_size); - } - - if (req_capsule_has_field(&req->rq_pill, &RMF_EADATA, RCL_SERVER)) - req_capsule_set_size(&req->rq_pill, &RMF_EADATA, - RCL_SERVER, output_size); - ptlrpc_request_set_replen(req); - - /* make rpc */ - if (opcode == MDS_REINT) - mdc_get_mod_rpc_slot(req, NULL); - - rc = ptlrpc_queue_wait(req); - - if (opcode == MDS_REINT) - mdc_put_mod_rpc_slot(req, NULL); - - if (rc) - ptlrpc_req_finished(req); - else - *request = req; - return rc; -} - -static int mdc_setxattr(struct obd_export *exp, const struct lu_fid *fid, - u64 obd_md_valid, const char *name, - const void *value, size_t value_size, - unsigned int xattr_flags, u32 suppgid, - struct ptlrpc_request **req) -{ - LASSERT(obd_md_valid == OBD_MD_FLXATTR || - obd_md_valid == OBD_MD_FLXATTRRM); - - return mdc_xattr_common(exp, &RQF_MDS_REINT_SETXATTR, - fid, MDS_REINT, obd_md_valid, name, - value, value_size, 0, xattr_flags, suppgid, - req); -} - -static int mdc_getxattr(struct obd_export *exp, const struct lu_fid *fid, - u64 obd_md_valid, const char *name, size_t buf_size, - struct ptlrpc_request **req) -{ - LASSERT(obd_md_valid == OBD_MD_FLXATTR || - obd_md_valid == OBD_MD_FLXATTRLS); - - return mdc_xattr_common(exp, &RQF_MDS_GETXATTR, fid, MDS_GETXATTR, - obd_md_valid, name, NULL, 0, buf_size, 0, -1, - req); -} - -#ifdef CONFIG_FS_POSIX_ACL -static int mdc_unpack_acl(struct ptlrpc_request *req, struct lustre_md *md) -{ - struct req_capsule *pill = &req->rq_pill; - struct mdt_body *body = md->body; - struct posix_acl *acl; - void *buf; - int rc; - - if (!body->mbo_aclsize) - return 0; - - buf = req_capsule_server_sized_get(pill, &RMF_ACL, body->mbo_aclsize); - - if (!buf) - return -EPROTO; - - acl = posix_acl_from_xattr(&init_user_ns, buf, body->mbo_aclsize); - if (!acl) - return 0; - - if (IS_ERR(acl)) { - rc = PTR_ERR(acl); - CERROR("convert xattr to acl: %d\n", rc); - return rc; - } - - rc = posix_acl_valid(&init_user_ns, acl); - if (rc) { - CERROR("validate acl: %d\n", rc); - posix_acl_release(acl); - return rc; - } - - md->posix_acl = acl; - return 0; -} -#else -#define mdc_unpack_acl(req, md) 0 -#endif - -static int mdc_get_lustre_md(struct obd_export *exp, - struct ptlrpc_request *req, - struct obd_export *dt_exp, - struct obd_export *md_exp, - struct lustre_md *md) -{ - struct req_capsule *pill = &req->rq_pill; - int rc; - - LASSERT(md); - memset(md, 0, sizeof(*md)); - - md->body = req_capsule_server_get(pill, &RMF_MDT_BODY); - - if (md->body->mbo_valid & OBD_MD_FLEASIZE) { - if (!S_ISREG(md->body->mbo_mode)) { - CDEBUG(D_INFO, - "OBD_MD_FLEASIZE set, should be a regular file, but is not\n"); - rc = -EPROTO; - goto out; - } - - if (md->body->mbo_eadatasize == 0) { - CDEBUG(D_INFO, - "OBD_MD_FLEASIZE set, but eadatasize 0\n"); - rc = -EPROTO; - goto out; - } - - md->layout.lb_len = md->body->mbo_eadatasize; - md->layout.lb_buf = req_capsule_server_sized_get(pill, - &RMF_MDT_MD, - md->layout.lb_len); - if (!md->layout.lb_buf) { - rc = -EPROTO; - goto out; - } - } else if (md->body->mbo_valid & OBD_MD_FLDIREA) { - const union lmv_mds_md *lmv; - size_t lmv_size; - - if (!S_ISDIR(md->body->mbo_mode)) { - CDEBUG(D_INFO, - "OBD_MD_FLDIREA set, should be a directory, but is not\n"); - rc = -EPROTO; - goto out; - } - - lmv_size = md->body->mbo_eadatasize; - if (!lmv_size) { - CDEBUG(D_INFO, - "OBD_MD_FLDIREA is set, but eadatasize 0\n"); - return -EPROTO; - } - if (md->body->mbo_valid & OBD_MD_MEA) { - lmv = req_capsule_server_sized_get(pill, &RMF_MDT_MD, - lmv_size); - if (!lmv) { - rc = -EPROTO; - goto out; - } - - rc = md_unpackmd(md_exp, &md->lmv, lmv, lmv_size); - if (rc < 0) - goto out; - - if (rc < (typeof(rc))sizeof(*md->lmv)) { - CDEBUG(D_INFO, - "size too small: rc < sizeof(*md->lmv) (%d < %d)\n", - rc, (int)sizeof(*md->lmv)); - rc = -EPROTO; - goto out; - } - } - } - rc = 0; - - if (md->body->mbo_valid & OBD_MD_FLACL) { - /* for ACL, it's possible that FLACL is set but aclsize is zero. - * only when aclsize != 0 there's an actual segment for ACL - * in reply buffer. - */ - if (md->body->mbo_aclsize) { - rc = mdc_unpack_acl(req, md); - if (rc) - goto out; -#ifdef CONFIG_FS_POSIX_ACL - } else { - md->posix_acl = NULL; -#endif - } - } - -out: - if (rc) { -#ifdef CONFIG_FS_POSIX_ACL - posix_acl_release(md->posix_acl); -#endif - } - return rc; -} - -static int mdc_free_lustre_md(struct obd_export *exp, struct lustre_md *md) -{ - return 0; -} - -void mdc_replay_open(struct ptlrpc_request *req) -{ - struct md_open_data *mod = req->rq_cb_data; - struct ptlrpc_request *close_req; - struct obd_client_handle *och; - struct lustre_handle old; - struct mdt_body *body; - - if (!mod) { - DEBUG_REQ(D_ERROR, req, - "Can't properly replay without open data."); - return; - } - - body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - - och = mod->mod_och; - if (och) { - struct lustre_handle *file_fh; - - LASSERT(och->och_magic == OBD_CLIENT_HANDLE_MAGIC); - - file_fh = &och->och_fh; - CDEBUG(D_HA, "updating handle from %#llx to %#llx\n", - file_fh->cookie, body->mbo_handle.cookie); - old = *file_fh; - *file_fh = body->mbo_handle; - } - close_req = mod->mod_close_req; - if (close_req) { - __u32 opc = lustre_msg_get_opc(close_req->rq_reqmsg); - struct mdt_ioepoch *epoch; - - LASSERT(opc == MDS_CLOSE); - epoch = req_capsule_client_get(&close_req->rq_pill, - &RMF_MDT_EPOCH); - LASSERT(epoch); - - if (och) - LASSERT(!memcmp(&old, &epoch->mio_handle, sizeof(old))); - DEBUG_REQ(D_HA, close_req, "updating close body with new fh"); - epoch->mio_handle = body->mbo_handle; - } -} - -void mdc_commit_open(struct ptlrpc_request *req) -{ - struct md_open_data *mod = req->rq_cb_data; - - if (!mod) - return; - - /** - * No need to touch md_open_data::mod_och, it holds a reference on - * \var mod and will zero references to each other, \var mod will be - * freed after that when md_open_data::mod_och will put the reference. - */ - - /** - * Do not let open request to disappear as it still may be needed - * for close rpc to happen (it may happen on evict only, otherwise - * ptlrpc_request::rq_replay does not let mdc_commit_open() to be - * called), just mark this rpc as committed to distinguish these 2 - * cases, see mdc_close() for details. The open request reference will - * be put along with freeing \var mod. - */ - ptlrpc_request_addref(req); - spin_lock(&req->rq_lock); - req->rq_committed = 1; - spin_unlock(&req->rq_lock); - req->rq_cb_data = NULL; - obd_mod_put(mod); -} - -int mdc_set_open_replay_data(struct obd_export *exp, - struct obd_client_handle *och, - struct lookup_intent *it) -{ - struct md_open_data *mod; - struct mdt_rec_create *rec; - struct mdt_body *body; - struct ptlrpc_request *open_req = it->it_request; - struct obd_import *imp = open_req->rq_import; - - if (!open_req->rq_replay) - return 0; - - rec = req_capsule_client_get(&open_req->rq_pill, &RMF_REC_REINT); - body = req_capsule_server_get(&open_req->rq_pill, &RMF_MDT_BODY); - LASSERT(rec); - /* Incoming message in my byte order (it's been swabbed). */ - /* Outgoing messages always in my byte order. */ - LASSERT(body); - - /* Only if the import is replayable, we set replay_open data */ - if (och && imp->imp_replayable) { - mod = obd_mod_alloc(); - if (!mod) { - DEBUG_REQ(D_ERROR, open_req, - "Can't allocate md_open_data"); - return 0; - } - - /** - * Take a reference on \var mod, to be freed on mdc_close(). - * It protects \var mod from being freed on eviction (commit - * callback is called despite rq_replay flag). - * Another reference for \var och. - */ - obd_mod_get(mod); - obd_mod_get(mod); - - spin_lock(&open_req->rq_lock); - och->och_mod = mod; - mod->mod_och = och; - mod->mod_is_create = it_disposition(it, DISP_OPEN_CREATE) || - it_disposition(it, DISP_OPEN_STRIPE); - mod->mod_open_req = open_req; - open_req->rq_cb_data = mod; - open_req->rq_commit_cb = mdc_commit_open; - spin_unlock(&open_req->rq_lock); - } - - rec->cr_fid2 = body->mbo_fid1; - rec->cr_ioepoch = body->mbo_ioepoch; - rec->cr_old_handle.cookie = body->mbo_handle.cookie; - open_req->rq_replay_cb = mdc_replay_open; - if (!fid_is_sane(&body->mbo_fid1)) { - DEBUG_REQ(D_ERROR, open_req, - "Saving replay request with insane fid"); - LBUG(); - } - - DEBUG_REQ(D_RPCTRACE, open_req, "Set up open replay data"); - return 0; -} - -static void mdc_free_open(struct md_open_data *mod) -{ - int committed = 0; - - if (mod->mod_is_create == 0 && - imp_connect_disp_stripe(mod->mod_open_req->rq_import)) - committed = 1; - - /* - * No reason to asssert here if the open request has - * rq_replay == 1. It means that mdc_close failed, and - * close request wasn`t sent. It is not fatal to client. - * The worst thing is eviction if the client gets open lock - */ - DEBUG_REQ(D_RPCTRACE, mod->mod_open_req, - "free open request rq_replay = %d\n", - mod->mod_open_req->rq_replay); - - ptlrpc_request_committed(mod->mod_open_req, committed); - if (mod->mod_close_req) - ptlrpc_request_committed(mod->mod_close_req, committed); -} - -static int mdc_clear_open_replay_data(struct obd_export *exp, - struct obd_client_handle *och) -{ - struct md_open_data *mod = och->och_mod; - - /** - * It is possible to not have \var mod in a case of eviction between - * lookup and ll_file_open(). - **/ - if (!mod) - return 0; - - LASSERT(mod != LP_POISON); - LASSERT(mod->mod_open_req); - mdc_free_open(mod); - - mod->mod_och = NULL; - och->och_mod = NULL; - obd_mod_put(mod); - - return 0; -} - -static int mdc_close(struct obd_export *exp, struct md_op_data *op_data, - struct md_open_data *mod, struct ptlrpc_request **request) -{ - struct obd_device *obd = class_exp2obd(exp); - struct ptlrpc_request *req; - struct req_format *req_fmt; - int rc; - int saved_rc = 0; - - if (op_data->op_bias & MDS_HSM_RELEASE) { - req_fmt = &RQF_MDS_INTENT_CLOSE; - - /* allocate a FID for volatile file */ - rc = mdc_fid_alloc(NULL, exp, &op_data->op_fid2, op_data); - if (rc < 0) { - CERROR("%s: " DFID " failed to allocate FID: %d\n", - obd->obd_name, PFID(&op_data->op_fid1), rc); - /* save the errcode and proceed to close */ - saved_rc = rc; - } - } else if (op_data->op_bias & MDS_CLOSE_LAYOUT_SWAP) { - req_fmt = &RQF_MDS_INTENT_CLOSE; - } else { - req_fmt = &RQF_MDS_CLOSE; - } - - *request = NULL; - if (OBD_FAIL_CHECK(OBD_FAIL_MDC_CLOSE)) - req = NULL; - else - req = ptlrpc_request_alloc(class_exp2cliimp(exp), req_fmt); - - /* Ensure that this close's handle is fixed up during replay. */ - if (likely(mod)) { - LASSERTF(mod->mod_open_req && - mod->mod_open_req->rq_type != LI_POISON, - "POISONED open %p!\n", mod->mod_open_req); - - mod->mod_close_req = req; - - DEBUG_REQ(D_HA, mod->mod_open_req, "matched open"); - /* We no longer want to preserve this open for replay even - * though the open was committed. b=3632, b=3633 - */ - spin_lock(&mod->mod_open_req->rq_lock); - mod->mod_open_req->rq_replay = 0; - spin_unlock(&mod->mod_open_req->rq_lock); - } else { - CDEBUG(D_HA, - "couldn't find open req; expecting close error\n"); - } - if (!req) { - /* - * TODO: repeat close after errors - */ - CWARN("%s: close of FID " DFID " failed, file reference will be dropped when this client unmounts or is evicted\n", - obd->obd_name, PFID(&op_data->op_fid1)); - rc = -ENOMEM; - goto out; - } - - rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_CLOSE); - if (rc) { - ptlrpc_request_free(req); - req = NULL; - goto out; - } - - /* - * To avoid a livelock (bug 7034), we need to send CLOSE RPCs to a - * portal whose threads are not taking any DLM locks and are therefore - * always progressing - */ - req->rq_request_portal = MDS_READPAGE_PORTAL; - ptlrpc_at_set_req_timeout(req); - - mdc_close_pack(req, op_data); - - req_capsule_set_size(&req->rq_pill, &RMF_MDT_MD, RCL_SERVER, - obd->u.cli.cl_default_mds_easize); - - ptlrpc_request_set_replen(req); - - mdc_get_mod_rpc_slot(req, NULL); - rc = ptlrpc_queue_wait(req); - mdc_put_mod_rpc_slot(req, NULL); - - if (!req->rq_repmsg) { - CDEBUG(D_RPCTRACE, "request failed to send: %p, %d\n", req, - req->rq_status); - if (rc == 0) - rc = req->rq_status ?: -EIO; - } else if (rc == 0 || rc == -EAGAIN) { - struct mdt_body *body; - - rc = lustre_msg_get_status(req->rq_repmsg); - if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) { - DEBUG_REQ(D_ERROR, req, - "type == PTL_RPC_MSG_ERR, err = %d", rc); - if (rc > 0) - rc = -rc; - } - body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY); - if (!body) - rc = -EPROTO; - } else if (rc == -ESTALE) { - /** - * it can be allowed error after 3633 if open was committed and - * server failed before close was sent. Let's check if mod - * exists and return no error in that case - */ - if (mod) { - DEBUG_REQ(D_HA, req, "Reset ESTALE = %d", rc); - if (mod->mod_open_req->rq_committed) - rc = 0; - } - } - -out: - if (mod) { - if (rc != 0) - mod->mod_close_req = NULL; - /* Since now, mod is accessed through open_req only, - * thus close req does not keep a reference on mod anymore. - */ - obd_mod_put(mod); - } - *request = req; - return rc < 0 ? rc : saved_rc; -} - -static int mdc_getpage(struct obd_export *exp, const struct lu_fid *fid, - u64 offset, struct page **pages, int npages, - struct ptlrpc_request **request) -{ - struct ptlrpc_bulk_desc *desc; - struct ptlrpc_request *req; - wait_queue_head_t waitq; - int resends = 0; - int rc; - int i; - - *request = NULL; - init_waitqueue_head(&waitq); - -restart_bulk: - req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_READPAGE); - if (!req) - return -ENOMEM; - - rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_READPAGE); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - req->rq_request_portal = MDS_READPAGE_PORTAL; - ptlrpc_at_set_req_timeout(req); - - desc = ptlrpc_prep_bulk_imp(req, npages, 1, - PTLRPC_BULK_PUT_SINK | PTLRPC_BULK_BUF_KIOV, - MDS_BULK_PORTAL, - &ptlrpc_bulk_kiov_pin_ops); - if (!desc) { - ptlrpc_request_free(req); - return -ENOMEM; - } - - /* NB req now owns desc and will free it when it gets freed */ - for (i = 0; i < npages; i++) - desc->bd_frag_ops->add_kiov_frag(desc, pages[i], 0, PAGE_SIZE); - - mdc_readdir_pack(req, offset, PAGE_SIZE * npages, fid); - - ptlrpc_request_set_replen(req); - rc = ptlrpc_queue_wait(req); - if (rc) { - ptlrpc_req_finished(req); - if (rc != -ETIMEDOUT) - return rc; - - resends++; - if (!client_should_resend(resends, &exp->exp_obd->u.cli)) { - CERROR("%s: too many resend retries: rc = %d\n", - exp->exp_obd->obd_name, -EIO); - return -EIO; - } - wait_event_idle_timeout(waitq, 0, resends * HZ); - - goto restart_bulk; - } - - rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, - req->rq_bulk->bd_nob_transferred); - if (rc < 0) { - ptlrpc_req_finished(req); - return rc; - } - - if (req->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK) { - CERROR("%s: unexpected bytes transferred: %d (%ld expected)\n", - exp->exp_obd->obd_name, req->rq_bulk->bd_nob_transferred, - PAGE_SIZE * npages); - ptlrpc_req_finished(req); - return -EPROTO; - } - - *request = req; - return 0; -} - -static void mdc_release_page(struct page *page, int remove) -{ - if (remove) { - lock_page(page); - if (likely(page->mapping)) - truncate_complete_page(page->mapping, page); - unlock_page(page); - } - put_page(page); -} - -static struct page *mdc_page_locate(struct address_space *mapping, __u64 *hash, - __u64 *start, __u64 *end, int hash64) -{ - /* - * Complement of hash is used as an index so that - * radix_tree_gang_lookup() can be used to find a page with starting - * hash _smaller_ than one we are looking for. - */ - unsigned long offset = hash_x_index(*hash, hash64); - struct page *page; - int found; - - xa_lock_irq(&mapping->i_pages); - found = radix_tree_gang_lookup(&mapping->i_pages, - (void **)&page, offset, 1); - if (found > 0 && !radix_tree_exceptional_entry(page)) { - struct lu_dirpage *dp; - - get_page(page); - xa_unlock_irq(&mapping->i_pages); - /* - * In contrast to find_lock_page() we are sure that directory - * page cannot be truncated (while DLM lock is held) and, - * hence, can avoid restart. - * - * In fact, page cannot be locked here at all, because - * mdc_read_page_remote does synchronous io. - */ - wait_on_page_locked(page); - if (PageUptodate(page)) { - dp = kmap(page); - if (BITS_PER_LONG == 32 && hash64) { - *start = le64_to_cpu(dp->ldp_hash_start) >> 32; - *end = le64_to_cpu(dp->ldp_hash_end) >> 32; - *hash = *hash >> 32; - } else { - *start = le64_to_cpu(dp->ldp_hash_start); - *end = le64_to_cpu(dp->ldp_hash_end); - } - if (unlikely(*start == 1 && *hash == 0)) - *hash = *start; - else - LASSERTF(*start <= *hash, "start = %#llx,end = %#llx,hash = %#llx\n", - *start, *end, *hash); - CDEBUG(D_VFSTRACE, "offset %lx [%#llx %#llx], hash %#llx\n", - offset, *start, *end, *hash); - if (*hash > *end) { - kunmap(page); - mdc_release_page(page, 0); - page = NULL; - } else if (*end != *start && *hash == *end) { - /* - * upon hash collision, remove this page, - * otherwise put page reference, and - * mdc_read_page_remote() will issue RPC to - * fetch the page we want. - */ - kunmap(page); - mdc_release_page(page, - le32_to_cpu(dp->ldp_flags) & LDF_COLLIDE); - page = NULL; - } - } else { - put_page(page); - page = ERR_PTR(-EIO); - } - } else { - xa_unlock_irq(&mapping->i_pages); - page = NULL; - } - return page; -} - -/* - * Adjust a set of pages, each page containing an array of lu_dirpages, - * so that each page can be used as a single logical lu_dirpage. - * - * A lu_dirpage is laid out as follows, where s = ldp_hash_start, - * e = ldp_hash_end, f = ldp_flags, p = padding, and each "ent" is a - * struct lu_dirent. It has size up to LU_PAGE_SIZE. The ldp_hash_end - * value is used as a cookie to request the next lu_dirpage in a - * directory listing that spans multiple pages (two in this example): - * ________ - * | | - * .|--------v------- -----. - * |s|e|f|p|ent|ent| ... |ent| - * '--|-------------- -----' Each PAGE contains a single - * '------. lu_dirpage. - * .---------v------- -----. - * |s|e|f|p|ent| 0 | ... | 0 | - * '----------------- -----' - * - * However, on hosts where the native VM page size (PAGE_SIZE) is - * larger than LU_PAGE_SIZE, a single host page may contain multiple - * lu_dirpages. After reading the lu_dirpages from the MDS, the - * ldp_hash_end of the first lu_dirpage refers to the one immediately - * after it in the same PAGE (arrows simplified for brevity, but - * in general e0==s1, e1==s2, etc.): - * - * .-------------------- -----. - * |s0|e0|f0|p|ent|ent| ... |ent| - * |---v---------------- -----| - * |s1|e1|f1|p|ent|ent| ... |ent| - * |---v---------------- -----| Here, each PAGE contains - * ... multiple lu_dirpages. - * |---v---------------- -----| - * |s'|e'|f'|p|ent|ent| ... |ent| - * '---|---------------- -----' - * v - * .----------------------------. - * | next PAGE | - * - * This structure is transformed into a single logical lu_dirpage as follows: - * - * - Replace e0 with e' so the request for the next lu_dirpage gets the page - * labeled 'next PAGE'. - * - * - Copy the LDF_COLLIDE flag from f' to f0 to correctly reflect whether - * a hash collision with the next page exists. - * - * - Adjust the lde_reclen of the ending entry of each lu_dirpage to span - * to the first entry of the next lu_dirpage. - */ -#if PAGE_SIZE > LU_PAGE_SIZE -static void mdc_adjust_dirpages(struct page **pages, int cfs_pgs, int lu_pgs) -{ - int i; - - for (i = 0; i < cfs_pgs; i++) { - struct lu_dirpage *dp = kmap(pages[i]); - __u64 hash_end = le64_to_cpu(dp->ldp_hash_end); - __u32 flags = le32_to_cpu(dp->ldp_flags); - struct lu_dirpage *first = dp; - - while (--lu_pgs > 0) { - struct lu_dirent *end_dirent = NULL; - struct lu_dirent *ent; - - for (ent = lu_dirent_start(dp); ent; - ent = lu_dirent_next(ent)) - end_dirent = ent; - - /* Advance dp to next lu_dirpage. */ - dp = (struct lu_dirpage *)((char *)dp + LU_PAGE_SIZE); - - /* Check if we've reached the end of the CFS_PAGE. */ - if (!((unsigned long)dp & ~PAGE_MASK)) - break; - - /* Save the hash and flags of this lu_dirpage. */ - hash_end = le64_to_cpu(dp->ldp_hash_end); - flags = le32_to_cpu(dp->ldp_flags); - - /* Check if lu_dirpage contains no entries. */ - if (!end_dirent) - break; - - /* - * Enlarge the end entry lde_reclen from 0 to - * first entry of next lu_dirpage. - */ - LASSERT(!le16_to_cpu(end_dirent->lde_reclen)); - end_dirent->lde_reclen = - cpu_to_le16((char *)(dp->ldp_entries) - - (char *)end_dirent); - } - - first->ldp_hash_end = hash_end; - first->ldp_flags &= ~cpu_to_le32(LDF_COLLIDE); - first->ldp_flags |= flags & cpu_to_le32(LDF_COLLIDE); - - kunmap(pages[i]); - } - LASSERTF(lu_pgs == 0, "left = %d", lu_pgs); -} -#else -#define mdc_adjust_dirpages(pages, cfs_pgs, lu_pgs) do {} while (0) -#endif /* PAGE_SIZE > LU_PAGE_SIZE */ - -/* parameters for readdir page */ -struct readpage_param { - struct md_op_data *rp_mod; - __u64 rp_off; - int rp_hash64; - struct obd_export *rp_exp; - struct md_callback *rp_cb; -}; - -/** - * Read pages from server. - * - * Page in MDS_READPAGE RPC is packed in LU_PAGE_SIZE, and each page contains - * a header lu_dirpage which describes the start/end hash, and whether this - * page is empty (contains no dir entry) or hash collide with next page. - * After client receives reply, several pages will be integrated into dir page - * in PAGE_SIZE (if PAGE_SIZE greater than LU_PAGE_SIZE), and the - * lu_dirpage for this integrated page will be adjusted. - **/ -static int mdc_read_page_remote(void *data, struct page *page0) -{ - struct readpage_param *rp = data; - struct page **page_pool; - struct page *page; - struct lu_dirpage *dp; - int rd_pgs = 0; /* number of pages read actually */ - int npages; - struct md_op_data *op_data = rp->rp_mod; - struct ptlrpc_request *req; - int max_pages = op_data->op_max_pages; - struct inode *inode; - struct lu_fid *fid; - int i; - int rc; - - LASSERT(max_pages > 0 && max_pages <= PTLRPC_MAX_BRW_PAGES); - inode = op_data->op_data; - fid = &op_data->op_fid1; - LASSERT(inode); - - page_pool = kcalloc(max_pages, sizeof(page), GFP_NOFS); - if (page_pool) { - page_pool[0] = page0; - } else { - page_pool = &page0; - max_pages = 1; - } - - for (npages = 1; npages < max_pages; npages++) { - page = page_cache_alloc(inode->i_mapping); - if (!page) - break; - page_pool[npages] = page; - } - - rc = mdc_getpage(rp->rp_exp, fid, rp->rp_off, page_pool, npages, &req); - if (!rc) { - int lu_pgs = req->rq_bulk->bd_nob_transferred; - - rd_pgs = (req->rq_bulk->bd_nob_transferred + - PAGE_SIZE - 1) >> PAGE_SHIFT; - lu_pgs >>= LU_PAGE_SHIFT; - LASSERT(!(req->rq_bulk->bd_nob_transferred & ~LU_PAGE_MASK)); - - CDEBUG(D_INODE, "read %d(%d) pages\n", rd_pgs, lu_pgs); - - mdc_adjust_dirpages(page_pool, rd_pgs, lu_pgs); - - SetPageUptodate(page0); - } - - unlock_page(page0); - ptlrpc_req_finished(req); - CDEBUG(D_CACHE, "read %d/%d pages\n", rd_pgs, npages); - for (i = 1; i < npages; i++) { - unsigned long offset; - __u64 hash; - int ret; - - page = page_pool[i]; - - if (rc < 0 || i >= rd_pgs) { - put_page(page); - continue; - } - - SetPageUptodate(page); - - dp = kmap(page); - hash = le64_to_cpu(dp->ldp_hash_start); - kunmap(page); - - offset = hash_x_index(hash, rp->rp_hash64); - - prefetchw(&page->flags); - ret = add_to_page_cache_lru(page, inode->i_mapping, offset, - GFP_KERNEL); - if (!ret) - unlock_page(page); - else - CDEBUG(D_VFSTRACE, "page %lu add to page cache failed: rc = %d\n", - offset, ret); - put_page(page); - } - - if (page_pool != &page0) - kfree(page_pool); - - return rc; -} - -/** - * Read dir page from cache first, if it can not find it, read it from - * server and add into the cache. - * - * \param[in] exp MDC export - * \param[in] op_data client MD stack parameters, transferring parameters - * between different layers on client MD stack. - * \param[in] cb_op callback required for ldlm lock enqueue during - * read page - * \param[in] hash_offset the hash offset of the page to be read - * \param[in] ppage the page to be read - * - * retval = 0 get the page successfully - * errno(<0) get the page failed - */ -static int mdc_read_page(struct obd_export *exp, struct md_op_data *op_data, - struct md_callback *cb_op, __u64 hash_offset, - struct page **ppage) -{ - struct lookup_intent it = { .it_op = IT_READDIR }; - struct page *page; - struct inode *dir = op_data->op_data; - struct address_space *mapping; - struct lu_dirpage *dp; - __u64 start = 0; - __u64 end = 0; - struct lustre_handle lockh; - struct ptlrpc_request *enq_req = NULL; - struct readpage_param rp_param; - int rc; - - *ppage = NULL; - - LASSERT(dir); - mapping = dir->i_mapping; - - rc = mdc_intent_lock(exp, op_data, &it, &enq_req, - cb_op->md_blocking_ast, 0); - if (enq_req) - ptlrpc_req_finished(enq_req); - - if (rc < 0) { - CERROR("%s: " DFID " lock enqueue fails: rc = %d\n", - exp->exp_obd->obd_name, PFID(&op_data->op_fid1), rc); - return rc; - } - - rc = 0; - lockh.cookie = it.it_lock_handle; - mdc_set_lock_data(exp, &lockh, dir, NULL); - - rp_param.rp_off = hash_offset; - rp_param.rp_hash64 = op_data->op_cli_flags & CLI_HASH64; - page = mdc_page_locate(mapping, &rp_param.rp_off, &start, &end, - rp_param.rp_hash64); - if (IS_ERR(page)) { - CDEBUG(D_INFO, "%s: dir page locate: " DFID " at %llu: rc %ld\n", - exp->exp_obd->obd_name, PFID(&op_data->op_fid1), - rp_param.rp_off, PTR_ERR(page)); - rc = PTR_ERR(page); - goto out_unlock; - } else if (page) { - /* - * XXX nikita: not entirely correct handling of a corner case: - * suppose hash chain of entries with hash value HASH crosses - * border between pages P0 and P1. First both P0 and P1 are - * cached, seekdir() is called for some entry from the P0 part - * of the chain. Later P0 goes out of cache. telldir(HASH) - * happens and finds P1, as it starts with matching hash - * value. Remaining entries from P0 part of the chain are - * skipped. (Is that really a bug?) - * - * Possible solutions: 0. don't cache P1 is such case, handle - * it as an "overflow" page. 1. invalidate all pages at - * once. 2. use HASH|1 as an index for P1. - */ - goto hash_collision; - } - - rp_param.rp_exp = exp; - rp_param.rp_mod = op_data; - page = read_cache_page(mapping, - hash_x_index(rp_param.rp_off, - rp_param.rp_hash64), - mdc_read_page_remote, &rp_param); - if (IS_ERR(page)) { - CERROR("%s: read cache page: " DFID " at %llu: rc %ld\n", - exp->exp_obd->obd_name, PFID(&op_data->op_fid1), - rp_param.rp_off, PTR_ERR(page)); - rc = PTR_ERR(page); - goto out_unlock; - } - - wait_on_page_locked(page); - (void)kmap(page); - if (!PageUptodate(page)) { - CERROR("%s: page not updated: " DFID " at %llu: rc %d\n", - exp->exp_obd->obd_name, PFID(&op_data->op_fid1), - rp_param.rp_off, -5); - goto fail; - } - if (!PageChecked(page)) - SetPageChecked(page); - if (PageError(page)) { - CERROR("%s: page error: " DFID " at %llu: rc %d\n", - exp->exp_obd->obd_name, PFID(&op_data->op_fid1), - rp_param.rp_off, -5); - goto fail; - } - -hash_collision: - dp = page_address(page); - if (BITS_PER_LONG == 32 && rp_param.rp_hash64) { - start = le64_to_cpu(dp->ldp_hash_start) >> 32; - end = le64_to_cpu(dp->ldp_hash_end) >> 32; - rp_param.rp_off = hash_offset >> 32; - } else { - start = le64_to_cpu(dp->ldp_hash_start); - end = le64_to_cpu(dp->ldp_hash_end); - rp_param.rp_off = hash_offset; - } - if (end == start) { - LASSERT(start == rp_param.rp_off); - CWARN("Page-wide hash collision: %#lx\n", (unsigned long)end); -#if BITS_PER_LONG == 32 - CWARN("Real page-wide hash collision at [%llu %llu] with hash %llu\n", - le64_to_cpu(dp->ldp_hash_start), - le64_to_cpu(dp->ldp_hash_end), hash_offset); -#endif - /* - * Fetch whole overflow chain... - * - * XXX not yet. - */ - goto fail; - } - *ppage = page; -out_unlock: - ldlm_lock_decref(&lockh, it.it_lock_mode); - return rc; -fail: - kunmap(page); - mdc_release_page(page, 1); - rc = -EIO; - goto out_unlock; -} - -static int mdc_statfs(const struct lu_env *env, - struct obd_export *exp, struct obd_statfs *osfs, - __u64 max_age, __u32 flags) -{ - struct obd_device *obd = class_exp2obd(exp); - struct ptlrpc_request *req; - struct obd_statfs *msfs; - struct obd_import *imp = NULL; - int rc; - - /* - * Since the request might also come from lprocfs, so we need - * sync this with client_disconnect_export Bug15684 - */ - down_read(&obd->u.cli.cl_sem); - if (obd->u.cli.cl_import) - imp = class_import_get(obd->u.cli.cl_import); - up_read(&obd->u.cli.cl_sem); - if (!imp) - return -ENODEV; - - req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_STATFS, - LUSTRE_MDS_VERSION, MDS_STATFS); - if (!req) { - rc = -ENOMEM; - goto output; - } - - ptlrpc_request_set_replen(req); - - if (flags & OBD_STATFS_NODELAY) { - /* procfs requests not want stay in wait for avoid deadlock */ - req->rq_no_resend = 1; - req->rq_no_delay = 1; - } - - rc = ptlrpc_queue_wait(req); - if (rc) { - /* check connection error first */ - if (imp->imp_connect_error) - rc = imp->imp_connect_error; - goto out; - } - - msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS); - if (!msfs) { - rc = -EPROTO; - goto out; - } - - *osfs = *msfs; -out: - ptlrpc_req_finished(req); -output: - class_import_put(imp); - return rc; -} - -static int mdc_ioc_fid2path(struct obd_export *exp, struct getinfo_fid2path *gf) -{ - __u32 keylen, vallen; - void *key; - int rc; - - if (gf->gf_pathlen > PATH_MAX) - return -ENAMETOOLONG; - if (gf->gf_pathlen < 2) - return -EOVERFLOW; - - /* Key is KEY_FID2PATH + getinfo_fid2path description */ - keylen = cfs_size_round(sizeof(KEY_FID2PATH)) + sizeof(*gf); - key = kzalloc(keylen, GFP_NOFS); - if (!key) - return -ENOMEM; - memcpy(key, KEY_FID2PATH, sizeof(KEY_FID2PATH)); - memcpy(key + cfs_size_round(sizeof(KEY_FID2PATH)), gf, sizeof(*gf)); - - CDEBUG(D_IOCTL, "path get " DFID " from %llu #%d\n", - PFID(&gf->gf_fid), gf->gf_recno, gf->gf_linkno); - - if (!fid_is_sane(&gf->gf_fid)) { - rc = -EINVAL; - goto out; - } - - /* Val is struct getinfo_fid2path result plus path */ - vallen = sizeof(*gf) + gf->gf_pathlen; - - rc = obd_get_info(NULL, exp, keylen, key, &vallen, gf); - if (rc != 0 && rc != -EREMOTE) - goto out; - - if (vallen <= sizeof(*gf)) { - rc = -EPROTO; - goto out; - } else if (vallen > sizeof(*gf) + gf->gf_pathlen) { - rc = -EOVERFLOW; - goto out; - } - - CDEBUG(D_IOCTL, "path got " DFID " from %llu #%d: %s\n", - PFID(&gf->gf_fid), gf->gf_recno, gf->gf_linkno, - gf->gf_pathlen < 512 ? gf->gf_path : - /* only log the last 512 characters of the path */ - gf->gf_path + gf->gf_pathlen - 512); - -out: - kfree(key); - return rc; -} - -static int mdc_ioc_hsm_progress(struct obd_export *exp, - struct hsm_progress_kernel *hpk) -{ - struct obd_import *imp = class_exp2cliimp(exp); - struct hsm_progress_kernel *req_hpk; - struct ptlrpc_request *req; - int rc; - - req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_PROGRESS, - LUSTRE_MDS_VERSION, MDS_HSM_PROGRESS); - if (!req) { - rc = -ENOMEM; - goto out; - } - - mdc_pack_body(req, NULL, 0, 0, -1, 0); - - /* Copy hsm_progress struct */ - req_hpk = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_PROGRESS); - if (!req_hpk) { - rc = -EPROTO; - goto out; - } - - *req_hpk = *hpk; - req_hpk->hpk_errval = lustre_errno_hton(hpk->hpk_errval); - - ptlrpc_request_set_replen(req); - - mdc_get_mod_rpc_slot(req, NULL); - rc = ptlrpc_queue_wait(req); - mdc_put_mod_rpc_slot(req, NULL); -out: - ptlrpc_req_finished(req); - return rc; -} - -static int mdc_ioc_hsm_ct_register(struct obd_import *imp, __u32 archives) -{ - __u32 *archive_mask; - struct ptlrpc_request *req; - int rc; - - req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_CT_REGISTER, - LUSTRE_MDS_VERSION, - MDS_HSM_CT_REGISTER); - if (!req) { - rc = -ENOMEM; - goto out; - } - - mdc_pack_body(req, NULL, 0, 0, -1, 0); - - /* Copy hsm_progress struct */ - archive_mask = req_capsule_client_get(&req->rq_pill, - &RMF_MDS_HSM_ARCHIVE); - if (!archive_mask) { - rc = -EPROTO; - goto out; - } - - *archive_mask = archives; - - ptlrpc_request_set_replen(req); - - rc = mdc_queue_wait(req); -out: - ptlrpc_req_finished(req); - return rc; -} - -static int mdc_ioc_hsm_current_action(struct obd_export *exp, - struct md_op_data *op_data) -{ - struct hsm_current_action *hca = op_data->op_data; - struct hsm_current_action *req_hca; - struct ptlrpc_request *req; - int rc; - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), - &RQF_MDS_HSM_ACTION); - if (!req) - return -ENOMEM; - - rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_ACTION); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - mdc_pack_body(req, &op_data->op_fid1, 0, 0, - op_data->op_suppgids[0], 0); - - ptlrpc_request_set_replen(req); - - rc = mdc_queue_wait(req); - if (rc) - goto out; - - req_hca = req_capsule_server_get(&req->rq_pill, - &RMF_MDS_HSM_CURRENT_ACTION); - if (!req_hca) { - rc = -EPROTO; - goto out; - } - - *hca = *req_hca; - -out: - ptlrpc_req_finished(req); - return rc; -} - -static int mdc_ioc_hsm_ct_unregister(struct obd_import *imp) -{ - struct ptlrpc_request *req; - int rc; - - req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_HSM_CT_UNREGISTER, - LUSTRE_MDS_VERSION, - MDS_HSM_CT_UNREGISTER); - if (!req) { - rc = -ENOMEM; - goto out; - } - - mdc_pack_body(req, NULL, 0, 0, -1, 0); - - ptlrpc_request_set_replen(req); - - rc = mdc_queue_wait(req); -out: - ptlrpc_req_finished(req); - return rc; -} - -static int mdc_ioc_hsm_state_get(struct obd_export *exp, - struct md_op_data *op_data) -{ - struct hsm_user_state *hus = op_data->op_data; - struct hsm_user_state *req_hus; - struct ptlrpc_request *req; - int rc; - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), - &RQF_MDS_HSM_STATE_GET); - if (!req) - return -ENOMEM; - - rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_STATE_GET); - if (rc != 0) { - ptlrpc_request_free(req); - return rc; - } - - mdc_pack_body(req, &op_data->op_fid1, 0, 0, - op_data->op_suppgids[0], 0); - - ptlrpc_request_set_replen(req); - - rc = mdc_queue_wait(req); - if (rc) - goto out; - - req_hus = req_capsule_server_get(&req->rq_pill, &RMF_HSM_USER_STATE); - if (!req_hus) { - rc = -EPROTO; - goto out; - } - - *hus = *req_hus; - -out: - ptlrpc_req_finished(req); - return rc; -} - -static int mdc_ioc_hsm_state_set(struct obd_export *exp, - struct md_op_data *op_data) -{ - struct hsm_state_set *hss = op_data->op_data; - struct hsm_state_set *req_hss; - struct ptlrpc_request *req; - int rc; - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), - &RQF_MDS_HSM_STATE_SET); - if (!req) - return -ENOMEM; - - rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_STATE_SET); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - mdc_pack_body(req, &op_data->op_fid1, 0, 0, - op_data->op_suppgids[0], 0); - - /* Copy states */ - req_hss = req_capsule_client_get(&req->rq_pill, &RMF_HSM_STATE_SET); - if (!req_hss) { - rc = -EPROTO; - goto out; - } - *req_hss = *hss; - - ptlrpc_request_set_replen(req); - - mdc_get_mod_rpc_slot(req, NULL); - rc = ptlrpc_queue_wait(req); - mdc_put_mod_rpc_slot(req, NULL); -out: - ptlrpc_req_finished(req); - return rc; -} - -static int mdc_ioc_hsm_request(struct obd_export *exp, - struct hsm_user_request *hur) -{ - struct obd_import *imp = class_exp2cliimp(exp); - struct ptlrpc_request *req; - struct hsm_request *req_hr; - struct hsm_user_item *req_hui; - char *req_opaque; - int rc; - - req = ptlrpc_request_alloc(imp, &RQF_MDS_HSM_REQUEST); - if (!req) { - rc = -ENOMEM; - goto out; - } - - req_capsule_set_size(&req->rq_pill, &RMF_MDS_HSM_USER_ITEM, RCL_CLIENT, - hur->hur_request.hr_itemcount - * sizeof(struct hsm_user_item)); - req_capsule_set_size(&req->rq_pill, &RMF_GENERIC_DATA, RCL_CLIENT, - hur->hur_request.hr_data_len); - - rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_HSM_REQUEST); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - mdc_pack_body(req, NULL, 0, 0, -1, 0); - - /* Copy hsm_request struct */ - req_hr = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_REQUEST); - if (!req_hr) { - rc = -EPROTO; - goto out; - } - *req_hr = hur->hur_request; - - /* Copy hsm_user_item structs */ - req_hui = req_capsule_client_get(&req->rq_pill, &RMF_MDS_HSM_USER_ITEM); - if (!req_hui) { - rc = -EPROTO; - goto out; - } - memcpy(req_hui, hur->hur_user_item, - hur->hur_request.hr_itemcount * sizeof(struct hsm_user_item)); - - /* Copy opaque field */ - req_opaque = req_capsule_client_get(&req->rq_pill, &RMF_GENERIC_DATA); - if (!req_opaque) { - rc = -EPROTO; - goto out; - } - memcpy(req_opaque, hur_data(hur), hur->hur_request.hr_data_len); - - ptlrpc_request_set_replen(req); - - mdc_get_mod_rpc_slot(req, NULL); - rc = ptlrpc_queue_wait(req); - mdc_put_mod_rpc_slot(req, NULL); -out: - ptlrpc_req_finished(req); - return rc; -} - -static struct kuc_hdr *changelog_kuc_hdr(char *buf, size_t len, u32 flags) -{ - struct kuc_hdr *lh = (struct kuc_hdr *)buf; - - LASSERT(len <= KUC_CHANGELOG_MSG_MAXSIZE); - - lh->kuc_magic = KUC_MAGIC; - lh->kuc_transport = KUC_TRANSPORT_CHANGELOG; - lh->kuc_flags = flags; - lh->kuc_msgtype = CL_RECORD; - lh->kuc_msglen = len; - return lh; -} - -struct changelog_show { - __u64 cs_startrec; - enum changelog_send_flag cs_flags; - struct file *cs_fp; - char *cs_buf; - struct obd_device *cs_obd; -}; - -static inline char *cs_obd_name(struct changelog_show *cs) -{ - return cs->cs_obd->obd_name; -} - -static int changelog_kkuc_cb(const struct lu_env *env, struct llog_handle *llh, - struct llog_rec_hdr *hdr, void *data) -{ - struct changelog_show *cs = data; - struct llog_changelog_rec *rec = (struct llog_changelog_rec *)hdr; - struct kuc_hdr *lh; - size_t len; - int rc; - - if (rec->cr_hdr.lrh_type != CHANGELOG_REC) { - rc = -EINVAL; - CERROR("%s: not a changelog rec %x/%d: rc = %d\n", - cs_obd_name(cs), rec->cr_hdr.lrh_type, - rec->cr.cr_type, rc); - return rc; - } - - if (rec->cr.cr_index < cs->cs_startrec) { - /* Skip entries earlier than what we are interested in */ - CDEBUG(D_HSM, "rec=%llu start=%llu\n", - rec->cr.cr_index, cs->cs_startrec); - return 0; - } - - CDEBUG(D_HSM, "%llu %02d%-5s %llu 0x%x t=" DFID " p=" DFID - " %.*s\n", rec->cr.cr_index, rec->cr.cr_type, - changelog_type2str(rec->cr.cr_type), rec->cr.cr_time, - rec->cr.cr_flags & CLF_FLAGMASK, - PFID(&rec->cr.cr_tfid), PFID(&rec->cr.cr_pfid), - rec->cr.cr_namelen, changelog_rec_name(&rec->cr)); - - len = sizeof(*lh) + changelog_rec_size(&rec->cr) + rec->cr.cr_namelen; - - /* Set up the message */ - lh = changelog_kuc_hdr(cs->cs_buf, len, cs->cs_flags); - memcpy(lh + 1, &rec->cr, len - sizeof(*lh)); - - rc = libcfs_kkuc_msg_put(cs->cs_fp, lh); - CDEBUG(D_HSM, "kucmsg fp %p len %zu rc %d\n", cs->cs_fp, len, rc); - - return rc; -} - -static int mdc_changelog_send_thread(void *csdata) -{ - enum llog_flag flags = LLOG_F_IS_CAT; - struct changelog_show *cs = csdata; - struct llog_ctxt *ctxt = NULL; - struct llog_handle *llh = NULL; - struct kuc_hdr *kuch; - int rc; - - CDEBUG(D_HSM, "changelog to fp=%p start %llu\n", - cs->cs_fp, cs->cs_startrec); - - cs->cs_buf = kzalloc(KUC_CHANGELOG_MSG_MAXSIZE, GFP_NOFS); - if (!cs->cs_buf) { - rc = -ENOMEM; - goto out; - } - - /* Set up the remote catalog handle */ - ctxt = llog_get_context(cs->cs_obd, LLOG_CHANGELOG_REPL_CTXT); - if (!ctxt) { - rc = -ENOENT; - goto out; - } - rc = llog_open(NULL, ctxt, &llh, NULL, CHANGELOG_CATALOG, - LLOG_OPEN_EXISTS); - if (rc) { - CERROR("%s: fail to open changelog catalog: rc = %d\n", - cs_obd_name(cs), rc); - goto out; - } - - if (cs->cs_flags & CHANGELOG_FLAG_JOBID) - flags |= LLOG_F_EXT_JOBID; - - rc = llog_init_handle(NULL, llh, flags, NULL); - if (rc) { - CERROR("llog_init_handle failed %d\n", rc); - goto out; - } - - rc = llog_cat_process(NULL, llh, changelog_kkuc_cb, cs, 0, 0); - - /* Send EOF no matter what our result */ - kuch = changelog_kuc_hdr(cs->cs_buf, sizeof(*kuch), cs->cs_flags); - kuch->kuc_msgtype = CL_EOF; - libcfs_kkuc_msg_put(cs->cs_fp, kuch); - -out: - fput(cs->cs_fp); - if (llh) - llog_cat_close(NULL, llh); - if (ctxt) - llog_ctxt_put(ctxt); - kfree(cs->cs_buf); - kfree(cs); - return rc; -} - -static int mdc_ioc_changelog_send(struct obd_device *obd, - struct ioc_changelog *icc) -{ - struct changelog_show *cs; - struct task_struct *task; - int rc; - - /* Freed in mdc_changelog_send_thread */ - cs = kzalloc(sizeof(*cs), GFP_NOFS); - if (!cs) - return -ENOMEM; - - cs->cs_obd = obd; - cs->cs_startrec = icc->icc_recno; - /* matching fput in mdc_changelog_send_thread */ - cs->cs_fp = fget(icc->icc_id); - cs->cs_flags = icc->icc_flags; - - /* - * New thread because we should return to user app before - * writing into our pipe - */ - task = kthread_run(mdc_changelog_send_thread, cs, - "mdc_clg_send_thread"); - if (IS_ERR(task)) { - rc = PTR_ERR(task); - CERROR("%s: can't start changelog thread: rc = %d\n", - cs_obd_name(cs), rc); - kfree(cs); - } else { - rc = 0; - CDEBUG(D_HSM, "%s: started changelog thread\n", - cs_obd_name(cs)); - } - - CERROR("Failed to start changelog thread: %d\n", rc); - return rc; -} - -static int mdc_ioc_hsm_ct_start(struct obd_export *exp, - struct lustre_kernelcomm *lk); - -static int mdc_quotactl(struct obd_device *unused, struct obd_export *exp, - struct obd_quotactl *oqctl) -{ - struct ptlrpc_request *req; - struct obd_quotactl *oqc; - int rc; - - req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), - &RQF_MDS_QUOTACTL, LUSTRE_MDS_VERSION, - MDS_QUOTACTL); - if (!req) - return -ENOMEM; - - oqc = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL); - *oqc = *oqctl; - - ptlrpc_request_set_replen(req); - ptlrpc_at_set_req_timeout(req); - req->rq_no_resend = 1; - - rc = ptlrpc_queue_wait(req); - if (rc) - CERROR("ptlrpc_queue_wait failed, rc: %d\n", rc); - - if (req->rq_repmsg) { - oqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL); - if (oqc) { - *oqctl = *oqc; - } else if (!rc) { - CERROR("Can't unpack obd_quotactl\n"); - rc = -EPROTO; - } - } else if (!rc) { - CERROR("Can't unpack obd_quotactl\n"); - rc = -EPROTO; - } - ptlrpc_req_finished(req); - - return rc; -} - -static int mdc_ioc_swap_layouts(struct obd_export *exp, - struct md_op_data *op_data) -{ - LIST_HEAD(cancels); - struct ptlrpc_request *req; - int rc, count; - struct mdc_swap_layouts *msl, *payload; - - msl = op_data->op_data; - - /* When the MDT will get the MDS_SWAP_LAYOUTS RPC the - * first thing it will do is to cancel the 2 layout - * locks hold by this client. - * So the client must cancel its layout locks on the 2 fids - * with the request RPC to avoid extra RPC round trips - */ - count = mdc_resource_get_unused(exp, &op_data->op_fid1, &cancels, - LCK_CR, MDS_INODELOCK_LAYOUT | - MDS_INODELOCK_XATTR); - count += mdc_resource_get_unused(exp, &op_data->op_fid2, &cancels, - LCK_CR, MDS_INODELOCK_LAYOUT | - MDS_INODELOCK_XATTR); - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), - &RQF_MDS_SWAP_LAYOUTS); - if (!req) { - ldlm_lock_list_put(&cancels, l_bl_ast, count); - return -ENOMEM; - } - - rc = mdc_prep_elc_req(exp, req, MDS_SWAP_LAYOUTS, &cancels, count); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - mdc_swap_layouts_pack(req, op_data); - - payload = req_capsule_client_get(&req->rq_pill, &RMF_SWAP_LAYOUTS); - LASSERT(payload); - - *payload = *msl; - - ptlrpc_request_set_replen(req); - - rc = ptlrpc_queue_wait(req); - - ptlrpc_req_finished(req); - return rc; -} - -static int mdc_iocontrol(unsigned int cmd, struct obd_export *exp, int len, - void *karg, void __user *uarg) -{ - struct obd_device *obd = exp->exp_obd; - struct obd_ioctl_data *data = karg; - struct obd_import *imp = obd->u.cli.cl_import; - int rc; - - if (!try_module_get(THIS_MODULE)) { - CERROR("%s: cannot get module '%s'\n", obd->obd_name, - module_name(THIS_MODULE)); - return -EINVAL; - } - switch (cmd) { - case OBD_IOC_CHANGELOG_SEND: - rc = mdc_ioc_changelog_send(obd, karg); - goto out; - case OBD_IOC_CHANGELOG_CLEAR: { - struct ioc_changelog *icc = karg; - struct changelog_setinfo cs = { - .cs_recno = icc->icc_recno, - .cs_id = icc->icc_id - }; - - rc = obd_set_info_async(NULL, exp, strlen(KEY_CHANGELOG_CLEAR), - KEY_CHANGELOG_CLEAR, sizeof(cs), &cs, - NULL); - goto out; - } - case OBD_IOC_FID2PATH: - rc = mdc_ioc_fid2path(exp, karg); - goto out; - case LL_IOC_HSM_CT_START: - rc = mdc_ioc_hsm_ct_start(exp, karg); - /* ignore if it was already registered on this MDS. */ - if (rc == -EEXIST) - rc = 0; - goto out; - case LL_IOC_HSM_PROGRESS: - rc = mdc_ioc_hsm_progress(exp, karg); - goto out; - case LL_IOC_HSM_STATE_GET: - rc = mdc_ioc_hsm_state_get(exp, karg); - goto out; - case LL_IOC_HSM_STATE_SET: - rc = mdc_ioc_hsm_state_set(exp, karg); - goto out; - case LL_IOC_HSM_ACTION: - rc = mdc_ioc_hsm_current_action(exp, karg); - goto out; - case LL_IOC_HSM_REQUEST: - rc = mdc_ioc_hsm_request(exp, karg); - goto out; - case OBD_IOC_CLIENT_RECOVER: - rc = ptlrpc_recover_import(imp, data->ioc_inlbuf1, 0); - if (rc < 0) - goto out; - rc = 0; - goto out; - case IOC_OSC_SET_ACTIVE: - rc = ptlrpc_set_import_active(imp, data->ioc_offset); - goto out; - case OBD_IOC_PING_TARGET: - rc = ptlrpc_obd_ping(obd); - goto out; - /* - * Normally IOC_OBD_STATFS, OBD_IOC_QUOTACTL iocontrol are handled by - * LMV instead of MDC. But when the cluster is upgraded from 1.8, - * there'd be no LMV layer thus we might be called here. Eventually - * this code should be removed. - * bz20731, LU-592. - */ - case IOC_OBD_STATFS: { - struct obd_statfs stat_buf = {0}; - - if (*((__u32 *)data->ioc_inlbuf2) != 0) { - rc = -ENODEV; - goto out; - } - - /* copy UUID */ - if (copy_to_user(data->ioc_pbuf2, obd2cli_tgt(obd), - min_t(size_t, data->ioc_plen2, - sizeof(struct obd_uuid)))) { - rc = -EFAULT; - goto out; - } - - rc = mdc_statfs(NULL, obd->obd_self_export, &stat_buf, - get_jiffies_64() - OBD_STATFS_CACHE_SECONDS * HZ, - 0); - if (rc != 0) - goto out; - - if (copy_to_user(data->ioc_pbuf1, &stat_buf, - min_t(size_t, data->ioc_plen1, - sizeof(stat_buf)))) { - rc = -EFAULT; - goto out; - } - - rc = 0; - goto out; - } - case OBD_IOC_QUOTACTL: { - struct if_quotactl *qctl = karg; - struct obd_quotactl *oqctl; - - oqctl = kzalloc(sizeof(*oqctl), GFP_NOFS); - if (!oqctl) { - rc = -ENOMEM; - goto out; - } - - QCTL_COPY(oqctl, qctl); - rc = obd_quotactl(exp, oqctl); - if (rc == 0) { - QCTL_COPY(qctl, oqctl); - qctl->qc_valid = QC_MDTIDX; - qctl->obd_uuid = obd->u.cli.cl_target_uuid; - } - - kfree(oqctl); - goto out; - } - case LL_IOC_GET_CONNECT_FLAGS: - if (copy_to_user(uarg, exp_connect_flags_ptr(exp), - sizeof(*exp_connect_flags_ptr(exp)))) { - rc = -EFAULT; - goto out; - } - - rc = 0; - goto out; - case LL_IOC_LOV_SWAP_LAYOUTS: - rc = mdc_ioc_swap_layouts(exp, karg); - goto out; - default: - CERROR("unrecognised ioctl: cmd = %#x\n", cmd); - rc = -ENOTTY; - goto out; - } -out: - module_put(THIS_MODULE); - - return rc; -} - -static int mdc_get_info_rpc(struct obd_export *exp, - u32 keylen, void *key, - int vallen, void *val) -{ - struct obd_import *imp = class_exp2cliimp(exp); - struct ptlrpc_request *req; - char *tmp; - int rc = -EINVAL; - - req = ptlrpc_request_alloc(imp, &RQF_MDS_GET_INFO); - if (!req) - return -ENOMEM; - - req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_KEY, - RCL_CLIENT, keylen); - req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VALLEN, - RCL_CLIENT, sizeof(__u32)); - - rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_GET_INFO); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_KEY); - memcpy(tmp, key, keylen); - tmp = req_capsule_client_get(&req->rq_pill, &RMF_GETINFO_VALLEN); - memcpy(tmp, &vallen, sizeof(__u32)); - - req_capsule_set_size(&req->rq_pill, &RMF_GETINFO_VAL, - RCL_SERVER, vallen); - ptlrpc_request_set_replen(req); - - rc = ptlrpc_queue_wait(req); - /* -EREMOTE means the get_info result is partial, and it needs to - * continue on another MDT, see fid2path part in lmv_iocontrol - */ - if (rc == 0 || rc == -EREMOTE) { - tmp = req_capsule_server_get(&req->rq_pill, &RMF_GETINFO_VAL); - memcpy(val, tmp, vallen); - if (ptlrpc_rep_need_swab(req)) { - if (KEY_IS(KEY_FID2PATH)) - lustre_swab_fid2path(val); - } - } - ptlrpc_req_finished(req); - - return rc; -} - -static void lustre_swab_hai(struct hsm_action_item *h) -{ - __swab32s(&h->hai_len); - __swab32s(&h->hai_action); - lustre_swab_lu_fid(&h->hai_fid); - lustre_swab_lu_fid(&h->hai_dfid); - __swab64s(&h->hai_cookie); - __swab64s(&h->hai_extent.offset); - __swab64s(&h->hai_extent.length); - __swab64s(&h->hai_gid); -} - -static void lustre_swab_hal(struct hsm_action_list *h) -{ - struct hsm_action_item *hai; - u32 i; - - __swab32s(&h->hal_version); - __swab32s(&h->hal_count); - __swab32s(&h->hal_archive_id); - __swab64s(&h->hal_flags); - hai = hai_first(h); - for (i = 0; i < h->hal_count; i++, hai = hai_next(hai)) - lustre_swab_hai(hai); -} - -static void lustre_swab_kuch(struct kuc_hdr *l) -{ - __swab16s(&l->kuc_magic); - /* __u8 l->kuc_transport */ - __swab16s(&l->kuc_msgtype); - __swab16s(&l->kuc_msglen); -} - -static int mdc_ioc_hsm_ct_start(struct obd_export *exp, - struct lustre_kernelcomm *lk) -{ - struct obd_import *imp = class_exp2cliimp(exp); - __u32 archive = lk->lk_data; - int rc = 0; - - if (lk->lk_group != KUC_GRP_HSM) { - CERROR("Bad copytool group %d\n", lk->lk_group); - return -EINVAL; - } - - CDEBUG(D_HSM, "CT start r%d w%d u%d g%d f%#x\n", lk->lk_rfd, lk->lk_wfd, - lk->lk_uid, lk->lk_group, lk->lk_flags); - - if (lk->lk_flags & LK_FLG_STOP) { - /* Unregister with the coordinator */ - rc = mdc_ioc_hsm_ct_unregister(imp); - } else { - rc = mdc_ioc_hsm_ct_register(imp, archive); - } - - return rc; -} - -/** - * Send a message to any listening copytools - * @param val KUC message (kuc_hdr + hsm_action_list) - * @param len total length of message - */ -static int mdc_hsm_copytool_send(size_t len, void *val) -{ - struct kuc_hdr *lh = (struct kuc_hdr *)val; - struct hsm_action_list *hal = (struct hsm_action_list *)(lh + 1); - - if (len < sizeof(*lh) + sizeof(*hal)) { - CERROR("Short HSM message %zu < %zu\n", len, - sizeof(*lh) + sizeof(*hal)); - return -EPROTO; - } - if (lh->kuc_magic == __swab16(KUC_MAGIC)) { - lustre_swab_kuch(lh); - lustre_swab_hal(hal); - } else if (lh->kuc_magic != KUC_MAGIC) { - CERROR("Bad magic %x!=%x\n", lh->kuc_magic, KUC_MAGIC); - return -EPROTO; - } - - CDEBUG(D_HSM, - "Received message mg=%x t=%d m=%d l=%d actions=%d on %s\n", - lh->kuc_magic, lh->kuc_transport, lh->kuc_msgtype, - lh->kuc_msglen, hal->hal_count, hal->hal_fsname); - - /* Broadcast to HSM listeners */ - return libcfs_kkuc_group_put(KUC_GRP_HSM, lh); -} - -/** - * callback function passed to kuc for re-registering each HSM copytool - * running on MDC, after MDT shutdown/recovery. - * @param data copytool registration data - * @param cb_arg callback argument (obd_import) - */ -static int mdc_hsm_ct_reregister(void *data, void *cb_arg) -{ - struct kkuc_ct_data *kcd = data; - struct obd_import *imp = (struct obd_import *)cb_arg; - int rc; - - if (!kcd || kcd->kcd_magic != KKUC_CT_DATA_MAGIC) - return -EPROTO; - - if (!obd_uuid_equals(&kcd->kcd_uuid, &imp->imp_obd->obd_uuid)) - return 0; - - CDEBUG(D_HA, "%s: recover copytool registration to MDT (archive=%#x)\n", - imp->imp_obd->obd_name, kcd->kcd_archive); - rc = mdc_ioc_hsm_ct_register(imp, kcd->kcd_archive); - - /* ignore error if the copytool is already registered */ - return (rc == -EEXIST) ? 0 : rc; -} - -static int mdc_set_info_async(const struct lu_env *env, - struct obd_export *exp, - u32 keylen, void *key, - u32 vallen, void *val, - struct ptlrpc_request_set *set) -{ - struct obd_import *imp = class_exp2cliimp(exp); - int rc; - - if (KEY_IS(KEY_READ_ONLY)) { - if (vallen != sizeof(int)) - return -EINVAL; - - spin_lock(&imp->imp_lock); - if (*((int *)val)) { - imp->imp_connect_flags_orig |= OBD_CONNECT_RDONLY; - imp->imp_connect_data.ocd_connect_flags |= - OBD_CONNECT_RDONLY; - } else { - imp->imp_connect_flags_orig &= ~OBD_CONNECT_RDONLY; - imp->imp_connect_data.ocd_connect_flags &= - ~OBD_CONNECT_RDONLY; - } - spin_unlock(&imp->imp_lock); - - return do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION, - keylen, key, vallen, val, set); - } - if (KEY_IS(KEY_SPTLRPC_CONF)) { - sptlrpc_conf_client_adapt(exp->exp_obd); - return 0; - } - if (KEY_IS(KEY_FLUSH_CTX)) { - sptlrpc_import_flush_my_ctx(imp); - return 0; - } - if (KEY_IS(KEY_CHANGELOG_CLEAR)) { - rc = do_set_info_async(imp, MDS_SET_INFO, LUSTRE_MDS_VERSION, - keylen, key, vallen, val, set); - return rc; - } - if (KEY_IS(KEY_HSM_COPYTOOL_SEND)) { - rc = mdc_hsm_copytool_send(vallen, val); - return rc; - } - if (KEY_IS(KEY_DEFAULT_EASIZE)) { - u32 *default_easize = val; - - exp->exp_obd->u.cli.cl_default_mds_easize = *default_easize; - return 0; - } - - CERROR("Unknown key %s\n", (char *)key); - return -EINVAL; -} - -static int mdc_get_info(const struct lu_env *env, struct obd_export *exp, - __u32 keylen, void *key, __u32 *vallen, void *val) -{ - int rc = -EINVAL; - - if (KEY_IS(KEY_MAX_EASIZE)) { - u32 mdsize, *max_easize; - - if (*vallen != sizeof(int)) - return -EINVAL; - mdsize = *(u32 *)val; - if (mdsize > exp->exp_obd->u.cli.cl_max_mds_easize) - exp->exp_obd->u.cli.cl_max_mds_easize = mdsize; - max_easize = val; - *max_easize = exp->exp_obd->u.cli.cl_max_mds_easize; - return 0; - } else if (KEY_IS(KEY_DEFAULT_EASIZE)) { - u32 *default_easize; - - if (*vallen != sizeof(int)) - return -EINVAL; - default_easize = val; - *default_easize = exp->exp_obd->u.cli.cl_default_mds_easize; - return 0; - } else if (KEY_IS(KEY_CONN_DATA)) { - struct obd_import *imp = class_exp2cliimp(exp); - struct obd_connect_data *data = val; - - if (*vallen != sizeof(*data)) - return -EINVAL; - - *data = imp->imp_connect_data; - return 0; - } else if (KEY_IS(KEY_TGT_COUNT)) { - *((u32 *)val) = 1; - return 0; - } - - rc = mdc_get_info_rpc(exp, keylen, key, *vallen, val); - - return rc; -} - -static int mdc_sync(struct obd_export *exp, const struct lu_fid *fid, - struct ptlrpc_request **request) -{ - struct ptlrpc_request *req; - int rc; - - *request = NULL; - req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_MDS_SYNC); - if (!req) - return -ENOMEM; - - rc = ptlrpc_request_pack(req, LUSTRE_MDS_VERSION, MDS_SYNC); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - mdc_pack_body(req, fid, 0, 0, -1, 0); - - ptlrpc_request_set_replen(req); - - rc = ptlrpc_queue_wait(req); - if (rc) - ptlrpc_req_finished(req); - else - *request = req; - return rc; -} - -static int mdc_import_event(struct obd_device *obd, struct obd_import *imp, - enum obd_import_event event) -{ - int rc = 0; - - LASSERT(imp->imp_obd == obd); - - switch (event) { - case IMP_EVENT_INACTIVE: { - struct client_obd *cli = &obd->u.cli; - /* - * Flush current sequence to make client obtain new one - * from server in case of disconnect/reconnect. - */ - if (cli->cl_seq) - seq_client_flush(cli->cl_seq); - - rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE, NULL); - break; - } - case IMP_EVENT_INVALIDATE: { - struct ldlm_namespace *ns = obd->obd_namespace; - - ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY); - - break; - } - case IMP_EVENT_ACTIVE: - rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE, NULL); - /* redo the kuc registration after reconnecting */ - if (rc == 0) - /* re-register HSM agents */ - rc = libcfs_kkuc_group_foreach(KUC_GRP_HSM, - mdc_hsm_ct_reregister, - (void *)imp); - break; - case IMP_EVENT_OCD: - rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD, NULL); - break; - case IMP_EVENT_DISCON: - case IMP_EVENT_DEACTIVATE: - case IMP_EVENT_ACTIVATE: - break; - default: - CERROR("Unknown import event %x\n", event); - LBUG(); - } - return rc; -} - -int mdc_fid_alloc(const struct lu_env *env, struct obd_export *exp, - struct lu_fid *fid, struct md_op_data *op_data) -{ - struct client_obd *cli = &exp->exp_obd->u.cli; - struct lu_client_seq *seq = cli->cl_seq; - - return seq_client_alloc_fid(env, seq, fid); -} - -static struct obd_uuid *mdc_get_uuid(struct obd_export *exp) -{ - struct client_obd *cli = &exp->exp_obd->u.cli; - - return &cli->cl_target_uuid; -} - -/** - * Determine whether the lock can be canceled before replaying it during - * recovery, non zero value will be return if the lock can be canceled, - * or zero returned for not - */ -static int mdc_cancel_weight(struct ldlm_lock *lock) -{ - if (lock->l_resource->lr_type != LDLM_IBITS) - return 0; - - /* FIXME: if we ever get into a situation where there are too many - * opened files with open locks on a single node, then we really - * should replay these open locks to reget it - */ - if (lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_OPEN) - return 0; - - return 1; -} - -static int mdc_resource_inode_free(struct ldlm_resource *res) -{ - if (res->lr_lvb_inode) - res->lr_lvb_inode = NULL; - - return 0; -} - -static struct ldlm_valblock_ops inode_lvbo = { - .lvbo_free = mdc_resource_inode_free, -}; - -static int mdc_llog_init(struct obd_device *obd) -{ - struct obd_llog_group *olg = &obd->obd_olg; - struct llog_ctxt *ctxt; - int rc; - - rc = llog_setup(NULL, obd, olg, LLOG_CHANGELOG_REPL_CTXT, obd, - &llog_client_ops); - if (rc) - return rc; - - ctxt = llog_group_get_ctxt(olg, LLOG_CHANGELOG_REPL_CTXT); - llog_initiator_connect(ctxt); - llog_ctxt_put(ctxt); - - return 0; -} - -static void mdc_llog_finish(struct obd_device *obd) -{ - struct llog_ctxt *ctxt; - - ctxt = llog_get_context(obd, LLOG_CHANGELOG_REPL_CTXT); - if (ctxt) - llog_cleanup(NULL, ctxt); -} - -static int mdc_setup(struct obd_device *obd, struct lustre_cfg *cfg) -{ - struct lprocfs_static_vars lvars = { NULL }; - int rc; - - rc = ptlrpcd_addref(); - if (rc < 0) - return rc; - - rc = client_obd_setup(obd, cfg); - if (rc) - goto err_ptlrpcd_decref; - - lprocfs_mdc_init_vars(&lvars); - lprocfs_obd_setup(obd, lvars.obd_vars, lvars.sysfs_vars); - sptlrpc_lprocfs_cliobd_attach(obd); - ptlrpc_lprocfs_register_obd(obd); - - ns_register_cancel(obd->obd_namespace, mdc_cancel_weight); - - obd->obd_namespace->ns_lvbo = &inode_lvbo; - - rc = mdc_llog_init(obd); - if (rc) { - mdc_cleanup(obd); - CERROR("failed to setup llogging subsystems\n"); - return rc; - } - - return rc; - -err_ptlrpcd_decref: - ptlrpcd_decref(); - return rc; -} - -/* Initialize the default and maximum LOV EA sizes. This allows - * us to make MDS RPCs with large enough reply buffers to hold a default - * sized EA without having to calculate this (via a call into the - * LOV + OSCs) each time we make an RPC. The maximum size is also tracked - * but not used to avoid wastefully vmalloc()'ing large reply buffers when - * a large number of stripes is possible. If a larger reply buffer is - * required it will be reallocated in the ptlrpc layer due to overflow. - */ -static int mdc_init_ea_size(struct obd_export *exp, u32 easize, u32 def_easize) -{ - struct obd_device *obd = exp->exp_obd; - struct client_obd *cli = &obd->u.cli; - - if (cli->cl_max_mds_easize < easize) - cli->cl_max_mds_easize = easize; - - if (cli->cl_default_mds_easize < def_easize) - cli->cl_default_mds_easize = def_easize; - - return 0; -} - -static int mdc_precleanup(struct obd_device *obd) -{ - /* Failsafe, ok if racy */ - if (obd->obd_type->typ_refcnt <= 1) - libcfs_kkuc_group_rem(0, KUC_GRP_HSM); - - obd_cleanup_client_import(obd); - ptlrpc_lprocfs_unregister_obd(obd); - lprocfs_obd_cleanup(obd); - mdc_llog_finish(obd); - return 0; -} - -static int mdc_cleanup(struct obd_device *obd) -{ - ptlrpcd_decref(); - - return client_obd_cleanup(obd); -} - -static int mdc_process_config(struct obd_device *obd, u32 len, void *buf) -{ - struct lustre_cfg *lcfg = buf; - struct lprocfs_static_vars lvars = { NULL }; - int rc = 0; - - lprocfs_mdc_init_vars(&lvars); - switch (lcfg->lcfg_command) { - default: - rc = class_process_proc_param(PARAM_MDC, lvars.obd_vars, - lcfg, obd); - if (rc > 0) - rc = 0; - break; - } - return rc; -} - -static struct obd_ops mdc_obd_ops = { - .owner = THIS_MODULE, - .setup = mdc_setup, - .precleanup = mdc_precleanup, - .cleanup = mdc_cleanup, - .add_conn = client_import_add_conn, - .del_conn = client_import_del_conn, - .connect = client_connect_import, - .disconnect = client_disconnect_export, - .iocontrol = mdc_iocontrol, - .set_info_async = mdc_set_info_async, - .statfs = mdc_statfs, - .fid_init = client_fid_init, - .fid_fini = client_fid_fini, - .fid_alloc = mdc_fid_alloc, - .import_event = mdc_import_event, - .get_info = mdc_get_info, - .process_config = mdc_process_config, - .get_uuid = mdc_get_uuid, - .quotactl = mdc_quotactl, -}; - -static struct md_ops mdc_md_ops = { - .getstatus = mdc_getstatus, - .null_inode = mdc_null_inode, - .close = mdc_close, - .create = mdc_create, - .enqueue = mdc_enqueue, - .getattr = mdc_getattr, - .getattr_name = mdc_getattr_name, - .intent_lock = mdc_intent_lock, - .link = mdc_link, - .rename = mdc_rename, - .setattr = mdc_setattr, - .setxattr = mdc_setxattr, - .getxattr = mdc_getxattr, - .sync = mdc_sync, - .read_page = mdc_read_page, - .unlink = mdc_unlink, - .cancel_unused = mdc_cancel_unused, - .init_ea_size = mdc_init_ea_size, - .set_lock_data = mdc_set_lock_data, - .lock_match = mdc_lock_match, - .get_lustre_md = mdc_get_lustre_md, - .free_lustre_md = mdc_free_lustre_md, - .set_open_replay_data = mdc_set_open_replay_data, - .clear_open_replay_data = mdc_clear_open_replay_data, - .intent_getattr_async = mdc_intent_getattr_async, - .revalidate_lock = mdc_revalidate_lock -}; - -static int __init mdc_init(void) -{ - struct lprocfs_static_vars lvars = { NULL }; - int rc; - - rc = libcfs_setup(); - if (rc) - return rc; - - lprocfs_mdc_init_vars(&lvars); - - return class_register_type(&mdc_obd_ops, &mdc_md_ops, - LUSTRE_MDC_NAME, NULL); -} - -static void /*__exit*/ mdc_exit(void) -{ - class_unregister_type(LUSTRE_MDC_NAME); -} - -MODULE_AUTHOR("OpenSFS, Inc. "); -MODULE_DESCRIPTION("Lustre Metadata Client"); -MODULE_VERSION(LUSTRE_VERSION_STRING); -MODULE_LICENSE("GPL"); - -module_init(mdc_init); -module_exit(mdc_exit); diff --git a/drivers/staging/lustre/lustre/mgc/Makefile b/drivers/staging/lustre/lustre/mgc/Makefile deleted file mode 100644 index 8abf108dbcf78642a7188be26764a995615e7e9b..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/mgc/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/include -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/lustre/include - -obj-$(CONFIG_LUSTRE_FS) += mgc.o -mgc-y := mgc_request.o lproc_mgc.o diff --git a/drivers/staging/lustre/lustre/mgc/lproc_mgc.c b/drivers/staging/lustre/lustre/mgc/lproc_mgc.c deleted file mode 100644 index 636770624e8f3538b21c2383549acf126793d421..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/mgc/lproc_mgc.c +++ /dev/null @@ -1,69 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ -#define DEBUG_SUBSYSTEM S_CLASS - -#include -#include -#include -#include "mgc_internal.h" - -LPROC_SEQ_FOPS_RO_TYPE(mgc, connect_flags); -LPROC_SEQ_FOPS_RO_TYPE(mgc, server_uuid); -LPROC_SEQ_FOPS_RO_TYPE(mgc, conn_uuid); -LPROC_SEQ_FOPS_RO_TYPE(mgc, import); -LPROC_SEQ_FOPS_RO_TYPE(mgc, state); - -LPROC_SEQ_FOPS_WR_ONLY(mgc, ping); - -static int mgc_ir_state_seq_show(struct seq_file *m, void *v) -{ - return lprocfs_mgc_rd_ir_state(m, m->private); -} - -LPROC_SEQ_FOPS_RO(mgc_ir_state); - -static struct lprocfs_vars lprocfs_mgc_obd_vars[] = { - { "ping", &mgc_ping_fops, NULL, 0222 }, - { "connect_flags", &mgc_connect_flags_fops, NULL, 0 }, - { "mgs_server_uuid", &mgc_server_uuid_fops, NULL, 0 }, - { "mgs_conn_uuid", &mgc_conn_uuid_fops, NULL, 0 }, - { "import", &mgc_import_fops, NULL, 0 }, - { "state", &mgc_state_fops, NULL, 0 }, - { "ir_state", &mgc_ir_state_fops, NULL, 0 }, - { NULL } -}; - -void lprocfs_mgc_init_vars(struct lprocfs_static_vars *lvars) -{ - lvars->obd_vars = lprocfs_mgc_obd_vars; -} diff --git a/drivers/staging/lustre/lustre/mgc/mgc_internal.h b/drivers/staging/lustre/lustre/mgc/mgc_internal.h deleted file mode 100644 index 9541892b67c7560aa1321b37e4b876ea6c65ccaf..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/mgc/mgc_internal.h +++ /dev/null @@ -1,57 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#ifndef _MGC_INTERNAL_H -#define _MGC_INTERNAL_H - -#include -#include -#include -#include - -void lprocfs_mgc_init_vars(struct lprocfs_static_vars *lvars); -int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data); - -int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld); - -static inline int cld_is_sptlrpc(struct config_llog_data *cld) -{ - return cld->cld_type == CONFIG_T_SPTLRPC; -} - -static inline int cld_is_recover(struct config_llog_data *cld) -{ - return cld->cld_type == CONFIG_T_RECOVER; -} - -#endif /* _MGC_INTERNAL_H */ diff --git a/drivers/staging/lustre/lustre/mgc/mgc_request.c b/drivers/staging/lustre/lustre/mgc/mgc_request.c deleted file mode 100644 index 32df804614d3588a297dadab8a7ee67bdab5275d..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/mgc/mgc_request.c +++ /dev/null @@ -1,1851 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/mgc/mgc_request.c - * - * Author: Nathan Rutman - */ - -#define DEBUG_SUBSYSTEM S_MGC -#define D_MGC D_CONFIG /*|D_WARNING*/ - -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "mgc_internal.h" - -static int mgc_name2resid(char *name, int len, struct ldlm_res_id *res_id, - int type) -{ - __u64 resname = 0; - - if (len > sizeof(resname)) { - CERROR("name too long: %s\n", name); - return -EINVAL; - } - if (len <= 0) { - CERROR("missing name: %s\n", name); - return -EINVAL; - } - memcpy(&resname, name, len); - - /* Always use the same endianness for the resid */ - memset(res_id, 0, sizeof(*res_id)); - res_id->name[0] = cpu_to_le64(resname); - /* XXX: unfortunately, sptlprc and config llog share one lock */ - switch (type) { - case CONFIG_T_CONFIG: - case CONFIG_T_SPTLRPC: - resname = 0; - break; - case CONFIG_T_RECOVER: - case CONFIG_T_PARAMS: - resname = type; - break; - default: - LBUG(); - } - res_id->name[1] = cpu_to_le64(resname); - CDEBUG(D_MGC, "log %s to resid %#llx/%#llx (%.8s)\n", name, - res_id->name[0], res_id->name[1], (char *)&res_id->name[0]); - return 0; -} - -int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, int type) -{ - /* fsname is at most 8 chars long, maybe contain "-". - * e.g. "lustre", "SUN-000" - */ - return mgc_name2resid(fsname, strlen(fsname), res_id, type); -} -EXPORT_SYMBOL(mgc_fsname2resid); - -static int mgc_logname2resid(char *logname, struct ldlm_res_id *res_id, int type) -{ - char *name_end; - int len; - - /* logname consists of "fsname-nodetype". - * e.g. "lustre-MDT0001", "SUN-000-client" - * there is an exception: llog "params" - */ - name_end = strrchr(logname, '-'); - if (!name_end) - len = strlen(logname); - else - len = name_end - logname; - return mgc_name2resid(logname, len, res_id, type); -} - -/********************** config llog list **********************/ -static LIST_HEAD(config_llog_list); -static DEFINE_SPINLOCK(config_list_lock); - -/* Take a reference to a config log */ -static int config_log_get(struct config_llog_data *cld) -{ - atomic_inc(&cld->cld_refcount); - CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname, - atomic_read(&cld->cld_refcount)); - return 0; -} - -/* Drop a reference to a config log. When no longer referenced, - * we can free the config log data - */ -static void config_log_put(struct config_llog_data *cld) -{ - CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname, - atomic_read(&cld->cld_refcount)); - LASSERT(atomic_read(&cld->cld_refcount) > 0); - - /* spinlock to make sure no item with 0 refcount in the list */ - if (atomic_dec_and_lock(&cld->cld_refcount, &config_list_lock)) { - list_del(&cld->cld_list_chain); - spin_unlock(&config_list_lock); - - CDEBUG(D_MGC, "dropping config log %s\n", cld->cld_logname); - - if (cld->cld_recover) - config_log_put(cld->cld_recover); - if (cld->cld_params) - config_log_put(cld->cld_params); - if (cld->cld_sptlrpc) - config_log_put(cld->cld_sptlrpc); - if (cld_is_sptlrpc(cld)) - sptlrpc_conf_log_stop(cld->cld_logname); - - class_export_put(cld->cld_mgcexp); - kfree(cld); - } -} - -/* Find a config log by name */ -static -struct config_llog_data *config_log_find(char *logname, - struct config_llog_instance *cfg) -{ - struct config_llog_data *cld; - struct config_llog_data *found = NULL; - void *instance; - - LASSERT(logname); - - instance = cfg ? cfg->cfg_instance : NULL; - spin_lock(&config_list_lock); - list_for_each_entry(cld, &config_llog_list, cld_list_chain) { - /* check if instance equals */ - if (instance != cld->cld_cfg.cfg_instance) - continue; - - /* instance may be NULL, should check name */ - if (strcmp(logname, cld->cld_logname) == 0) { - found = cld; - config_log_get(found); - break; - } - } - spin_unlock(&config_list_lock); - return found; -} - -static -struct config_llog_data *do_config_log_add(struct obd_device *obd, - char *logname, - int type, - struct config_llog_instance *cfg, - struct super_block *sb) -{ - struct config_llog_data *cld; - int rc; - - CDEBUG(D_MGC, "do adding config log %s:%p\n", logname, - cfg ? cfg->cfg_instance : NULL); - - cld = kzalloc(sizeof(*cld) + strlen(logname) + 1, GFP_NOFS); - if (!cld) - return ERR_PTR(-ENOMEM); - - rc = mgc_logname2resid(logname, &cld->cld_resid, type); - if (rc) { - kfree(cld); - return ERR_PTR(rc); - } - - strcpy(cld->cld_logname, logname); - if (cfg) - cld->cld_cfg = *cfg; - else - cld->cld_cfg.cfg_callback = class_config_llog_handler; - mutex_init(&cld->cld_lock); - cld->cld_cfg.cfg_last_idx = 0; - cld->cld_cfg.cfg_flags = 0; - cld->cld_cfg.cfg_sb = sb; - cld->cld_type = type; - atomic_set(&cld->cld_refcount, 1); - - /* Keep the mgc around until we are done */ - cld->cld_mgcexp = class_export_get(obd->obd_self_export); - - if (cld_is_sptlrpc(cld)) { - sptlrpc_conf_log_start(logname); - cld->cld_cfg.cfg_obdname = obd->obd_name; - } - - spin_lock(&config_list_lock); - list_add(&cld->cld_list_chain, &config_llog_list); - spin_unlock(&config_list_lock); - - if (cld_is_sptlrpc(cld)) { - rc = mgc_process_log(obd, cld); - if (rc && rc != -ENOENT) - CERROR("failed processing sptlrpc log: %d\n", rc); - } - - return cld; -} - -static struct config_llog_data * -config_recover_log_add(struct obd_device *obd, char *fsname, - struct config_llog_instance *cfg, - struct super_block *sb) -{ - struct config_llog_instance lcfg = *cfg; - struct config_llog_data *cld; - char logname[32]; - - /* we have to use different llog for clients and mdts for cmd - * where only clients are notified if one of cmd server restarts - */ - LASSERT(strlen(fsname) < sizeof(logname) / 2); - strcpy(logname, fsname); - LASSERT(lcfg.cfg_instance); - strcat(logname, "-cliir"); - - cld = do_config_log_add(obd, logname, CONFIG_T_RECOVER, &lcfg, sb); - return cld; -} - -static struct config_llog_data * -config_params_log_add(struct obd_device *obd, - struct config_llog_instance *cfg, struct super_block *sb) -{ - struct config_llog_instance lcfg = *cfg; - struct config_llog_data *cld; - - lcfg.cfg_instance = sb; - - cld = do_config_log_add(obd, PARAMS_FILENAME, CONFIG_T_PARAMS, - &lcfg, sb); - - return cld; -} - -/** Add this log to the list of active logs watched by an MGC. - * Active means we're watching for updates. - * We have one active log per "mount" - client instance or servername. - * Each instance may be at a different point in the log. - */ -static struct config_llog_data * -config_log_add(struct obd_device *obd, char *logname, - struct config_llog_instance *cfg, struct super_block *sb) -{ - struct lustre_sb_info *lsi = s2lsi(sb); - struct config_llog_data *cld; - struct config_llog_data *sptlrpc_cld; - struct config_llog_data *params_cld; - struct config_llog_data *recover_cld = NULL; - char seclogname[32]; - char *ptr; - int rc; - - CDEBUG(D_MGC, "adding config log %s:%p\n", logname, cfg->cfg_instance); - - /* - * for each regular log, the depended sptlrpc log name is - * -sptlrpc. multiple regular logs may share one sptlrpc log. - */ - ptr = strrchr(logname, '-'); - if (!ptr || ptr - logname > 8) { - CERROR("logname %s is too long\n", logname); - return ERR_PTR(-EINVAL); - } - - memcpy(seclogname, logname, ptr - logname); - strcpy(seclogname + (ptr - logname), "-sptlrpc"); - - sptlrpc_cld = config_log_find(seclogname, NULL); - if (!sptlrpc_cld) { - sptlrpc_cld = do_config_log_add(obd, seclogname, - CONFIG_T_SPTLRPC, NULL, NULL); - if (IS_ERR(sptlrpc_cld)) { - CERROR("can't create sptlrpc log: %s\n", seclogname); - rc = PTR_ERR(sptlrpc_cld); - goto out_err; - } - } - params_cld = config_params_log_add(obd, cfg, sb); - if (IS_ERR(params_cld)) { - rc = PTR_ERR(params_cld); - CERROR("%s: can't create params log: rc = %d\n", - obd->obd_name, rc); - goto out_sptlrpc; - } - - cld = do_config_log_add(obd, logname, CONFIG_T_CONFIG, cfg, sb); - if (IS_ERR(cld)) { - CERROR("can't create log: %s\n", logname); - rc = PTR_ERR(cld); - goto out_params; - } - - LASSERT(lsi->lsi_lmd); - if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR)) { - ptr = strrchr(seclogname, '-'); - if (ptr) { - *ptr = 0; - } else { - CERROR("%s: sptlrpc log name not correct, %s: rc = %d\n", - obd->obd_name, seclogname, -EINVAL); - rc = -EINVAL; - goto out_cld; - } - recover_cld = config_recover_log_add(obd, seclogname, cfg, sb); - if (IS_ERR(recover_cld)) { - rc = PTR_ERR(recover_cld); - goto out_cld; - } - } - - mutex_lock(&cld->cld_lock); - cld->cld_recover = recover_cld; - cld->cld_params = params_cld; - cld->cld_sptlrpc = sptlrpc_cld; - mutex_unlock(&cld->cld_lock); - - return cld; - -out_cld: - config_log_put(cld); - -out_params: - config_log_put(params_cld); - -out_sptlrpc: - config_log_put(sptlrpc_cld); - -out_err: - return ERR_PTR(rc); -} - -static DEFINE_MUTEX(llog_process_lock); - -static inline void config_mark_cld_stop(struct config_llog_data *cld) -{ - mutex_lock(&cld->cld_lock); - spin_lock(&config_list_lock); - cld->cld_stopping = 1; - spin_unlock(&config_list_lock); - mutex_unlock(&cld->cld_lock); -} - -/** Stop watching for updates on this log. - */ -static int config_log_end(char *logname, struct config_llog_instance *cfg) -{ - struct config_llog_data *cld; - struct config_llog_data *cld_sptlrpc = NULL; - struct config_llog_data *cld_params = NULL; - struct config_llog_data *cld_recover = NULL; - int rc = 0; - - cld = config_log_find(logname, cfg); - if (!cld) - return -ENOENT; - - mutex_lock(&cld->cld_lock); - /* - * if cld_stopping is set, it means we didn't start the log thus - * not owning the start ref. this can happen after previous umount: - * the cld still hanging there waiting for lock cancel, and we - * remount again but failed in the middle and call log_end without - * calling start_log. - */ - if (unlikely(cld->cld_stopping)) { - mutex_unlock(&cld->cld_lock); - /* drop the ref from the find */ - config_log_put(cld); - return rc; - } - - spin_lock(&config_list_lock); - cld->cld_stopping = 1; - spin_unlock(&config_list_lock); - - cld_recover = cld->cld_recover; - cld->cld_recover = NULL; - - cld_params = cld->cld_params; - cld->cld_params = NULL; - cld_sptlrpc = cld->cld_sptlrpc; - cld->cld_sptlrpc = NULL; - mutex_unlock(&cld->cld_lock); - - if (cld_recover) { - config_mark_cld_stop(cld_recover); - config_log_put(cld_recover); - } - - if (cld_params) { - config_mark_cld_stop(cld_params); - config_log_put(cld_params); - } - - if (cld_sptlrpc) - config_log_put(cld_sptlrpc); - - /* drop the ref from the find */ - config_log_put(cld); - /* drop the start ref */ - config_log_put(cld); - - CDEBUG(D_MGC, "end config log %s (%d)\n", logname ? logname : "client", - rc); - return rc; -} - -int lprocfs_mgc_rd_ir_state(struct seq_file *m, void *data) -{ - struct obd_device *obd = data; - struct obd_import *imp; - struct obd_connect_data *ocd; - struct config_llog_data *cld; - int rc; - - rc = lprocfs_climp_check(obd); - if (rc) - return rc; - - imp = obd->u.cli.cl_import; - ocd = &imp->imp_connect_data; - - seq_printf(m, "imperative_recovery: %s\n", - OCD_HAS_FLAG(ocd, IMP_RECOV) ? "ENABLED" : "DISABLED"); - seq_printf(m, "client_state:\n"); - - spin_lock(&config_list_lock); - list_for_each_entry(cld, &config_llog_list, cld_list_chain) { - if (!cld->cld_recover) - continue; - seq_printf(m, " - { client: %s, nidtbl_version: %u }\n", - cld->cld_logname, - cld->cld_recover->cld_cfg.cfg_last_idx); - } - spin_unlock(&config_list_lock); - - up_read(&obd->u.cli.cl_sem); - return 0; -} - -/* reenqueue any lost locks */ -#define RQ_RUNNING 0x1 -#define RQ_NOW 0x2 -#define RQ_LATER 0x4 -#define RQ_STOP 0x8 -#define RQ_PRECLEANUP 0x10 -static int rq_state; -static wait_queue_head_t rq_waitq; -static DECLARE_COMPLETION(rq_exit); -static DECLARE_COMPLETION(rq_start); - -static void do_requeue(struct config_llog_data *cld) -{ - LASSERT(atomic_read(&cld->cld_refcount) > 0); - - /* Do not run mgc_process_log on a disconnected export or an - * export which is being disconnected. Take the client - * semaphore to make the check non-racy. - */ - down_read_nested(&cld->cld_mgcexp->exp_obd->u.cli.cl_sem, - OBD_CLI_SEM_MGC); - - if (cld->cld_mgcexp->exp_obd->u.cli.cl_conn_count != 0) { - int rc; - - CDEBUG(D_MGC, "updating log %s\n", cld->cld_logname); - rc = mgc_process_log(cld->cld_mgcexp->exp_obd, cld); - if (rc && rc != -ENOENT) - CERROR("failed processing log: %d\n", rc); - } else { - CDEBUG(D_MGC, "disconnecting, won't update log %s\n", - cld->cld_logname); - } - up_read(&cld->cld_mgcexp->exp_obd->u.cli.cl_sem); -} - -/* this timeout represents how many seconds MGC should wait before - * requeue config and recover lock to the MGS. We need to randomize this - * in order to not flood the MGS. - */ -#define MGC_TIMEOUT_MIN_SECONDS 5 -#define MGC_TIMEOUT_RAND_CENTISEC 500 - -static int mgc_requeue_thread(void *data) -{ - bool first = true; - - CDEBUG(D_MGC, "Starting requeue thread\n"); - - /* Keep trying failed locks periodically */ - spin_lock(&config_list_lock); - rq_state |= RQ_RUNNING; - while (!(rq_state & RQ_STOP)) { - struct config_llog_data *cld, *cld_prev; - int rand = prandom_u32_max(MGC_TIMEOUT_RAND_CENTISEC); - int to; - - /* Any new or requeued lostlocks will change the state */ - rq_state &= ~(RQ_NOW | RQ_LATER); - spin_unlock(&config_list_lock); - - if (first) { - first = false; - complete(&rq_start); - } - - /* Always wait a few seconds to allow the server who - * caused the lock revocation to finish its setup, plus some - * random so everyone doesn't try to reconnect at once. - */ - to = msecs_to_jiffies(MGC_TIMEOUT_MIN_SECONDS * MSEC_PER_SEC); - /* rand is centi-seconds */ - to += msecs_to_jiffies(rand * MSEC_PER_SEC / 100); - wait_event_idle_timeout(rq_waitq, - rq_state & (RQ_STOP | RQ_PRECLEANUP), - to); - - /* - * iterate & processing through the list. for each cld, process - * its depending sptlrpc cld firstly (if any) and then itself. - * - * it's guaranteed any item in the list must have - * reference > 0; and if cld_lostlock is set, at - * least one reference is taken by the previous enqueue. - */ - cld_prev = NULL; - - spin_lock(&config_list_lock); - rq_state &= ~RQ_PRECLEANUP; - list_for_each_entry(cld, &config_llog_list, cld_list_chain) { - if (!cld->cld_lostlock || cld->cld_stopping) - continue; - - /* - * hold reference to avoid being freed during - * subsequent processing. - */ - config_log_get(cld); - cld->cld_lostlock = 0; - spin_unlock(&config_list_lock); - - if (cld_prev) - config_log_put(cld_prev); - cld_prev = cld; - - if (likely(!(rq_state & RQ_STOP))) { - do_requeue(cld); - spin_lock(&config_list_lock); - } else { - spin_lock(&config_list_lock); - break; - } - } - spin_unlock(&config_list_lock); - if (cld_prev) - config_log_put(cld_prev); - - /* Wait a bit to see if anyone else needs a requeue */ - wait_event_idle(rq_waitq, rq_state & (RQ_NOW | RQ_STOP)); - spin_lock(&config_list_lock); - } - - /* spinlock and while guarantee RQ_NOW and RQ_LATER are not set */ - rq_state &= ~RQ_RUNNING; - spin_unlock(&config_list_lock); - - complete(&rq_exit); - - CDEBUG(D_MGC, "Ending requeue thread\n"); - return 0; -} - -/* Add a cld to the list to requeue. Start the requeue thread if needed. - * We are responsible for dropping the config log reference from here on out. - */ -static void mgc_requeue_add(struct config_llog_data *cld) -{ - bool wakeup = false; - - CDEBUG(D_INFO, "log %s: requeue (r=%d sp=%d st=%x)\n", - cld->cld_logname, atomic_read(&cld->cld_refcount), - cld->cld_stopping, rq_state); - LASSERT(atomic_read(&cld->cld_refcount) > 0); - - mutex_lock(&cld->cld_lock); - spin_lock(&config_list_lock); - if (!(rq_state & RQ_STOP) && !cld->cld_stopping && !cld->cld_lostlock) { - cld->cld_lostlock = 1; - rq_state |= RQ_NOW; - wakeup = true; - } - spin_unlock(&config_list_lock); - mutex_unlock(&cld->cld_lock); - if (wakeup) - wake_up(&rq_waitq); -} - -static int mgc_llog_init(const struct lu_env *env, struct obd_device *obd) -{ - struct llog_ctxt *ctxt; - int rc; - - /* setup only remote ctxt, the local disk context is switched per each - * filesystem during mgc_fs_setup() - */ - rc = llog_setup(env, obd, &obd->obd_olg, LLOG_CONFIG_REPL_CTXT, obd, - &llog_client_ops); - if (rc) - return rc; - - ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT); - LASSERT(ctxt); - - llog_initiator_connect(ctxt); - llog_ctxt_put(ctxt); - - return 0; -} - -static int mgc_llog_fini(const struct lu_env *env, struct obd_device *obd) -{ - struct llog_ctxt *ctxt; - - ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT); - if (ctxt) - llog_cleanup(env, ctxt); - - return 0; -} - -static atomic_t mgc_count = ATOMIC_INIT(0); -static int mgc_precleanup(struct obd_device *obd) -{ - int rc = 0; - int temp; - - if (atomic_dec_and_test(&mgc_count)) { - LASSERT(rq_state & RQ_RUNNING); - /* stop requeue thread */ - temp = RQ_STOP; - } else { - /* wakeup requeue thread to clean our cld */ - temp = RQ_NOW | RQ_PRECLEANUP; - } - - spin_lock(&config_list_lock); - rq_state |= temp; - spin_unlock(&config_list_lock); - wake_up(&rq_waitq); - - if (temp & RQ_STOP) - wait_for_completion(&rq_exit); - obd_cleanup_client_import(obd); - - rc = mgc_llog_fini(NULL, obd); - if (rc) - CERROR("failed to cleanup llogging subsystems\n"); - - return rc; -} - -static int mgc_cleanup(struct obd_device *obd) -{ - /* COMPAT_146 - old config logs may have added profiles we don't - * know about - */ - if (obd->obd_type->typ_refcnt <= 1) - /* Only for the last mgc */ - class_del_profiles(); - - lprocfs_obd_cleanup(obd); - ptlrpcd_decref(); - - return client_obd_cleanup(obd); -} - -static int mgc_setup(struct obd_device *obd, struct lustre_cfg *lcfg) -{ - struct lprocfs_static_vars lvars = { NULL }; - struct task_struct *task; - int rc; - - rc = ptlrpcd_addref(); - if (rc < 0) - goto err_noref; - - rc = client_obd_setup(obd, lcfg); - if (rc) - goto err_decref; - - rc = mgc_llog_init(NULL, obd); - if (rc) { - CERROR("failed to setup llogging subsystems\n"); - goto err_cleanup; - } - - lprocfs_mgc_init_vars(&lvars); - lprocfs_obd_setup(obd, lvars.obd_vars, lvars.sysfs_vars); - sptlrpc_lprocfs_cliobd_attach(obd); - - if (atomic_inc_return(&mgc_count) == 1) { - rq_state = 0; - init_waitqueue_head(&rq_waitq); - - /* start requeue thread */ - task = kthread_run(mgc_requeue_thread, NULL, "ll_cfg_requeue"); - if (IS_ERR(task)) { - rc = PTR_ERR(task); - CERROR("%s: cannot start requeue thread: rc = %d; no more log updates\n", - obd->obd_name, rc); - goto err_cleanup; - } - /* rc is the task_struct pointer of mgc_requeue_thread. */ - rc = 0; - wait_for_completion(&rq_start); - } - - return rc; - -err_cleanup: - client_obd_cleanup(obd); -err_decref: - ptlrpcd_decref(); -err_noref: - return rc; -} - -/* based on ll_mdc_blocking_ast */ -static int mgc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, - void *data, int flag) -{ - struct lustre_handle lockh; - struct config_llog_data *cld = data; - int rc = 0; - - switch (flag) { - case LDLM_CB_BLOCKING: - /* mgs wants the lock, give it up... */ - LDLM_DEBUG(lock, "MGC blocking CB"); - ldlm_lock2handle(lock, &lockh); - rc = ldlm_cli_cancel(&lockh, LCF_ASYNC); - break; - case LDLM_CB_CANCELING: - /* We've given up the lock, prepare ourselves to update. */ - LDLM_DEBUG(lock, "MGC cancel CB"); - - CDEBUG(D_MGC, "Lock res " DLDLMRES " (%.8s)\n", - PLDLMRES(lock->l_resource), - (char *)&lock->l_resource->lr_name.name[0]); - - if (!cld) { - CDEBUG(D_INFO, "missing data, won't requeue\n"); - break; - } - - /* held at mgc_process_log(). */ - LASSERT(atomic_read(&cld->cld_refcount) > 0); - - lock->l_ast_data = NULL; - /* Are we done with this log? */ - if (cld->cld_stopping) { - CDEBUG(D_MGC, "log %s: stopping, won't requeue\n", - cld->cld_logname); - config_log_put(cld); - break; - } - /* Make sure not to re-enqueue when the mgc is stopping - * (we get called from client_disconnect_export) - */ - if (!lock->l_conn_export || - !lock->l_conn_export->exp_obd->u.cli.cl_conn_count) { - CDEBUG(D_MGC, "log %.8s: disconnecting, won't requeue\n", - cld->cld_logname); - config_log_put(cld); - break; - } - - /* Re-enqueue now */ - mgc_requeue_add(cld); - config_log_put(cld); - break; - default: - LBUG(); - } - - return rc; -} - -/* Not sure where this should go... */ -/* This is the timeout value for MGS_CONNECT request plus a ping interval, such - * that we can have a chance to try the secondary MGS if any. - */ -#define MGC_ENQUEUE_LIMIT (INITIAL_CONNECT_TIMEOUT + (AT_OFF ? 0 : at_min) \ - + PING_INTERVAL) -#define MGC_TARGET_REG_LIMIT 10 -#define MGC_SEND_PARAM_LIMIT 10 - -/* Send parameter to MGS*/ -static int mgc_set_mgs_param(struct obd_export *exp, - struct mgs_send_param *msp) -{ - struct ptlrpc_request *req; - struct mgs_send_param *req_msp, *rep_msp; - int rc; - - req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), - &RQF_MGS_SET_INFO, LUSTRE_MGS_VERSION, - MGS_SET_INFO); - if (!req) - return -ENOMEM; - - req_msp = req_capsule_client_get(&req->rq_pill, &RMF_MGS_SEND_PARAM); - if (!req_msp) { - ptlrpc_req_finished(req); - return -ENOMEM; - } - - memcpy(req_msp, msp, sizeof(*req_msp)); - ptlrpc_request_set_replen(req); - - /* Limit how long we will wait for the enqueue to complete */ - req->rq_delay_limit = MGC_SEND_PARAM_LIMIT; - rc = ptlrpc_queue_wait(req); - if (!rc) { - rep_msp = req_capsule_server_get(&req->rq_pill, &RMF_MGS_SEND_PARAM); - memcpy(msp, rep_msp, sizeof(*rep_msp)); - } - - ptlrpc_req_finished(req); - - return rc; -} - -/* Take a config lock so we can get cancel notifications */ -static int mgc_enqueue(struct obd_export *exp, __u32 type, - union ldlm_policy_data *policy, __u32 mode, - __u64 *flags, void *bl_cb, void *cp_cb, void *gl_cb, - void *data, __u32 lvb_len, void *lvb_swabber, - struct lustre_handle *lockh) -{ - struct config_llog_data *cld = data; - struct ldlm_enqueue_info einfo = { - .ei_type = type, - .ei_mode = mode, - .ei_cb_bl = mgc_blocking_ast, - .ei_cb_cp = ldlm_completion_ast, - }; - struct ptlrpc_request *req; - int short_limit = cld_is_sptlrpc(cld); - int rc; - - CDEBUG(D_MGC, "Enqueue for %s (res %#llx)\n", cld->cld_logname, - cld->cld_resid.name[0]); - - /* We need a callback for every lockholder, so don't try to - * ldlm_lock_match (see rev 1.1.2.11.2.47) - */ - req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), - &RQF_LDLM_ENQUEUE, LUSTRE_DLM_VERSION, - LDLM_ENQUEUE); - if (!req) - return -ENOMEM; - - req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, 0); - ptlrpc_request_set_replen(req); - - /* Limit how long we will wait for the enqueue to complete */ - req->rq_delay_limit = short_limit ? 5 : MGC_ENQUEUE_LIMIT; - rc = ldlm_cli_enqueue(exp, &req, &einfo, &cld->cld_resid, NULL, flags, - NULL, 0, LVB_T_NONE, lockh, 0); - /* A failed enqueue should still call the mgc_blocking_ast, - * where it will be requeued if needed ("grant failed"). - */ - ptlrpc_req_finished(req); - return rc; -} - -static void mgc_notify_active(struct obd_device *unused) -{ - /* wakeup mgc_requeue_thread to requeue mgc lock */ - spin_lock(&config_list_lock); - rq_state |= RQ_NOW; - spin_unlock(&config_list_lock); - wake_up(&rq_waitq); - - /* TODO: Help the MGS rebuild nidtbl. -jay */ -} - -/* Send target_reg message to MGS */ -static int mgc_target_register(struct obd_export *exp, - struct mgs_target_info *mti) -{ - struct ptlrpc_request *req; - struct mgs_target_info *req_mti, *rep_mti; - int rc; - - req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), - &RQF_MGS_TARGET_REG, LUSTRE_MGS_VERSION, - MGS_TARGET_REG); - if (!req) - return -ENOMEM; - - req_mti = req_capsule_client_get(&req->rq_pill, &RMF_MGS_TARGET_INFO); - if (!req_mti) { - ptlrpc_req_finished(req); - return -ENOMEM; - } - - memcpy(req_mti, mti, sizeof(*req_mti)); - ptlrpc_request_set_replen(req); - CDEBUG(D_MGC, "register %s\n", mti->mti_svname); - /* Limit how long we will wait for the enqueue to complete */ - req->rq_delay_limit = MGC_TARGET_REG_LIMIT; - - rc = ptlrpc_queue_wait(req); - if (!rc) { - rep_mti = req_capsule_server_get(&req->rq_pill, - &RMF_MGS_TARGET_INFO); - memcpy(mti, rep_mti, sizeof(*rep_mti)); - CDEBUG(D_MGC, "register %s got index = %d\n", - mti->mti_svname, mti->mti_stripe_index); - } - ptlrpc_req_finished(req); - - return rc; -} - -static int mgc_set_info_async(const struct lu_env *env, struct obd_export *exp, - u32 keylen, void *key, u32 vallen, - void *val, struct ptlrpc_request_set *set) -{ - int rc = -EINVAL; - - /* Turn off initial_recov after we try all backup servers once */ - if (KEY_IS(KEY_INIT_RECOV_BACKUP)) { - struct obd_import *imp = class_exp2cliimp(exp); - int value; - - if (vallen != sizeof(int)) - return -EINVAL; - value = *(int *)val; - CDEBUG(D_MGC, "InitRecov %s %d/d%d:i%d:r%d:or%d:%s\n", - imp->imp_obd->obd_name, value, - imp->imp_deactive, imp->imp_invalid, - imp->imp_replayable, imp->imp_obd->obd_replayable, - ptlrpc_import_state_name(imp->imp_state)); - /* Resurrect if we previously died */ - if ((imp->imp_state != LUSTRE_IMP_FULL && - imp->imp_state != LUSTRE_IMP_NEW) || value > 1) - ptlrpc_reconnect_import(imp); - return 0; - } - if (KEY_IS(KEY_SET_INFO)) { - struct mgs_send_param *msp; - - msp = val; - rc = mgc_set_mgs_param(exp, msp); - return rc; - } - if (KEY_IS(KEY_MGSSEC)) { - struct client_obd *cli = &exp->exp_obd->u.cli; - struct sptlrpc_flavor flvr; - - /* - * empty string means using current flavor, if which haven't - * been set yet, set it as null. - * - * if flavor has been set previously, check the asking flavor - * must match the existing one. - */ - if (vallen == 0) { - if (cli->cl_flvr_mgc.sf_rpc != SPTLRPC_FLVR_INVALID) - return 0; - val = "null"; - vallen = 4; - } - - rc = sptlrpc_parse_flavor(val, &flvr); - if (rc) { - CERROR("invalid sptlrpc flavor %s to MGS\n", - (char *)val); - return rc; - } - - /* - * caller already hold a mutex - */ - if (cli->cl_flvr_mgc.sf_rpc == SPTLRPC_FLVR_INVALID) { - cli->cl_flvr_mgc = flvr; - } else if (memcmp(&cli->cl_flvr_mgc, &flvr, - sizeof(flvr)) != 0) { - char str[20]; - - sptlrpc_flavor2name(&cli->cl_flvr_mgc, - str, sizeof(str)); - LCONSOLE_ERROR("asking sptlrpc flavor %s to MGS but currently %s is in use\n", - (char *)val, str); - rc = -EPERM; - } - return rc; - } - - return rc; -} - -static int mgc_get_info(const struct lu_env *env, struct obd_export *exp, - __u32 keylen, void *key, __u32 *vallen, void *val) -{ - int rc = -EINVAL; - - if (KEY_IS(KEY_CONN_DATA)) { - struct obd_import *imp = class_exp2cliimp(exp); - struct obd_connect_data *data = val; - - if (*vallen == sizeof(*data)) { - *data = imp->imp_connect_data; - rc = 0; - } - } - - return rc; -} - -static int mgc_import_event(struct obd_device *obd, - struct obd_import *imp, - enum obd_import_event event) -{ - LASSERT(imp->imp_obd == obd); - CDEBUG(D_MGC, "import event %#x\n", event); - - switch (event) { - case IMP_EVENT_DISCON: - /* MGC imports should not wait for recovery */ - if (OCD_HAS_FLAG(&imp->imp_connect_data, IMP_RECOV)) - ptlrpc_pinger_ir_down(); - break; - case IMP_EVENT_INACTIVE: - break; - case IMP_EVENT_INVALIDATE: { - struct ldlm_namespace *ns = obd->obd_namespace; - - ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY); - break; - } - case IMP_EVENT_ACTIVE: - CDEBUG(D_INFO, "%s: Reactivating import\n", obd->obd_name); - /* Clearing obd_no_recov allows us to continue pinging */ - obd->obd_no_recov = 0; - mgc_notify_active(obd); - if (OCD_HAS_FLAG(&imp->imp_connect_data, IMP_RECOV)) - ptlrpc_pinger_ir_up(); - break; - case IMP_EVENT_OCD: - break; - case IMP_EVENT_DEACTIVATE: - case IMP_EVENT_ACTIVATE: - break; - default: - CERROR("Unknown import event %#x\n", event); - LBUG(); - } - return 0; -} - -enum { - CONFIG_READ_NRPAGES_INIT = 1 << (20 - PAGE_SHIFT), - CONFIG_READ_NRPAGES = 4 -}; - -static int mgc_apply_recover_logs(struct obd_device *mgc, - struct config_llog_data *cld, - __u64 max_version, - void *data, int datalen, bool mne_swab) -{ - struct config_llog_instance *cfg = &cld->cld_cfg; - struct mgs_nidtbl_entry *entry; - struct lustre_cfg *lcfg; - struct lustre_cfg_bufs bufs; - u64 prev_version = 0; - char *inst; - char *buf; - int bufsz; - int pos; - int rc = 0; - int off = 0; - - LASSERT(cfg->cfg_instance); - LASSERT(cfg->cfg_sb == cfg->cfg_instance); - - inst = kzalloc(PAGE_SIZE, GFP_KERNEL); - if (!inst) - return -ENOMEM; - - pos = snprintf(inst, PAGE_SIZE, "%p", cfg->cfg_instance); - if (pos >= PAGE_SIZE) { - kfree(inst); - return -E2BIG; - } - - ++pos; - buf = inst + pos; - bufsz = PAGE_SIZE - pos; - - while (datalen > 0) { - int entry_len = sizeof(*entry); - int is_ost, i; - struct obd_device *obd; - char *obdname; - char *cname; - char *params; - char *uuid; - size_t len; - - rc = -EINVAL; - if (datalen < sizeof(*entry)) - break; - - entry = (typeof(entry))(data + off); - - /* sanity check */ - if (entry->mne_nid_type != 0) /* only support type 0 for ipv4 */ - break; - if (entry->mne_nid_count == 0) /* at least one nid entry */ - break; - if (entry->mne_nid_size != sizeof(lnet_nid_t)) - break; - - entry_len += entry->mne_nid_count * entry->mne_nid_size; - if (datalen < entry_len) /* must have entry_len at least */ - break; - - /* Keep this swab for normal mixed endian handling. LU-1644 */ - if (mne_swab) - lustre_swab_mgs_nidtbl_entry(entry); - if (entry->mne_length > PAGE_SIZE) { - CERROR("MNE too large (%u)\n", entry->mne_length); - break; - } - - if (entry->mne_length < entry_len) - break; - - off += entry->mne_length; - datalen -= entry->mne_length; - if (datalen < 0) - break; - - if (entry->mne_version > max_version) { - CERROR("entry index(%lld) is over max_index(%lld)\n", - entry->mne_version, max_version); - break; - } - - if (prev_version >= entry->mne_version) { - CERROR("index unsorted, prev %lld, now %lld\n", - prev_version, entry->mne_version); - break; - } - prev_version = entry->mne_version; - - /* - * Write a string with format "nid::instance" to - * lustre//--/import. - */ - - is_ost = entry->mne_type == LDD_F_SV_TYPE_OST; - memset(buf, 0, bufsz); - obdname = buf; - pos = 0; - - /* lustre-OST0001-osc- */ - strcpy(obdname, cld->cld_logname); - cname = strrchr(obdname, '-'); - if (!cname) { - CERROR("mgc %s: invalid logname %s\n", - mgc->obd_name, obdname); - break; - } - - pos = cname - obdname; - obdname[pos] = 0; - pos += sprintf(obdname + pos, "-%s%04x", - is_ost ? "OST" : "MDT", entry->mne_index); - - cname = is_ost ? "osc" : "mdc"; - pos += sprintf(obdname + pos, "-%s-%s", cname, inst); - lustre_cfg_bufs_reset(&bufs, obdname); - - /* find the obd by obdname */ - obd = class_name2obd(obdname); - if (!obd) { - CDEBUG(D_INFO, "mgc %s: cannot find obdname %s\n", - mgc->obd_name, obdname); - rc = 0; - /* this is a safe race, when the ost is starting up...*/ - continue; - } - - /* osc.import = "connection=::" */ - ++pos; - params = buf + pos; - pos += sprintf(params, "%s.import=%s", cname, "connection="); - uuid = buf + pos; - - down_read(&obd->u.cli.cl_sem); - if (!obd->u.cli.cl_import) { - /* client does not connect to the OST yet */ - up_read(&obd->u.cli.cl_sem); - rc = 0; - continue; - } - - /* iterate all nids to find one */ - /* find uuid by nid */ - rc = -ENOENT; - for (i = 0; i < entry->mne_nid_count; i++) { - rc = client_import_find_conn(obd->u.cli.cl_import, - entry->u.nids[0], - (struct obd_uuid *)uuid); - if (!rc) - break; - } - - up_read(&obd->u.cli.cl_sem); - if (rc < 0) { - CERROR("mgc: cannot find uuid by nid %s\n", - libcfs_nid2str(entry->u.nids[0])); - break; - } - - CDEBUG(D_INFO, "Find uuid %s by nid %s\n", - uuid, libcfs_nid2str(entry->u.nids[0])); - - pos += strlen(uuid); - pos += sprintf(buf + pos, "::%u", entry->mne_instance); - LASSERT(pos < bufsz); - - lustre_cfg_bufs_set_string(&bufs, 1, params); - - rc = -ENOMEM; - len = lustre_cfg_len(bufs.lcfg_bufcount, bufs.lcfg_buflen); - lcfg = kzalloc(len, GFP_NOFS); - if (!lcfg) { - rc = -ENOMEM; - break; - } - lustre_cfg_init(lcfg, LCFG_PARAM, &bufs); - - CDEBUG(D_INFO, "ir apply logs %lld/%lld for %s -> %s\n", - prev_version, max_version, obdname, params); - - rc = class_process_config(lcfg); - kfree(lcfg); - if (rc) - CDEBUG(D_INFO, "process config for %s error %d\n", - obdname, rc); - - /* continue, even one with error */ - } - - kfree(inst); - return rc; -} - -/** - * This function is called if this client was notified for target restarting - * by the MGS. A CONFIG_READ RPC is going to send to fetch recovery logs. - */ -static int mgc_process_recover_log(struct obd_device *obd, - struct config_llog_data *cld) -{ - struct ptlrpc_request *req = NULL; - struct config_llog_instance *cfg = &cld->cld_cfg; - struct mgs_config_body *body; - struct mgs_config_res *res; - struct ptlrpc_bulk_desc *desc; - struct page **pages; - int nrpages; - bool eof = true; - bool mne_swab; - int i; - int ealen; - int rc; - - /* allocate buffer for bulk transfer. - * if this is the first time for this mgs to read logs, - * CONFIG_READ_NRPAGES_INIT will be used since it will read all logs - * once; otherwise, it only reads increment of logs, this should be - * small and CONFIG_READ_NRPAGES will be used. - */ - nrpages = CONFIG_READ_NRPAGES; - if (cfg->cfg_last_idx == 0) /* the first time */ - nrpages = CONFIG_READ_NRPAGES_INIT; - - pages = kcalloc(nrpages, sizeof(*pages), GFP_KERNEL); - if (!pages) { - rc = -ENOMEM; - goto out; - } - - for (i = 0; i < nrpages; i++) { - pages[i] = alloc_page(GFP_KERNEL); - if (!pages[i]) { - rc = -ENOMEM; - goto out; - } - } - -again: - LASSERT(cld_is_recover(cld)); - LASSERT(mutex_is_locked(&cld->cld_lock)); - req = ptlrpc_request_alloc(class_exp2cliimp(cld->cld_mgcexp), - &RQF_MGS_CONFIG_READ); - if (!req) { - rc = -ENOMEM; - goto out; - } - - rc = ptlrpc_request_pack(req, LUSTRE_MGS_VERSION, MGS_CONFIG_READ); - if (rc) - goto out; - - /* pack request */ - body = req_capsule_client_get(&req->rq_pill, &RMF_MGS_CONFIG_BODY); - LASSERT(sizeof(body->mcb_name) > strlen(cld->cld_logname)); - if (strlcpy(body->mcb_name, cld->cld_logname, sizeof(body->mcb_name)) - >= sizeof(body->mcb_name)) { - rc = -E2BIG; - goto out; - } - body->mcb_offset = cfg->cfg_last_idx + 1; - body->mcb_type = cld->cld_type; - body->mcb_bits = PAGE_SHIFT; - body->mcb_units = nrpages; - - /* allocate bulk transfer descriptor */ - desc = ptlrpc_prep_bulk_imp(req, nrpages, 1, - PTLRPC_BULK_PUT_SINK | PTLRPC_BULK_BUF_KIOV, - MGS_BULK_PORTAL, - &ptlrpc_bulk_kiov_pin_ops); - if (!desc) { - rc = -ENOMEM; - goto out; - } - - for (i = 0; i < nrpages; i++) - desc->bd_frag_ops->add_kiov_frag(desc, pages[i], 0, PAGE_SIZE); - - ptlrpc_request_set_replen(req); - rc = ptlrpc_queue_wait(req); - if (rc) - goto out; - - res = req_capsule_server_get(&req->rq_pill, &RMF_MGS_CONFIG_RES); - if (res->mcr_size < res->mcr_offset) { - rc = -EINVAL; - goto out; - } - - /* always update the index even though it might have errors with - * handling the recover logs - */ - cfg->cfg_last_idx = res->mcr_offset; - eof = res->mcr_offset == res->mcr_size; - - CDEBUG(D_INFO, "Latest version %lld, more %d.\n", - res->mcr_offset, eof == false); - - ealen = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, 0); - if (ealen < 0) { - rc = ealen; - goto out; - } - - if (ealen > nrpages << PAGE_SHIFT) { - rc = -EINVAL; - goto out; - } - - if (ealen == 0) { /* no logs transferred */ - if (!eof) - rc = -EINVAL; - goto out; - } - - mne_swab = !!ptlrpc_rep_need_swab(req); -#if OBD_OCD_VERSION(3, 0, 53, 0) > LUSTRE_VERSION_CODE - /* This import flag means the server did an extra swab of IR MNE - * records (fixed in LU-1252), reverse it here if needed. LU-1644 - */ - if (unlikely(req->rq_import->imp_need_mne_swab)) - mne_swab = !mne_swab; -#endif - - for (i = 0; i < nrpages && ealen > 0; i++) { - int rc2; - void *ptr; - - ptr = kmap(pages[i]); - rc2 = mgc_apply_recover_logs(obd, cld, res->mcr_offset, ptr, - min_t(int, ealen, PAGE_SIZE), - mne_swab); - kunmap(pages[i]); - if (rc2 < 0) { - CWARN("Process recover log %s error %d\n", - cld->cld_logname, rc2); - break; - } - - ealen -= PAGE_SIZE; - } - -out: - if (req) - ptlrpc_req_finished(req); - - if (rc == 0 && !eof) - goto again; - - if (pages) { - for (i = 0; i < nrpages; i++) { - if (!pages[i]) - break; - __free_page(pages[i]); - } - kfree(pages); - } - return rc; -} - -/* local_only means it cannot get remote llogs */ -static int mgc_process_cfg_log(struct obd_device *mgc, - struct config_llog_data *cld, int local_only) -{ - struct llog_ctxt *ctxt; - struct lustre_sb_info *lsi = NULL; - int rc = 0; - bool sptlrpc_started = false; - struct lu_env *env; - - LASSERT(cld); - LASSERT(mutex_is_locked(&cld->cld_lock)); - - /* - * local copy of sptlrpc log is controlled elsewhere, don't try to - * read it up here. - */ - if (cld_is_sptlrpc(cld) && local_only) - return 0; - - if (cld->cld_cfg.cfg_sb) - lsi = s2lsi(cld->cld_cfg.cfg_sb); - - env = kzalloc(sizeof(*env), GFP_KERNEL); - if (!env) - return -ENOMEM; - - rc = lu_env_init(env, LCT_MG_THREAD); - if (rc) - goto out_free; - - ctxt = llog_get_context(mgc, LLOG_CONFIG_REPL_CTXT); - LASSERT(ctxt); - - if (local_only) /* no local log at client side */ { - rc = -EIO; - goto out_pop; - } - - if (cld_is_sptlrpc(cld)) { - sptlrpc_conf_log_update_begin(cld->cld_logname); - sptlrpc_started = true; - } - - /* logname and instance info should be the same, so use our - * copy of the instance for the update. The cfg_last_idx will - * be updated here. - */ - rc = class_config_parse_llog(env, ctxt, cld->cld_logname, - &cld->cld_cfg); - -out_pop: - __llog_ctxt_put(env, ctxt); - - /* - * update settings on existing OBDs. doing it inside - * of llog_process_lock so no device is attaching/detaching - * in parallel. - * the logname must be -sptlrpc - */ - if (sptlrpc_started) { - LASSERT(cld_is_sptlrpc(cld)); - sptlrpc_conf_log_update_end(cld->cld_logname); - class_notify_sptlrpc_conf(cld->cld_logname, - strlen(cld->cld_logname) - - strlen("-sptlrpc")); - } - - lu_env_fini(env); -out_free: - kfree(env); - return rc; -} - -static bool mgc_import_in_recovery(struct obd_import *imp) -{ - bool in_recovery = true; - - spin_lock(&imp->imp_lock); - if (imp->imp_state == LUSTRE_IMP_FULL || - imp->imp_state == LUSTRE_IMP_CLOSED) - in_recovery = false; - spin_unlock(&imp->imp_lock); - - return in_recovery; -} - -/** - * Get a configuration log from the MGS and process it. - * - * This function is called for both clients and servers to process the - * configuration log from the MGS. The MGC enqueues a DLM lock on the - * log from the MGS, and if the lock gets revoked the MGC will be notified - * by the lock cancellation callback that the config log has changed, - * and will enqueue another MGS lock on it, and then continue processing - * the new additions to the end of the log. - * - * Since the MGC import is not replayable, if the import is being evicted - * (rcl == -ESHUTDOWN, \see ptlrpc_import_delay_req()), retry to process - * the log until recovery is finished or the import is closed. - * - * Make a local copy of the log before parsing it if appropriate (non-MGS - * server) so that the server can start even when the MGS is down. - * - * There shouldn't be multiple processes running process_log at once -- - * sounds like badness. It actually might be fine, as long as they're not - * trying to update from the same log simultaneously, in which case we - * should use a per-log semaphore instead of cld_lock. - * - * \param[in] mgc MGC device by which to fetch the configuration log - * \param[in] cld log processing state (stored in lock callback data) - * - * \retval 0 on success - * \retval negative errno on failure - */ -int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld) -{ - struct lustre_handle lockh = { 0 }; - __u64 flags = LDLM_FL_NO_LRU; - bool retry = false; - int rc = 0, rcl; - - LASSERT(cld); - - /* I don't want multiple processes running process_log at once -- - * sounds like badness. It actually might be fine, as long as - * we're not trying to update from the same log - * simultaneously (in which case we should use a per-log sem.) - */ -restart: - mutex_lock(&cld->cld_lock); - if (cld->cld_stopping) { - mutex_unlock(&cld->cld_lock); - return 0; - } - - OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PAUSE_PROCESS_LOG, 20); - - CDEBUG(D_MGC, "Process log %s:%p from %d\n", cld->cld_logname, - cld->cld_cfg.cfg_instance, cld->cld_cfg.cfg_last_idx + 1); - - /* Get the cfg lock on the llog */ - rcl = mgc_enqueue(mgc->u.cli.cl_mgc_mgsexp, LDLM_PLAIN, NULL, - LCK_CR, &flags, NULL, NULL, NULL, - cld, 0, NULL, &lockh); - if (rcl == 0) { - /* Get the cld, it will be released in mgc_blocking_ast. */ - config_log_get(cld); - rc = ldlm_lock_set_data(&lockh, (void *)cld); - LASSERT(rc == 0); - } else { - CDEBUG(D_MGC, "Can't get cfg lock: %d\n", rcl); - - if (rcl == -ESHUTDOWN && - atomic_read(&mgc->u.cli.cl_mgc_refcount) > 0 && !retry) { - struct obd_import *imp; - - mutex_unlock(&cld->cld_lock); - imp = class_exp2cliimp(mgc->u.cli.cl_mgc_mgsexp); - - /* - * Let's force the pinger, and wait the import to be - * connected, note: since mgc import is non-replayable, - * and even the import state is disconnected, it does - * not mean the "recovery" is stopped, so we will keep - * waitting until timeout or the import state is - * FULL or closed - */ - ptlrpc_pinger_force(imp); - - wait_event_idle_timeout(imp->imp_recovery_waitq, - !mgc_import_in_recovery(imp), - obd_timeout * HZ); - - if (imp->imp_state == LUSTRE_IMP_FULL) { - retry = true; - goto restart; - } else { - mutex_lock(&cld->cld_lock); - spin_lock(&config_list_lock); - cld->cld_lostlock = 1; - spin_unlock(&config_list_lock); - } - } else { - /* mark cld_lostlock so that it will requeue - * after MGC becomes available. - */ - spin_lock(&config_list_lock); - cld->cld_lostlock = 1; - spin_unlock(&config_list_lock); - } - } - - if (cld_is_recover(cld)) { - rc = 0; /* this is not a fatal error for recover log */ - if (!rcl) { - rc = mgc_process_recover_log(mgc, cld); - if (rc) { - CERROR("%s: recover log %s failed: rc = %d not fatal.\n", - mgc->obd_name, cld->cld_logname, rc); - rc = 0; - spin_lock(&config_list_lock); - cld->cld_lostlock = 1; - spin_unlock(&config_list_lock); - } - } - } else { - rc = mgc_process_cfg_log(mgc, cld, rcl != 0); - } - - CDEBUG(D_MGC, "%s: configuration from log '%s' %sed (%d).\n", - mgc->obd_name, cld->cld_logname, rc ? "fail" : "succeed", rc); - - mutex_unlock(&cld->cld_lock); - - /* Now drop the lock so MGS can revoke it */ - if (!rcl) - ldlm_lock_decref(&lockh, LCK_CR); - - return rc; -} - -/** Called from lustre_process_log. - * LCFG_LOG_START gets the config log from the MGS, processes it to start - * any services, and adds it to the list logs to watch (follow). - */ -static int mgc_process_config(struct obd_device *obd, u32 len, void *buf) -{ - struct lustre_cfg *lcfg = buf; - struct config_llog_instance *cfg = NULL; - char *logname; - int rc = 0; - - switch (lcfg->lcfg_command) { - case LCFG_LOV_ADD_OBD: { - /* Overloading this cfg command: register a new target */ - struct mgs_target_info *mti; - - if (LUSTRE_CFG_BUFLEN(lcfg, 1) != - sizeof(struct mgs_target_info)) { - rc = -EINVAL; - goto out; - } - - mti = (struct mgs_target_info *)lustre_cfg_buf(lcfg, 1); - CDEBUG(D_MGC, "add_target %s %#x\n", - mti->mti_svname, mti->mti_flags); - rc = mgc_target_register(obd->u.cli.cl_mgc_mgsexp, mti); - break; - } - case LCFG_LOV_DEL_OBD: - /* Unregister has no meaning at the moment. */ - CERROR("lov_del_obd unimplemented\n"); - rc = -ENOSYS; - break; - case LCFG_SPTLRPC_CONF: { - rc = sptlrpc_process_config(lcfg); - break; - } - case LCFG_LOG_START: { - struct config_llog_data *cld; - struct super_block *sb; - - logname = lustre_cfg_string(lcfg, 1); - cfg = (struct config_llog_instance *)lustre_cfg_buf(lcfg, 2); - sb = *(struct super_block **)lustre_cfg_buf(lcfg, 3); - - CDEBUG(D_MGC, "parse_log %s from %d\n", logname, - cfg->cfg_last_idx); - - /* We're only called through here on the initial mount */ - cld = config_log_add(obd, logname, cfg, sb); - if (IS_ERR(cld)) { - rc = PTR_ERR(cld); - break; - } - - /* COMPAT_146 */ - /* FIXME only set this for old logs! Right now this forces - * us to always skip the "inside markers" check - */ - cld->cld_cfg.cfg_flags |= CFG_F_COMPAT146; - - rc = mgc_process_log(obd, cld); - if (rc == 0 && cld->cld_recover) { - if (OCD_HAS_FLAG(&obd->u.cli.cl_import-> - imp_connect_data, IMP_RECOV)) { - rc = mgc_process_log(obd, cld->cld_recover); - } else { - struct config_llog_data *cir; - - mutex_lock(&cld->cld_lock); - cir = cld->cld_recover; - cld->cld_recover = NULL; - mutex_unlock(&cld->cld_lock); - config_log_put(cir); - } - - if (rc) - CERROR("Cannot process recover llog %d\n", rc); - } - - if (rc == 0 && cld->cld_params) { - rc = mgc_process_log(obd, cld->cld_params); - if (rc == -ENOENT) { - CDEBUG(D_MGC, - "There is no params config file yet\n"); - rc = 0; - } - /* params log is optional */ - if (rc) - CERROR( - "%s: can't process params llog: rc = %d\n", - obd->obd_name, rc); - } - - break; - } - case LCFG_LOG_END: { - logname = lustre_cfg_string(lcfg, 1); - - if (lcfg->lcfg_bufcount >= 2) - cfg = (struct config_llog_instance *)lustre_cfg_buf( - lcfg, 2); - rc = config_log_end(logname, cfg); - break; - } - default: { - CERROR("Unknown command: %d\n", lcfg->lcfg_command); - rc = -EINVAL; - goto out; - } - } -out: - return rc; -} - -static struct obd_ops mgc_obd_ops = { - .owner = THIS_MODULE, - .setup = mgc_setup, - .precleanup = mgc_precleanup, - .cleanup = mgc_cleanup, - .add_conn = client_import_add_conn, - .del_conn = client_import_del_conn, - .connect = client_connect_import, - .disconnect = client_disconnect_export, - .set_info_async = mgc_set_info_async, - .get_info = mgc_get_info, - .import_event = mgc_import_event, - .process_config = mgc_process_config, -}; - -static int __init mgc_init(void) -{ - int rc; - - rc = libcfs_setup(); - if (rc) - return rc; - - return class_register_type(&mgc_obd_ops, NULL, - LUSTRE_MGC_NAME, NULL); -} - -static void /*__exit*/ mgc_exit(void) -{ - class_unregister_type(LUSTRE_MGC_NAME); -} - -MODULE_AUTHOR("OpenSFS, Inc. "); -MODULE_DESCRIPTION("Lustre Management Client"); -MODULE_VERSION(LUSTRE_VERSION_STRING); -MODULE_LICENSE("GPL"); - -module_init(mgc_init); -module_exit(mgc_exit); diff --git a/drivers/staging/lustre/lustre/obdclass/Makefile b/drivers/staging/lustre/lustre/obdclass/Makefile deleted file mode 100644 index e3fa9acff4c4238438cf02e8da4d33b23621e3d4..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdclass/Makefile +++ /dev/null @@ -1,12 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/include -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/lustre/include - -obj-$(CONFIG_LUSTRE_FS) += obdclass.o - -obdclass-y := linux/linux-module.o linux/linux-sysctl.o \ - llog.o llog_cat.o llog_obd.o llog_swab.o class_obd.o debug.o \ - genops.o uuid.o lprocfs_status.o lprocfs_counters.o \ - lustre_handles.o lustre_peer.o statfs_pack.o linkea.o \ - obdo.o obd_config.o obd_mount.o lu_object.o lu_ref.o \ - cl_object.o cl_page.o cl_lock.o cl_io.o kernelcomm.o diff --git a/drivers/staging/lustre/lustre/obdclass/cl_internal.h b/drivers/staging/lustre/lustre/obdclass/cl_internal.h deleted file mode 100644 index a0db830ca84187a4f6247079d2a72066373e1193..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdclass/cl_internal.h +++ /dev/null @@ -1,95 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Internal cl interfaces. - * - * Author: Nikita Danilov - */ -#ifndef _CL_INTERNAL_H -#define _CL_INTERNAL_H - -#define CLT_PVEC_SIZE (14) - -/** - * Possible levels of the nesting. Currently this is 2: there are "top" - * entities (files, extent locks), and "sub" entities (stripes and stripe - * locks). This is used only for debugging counters right now. - */ -enum clt_nesting_level { - CNL_TOP, - CNL_SUB, - CNL_NR -}; - -/** - * Thread local state internal for generic cl-code. - */ -struct cl_thread_info { - /* - * Common fields. - */ - struct cl_io clt_io; - struct cl_2queue clt_queue; - - /* - * Fields used by cl_lock.c - */ - struct cl_lock_descr clt_descr; - struct cl_page_list clt_list; - /** @} debugging */ - - /* - * Fields used by cl_page.c - */ - struct cl_page *clt_pvec[CLT_PVEC_SIZE]; - - /* - * Fields used by cl_io.c - */ - /** - * Pointer to the topmost ongoing IO in this thread. - */ - struct cl_io *clt_current_io; - /** - * Used for submitting a sync io. - */ - struct cl_sync_io clt_anchor; - /** - * Fields used by cl_lock_discard_pages(). - */ - pgoff_t clt_next_index; - pgoff_t clt_fn_index; /* first non-overlapped index */ -}; - -struct cl_thread_info *cl_env_info(const struct lu_env *env); - -#endif /* _CL_INTERNAL_H */ diff --git a/drivers/staging/lustre/lustre/obdclass/cl_io.c b/drivers/staging/lustre/lustre/obdclass/cl_io.c deleted file mode 100644 index fcdae602925826a2fa31d4b4bcc6d72c008934c1..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdclass/cl_io.c +++ /dev/null @@ -1,1151 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Client IO. - * - * Author: Nikita Danilov - * Author: Jinshan Xiong - */ - -#define DEBUG_SUBSYSTEM S_CLASS - -#include -#include -#include -#include -#include -#include -#include "cl_internal.h" - -/***************************************************************************** - * - * cl_io interface. - * - */ - -#define cl_io_for_each(slice, io) \ - list_for_each_entry((slice), &io->ci_layers, cis_linkage) -#define cl_io_for_each_reverse(slice, io) \ - list_for_each_entry_reverse((slice), &io->ci_layers, cis_linkage) - -static inline int cl_io_type_is_valid(enum cl_io_type type) -{ - return CIT_READ <= type && type < CIT_OP_NR; -} - -static inline int cl_io_is_loopable(const struct cl_io *io) -{ - return cl_io_type_is_valid(io->ci_type) && io->ci_type != CIT_MISC; -} - -/** - * Returns true iff there is an IO ongoing in the given environment. - */ -int cl_io_is_going(const struct lu_env *env) -{ - return cl_env_info(env)->clt_current_io != NULL; -} - -/** - * cl_io invariant that holds at all times when exported cl_io_*() functions - * are entered and left. - */ -static int cl_io_invariant(const struct cl_io *io) -{ - struct cl_io *up; - - up = io->ci_parent; - return - /* - * io can own pages only when it is ongoing. Sub-io might - * still be in CIS_LOCKED state when top-io is in - * CIS_IO_GOING. - */ - ergo(io->ci_owned_nr > 0, io->ci_state == CIS_IO_GOING || - (io->ci_state == CIS_LOCKED && up)); -} - -/** - * Finalize \a io, by calling cl_io_operations::cio_fini() bottom-to-top. - */ -void cl_io_fini(const struct lu_env *env, struct cl_io *io) -{ - struct cl_io_slice *slice; - struct cl_thread_info *info; - - LINVRNT(cl_io_type_is_valid(io->ci_type)); - LINVRNT(cl_io_invariant(io)); - - while (!list_empty(&io->ci_layers)) { - slice = container_of(io->ci_layers.prev, struct cl_io_slice, - cis_linkage); - list_del_init(&slice->cis_linkage); - if (slice->cis_iop->op[io->ci_type].cio_fini) - slice->cis_iop->op[io->ci_type].cio_fini(env, slice); - /* - * Invalidate slice to catch use after free. This assumes that - * slices are allocated within session and can be touched - * after ->cio_fini() returns. - */ - slice->cis_io = NULL; - } - io->ci_state = CIS_FINI; - info = cl_env_info(env); - if (info->clt_current_io == io) - info->clt_current_io = NULL; - - /* sanity check for layout change */ - switch (io->ci_type) { - case CIT_READ: - case CIT_WRITE: - case CIT_DATA_VERSION: - break; - case CIT_FAULT: - break; - case CIT_FSYNC: - LASSERT(!io->ci_need_restart); - break; - case CIT_SETATTR: - case CIT_MISC: - /* Check ignore layout change conf */ - LASSERT(ergo(io->ci_ignore_layout || !io->ci_verify_layout, - !io->ci_need_restart)); - break; - default: - LBUG(); - } -} -EXPORT_SYMBOL(cl_io_fini); - -static int cl_io_init0(const struct lu_env *env, struct cl_io *io, - enum cl_io_type iot, struct cl_object *obj) -{ - struct cl_object *scan; - int result; - - LINVRNT(io->ci_state == CIS_ZERO || io->ci_state == CIS_FINI); - LINVRNT(cl_io_type_is_valid(iot)); - LINVRNT(cl_io_invariant(io)); - - io->ci_type = iot; - INIT_LIST_HEAD(&io->ci_lockset.cls_todo); - INIT_LIST_HEAD(&io->ci_lockset.cls_done); - INIT_LIST_HEAD(&io->ci_layers); - - result = 0; - cl_object_for_each(scan, obj) { - if (scan->co_ops->coo_io_init) { - result = scan->co_ops->coo_io_init(env, scan, io); - if (result != 0) - break; - } - } - if (result == 0) - io->ci_state = CIS_INIT; - return result; -} - -/** - * Initialize sub-io, by calling cl_io_operations::cio_init() top-to-bottom. - * - * \pre obj != cl_object_top(obj) - */ -int cl_io_sub_init(const struct lu_env *env, struct cl_io *io, - enum cl_io_type iot, struct cl_object *obj) -{ - struct cl_thread_info *info = cl_env_info(env); - - LASSERT(obj != cl_object_top(obj)); - if (!info->clt_current_io) - info->clt_current_io = io; - return cl_io_init0(env, io, iot, obj); -} -EXPORT_SYMBOL(cl_io_sub_init); - -/** - * Initialize \a io, by calling cl_io_operations::cio_init() top-to-bottom. - * - * Caller has to call cl_io_fini() after a call to cl_io_init(), no matter - * what the latter returned. - * - * \pre obj == cl_object_top(obj) - * \pre cl_io_type_is_valid(iot) - * \post cl_io_type_is_valid(io->ci_type) && io->ci_type == iot - */ -int cl_io_init(const struct lu_env *env, struct cl_io *io, - enum cl_io_type iot, struct cl_object *obj) -{ - struct cl_thread_info *info = cl_env_info(env); - - LASSERT(obj == cl_object_top(obj)); - LASSERT(!info->clt_current_io); - - info->clt_current_io = io; - return cl_io_init0(env, io, iot, obj); -} -EXPORT_SYMBOL(cl_io_init); - -/** - * Initialize read or write io. - * - * \pre iot == CIT_READ || iot == CIT_WRITE - */ -int cl_io_rw_init(const struct lu_env *env, struct cl_io *io, - enum cl_io_type iot, loff_t pos, size_t count) -{ - LINVRNT(iot == CIT_READ || iot == CIT_WRITE); - LINVRNT(io->ci_obj); - - LU_OBJECT_HEADER(D_VFSTRACE, env, &io->ci_obj->co_lu, - "io range: %u [%llu, %llu) %u %u\n", - iot, (__u64)pos, (__u64)pos + count, - io->u.ci_rw.crw_nonblock, io->u.ci_wr.wr_append); - io->u.ci_rw.crw_pos = pos; - io->u.ci_rw.crw_count = count; - return cl_io_init(env, io, iot, io->ci_obj); -} -EXPORT_SYMBOL(cl_io_rw_init); - -static int cl_lock_descr_sort(const struct cl_lock_descr *d0, - const struct cl_lock_descr *d1) -{ - return lu_fid_cmp(lu_object_fid(&d0->cld_obj->co_lu), - lu_object_fid(&d1->cld_obj->co_lu)); -} - -/* - * Sort locks in lexicographical order of their (fid, start-offset) pairs. - */ -static void cl_io_locks_sort(struct cl_io *io) -{ - int done = 0; - - /* hidden treasure: bubble sort for now. */ - do { - struct cl_io_lock_link *curr; - struct cl_io_lock_link *prev; - struct cl_io_lock_link *temp; - - done = 1; - prev = NULL; - - list_for_each_entry_safe(curr, temp, - &io->ci_lockset.cls_todo, - cill_linkage) { - if (prev) { - switch (cl_lock_descr_sort(&prev->cill_descr, - &curr->cill_descr)) { - case 0: - /* - * IMPOSSIBLE: Identical locks are - * already removed at - * this point. - */ - default: - LBUG(); - case 1: - list_move_tail(&curr->cill_linkage, - &prev->cill_linkage); - done = 0; - continue; /* don't change prev: it's - * still "previous" - */ - case -1: /* already in order */ - break; - } - } - prev = curr; - } - } while (!done); -} - -static void cl_lock_descr_merge(struct cl_lock_descr *d0, - const struct cl_lock_descr *d1) -{ - d0->cld_start = min(d0->cld_start, d1->cld_start); - d0->cld_end = max(d0->cld_end, d1->cld_end); - - if (d1->cld_mode == CLM_WRITE && d0->cld_mode != CLM_WRITE) - d0->cld_mode = CLM_WRITE; - - if (d1->cld_mode == CLM_GROUP && d0->cld_mode != CLM_GROUP) - d0->cld_mode = CLM_GROUP; -} - -static int cl_lockset_merge(const struct cl_lockset *set, - const struct cl_lock_descr *need) -{ - struct cl_io_lock_link *scan; - - list_for_each_entry(scan, &set->cls_todo, cill_linkage) { - if (!cl_object_same(scan->cill_descr.cld_obj, need->cld_obj)) - continue; - - /* Merge locks for the same object because ldlm lock server - * may expand the lock extent, otherwise there is a deadlock - * case if two conflicted locks are queueud for the same object - * and lock server expands one lock to overlap the another. - * The side effect is that it can generate a multi-stripe lock - * that may cause casacading problem - */ - cl_lock_descr_merge(&scan->cill_descr, need); - CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n", - scan->cill_descr.cld_mode, scan->cill_descr.cld_start, - scan->cill_descr.cld_end); - return 1; - } - return 0; -} - -static int cl_lockset_lock(const struct lu_env *env, struct cl_io *io, - struct cl_lockset *set) -{ - struct cl_io_lock_link *link; - struct cl_io_lock_link *temp; - int result; - - result = 0; - list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage) { - result = cl_lock_request(env, io, &link->cill_lock); - if (result < 0) - break; - - list_move(&link->cill_linkage, &set->cls_done); - } - return result; -} - -/** - * Takes locks necessary for the current iteration of io. - * - * Calls cl_io_operations::cio_lock() top-to-bottom to collect locks required - * by layers for the current iteration. Then sort locks (to avoid dead-locks), - * and acquire them. - */ -int cl_io_lock(const struct lu_env *env, struct cl_io *io) -{ - const struct cl_io_slice *scan; - int result = 0; - - LINVRNT(cl_io_is_loopable(io)); - LINVRNT(io->ci_state == CIS_IT_STARTED); - LINVRNT(cl_io_invariant(io)); - - cl_io_for_each(scan, io) { - if (!scan->cis_iop->op[io->ci_type].cio_lock) - continue; - result = scan->cis_iop->op[io->ci_type].cio_lock(env, scan); - if (result != 0) - break; - } - if (result == 0) { - cl_io_locks_sort(io); - result = cl_lockset_lock(env, io, &io->ci_lockset); - } - if (result != 0) - cl_io_unlock(env, io); - else - io->ci_state = CIS_LOCKED; - return result; -} -EXPORT_SYMBOL(cl_io_lock); - -/** - * Release locks takes by io. - */ -void cl_io_unlock(const struct lu_env *env, struct cl_io *io) -{ - struct cl_lockset *set; - struct cl_io_lock_link *link; - struct cl_io_lock_link *temp; - const struct cl_io_slice *scan; - - LASSERT(cl_io_is_loopable(io)); - LASSERT(CIS_IT_STARTED <= io->ci_state && io->ci_state < CIS_UNLOCKED); - LINVRNT(cl_io_invariant(io)); - - set = &io->ci_lockset; - - list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage) { - list_del_init(&link->cill_linkage); - if (link->cill_fini) - link->cill_fini(env, link); - } - - list_for_each_entry_safe(link, temp, &set->cls_done, cill_linkage) { - list_del_init(&link->cill_linkage); - cl_lock_release(env, &link->cill_lock); - if (link->cill_fini) - link->cill_fini(env, link); - } - - cl_io_for_each_reverse(scan, io) { - if (scan->cis_iop->op[io->ci_type].cio_unlock) - scan->cis_iop->op[io->ci_type].cio_unlock(env, scan); - } - io->ci_state = CIS_UNLOCKED; -} -EXPORT_SYMBOL(cl_io_unlock); - -/** - * Prepares next iteration of io. - * - * Calls cl_io_operations::cio_iter_init() top-to-bottom. This exists to give - * layers a chance to modify io parameters, e.g., so that lov can restrict io - * to a single stripe. - */ -int cl_io_iter_init(const struct lu_env *env, struct cl_io *io) -{ - const struct cl_io_slice *scan; - int result; - - LINVRNT(cl_io_is_loopable(io)); - LINVRNT(io->ci_state == CIS_INIT || io->ci_state == CIS_IT_ENDED); - LINVRNT(cl_io_invariant(io)); - - result = 0; - cl_io_for_each(scan, io) { - if (!scan->cis_iop->op[io->ci_type].cio_iter_init) - continue; - result = scan->cis_iop->op[io->ci_type].cio_iter_init(env, - scan); - if (result != 0) - break; - } - if (result == 0) - io->ci_state = CIS_IT_STARTED; - return result; -} -EXPORT_SYMBOL(cl_io_iter_init); - -/** - * Finalizes io iteration. - * - * Calls cl_io_operations::cio_iter_fini() bottom-to-top. - */ -void cl_io_iter_fini(const struct lu_env *env, struct cl_io *io) -{ - const struct cl_io_slice *scan; - - LINVRNT(cl_io_is_loopable(io)); - LINVRNT(io->ci_state == CIS_UNLOCKED); - LINVRNT(cl_io_invariant(io)); - - cl_io_for_each_reverse(scan, io) { - if (scan->cis_iop->op[io->ci_type].cio_iter_fini) - scan->cis_iop->op[io->ci_type].cio_iter_fini(env, scan); - } - io->ci_state = CIS_IT_ENDED; -} -EXPORT_SYMBOL(cl_io_iter_fini); - -/** - * Records that read or write io progressed \a nob bytes forward. - */ -static void cl_io_rw_advance(const struct lu_env *env, struct cl_io *io, - size_t nob) -{ - const struct cl_io_slice *scan; - - LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE || - nob == 0); - LINVRNT(cl_io_is_loopable(io)); - LINVRNT(cl_io_invariant(io)); - - io->u.ci_rw.crw_pos += nob; - io->u.ci_rw.crw_count -= nob; - - /* layers have to be notified. */ - cl_io_for_each_reverse(scan, io) { - if (scan->cis_iop->op[io->ci_type].cio_advance) - scan->cis_iop->op[io->ci_type].cio_advance(env, scan, - nob); - } -} - -/** - * Adds a lock to a lockset. - */ -int cl_io_lock_add(const struct lu_env *env, struct cl_io *io, - struct cl_io_lock_link *link) -{ - int result; - - if (cl_lockset_merge(&io->ci_lockset, &link->cill_descr)) { - result = 1; - } else { - list_add(&link->cill_linkage, &io->ci_lockset.cls_todo); - result = 0; - } - return result; -} -EXPORT_SYMBOL(cl_io_lock_add); - -static void cl_free_io_lock_link(const struct lu_env *env, - struct cl_io_lock_link *link) -{ - kfree(link); -} - -/** - * Allocates new lock link, and uses it to add a lock to a lockset. - */ -int cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io, - struct cl_lock_descr *descr) -{ - struct cl_io_lock_link *link; - int result; - - link = kzalloc(sizeof(*link), GFP_NOFS); - if (link) { - link->cill_descr = *descr; - link->cill_fini = cl_free_io_lock_link; - result = cl_io_lock_add(env, io, link); - if (result) /* lock match */ - link->cill_fini(env, link); - } else { - result = -ENOMEM; - } - - return result; -} -EXPORT_SYMBOL(cl_io_lock_alloc_add); - -/** - * Starts io by calling cl_io_operations::cio_start() top-to-bottom. - */ -int cl_io_start(const struct lu_env *env, struct cl_io *io) -{ - const struct cl_io_slice *scan; - int result = 0; - - LINVRNT(cl_io_is_loopable(io)); - LINVRNT(io->ci_state == CIS_LOCKED); - LINVRNT(cl_io_invariant(io)); - - io->ci_state = CIS_IO_GOING; - cl_io_for_each(scan, io) { - if (!scan->cis_iop->op[io->ci_type].cio_start) - continue; - result = scan->cis_iop->op[io->ci_type].cio_start(env, scan); - if (result != 0) - break; - } - if (result >= 0) - result = 0; - return result; -} -EXPORT_SYMBOL(cl_io_start); - -/** - * Wait until current io iteration is finished by calling - * cl_io_operations::cio_end() bottom-to-top. - */ -void cl_io_end(const struct lu_env *env, struct cl_io *io) -{ - const struct cl_io_slice *scan; - - LINVRNT(cl_io_is_loopable(io)); - LINVRNT(io->ci_state == CIS_IO_GOING); - LINVRNT(cl_io_invariant(io)); - - cl_io_for_each_reverse(scan, io) { - if (scan->cis_iop->op[io->ci_type].cio_end) - scan->cis_iop->op[io->ci_type].cio_end(env, scan); - /* TODO: error handling. */ - } - io->ci_state = CIS_IO_FINISHED; -} -EXPORT_SYMBOL(cl_io_end); - -/** - * Called by read io, to decide the readahead extent - * - * \see cl_io_operations::cio_read_ahead() - */ -int cl_io_read_ahead(const struct lu_env *env, struct cl_io *io, - pgoff_t start, struct cl_read_ahead *ra) -{ - const struct cl_io_slice *scan; - int result = 0; - - LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_FAULT); - LINVRNT(cl_io_invariant(io)); - - cl_io_for_each(scan, io) { - if (!scan->cis_iop->cio_read_ahead) - continue; - - result = scan->cis_iop->cio_read_ahead(env, scan, start, ra); - if (result) - break; - } - return result > 0 ? 0 : result; -} -EXPORT_SYMBOL(cl_io_read_ahead); - -/** - * Commit a list of contiguous pages into writeback cache. - * - * \returns 0 if all pages committed, or errcode if error occurred. - * \see cl_io_operations::cio_commit_async() - */ -int cl_io_commit_async(const struct lu_env *env, struct cl_io *io, - struct cl_page_list *queue, int from, int to, - cl_commit_cbt cb) -{ - const struct cl_io_slice *scan; - int result = 0; - - cl_io_for_each(scan, io) { - if (!scan->cis_iop->cio_commit_async) - continue; - result = scan->cis_iop->cio_commit_async(env, scan, queue, - from, to, cb); - if (result != 0) - break; - } - return result; -} -EXPORT_SYMBOL(cl_io_commit_async); - -/** - * Submits a list of pages for immediate io. - * - * After the function gets returned, The submitted pages are moved to - * queue->c2_qout queue, and queue->c2_qin contain both the pages don't need - * to be submitted, and the pages are errant to submit. - * - * \returns 0 if at least one page was submitted, error code otherwise. - * \see cl_io_operations::cio_submit() - */ -int cl_io_submit_rw(const struct lu_env *env, struct cl_io *io, - enum cl_req_type crt, struct cl_2queue *queue) -{ - const struct cl_io_slice *scan; - int result = 0; - - cl_io_for_each(scan, io) { - if (!scan->cis_iop->cio_submit) - continue; - result = scan->cis_iop->cio_submit(env, scan, crt, queue); - if (result != 0) - break; - } - /* - * If ->cio_submit() failed, no pages were sent. - */ - LASSERT(ergo(result != 0, list_empty(&queue->c2_qout.pl_pages))); - return result; -} -EXPORT_SYMBOL(cl_io_submit_rw); - -static void cl_page_list_assume(const struct lu_env *env, - struct cl_io *io, struct cl_page_list *plist); - -/** - * Submit a sync_io and wait for the IO to be finished, or error happens. - * If \a timeout is zero, it means to wait for the IO unconditionally. - */ -int cl_io_submit_sync(const struct lu_env *env, struct cl_io *io, - enum cl_req_type iot, struct cl_2queue *queue, - long timeout) -{ - struct cl_sync_io *anchor = &cl_env_info(env)->clt_anchor; - struct cl_page *pg; - int rc; - - cl_page_list_for_each(pg, &queue->c2_qin) { - LASSERT(!pg->cp_sync_io); - pg->cp_sync_io = anchor; - } - - cl_sync_io_init(anchor, queue->c2_qin.pl_nr, &cl_sync_io_end); - rc = cl_io_submit_rw(env, io, iot, queue); - if (rc == 0) { - /* - * If some pages weren't sent for any reason (e.g., - * read found up-to-date pages in the cache, or write found - * clean pages), count them as completed to avoid infinite - * wait. - */ - cl_page_list_for_each(pg, &queue->c2_qin) { - pg->cp_sync_io = NULL; - cl_sync_io_note(env, anchor, 1); - } - - /* wait for the IO to be finished. */ - rc = cl_sync_io_wait(env, anchor, timeout); - cl_page_list_assume(env, io, &queue->c2_qout); - } else { - LASSERT(list_empty(&queue->c2_qout.pl_pages)); - cl_page_list_for_each(pg, &queue->c2_qin) - pg->cp_sync_io = NULL; - } - return rc; -} -EXPORT_SYMBOL(cl_io_submit_sync); - -/** - * Main io loop. - * - * Pumps io through iterations calling - * - * - cl_io_iter_init() - * - * - cl_io_lock() - * - * - cl_io_start() - * - * - cl_io_end() - * - * - cl_io_unlock() - * - * - cl_io_iter_fini() - * - * repeatedly until there is no more io to do. - */ -int cl_io_loop(const struct lu_env *env, struct cl_io *io) -{ - int result = 0; - - LINVRNT(cl_io_is_loopable(io)); - - do { - size_t nob; - - io->ci_continue = 0; - result = cl_io_iter_init(env, io); - if (result == 0) { - nob = io->ci_nob; - result = cl_io_lock(env, io); - if (result == 0) { - /* - * Notify layers that locks has been taken, - * and do actual i/o. - * - * - llite: kms, short read; - * - llite: generic_file_read(); - */ - result = cl_io_start(env, io); - /* - * Send any remaining pending - * io, etc. - * - * - llite: ll_rw_stats_tally. - */ - cl_io_end(env, io); - cl_io_unlock(env, io); - cl_io_rw_advance(env, io, io->ci_nob - nob); - } - } - cl_io_iter_fini(env, io); - } while (result == 0 && io->ci_continue); - if (result == 0) - result = io->ci_result; - return result < 0 ? result : 0; -} -EXPORT_SYMBOL(cl_io_loop); - -/** - * Adds io slice to the cl_io. - * - * This is called by cl_object_operations::coo_io_init() methods to add a - * per-layer state to the io. New state is added at the end of - * cl_io::ci_layers list, that is, it is at the bottom of the stack. - * - * \see cl_lock_slice_add(), cl_req_slice_add(), cl_page_slice_add() - */ -void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice, - struct cl_object *obj, - const struct cl_io_operations *ops) -{ - struct list_head *linkage = &slice->cis_linkage; - - LASSERT((!linkage->prev && !linkage->next) || - list_empty(linkage)); - - list_add_tail(linkage, &io->ci_layers); - slice->cis_io = io; - slice->cis_obj = obj; - slice->cis_iop = ops; -} -EXPORT_SYMBOL(cl_io_slice_add); - -/** - * Initializes page list. - */ -void cl_page_list_init(struct cl_page_list *plist) -{ - plist->pl_nr = 0; - INIT_LIST_HEAD(&plist->pl_pages); - plist->pl_owner = current; -} -EXPORT_SYMBOL(cl_page_list_init); - -/** - * Adds a page to a page list. - */ -void cl_page_list_add(struct cl_page_list *plist, struct cl_page *page) -{ - /* it would be better to check that page is owned by "current" io, but - * it is not passed here. - */ - LASSERT(page->cp_owner); - LINVRNT(plist->pl_owner == current); - - LASSERT(list_empty(&page->cp_batch)); - list_add_tail(&page->cp_batch, &plist->pl_pages); - ++plist->pl_nr; - lu_ref_add_at(&page->cp_reference, &page->cp_queue_ref, "queue", plist); - cl_page_get(page); -} -EXPORT_SYMBOL(cl_page_list_add); - -/** - * Removes a page from a page list. - */ -void cl_page_list_del(const struct lu_env *env, struct cl_page_list *plist, - struct cl_page *page) -{ - LASSERT(plist->pl_nr > 0); - LASSERT(cl_page_is_vmlocked(env, page)); - LINVRNT(plist->pl_owner == current); - - list_del_init(&page->cp_batch); - --plist->pl_nr; - lu_ref_del_at(&page->cp_reference, &page->cp_queue_ref, "queue", plist); - cl_page_put(env, page); -} -EXPORT_SYMBOL(cl_page_list_del); - -/** - * Moves a page from one page list to another. - */ -void cl_page_list_move(struct cl_page_list *dst, struct cl_page_list *src, - struct cl_page *page) -{ - LASSERT(src->pl_nr > 0); - LINVRNT(dst->pl_owner == current); - LINVRNT(src->pl_owner == current); - - list_move_tail(&page->cp_batch, &dst->pl_pages); - --src->pl_nr; - ++dst->pl_nr; - lu_ref_set_at(&page->cp_reference, &page->cp_queue_ref, "queue", - src, dst); -} -EXPORT_SYMBOL(cl_page_list_move); - -/** - * Moves a page from one page list to the head of another list. - */ -void cl_page_list_move_head(struct cl_page_list *dst, struct cl_page_list *src, - struct cl_page *page) -{ - LASSERT(src->pl_nr > 0); - LINVRNT(dst->pl_owner == current); - LINVRNT(src->pl_owner == current); - - list_move(&page->cp_batch, &dst->pl_pages); - --src->pl_nr; - ++dst->pl_nr; - lu_ref_set_at(&page->cp_reference, &page->cp_queue_ref, "queue", - src, dst); -} -EXPORT_SYMBOL(cl_page_list_move_head); - -/** - * splice the cl_page_list, just as list head does - */ -void cl_page_list_splice(struct cl_page_list *list, struct cl_page_list *head) -{ - struct cl_page *page; - struct cl_page *tmp; - - LINVRNT(list->pl_owner == current); - LINVRNT(head->pl_owner == current); - - cl_page_list_for_each_safe(page, tmp, list) - cl_page_list_move(head, list, page); -} -EXPORT_SYMBOL(cl_page_list_splice); - - -/** - * Disowns pages in a queue. - */ -void cl_page_list_disown(const struct lu_env *env, - struct cl_io *io, struct cl_page_list *plist) -{ - struct cl_page *page; - struct cl_page *temp; - - LINVRNT(plist->pl_owner == current); - - cl_page_list_for_each_safe(page, temp, plist) { - LASSERT(plist->pl_nr > 0); - - list_del_init(&page->cp_batch); - --plist->pl_nr; - /* - * cl_page_disown0 rather than usual cl_page_disown() is used, - * because pages are possibly in CPS_FREEING state already due - * to the call to cl_page_list_discard(). - */ - /* - * XXX cl_page_disown0() will fail if page is not locked. - */ - cl_page_disown0(env, io, page); - lu_ref_del_at(&page->cp_reference, &page->cp_queue_ref, "queue", - plist); - cl_page_put(env, page); - } -} -EXPORT_SYMBOL(cl_page_list_disown); - -/** - * Releases pages from queue. - */ -void cl_page_list_fini(const struct lu_env *env, struct cl_page_list *plist) -{ - struct cl_page *page; - struct cl_page *temp; - - LINVRNT(plist->pl_owner == current); - - cl_page_list_for_each_safe(page, temp, plist) - cl_page_list_del(env, plist, page); - LASSERT(plist->pl_nr == 0); -} -EXPORT_SYMBOL(cl_page_list_fini); - -/** - * Assumes all pages in a queue. - */ -static void cl_page_list_assume(const struct lu_env *env, - struct cl_io *io, struct cl_page_list *plist) -{ - struct cl_page *page; - - LINVRNT(plist->pl_owner == current); - - cl_page_list_for_each(page, plist) - cl_page_assume(env, io, page); -} - -/** - * Discards all pages in a queue. - */ -static void cl_page_list_discard(const struct lu_env *env, struct cl_io *io, - struct cl_page_list *plist) -{ - struct cl_page *page; - - LINVRNT(plist->pl_owner == current); - cl_page_list_for_each(page, plist) - cl_page_discard(env, io, page); -} - -/** - * Initialize dual page queue. - */ -void cl_2queue_init(struct cl_2queue *queue) -{ - cl_page_list_init(&queue->c2_qin); - cl_page_list_init(&queue->c2_qout); -} -EXPORT_SYMBOL(cl_2queue_init); - -/** - * Disown pages in both lists of a 2-queue. - */ -void cl_2queue_disown(const struct lu_env *env, - struct cl_io *io, struct cl_2queue *queue) -{ - cl_page_list_disown(env, io, &queue->c2_qin); - cl_page_list_disown(env, io, &queue->c2_qout); -} -EXPORT_SYMBOL(cl_2queue_disown); - -/** - * Discard (truncate) pages in both lists of a 2-queue. - */ -void cl_2queue_discard(const struct lu_env *env, - struct cl_io *io, struct cl_2queue *queue) -{ - cl_page_list_discard(env, io, &queue->c2_qin); - cl_page_list_discard(env, io, &queue->c2_qout); -} -EXPORT_SYMBOL(cl_2queue_discard); - -/** - * Finalize both page lists of a 2-queue. - */ -void cl_2queue_fini(const struct lu_env *env, struct cl_2queue *queue) -{ - cl_page_list_fini(env, &queue->c2_qout); - cl_page_list_fini(env, &queue->c2_qin); -} -EXPORT_SYMBOL(cl_2queue_fini); - -/** - * Initialize a 2-queue to contain \a page in its incoming page list. - */ -void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page) -{ - cl_2queue_init(queue); - /* - * Add a page to the incoming page list of 2-queue. - */ - cl_page_list_add(&queue->c2_qin, page); -} -EXPORT_SYMBOL(cl_2queue_init_page); - -/** - * Returns top-level io. - * - * \see cl_object_top() - */ -struct cl_io *cl_io_top(struct cl_io *io) -{ - while (io->ci_parent) - io = io->ci_parent; - return io; -} -EXPORT_SYMBOL(cl_io_top); - -/** - * Fills in attributes that are passed to server together with transfer. Only - * attributes from \a flags may be touched. This can be called multiple times - * for the same request. - */ -void cl_req_attr_set(const struct lu_env *env, struct cl_object *obj, - struct cl_req_attr *attr) -{ - struct cl_object *scan; - - cl_object_for_each(scan, obj) { - if (scan->co_ops->coo_req_attr_set) - scan->co_ops->coo_req_attr_set(env, scan, attr); - } -} -EXPORT_SYMBOL(cl_req_attr_set); - -/* cl_sync_io_callback assumes the caller must call cl_sync_io_wait() to - * wait for the IO to finish. - */ -void cl_sync_io_end(const struct lu_env *env, struct cl_sync_io *anchor) -{ - wake_up_all(&anchor->csi_waitq); - - /* it's safe to nuke or reuse anchor now */ - atomic_set(&anchor->csi_barrier, 0); -} -EXPORT_SYMBOL(cl_sync_io_end); - -/** - * Initialize synchronous io wait anchor - */ -void cl_sync_io_init(struct cl_sync_io *anchor, int nr, - void (*end)(const struct lu_env *, struct cl_sync_io *)) -{ - memset(anchor, 0, sizeof(*anchor)); - init_waitqueue_head(&anchor->csi_waitq); - atomic_set(&anchor->csi_sync_nr, nr); - atomic_set(&anchor->csi_barrier, nr > 0); - anchor->csi_sync_rc = 0; - anchor->csi_end_io = end; - LASSERT(end); -} -EXPORT_SYMBOL(cl_sync_io_init); - -/** - * Wait until all IO completes. Transfer completion routine has to call - * cl_sync_io_note() for every entity. - */ -int cl_sync_io_wait(const struct lu_env *env, struct cl_sync_io *anchor, - long timeout) -{ - int rc = 1; - - LASSERT(timeout >= 0); - - if (timeout == 0) - wait_event_idle(anchor->csi_waitq, - atomic_read(&anchor->csi_sync_nr) == 0); - else - rc = wait_event_idle_timeout(anchor->csi_waitq, - atomic_read(&anchor->csi_sync_nr) == 0, - timeout * HZ); - if (rc == 0) { - rc = -ETIMEDOUT; - CERROR("IO failed: %d, still wait for %d remaining entries\n", - rc, atomic_read(&anchor->csi_sync_nr)); - - wait_event_idle(anchor->csi_waitq, - atomic_read(&anchor->csi_sync_nr) == 0); - } else { - rc = anchor->csi_sync_rc; - } - LASSERT(atomic_read(&anchor->csi_sync_nr) == 0); - - /* wait until cl_sync_io_note() has done wakeup */ - while (unlikely(atomic_read(&anchor->csi_barrier) != 0)) - cpu_relax(); - - - return rc; -} -EXPORT_SYMBOL(cl_sync_io_wait); - -/** - * Indicate that transfer of a single page completed. - */ -void cl_sync_io_note(const struct lu_env *env, struct cl_sync_io *anchor, - int ioret) -{ - if (anchor->csi_sync_rc == 0 && ioret < 0) - anchor->csi_sync_rc = ioret; - /* - * Synchronous IO done without releasing page lock (e.g., as a part of - * ->{prepare,commit}_write(). Completion is used to signal the end of - * IO. - */ - LASSERT(atomic_read(&anchor->csi_sync_nr) > 0); - if (atomic_dec_and_test(&anchor->csi_sync_nr)) { - LASSERT(anchor->csi_end_io); - anchor->csi_end_io(env, anchor); - /* Can't access anchor any more */ - } -} -EXPORT_SYMBOL(cl_sync_io_note); diff --git a/drivers/staging/lustre/lustre/obdclass/cl_lock.c b/drivers/staging/lustre/lustre/obdclass/cl_lock.c deleted file mode 100644 index 9ca29a26a38b4b2e0ae314a88303eb261d495908..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdclass/cl_lock.c +++ /dev/null @@ -1,275 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Client Extent Lock. - * - * Author: Nikita Danilov - * Author: Jinshan Xiong - */ - -#define DEBUG_SUBSYSTEM S_CLASS - -#include -#include -#include -#include -#include -#include "cl_internal.h" - -static void cl_lock_trace0(int level, const struct lu_env *env, - const char *prefix, const struct cl_lock *lock, - const char *func, const int line) -{ - struct cl_object_header *h = cl_object_header(lock->cll_descr.cld_obj); - - CDEBUG(level, "%s: %p (%p/%d) at %s():%d\n", - prefix, lock, env, h->coh_nesting, func, line); -} -#define cl_lock_trace(level, env, prefix, lock) \ - cl_lock_trace0(level, env, prefix, lock, __func__, __LINE__) - -/** - * Adds lock slice to the compound lock. - * - * This is called by cl_object_operations::coo_lock_init() methods to add a - * per-layer state to the lock. New state is added at the end of - * cl_lock::cll_layers list, that is, it is at the bottom of the stack. - * - * \see cl_req_slice_add(), cl_page_slice_add(), cl_io_slice_add() - */ -void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice, - struct cl_object *obj, - const struct cl_lock_operations *ops) -{ - slice->cls_lock = lock; - list_add_tail(&slice->cls_linkage, &lock->cll_layers); - slice->cls_obj = obj; - slice->cls_ops = ops; -} -EXPORT_SYMBOL(cl_lock_slice_add); - -void cl_lock_fini(const struct lu_env *env, struct cl_lock *lock) -{ - struct cl_lock_slice *slice; - cl_lock_trace(D_DLMTRACE, env, "destroy lock", lock); - - while ((slice = list_first_entry_or_null(&lock->cll_layers, - struct cl_lock_slice, - cls_linkage)) != NULL) { - list_del_init(lock->cll_layers.next); - slice->cls_ops->clo_fini(env, slice); - } - POISON(lock, 0x5a, sizeof(*lock)); -} -EXPORT_SYMBOL(cl_lock_fini); - -int cl_lock_init(const struct lu_env *env, struct cl_lock *lock, - const struct cl_io *io) -{ - struct cl_object *obj = lock->cll_descr.cld_obj; - struct cl_object *scan; - int result = 0; - - /* Make sure cl_lock::cll_descr is initialized. */ - LASSERT(obj); - - INIT_LIST_HEAD(&lock->cll_layers); - list_for_each_entry(scan, &obj->co_lu.lo_header->loh_layers, - co_lu.lo_linkage) { - result = scan->co_ops->coo_lock_init(env, scan, lock, io); - if (result != 0) { - cl_lock_fini(env, lock); - break; - } - } - - return result; -} -EXPORT_SYMBOL(cl_lock_init); - -/** - * Returns a slice with a lock, corresponding to the given layer in the - * device stack. - * - * \see cl_page_at() - */ -const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock, - const struct lu_device_type *dtype) -{ - const struct cl_lock_slice *slice; - - list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { - if (slice->cls_obj->co_lu.lo_dev->ld_type == dtype) - return slice; - } - return NULL; -} -EXPORT_SYMBOL(cl_lock_at); - -void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock) -{ - const struct cl_lock_slice *slice; - - cl_lock_trace(D_DLMTRACE, env, "cancel lock", lock); - list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) { - if (slice->cls_ops->clo_cancel) - slice->cls_ops->clo_cancel(env, slice); - } -} -EXPORT_SYMBOL(cl_lock_cancel); - -/** - * Enqueue a lock. - * \param anchor: if we need to wait for resources before getting the lock, - * use @anchor for the purpose. - * \retval 0 enqueue successfully - * \retval <0 error code - */ -int cl_lock_enqueue(const struct lu_env *env, struct cl_io *io, - struct cl_lock *lock, struct cl_sync_io *anchor) -{ - const struct cl_lock_slice *slice; - int rc = -ENOSYS; - - list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { - if (!slice->cls_ops->clo_enqueue) - continue; - - rc = slice->cls_ops->clo_enqueue(env, slice, io, anchor); - if (rc != 0) - break; - } - return rc; -} -EXPORT_SYMBOL(cl_lock_enqueue); - -/** - * Main high-level entry point of cl_lock interface that finds existing or - * enqueues new lock matching given description. - */ -int cl_lock_request(const struct lu_env *env, struct cl_io *io, - struct cl_lock *lock) -{ - struct cl_sync_io *anchor = NULL; - __u32 enq_flags = lock->cll_descr.cld_enq_flags; - int rc; - - rc = cl_lock_init(env, lock, io); - if (rc < 0) - return rc; - - if ((enq_flags & CEF_ASYNC) && !(enq_flags & CEF_AGL)) { - anchor = &cl_env_info(env)->clt_anchor; - cl_sync_io_init(anchor, 1, cl_sync_io_end); - } - - rc = cl_lock_enqueue(env, io, lock, anchor); - - if (anchor) { - int rc2; - - /* drop the reference count held at initialization time */ - cl_sync_io_note(env, anchor, 0); - rc2 = cl_sync_io_wait(env, anchor, 0); - if (rc2 < 0 && rc == 0) - rc = rc2; - } - - if (rc < 0) - cl_lock_release(env, lock); - - return rc; -} -EXPORT_SYMBOL(cl_lock_request); - -/** - * Releases a hold and a reference on a lock, obtained by cl_lock_hold(). - */ -void cl_lock_release(const struct lu_env *env, struct cl_lock *lock) -{ - cl_lock_trace(D_DLMTRACE, env, "release lock", lock); - cl_lock_cancel(env, lock); - cl_lock_fini(env, lock); -} -EXPORT_SYMBOL(cl_lock_release); - -const char *cl_lock_mode_name(const enum cl_lock_mode mode) -{ - static const char * const names[] = { - [CLM_READ] = "R", - [CLM_WRITE] = "W", - [CLM_GROUP] = "G" - }; - if (0 <= mode && mode < ARRAY_SIZE(names)) - return names[mode]; - else - return "U"; -} -EXPORT_SYMBOL(cl_lock_mode_name); - -/** - * Prints human readable representation of a lock description. - */ -void cl_lock_descr_print(const struct lu_env *env, void *cookie, - lu_printer_t printer, - const struct cl_lock_descr *descr) -{ - const struct lu_fid *fid; - - fid = lu_object_fid(&descr->cld_obj->co_lu); - (*printer)(env, cookie, DDESCR "@" DFID, PDESCR(descr), PFID(fid)); -} -EXPORT_SYMBOL(cl_lock_descr_print); - -/** - * Prints human readable representation of \a lock to the \a f. - */ -void cl_lock_print(const struct lu_env *env, void *cookie, - lu_printer_t printer, const struct cl_lock *lock) -{ - const struct cl_lock_slice *slice; - - (*printer)(env, cookie, "lock@%p", lock); - cl_lock_descr_print(env, cookie, printer, &lock->cll_descr); - (*printer)(env, cookie, " {\n"); - - list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { - (*printer)(env, cookie, " %s@%p: ", - slice->cls_obj->co_lu.lo_dev->ld_type->ldt_name, - slice); - if (slice->cls_ops->clo_print) - slice->cls_ops->clo_print(env, cookie, printer, slice); - (*printer)(env, cookie, "\n"); - } - (*printer)(env, cookie, "} lock@%p\n", lock); -} -EXPORT_SYMBOL(cl_lock_print); diff --git a/drivers/staging/lustre/lustre/obdclass/cl_object.c b/drivers/staging/lustre/lustre/obdclass/cl_object.c deleted file mode 100644 index 42cce2dc5a45d2ee1237a7bcedc15b053eb01b0d..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdclass/cl_object.c +++ /dev/null @@ -1,1059 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Client Lustre Object. - * - * Author: Nikita Danilov - * Author: Jinshan Xiong - */ - -/* - * Locking. - * - * i_mutex - * PG_locked - * ->coh_attr_guard - * ->ls_guard - */ - -#define DEBUG_SUBSYSTEM S_CLASS - -/* class_put_type() */ -#include -#include -#include -#include -#include -#include -#include "cl_internal.h" - -static struct kmem_cache *cl_env_kmem; - -/** Lock class of cl_object_header::coh_attr_guard */ -static struct lock_class_key cl_attr_guard_class; - -/** - * Initialize cl_object_header. - */ -int cl_object_header_init(struct cl_object_header *h) -{ - int result; - - result = lu_object_header_init(&h->coh_lu); - if (result == 0) { - spin_lock_init(&h->coh_attr_guard); - lockdep_set_class(&h->coh_attr_guard, &cl_attr_guard_class); - h->coh_page_bufsize = 0; - } - return result; -} -EXPORT_SYMBOL(cl_object_header_init); - -/** - * Returns a cl_object with a given \a fid. - * - * Returns either cached or newly created object. Additional reference on the - * returned object is acquired. - * - * \see lu_object_find(), cl_page_find(), cl_lock_find() - */ -struct cl_object *cl_object_find(const struct lu_env *env, - struct cl_device *cd, const struct lu_fid *fid, - const struct cl_object_conf *c) -{ - might_sleep(); - return lu2cl(lu_object_find_slice(env, cl2lu_dev(cd), fid, &c->coc_lu)); -} -EXPORT_SYMBOL(cl_object_find); - -/** - * Releases a reference on \a o. - * - * When last reference is released object is returned to the cache, unless - * lu_object_header_flags::LU_OBJECT_HEARD_BANSHEE bit is set in its header. - * - * \see cl_page_put(), cl_lock_put(). - */ -void cl_object_put(const struct lu_env *env, struct cl_object *o) -{ - lu_object_put(env, &o->co_lu); -} -EXPORT_SYMBOL(cl_object_put); - -/** - * Acquire an additional reference to the object \a o. - * - * This can only be used to acquire _additional_ reference, i.e., caller - * already has to possess at least one reference to \a o before calling this. - * - * \see cl_page_get(), cl_lock_get(). - */ -void cl_object_get(struct cl_object *o) -{ - lu_object_get(&o->co_lu); -} -EXPORT_SYMBOL(cl_object_get); - -/** - * Returns the top-object for a given \a o. - * - * \see cl_io_top() - */ -struct cl_object *cl_object_top(struct cl_object *o) -{ - struct cl_object_header *hdr = cl_object_header(o); - struct cl_object *top; - - while (hdr->coh_parent) - hdr = hdr->coh_parent; - - top = lu2cl(lu_object_top(&hdr->coh_lu)); - CDEBUG(D_TRACE, "%p -> %p\n", o, top); - return top; -} -EXPORT_SYMBOL(cl_object_top); - -/** - * Returns pointer to the lock protecting data-attributes for the given object - * \a o. - * - * Data-attributes are protected by the cl_object_header::coh_attr_guard - * spin-lock in the top-object. - * - * \see cl_attr, cl_object_attr_lock(), cl_object_operations::coo_attr_get(). - */ -static spinlock_t *cl_object_attr_guard(struct cl_object *o) -{ - return &cl_object_header(cl_object_top(o))->coh_attr_guard; -} - -/** - * Locks data-attributes. - * - * Prevents data-attributes from changing, until lock is released by - * cl_object_attr_unlock(). This has to be called before calls to - * cl_object_attr_get(), cl_object_attr_update(). - */ -void cl_object_attr_lock(struct cl_object *o) - __acquires(cl_object_attr_guard(o)) -{ - spin_lock(cl_object_attr_guard(o)); -} -EXPORT_SYMBOL(cl_object_attr_lock); - -/** - * Releases data-attributes lock, acquired by cl_object_attr_lock(). - */ -void cl_object_attr_unlock(struct cl_object *o) - __releases(cl_object_attr_guard(o)) -{ - spin_unlock(cl_object_attr_guard(o)); -} -EXPORT_SYMBOL(cl_object_attr_unlock); - -/** - * Returns data-attributes of an object \a obj. - * - * Every layer is asked (by calling cl_object_operations::coo_attr_get()) - * top-to-bottom to fill in parts of \a attr that this layer is responsible - * for. - */ -int cl_object_attr_get(const struct lu_env *env, struct cl_object *obj, - struct cl_attr *attr) -{ - struct lu_object_header *top; - int result; - - assert_spin_locked(cl_object_attr_guard(obj)); - - top = obj->co_lu.lo_header; - result = 0; - list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) { - if (obj->co_ops->coo_attr_get) { - result = obj->co_ops->coo_attr_get(env, obj, attr); - if (result != 0) { - if (result > 0) - result = 0; - break; - } - } - } - return result; -} -EXPORT_SYMBOL(cl_object_attr_get); - -/** - * Updates data-attributes of an object \a obj. - * - * Only attributes, mentioned in a validness bit-mask \a v are - * updated. Calls cl_object_operations::coo_attr_update() on every layer, - * bottom to top. - */ -int cl_object_attr_update(const struct lu_env *env, struct cl_object *obj, - const struct cl_attr *attr, unsigned int v) -{ - struct lu_object_header *top; - int result; - - assert_spin_locked(cl_object_attr_guard(obj)); - - top = obj->co_lu.lo_header; - result = 0; - list_for_each_entry_reverse(obj, &top->loh_layers, co_lu.lo_linkage) { - if (obj->co_ops->coo_attr_update) { - result = obj->co_ops->coo_attr_update(env, obj, attr, - v); - if (result != 0) { - if (result > 0) - result = 0; - break; - } - } - } - return result; -} -EXPORT_SYMBOL(cl_object_attr_update); - -/** - * Notifies layers (bottom-to-top) that glimpse AST was received. - * - * Layers have to fill \a lvb fields with information that will be shipped - * back to glimpse issuer. - * - * \see cl_lock_operations::clo_glimpse() - */ -int cl_object_glimpse(const struct lu_env *env, struct cl_object *obj, - struct ost_lvb *lvb) -{ - struct lu_object_header *top; - int result; - - top = obj->co_lu.lo_header; - result = 0; - list_for_each_entry_reverse(obj, &top->loh_layers, co_lu.lo_linkage) { - if (obj->co_ops->coo_glimpse) { - result = obj->co_ops->coo_glimpse(env, obj, lvb); - if (result != 0) - break; - } - } - LU_OBJECT_HEADER(D_DLMTRACE, env, lu_object_top(top), - "size: %llu mtime: %llu atime: %llu ctime: %llu blocks: %llu\n", - lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime, - lvb->lvb_ctime, lvb->lvb_blocks); - return result; -} -EXPORT_SYMBOL(cl_object_glimpse); - -/** - * Updates a configuration of an object \a obj. - */ -int cl_conf_set(const struct lu_env *env, struct cl_object *obj, - const struct cl_object_conf *conf) -{ - struct lu_object_header *top; - int result; - - top = obj->co_lu.lo_header; - result = 0; - list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) { - if (obj->co_ops->coo_conf_set) { - result = obj->co_ops->coo_conf_set(env, obj, conf); - if (result != 0) - break; - } - } - return result; -} -EXPORT_SYMBOL(cl_conf_set); - -/** - * Prunes caches of pages and locks for this object. - */ -int cl_object_prune(const struct lu_env *env, struct cl_object *obj) -{ - struct lu_object_header *top; - struct cl_object *o; - int result; - - top = obj->co_lu.lo_header; - result = 0; - list_for_each_entry(o, &top->loh_layers, co_lu.lo_linkage) { - if (o->co_ops->coo_prune) { - result = o->co_ops->coo_prune(env, o); - if (result != 0) - break; - } - } - - return result; -} -EXPORT_SYMBOL(cl_object_prune); - -/** - * Get stripe information of this object. - */ -int cl_object_getstripe(const struct lu_env *env, struct cl_object *obj, - struct lov_user_md __user *uarg) -{ - struct lu_object_header *top; - int result = 0; - - top = obj->co_lu.lo_header; - list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) { - if (obj->co_ops->coo_getstripe) { - result = obj->co_ops->coo_getstripe(env, obj, uarg); - if (result) - break; - } - } - return result; -} -EXPORT_SYMBOL(cl_object_getstripe); - -/** - * Get fiemap extents from file object. - * - * \param env [in] lustre environment - * \param obj [in] file object - * \param key [in] fiemap request argument - * \param fiemap [out] fiemap extents mapping retrived - * \param buflen [in] max buffer length of @fiemap - * - * \retval 0 success - * \retval < 0 error - */ -int cl_object_fiemap(const struct lu_env *env, struct cl_object *obj, - struct ll_fiemap_info_key *key, - struct fiemap *fiemap, size_t *buflen) -{ - struct lu_object_header *top; - int result = 0; - - top = obj->co_lu.lo_header; - list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) { - if (obj->co_ops->coo_fiemap) { - result = obj->co_ops->coo_fiemap(env, obj, key, fiemap, - buflen); - if (result) - break; - } - } - return result; -} -EXPORT_SYMBOL(cl_object_fiemap); - -int cl_object_layout_get(const struct lu_env *env, struct cl_object *obj, - struct cl_layout *cl) -{ - struct lu_object_header *top = obj->co_lu.lo_header; - - list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) { - if (obj->co_ops->coo_layout_get) - return obj->co_ops->coo_layout_get(env, obj, cl); - } - - return -EOPNOTSUPP; -} -EXPORT_SYMBOL(cl_object_layout_get); - -loff_t cl_object_maxbytes(struct cl_object *obj) -{ - struct lu_object_header *top = obj->co_lu.lo_header; - loff_t maxbytes = LLONG_MAX; - - list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) { - if (obj->co_ops->coo_maxbytes) - maxbytes = min_t(loff_t, obj->co_ops->coo_maxbytes(obj), - maxbytes); - } - - return maxbytes; -} -EXPORT_SYMBOL(cl_object_maxbytes); - -/** - * Helper function removing all object locks, and marking object for - * deletion. All object pages must have been deleted at this point. - * - * This is called by cl_inode_fini() and lov_object_delete() to destroy top- - * and sub- objects respectively. - */ -void cl_object_kill(const struct lu_env *env, struct cl_object *obj) -{ - struct cl_object_header *hdr = cl_object_header(obj); - - set_bit(LU_OBJECT_HEARD_BANSHEE, &hdr->coh_lu.loh_flags); -} -EXPORT_SYMBOL(cl_object_kill); - -void cache_stats_init(struct cache_stats *cs, const char *name) -{ - int i; - - cs->cs_name = name; - for (i = 0; i < CS_NR; i++) - atomic_set(&cs->cs_stats[i], 0); -} - -static int cache_stats_print(const struct cache_stats *cs, - struct seq_file *m, int h) -{ - int i; - /* - * lookup hit total cached create - * env: ...... ...... ...... ...... ...... - */ - if (h) { - const char *names[CS_NR] = CS_NAMES; - - seq_printf(m, "%6s", " "); - for (i = 0; i < CS_NR; i++) - seq_printf(m, "%8s", names[i]); - seq_printf(m, "\n"); - } - - seq_printf(m, "%5.5s:", cs->cs_name); - for (i = 0; i < CS_NR; i++) - seq_printf(m, "%8u", atomic_read(&cs->cs_stats[i])); - return 0; -} - -static void cl_env_percpu_refill(void); - -/** - * Initialize client site. - * - * Perform common initialization (lu_site_init()), and initialize statistical - * counters. Also perform global initializations on the first call. - */ -int cl_site_init(struct cl_site *s, struct cl_device *d) -{ - size_t i; - int result; - - result = lu_site_init(&s->cs_lu, &d->cd_lu_dev); - if (result == 0) { - cache_stats_init(&s->cs_pages, "pages"); - for (i = 0; i < ARRAY_SIZE(s->cs_pages_state); ++i) - atomic_set(&s->cs_pages_state[0], 0); - cl_env_percpu_refill(); - } - return result; -} -EXPORT_SYMBOL(cl_site_init); - -/** - * Finalize client site. Dual to cl_site_init(). - */ -void cl_site_fini(struct cl_site *s) -{ - lu_site_fini(&s->cs_lu); -} -EXPORT_SYMBOL(cl_site_fini); - -static struct cache_stats cl_env_stats = { - .cs_name = "envs", - .cs_stats = { ATOMIC_INIT(0), } -}; - -/** - * Outputs client site statistical counters into a buffer. Suitable for - * ll_rd_*()-style functions. - */ -int cl_site_stats_print(const struct cl_site *site, struct seq_file *m) -{ - size_t i; - static const char * const pstate[] = { - [CPS_CACHED] = "c", - [CPS_OWNED] = "o", - [CPS_PAGEOUT] = "w", - [CPS_PAGEIN] = "r", - [CPS_FREEING] = "f" - }; -/* - lookup hit total busy create -pages: ...... ...... ...... ...... ...... [...... ...... ...... ......] -locks: ...... ...... ...... ...... ...... [...... ...... ...... ...... ......] - env: ...... ...... ...... ...... ...... - */ - lu_site_stats_print(&site->cs_lu, m); - cache_stats_print(&site->cs_pages, m, 1); - seq_puts(m, " ["); - for (i = 0; i < ARRAY_SIZE(site->cs_pages_state); ++i) - seq_printf(m, "%s: %u ", pstate[i], - atomic_read(&site->cs_pages_state[i])); - seq_puts(m, "]\n"); - cache_stats_print(&cl_env_stats, m, 0); - seq_puts(m, "\n"); - return 0; -} -EXPORT_SYMBOL(cl_site_stats_print); - -/***************************************************************************** - * - * lu_env handling on client. - * - */ - -/** - * The most efficient way is to store cl_env pointer in task specific - * structures. On Linux, it wont' be easy to use task_struct->journal_info - * because Lustre code may call into other fs which has certain assumptions - * about journal_info. Currently following fields in task_struct are identified - * can be used for this purpose: - * - tux_info: only on RedHat kernel. - * - ... - * \note As long as we use task_struct to store cl_env, we assume that once - * called into Lustre, we'll never call into the other part of the kernel - * which will use those fields in task_struct without explicitly exiting - * Lustre. - * - * If there's no space in task_struct is available, hash will be used. - * bz20044, bz22683. - */ - -static unsigned int cl_envs_cached_max = 32; /* XXX: prototype: arbitrary limit - * for now. - */ -static struct cl_env_cache { - rwlock_t cec_guard; - unsigned int cec_count; - struct list_head cec_envs; -} *cl_envs = NULL; - -struct cl_env { - void *ce_magic; - struct lu_env ce_lu; - struct lu_context ce_ses; - - /* - * Linkage into global list of all client environments. Used for - * garbage collection. - */ - struct list_head ce_linkage; - /* - * - */ - int ce_ref; - /* - * Debugging field: address of the caller who made original - * allocation. - */ - void *ce_debug; -}; - -#define CL_ENV_INC(counter) -#define CL_ENV_DEC(counter) - -static void cl_env_init0(struct cl_env *cle, void *debug) -{ - LASSERT(cle->ce_ref == 0); - LASSERT(cle->ce_magic == &cl_env_init0); - LASSERT(!cle->ce_debug); - - cle->ce_ref = 1; - cle->ce_debug = debug; - CL_ENV_INC(busy); -} - -static struct lu_env *cl_env_new(__u32 ctx_tags, __u32 ses_tags, void *debug) -{ - struct lu_env *env; - struct cl_env *cle; - - cle = kmem_cache_zalloc(cl_env_kmem, GFP_NOFS); - if (cle) { - int rc; - - INIT_LIST_HEAD(&cle->ce_linkage); - cle->ce_magic = &cl_env_init0; - env = &cle->ce_lu; - rc = lu_env_init(env, ctx_tags | LCT_CL_THREAD); - if (rc == 0) { - rc = lu_context_init(&cle->ce_ses, - ses_tags | LCT_SESSION); - if (rc == 0) { - lu_context_enter(&cle->ce_ses); - env->le_ses = &cle->ce_ses; - cl_env_init0(cle, debug); - } else { - lu_env_fini(env); - } - } - if (rc != 0) { - kmem_cache_free(cl_env_kmem, cle); - env = ERR_PTR(rc); - } else { - CL_ENV_INC(create); - CL_ENV_INC(total); - } - } else { - env = ERR_PTR(-ENOMEM); - } - return env; -} - -static void cl_env_fini(struct cl_env *cle) -{ - CL_ENV_DEC(total); - lu_context_fini(&cle->ce_lu.le_ctx); - lu_context_fini(&cle->ce_ses); - kmem_cache_free(cl_env_kmem, cle); -} - -static struct lu_env *cl_env_obtain(void *debug) -{ - struct cl_env *cle; - struct lu_env *env; - int cpu = get_cpu(); - - read_lock(&cl_envs[cpu].cec_guard); - LASSERT(equi(cl_envs[cpu].cec_count == 0, - list_empty(&cl_envs[cpu].cec_envs))); - if (cl_envs[cpu].cec_count > 0) { - int rc; - - cle = container_of(cl_envs[cpu].cec_envs.next, struct cl_env, - ce_linkage); - list_del_init(&cle->ce_linkage); - cl_envs[cpu].cec_count--; - read_unlock(&cl_envs[cpu].cec_guard); - put_cpu(); - - env = &cle->ce_lu; - rc = lu_env_refill(env); - if (rc == 0) { - cl_env_init0(cle, debug); - lu_context_enter(&env->le_ctx); - lu_context_enter(&cle->ce_ses); - } else { - cl_env_fini(cle); - env = ERR_PTR(rc); - } - } else { - read_unlock(&cl_envs[cpu].cec_guard); - put_cpu(); - env = cl_env_new(lu_context_tags_default, - lu_session_tags_default, debug); - } - return env; -} - -static inline struct cl_env *cl_env_container(struct lu_env *env) -{ - return container_of(env, struct cl_env, ce_lu); -} - -/** - * Returns lu_env: if there already is an environment associated with the - * current thread, it is returned, otherwise, new environment is allocated. - * - * Allocations are amortized through the global cache of environments. - * - * \param refcheck pointer to a counter used to detect environment leaks. In - * the usual case cl_env_get() and cl_env_put() are called in the same lexical - * scope and pointer to the same integer is passed as \a refcheck. This is - * used to detect missed cl_env_put(). - * - * \see cl_env_put() - */ -struct lu_env *cl_env_get(u16 *refcheck) -{ - struct lu_env *env; - - env = cl_env_obtain(__builtin_return_address(0)); - if (!IS_ERR(env)) { - struct cl_env *cle; - - cle = cl_env_container(env); - *refcheck = cle->ce_ref; - CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle); - } - return env; -} -EXPORT_SYMBOL(cl_env_get); - -/** - * Forces an allocation of a fresh environment with given tags. - * - * \see cl_env_get() - */ -struct lu_env *cl_env_alloc(u16 *refcheck, u32 tags) -{ - struct lu_env *env; - - env = cl_env_new(tags, tags, __builtin_return_address(0)); - if (!IS_ERR(env)) { - struct cl_env *cle; - - cle = cl_env_container(env); - *refcheck = cle->ce_ref; - CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle); - } - return env; -} -EXPORT_SYMBOL(cl_env_alloc); - -static void cl_env_exit(struct cl_env *cle) -{ - lu_context_exit(&cle->ce_lu.le_ctx); - lu_context_exit(&cle->ce_ses); -} - -/** - * Finalizes and frees a given number of cached environments. This is done to - * (1) free some memory (not currently hooked into VM), or (2) release - * references to modules. - */ -unsigned int cl_env_cache_purge(unsigned int nr) -{ - struct cl_env *cle; - unsigned int i; - - for_each_possible_cpu(i) { - write_lock(&cl_envs[i].cec_guard); - for (; !list_empty(&cl_envs[i].cec_envs) && nr > 0; --nr) { - cle = container_of(cl_envs[i].cec_envs.next, - struct cl_env, ce_linkage); - list_del_init(&cle->ce_linkage); - LASSERT(cl_envs[i].cec_count > 0); - cl_envs[i].cec_count--; - write_unlock(&cl_envs[i].cec_guard); - - cl_env_fini(cle); - write_lock(&cl_envs[i].cec_guard); - } - LASSERT(equi(cl_envs[i].cec_count == 0, - list_empty(&cl_envs[i].cec_envs))); - write_unlock(&cl_envs[i].cec_guard); - } - return nr; -} -EXPORT_SYMBOL(cl_env_cache_purge); - -/** - * Release an environment. - * - * Decrement \a env reference counter. When counter drops to 0, nothing in - * this thread is using environment and it is returned to the allocation - * cache, or freed straight away, if cache is large enough. - */ -void cl_env_put(struct lu_env *env, u16 *refcheck) -{ - struct cl_env *cle; - - cle = cl_env_container(env); - - LASSERT(cle->ce_ref > 0); - LASSERT(ergo(refcheck, cle->ce_ref == *refcheck)); - - CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle); - if (--cle->ce_ref == 0) { - int cpu = get_cpu(); - - CL_ENV_DEC(busy); - cle->ce_debug = NULL; - cl_env_exit(cle); - /* - * Don't bother to take a lock here. - * - * Return environment to the cache only when it was allocated - * with the standard tags. - */ - if (cl_envs[cpu].cec_count < cl_envs_cached_max && - (env->le_ctx.lc_tags & ~LCT_HAS_EXIT) == LCT_CL_THREAD && - (env->le_ses->lc_tags & ~LCT_HAS_EXIT) == LCT_SESSION) { - read_lock(&cl_envs[cpu].cec_guard); - list_add(&cle->ce_linkage, &cl_envs[cpu].cec_envs); - cl_envs[cpu].cec_count++; - read_unlock(&cl_envs[cpu].cec_guard); - } else { - cl_env_fini(cle); - } - put_cpu(); - } -} -EXPORT_SYMBOL(cl_env_put); - -/** - * Converts struct ost_lvb to struct cl_attr. - * - * \see cl_attr2lvb - */ -void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb) -{ - attr->cat_size = lvb->lvb_size; - attr->cat_mtime = lvb->lvb_mtime; - attr->cat_atime = lvb->lvb_atime; - attr->cat_ctime = lvb->lvb_ctime; - attr->cat_blocks = lvb->lvb_blocks; -} -EXPORT_SYMBOL(cl_lvb2attr); - -static struct cl_env cl_env_percpu[NR_CPUS]; - -static int cl_env_percpu_init(void) -{ - struct cl_env *cle; - int tags = LCT_REMEMBER | LCT_NOREF; - int i, j; - int rc = 0; - - for_each_possible_cpu(i) { - struct lu_env *env; - - rwlock_init(&cl_envs[i].cec_guard); - INIT_LIST_HEAD(&cl_envs[i].cec_envs); - cl_envs[i].cec_count = 0; - - cle = &cl_env_percpu[i]; - env = &cle->ce_lu; - - INIT_LIST_HEAD(&cle->ce_linkage); - cle->ce_magic = &cl_env_init0; - rc = lu_env_init(env, LCT_CL_THREAD | tags); - if (rc == 0) { - rc = lu_context_init(&cle->ce_ses, LCT_SESSION | tags); - if (rc == 0) { - lu_context_enter(&cle->ce_ses); - env->le_ses = &cle->ce_ses; - } else { - lu_env_fini(env); - } - } - if (rc != 0) - break; - } - if (rc != 0) { - /* Indices 0 to i (excluding i) were correctly initialized, - * thus we must uninitialize up to i, the rest are undefined. - */ - for (j = 0; j < i; j++) { - cle = &cl_env_percpu[j]; - lu_context_exit(&cle->ce_ses); - lu_context_fini(&cle->ce_ses); - lu_env_fini(&cle->ce_lu); - } - } - - return rc; -} - -static void cl_env_percpu_fini(void) -{ - int i; - - for_each_possible_cpu(i) { - struct cl_env *cle = &cl_env_percpu[i]; - - lu_context_exit(&cle->ce_ses); - lu_context_fini(&cle->ce_ses); - lu_env_fini(&cle->ce_lu); - } -} - -static void cl_env_percpu_refill(void) -{ - int i; - - for_each_possible_cpu(i) - lu_env_refill(&cl_env_percpu[i].ce_lu); -} - -void cl_env_percpu_put(struct lu_env *env) -{ - struct cl_env *cle; - int cpu; - - cpu = smp_processor_id(); - cle = cl_env_container(env); - LASSERT(cle == &cl_env_percpu[cpu]); - - cle->ce_ref--; - LASSERT(cle->ce_ref == 0); - - CL_ENV_DEC(busy); - cle->ce_debug = NULL; - - put_cpu(); -} -EXPORT_SYMBOL(cl_env_percpu_put); - -struct lu_env *cl_env_percpu_get(void) -{ - struct cl_env *cle; - - cle = &cl_env_percpu[get_cpu()]; - cl_env_init0(cle, __builtin_return_address(0)); - - return &cle->ce_lu; -} -EXPORT_SYMBOL(cl_env_percpu_get); - -/***************************************************************************** - * - * Temporary prototype thing: mirror obd-devices into cl devices. - * - */ - -struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site, - struct lu_device_type *ldt, - struct lu_device *next) -{ - const char *typename; - struct lu_device *d; - - typename = ldt->ldt_name; - d = ldt->ldt_ops->ldto_device_alloc(env, ldt, NULL); - if (!IS_ERR(d)) { - int rc; - - if (site) - d->ld_site = site; - rc = ldt->ldt_ops->ldto_device_init(env, d, typename, next); - if (rc == 0) { - lu_device_get(d); - lu_ref_add(&d->ld_reference, - "lu-stack", &lu_site_init); - } else { - ldt->ldt_ops->ldto_device_free(env, d); - CERROR("can't init device '%s', %d\n", typename, rc); - d = ERR_PTR(rc); - } - } else { - CERROR("Cannot allocate device: '%s'\n", typename); - } - return lu2cl_dev(d); -} -EXPORT_SYMBOL(cl_type_setup); - -/** - * Finalize device stack by calling lu_stack_fini(). - */ -void cl_stack_fini(const struct lu_env *env, struct cl_device *cl) -{ - lu_stack_fini(env, cl2lu_dev(cl)); -} -EXPORT_SYMBOL(cl_stack_fini); - -static struct lu_context_key cl_key; - -struct cl_thread_info *cl_env_info(const struct lu_env *env) -{ - return lu_context_key_get(&env->le_ctx, &cl_key); -} - -/* defines cl0_key_{init,fini}() */ -LU_KEY_INIT_FINI(cl0, struct cl_thread_info); - -static void *cl_key_init(const struct lu_context *ctx, - struct lu_context_key *key) -{ - return cl0_key_init(ctx, key); -} - -static void cl_key_fini(const struct lu_context *ctx, - struct lu_context_key *key, void *data) -{ - cl0_key_fini(ctx, key, data); -} - -static struct lu_context_key cl_key = { - .lct_tags = LCT_CL_THREAD, - .lct_init = cl_key_init, - .lct_fini = cl_key_fini, -}; - -static struct lu_kmem_descr cl_object_caches[] = { - { - .ckd_cache = &cl_env_kmem, - .ckd_name = "cl_env_kmem", - .ckd_size = sizeof(struct cl_env) - }, - { - .ckd_cache = NULL - } -}; - -/** - * Global initialization of cl-data. Create kmem caches, register - * lu_context_key's, etc. - * - * \see cl_global_fini() - */ -int cl_global_init(void) -{ - int result; - - cl_envs = kcalloc(num_possible_cpus(), sizeof(*cl_envs), GFP_KERNEL); - if (!cl_envs) { - result = -ENOMEM; - goto out; - } - - result = lu_kmem_init(cl_object_caches); - if (result) - goto out_envs; - - LU_CONTEXT_KEY_INIT(&cl_key); - result = lu_context_key_register(&cl_key); - if (result) - goto out_kmem; - - result = cl_env_percpu_init(); - if (result) - /* no cl_env_percpu_fini on error */ - goto out_keys; - - return 0; - -out_keys: - lu_context_key_degister(&cl_key); -out_kmem: - lu_kmem_fini(cl_object_caches); -out_envs: - kfree(cl_envs); -out: - return result; -} - -/** - * Finalization of global cl-data. Dual to cl_global_init(). - */ -void cl_global_fini(void) -{ - cl_env_percpu_fini(); - lu_context_key_degister(&cl_key); - lu_kmem_fini(cl_object_caches); - kfree(cl_envs); -} diff --git a/drivers/staging/lustre/lustre/obdclass/cl_page.c b/drivers/staging/lustre/lustre/obdclass/cl_page.c deleted file mode 100644 index 916cf81c5997a8483750ac6ca4fee396d386970d..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdclass/cl_page.c +++ /dev/null @@ -1,1045 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Client Lustre Page. - * - * Author: Nikita Danilov - * Author: Jinshan Xiong - */ - -#define DEBUG_SUBSYSTEM S_CLASS - -#include -#include -#include - -#include -#include "cl_internal.h" - -static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg); - -# define PASSERT(env, page, expr) \ - do { \ - if (unlikely(!(expr))) { \ - CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n"); \ - LASSERT(0); \ - } \ - } while (0) - -# define PINVRNT(env, page, exp) \ - ((void)sizeof(env), (void)sizeof(page), (void)sizeof !!(exp)) - -/** - * Internal version of cl_page_get(). - * - * This function can be used to obtain initial reference to previously - * unreferenced cached object. It can be called only if concurrent page - * reclamation is somehow prevented, e.g., by keeping a lock on a VM page, - * associated with \a page. - * - * Use with care! Not exported. - */ -static void cl_page_get_trust(struct cl_page *page) -{ - LASSERT(atomic_read(&page->cp_ref) > 0); - atomic_inc(&page->cp_ref); -} - -/** - * Returns a slice within a page, corresponding to the given layer in the - * device stack. - * - * \see cl_lock_at() - */ -static const struct cl_page_slice * -cl_page_at_trusted(const struct cl_page *page, - const struct lu_device_type *dtype) -{ - const struct cl_page_slice *slice; - - list_for_each_entry(slice, &page->cp_layers, cpl_linkage) { - if (slice->cpl_obj->co_lu.lo_dev->ld_type == dtype) - return slice; - } - return NULL; -} - -static void cl_page_free(const struct lu_env *env, struct cl_page *page) -{ - struct cl_object *obj = page->cp_obj; - - PASSERT(env, page, list_empty(&page->cp_batch)); - PASSERT(env, page, !page->cp_owner); - PASSERT(env, page, page->cp_state == CPS_FREEING); - - while (!list_empty(&page->cp_layers)) { - struct cl_page_slice *slice; - - slice = list_entry(page->cp_layers.next, - struct cl_page_slice, cpl_linkage); - list_del_init(page->cp_layers.next); - if (unlikely(slice->cpl_ops->cpo_fini)) - slice->cpl_ops->cpo_fini(env, slice); - } - lu_object_ref_del_at(&obj->co_lu, &page->cp_obj_ref, "cl_page", page); - cl_object_put(env, obj); - lu_ref_fini(&page->cp_reference); - kfree(page); -} - -/** - * Helper function updating page state. This is the only place in the code - * where cl_page::cp_state field is mutated. - */ -static inline void cl_page_state_set_trust(struct cl_page *page, - enum cl_page_state state) -{ - /* bypass const. */ - *(enum cl_page_state *)&page->cp_state = state; -} - -struct cl_page *cl_page_alloc(const struct lu_env *env, - struct cl_object *o, pgoff_t ind, - struct page *vmpage, - enum cl_page_type type) -{ - struct cl_page *page; - struct lu_object_header *head; - - page = kzalloc(cl_object_header(o)->coh_page_bufsize, GFP_NOFS); - if (page) { - int result = 0; - - atomic_set(&page->cp_ref, 1); - page->cp_obj = o; - cl_object_get(o); - lu_object_ref_add_at(&o->co_lu, &page->cp_obj_ref, "cl_page", - page); - page->cp_vmpage = vmpage; - cl_page_state_set_trust(page, CPS_CACHED); - page->cp_type = type; - INIT_LIST_HEAD(&page->cp_layers); - INIT_LIST_HEAD(&page->cp_batch); - lu_ref_init(&page->cp_reference); - head = o->co_lu.lo_header; - list_for_each_entry(o, &head->loh_layers, co_lu.lo_linkage) { - if (o->co_ops->coo_page_init) { - result = o->co_ops->coo_page_init(env, o, page, - ind); - if (result != 0) { - cl_page_delete0(env, page); - cl_page_free(env, page); - page = ERR_PTR(result); - break; - } - } - } - } else { - page = ERR_PTR(-ENOMEM); - } - return page; -} - -/** - * Returns a cl_page with index \a idx at the object \a o, and associated with - * the VM page \a vmpage. - * - * This is the main entry point into the cl_page caching interface. First, a - * cache (implemented as a per-object radix tree) is consulted. If page is - * found there, it is returned immediately. Otherwise new page is allocated - * and returned. In any case, additional reference to page is acquired. - * - * \see cl_object_find(), cl_lock_find() - */ -struct cl_page *cl_page_find(const struct lu_env *env, - struct cl_object *o, - pgoff_t idx, struct page *vmpage, - enum cl_page_type type) -{ - struct cl_page *page = NULL; - struct cl_object_header *hdr; - - LASSERT(type == CPT_CACHEABLE || type == CPT_TRANSIENT); - might_sleep(); - - hdr = cl_object_header(o); - - CDEBUG(D_PAGE, "%lu@" DFID " %p %lx %d\n", - idx, PFID(&hdr->coh_lu.loh_fid), vmpage, vmpage->private, type); - /* fast path. */ - if (type == CPT_CACHEABLE) { - /* - * vmpage lock is used to protect the child/parent - * relationship - */ - LASSERT(PageLocked(vmpage)); - /* - * cl_vmpage_page() can be called here without any locks as - * - * - "vmpage" is locked (which prevents ->private from - * concurrent updates), and - * - * - "o" cannot be destroyed while current thread holds a - * reference on it. - */ - page = cl_vmpage_page(vmpage, o); - - if (page) - return page; - } - - /* allocate and initialize cl_page */ - page = cl_page_alloc(env, o, idx, vmpage, type); - return page; -} -EXPORT_SYMBOL(cl_page_find); - -static inline int cl_page_invariant(const struct cl_page *pg) -{ - return cl_page_in_use_noref(pg); -} - -static void cl_page_state_set0(const struct lu_env *env, - struct cl_page *page, enum cl_page_state state) -{ - enum cl_page_state old; - - /* - * Matrix of allowed state transitions [old][new], for sanity - * checking. - */ - static const int allowed_transitions[CPS_NR][CPS_NR] = { - [CPS_CACHED] = { - [CPS_CACHED] = 0, - [CPS_OWNED] = 1, /* io finds existing cached page */ - [CPS_PAGEIN] = 0, - [CPS_PAGEOUT] = 1, /* write-out from the cache */ - [CPS_FREEING] = 1, /* eviction on the memory pressure */ - }, - [CPS_OWNED] = { - [CPS_CACHED] = 1, /* release to the cache */ - [CPS_OWNED] = 0, - [CPS_PAGEIN] = 1, /* start read immediately */ - [CPS_PAGEOUT] = 1, /* start write immediately */ - [CPS_FREEING] = 1, /* lock invalidation or truncate */ - }, - [CPS_PAGEIN] = { - [CPS_CACHED] = 1, /* io completion */ - [CPS_OWNED] = 0, - [CPS_PAGEIN] = 0, - [CPS_PAGEOUT] = 0, - [CPS_FREEING] = 0, - }, - [CPS_PAGEOUT] = { - [CPS_CACHED] = 1, /* io completion */ - [CPS_OWNED] = 0, - [CPS_PAGEIN] = 0, - [CPS_PAGEOUT] = 0, - [CPS_FREEING] = 0, - }, - [CPS_FREEING] = { - [CPS_CACHED] = 0, - [CPS_OWNED] = 0, - [CPS_PAGEIN] = 0, - [CPS_PAGEOUT] = 0, - [CPS_FREEING] = 0, - } - }; - - old = page->cp_state; - PASSERT(env, page, allowed_transitions[old][state]); - CL_PAGE_HEADER(D_TRACE, env, page, "%d -> %d\n", old, state); - PASSERT(env, page, page->cp_state == old); - PASSERT(env, page, equi(state == CPS_OWNED, page->cp_owner)); - cl_page_state_set_trust(page, state); -} - -static void cl_page_state_set(const struct lu_env *env, - struct cl_page *page, enum cl_page_state state) -{ - cl_page_state_set0(env, page, state); -} - -/** - * Acquires an additional reference to a page. - * - * This can be called only by caller already possessing a reference to \a - * page. - * - * \see cl_object_get(), cl_lock_get(). - */ -void cl_page_get(struct cl_page *page) -{ - cl_page_get_trust(page); -} -EXPORT_SYMBOL(cl_page_get); - -/** - * Releases a reference to a page. - * - * When last reference is released, page is returned to the cache, unless it - * is in cl_page_state::CPS_FREEING state, in which case it is immediately - * destroyed. - * - * \see cl_object_put(), cl_lock_put(). - */ -void cl_page_put(const struct lu_env *env, struct cl_page *page) -{ - CL_PAGE_HEADER(D_TRACE, env, page, "%d\n", - atomic_read(&page->cp_ref)); - - if (atomic_dec_and_test(&page->cp_ref)) { - LASSERT(page->cp_state == CPS_FREEING); - - LASSERT(atomic_read(&page->cp_ref) == 0); - PASSERT(env, page, !page->cp_owner); - PASSERT(env, page, list_empty(&page->cp_batch)); - /* - * Page is no longer reachable by other threads. Tear - * it down. - */ - cl_page_free(env, page); - } -} -EXPORT_SYMBOL(cl_page_put); - -/** - * Returns a cl_page associated with a VM page, and given cl_object. - */ -struct cl_page *cl_vmpage_page(struct page *vmpage, struct cl_object *obj) -{ - struct cl_page *page; - - LASSERT(PageLocked(vmpage)); - - /* - * NOTE: absence of races and liveness of data are guaranteed by page - * lock on a "vmpage". That works because object destruction has - * bottom-to-top pass. - */ - - page = (struct cl_page *)vmpage->private; - if (page) { - cl_page_get_trust(page); - LASSERT(page->cp_type == CPT_CACHEABLE); - } - return page; -} -EXPORT_SYMBOL(cl_vmpage_page); - -const struct cl_page_slice *cl_page_at(const struct cl_page *page, - const struct lu_device_type *dtype) -{ - return cl_page_at_trusted(page, dtype); -} -EXPORT_SYMBOL(cl_page_at); - -#define CL_PAGE_OP(opname) offsetof(struct cl_page_operations, opname) - -#define CL_PAGE_INVOKE(_env, _page, _op, _proto, ...) \ -({ \ - const struct lu_env *__env = (_env); \ - struct cl_page *__page = (_page); \ - const struct cl_page_slice *__scan; \ - int __result; \ - ptrdiff_t __op = (_op); \ - int (*__method)_proto; \ - \ - __result = 0; \ - list_for_each_entry(__scan, &__page->cp_layers, cpl_linkage) { \ - __method = *(void **)((char *)__scan->cpl_ops + __op); \ - if (__method) { \ - __result = (*__method)(__env, __scan, ## __VA_ARGS__); \ - if (__result != 0) \ - break; \ - } \ - } \ - if (__result > 0) \ - __result = 0; \ - __result; \ -}) - -#define CL_PAGE_INVOID(_env, _page, _op, _proto, ...) \ -do { \ - const struct lu_env *__env = (_env); \ - struct cl_page *__page = (_page); \ - const struct cl_page_slice *__scan; \ - ptrdiff_t __op = (_op); \ - void (*__method)_proto; \ - \ - list_for_each_entry(__scan, &__page->cp_layers, cpl_linkage) { \ - __method = *(void **)((char *)__scan->cpl_ops + __op); \ - if (__method) \ - (*__method)(__env, __scan, ## __VA_ARGS__); \ - } \ -} while (0) - -#define CL_PAGE_INVOID_REVERSE(_env, _page, _op, _proto, ...) \ -do { \ - const struct lu_env *__env = (_env); \ - struct cl_page *__page = (_page); \ - const struct cl_page_slice *__scan; \ - ptrdiff_t __op = (_op); \ - void (*__method)_proto; \ - \ - list_for_each_entry_reverse(__scan, &__page->cp_layers, cpl_linkage) { \ - __method = *(void **)((char *)__scan->cpl_ops + __op); \ - if (__method) \ - (*__method)(__env, __scan, ## __VA_ARGS__); \ - } \ -} while (0) - -static int cl_page_invoke(const struct lu_env *env, - struct cl_io *io, struct cl_page *page, ptrdiff_t op) - -{ - PINVRNT(env, page, cl_object_same(page->cp_obj, io->ci_obj)); - return CL_PAGE_INVOKE(env, page, op, - (const struct lu_env *, - const struct cl_page_slice *, struct cl_io *), - io); -} - -static void cl_page_invoid(const struct lu_env *env, - struct cl_io *io, struct cl_page *page, ptrdiff_t op) - -{ - PINVRNT(env, page, cl_object_same(page->cp_obj, io->ci_obj)); - CL_PAGE_INVOID(env, page, op, - (const struct lu_env *, - const struct cl_page_slice *, struct cl_io *), io); -} - -static void cl_page_owner_clear(struct cl_page *page) -{ - if (page->cp_owner) { - LASSERT(page->cp_owner->ci_owned_nr > 0); - page->cp_owner->ci_owned_nr--; - page->cp_owner = NULL; - } -} - -static void cl_page_owner_set(struct cl_page *page) -{ - page->cp_owner->ci_owned_nr++; -} - -void cl_page_disown0(const struct lu_env *env, - struct cl_io *io, struct cl_page *pg) -{ - enum cl_page_state state; - - state = pg->cp_state; - PINVRNT(env, pg, state == CPS_OWNED || state == CPS_FREEING); - PINVRNT(env, pg, cl_page_invariant(pg) || state == CPS_FREEING); - cl_page_owner_clear(pg); - - if (state == CPS_OWNED) - cl_page_state_set(env, pg, CPS_CACHED); - /* - * Completion call-backs are executed in the bottom-up order, so that - * uppermost layer (llite), responsible for VFS/VM interaction runs - * last and can release locks safely. - */ - CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(cpo_disown), - (const struct lu_env *, - const struct cl_page_slice *, struct cl_io *), - io); -} - -/** - * returns true, iff page is owned by the given io. - */ -int cl_page_is_owned(const struct cl_page *pg, const struct cl_io *io) -{ - struct cl_io *top = cl_io_top((struct cl_io *)io); - - LINVRNT(cl_object_same(pg->cp_obj, io->ci_obj)); - return pg->cp_state == CPS_OWNED && pg->cp_owner == top; -} -EXPORT_SYMBOL(cl_page_is_owned); - -/** - * Try to own a page by IO. - * - * Waits until page is in cl_page_state::CPS_CACHED state, and then switch it - * into cl_page_state::CPS_OWNED state. - * - * \pre !cl_page_is_owned(pg, io) - * \post result == 0 iff cl_page_is_owned(pg, io) - * - * \retval 0 success - * - * \retval -ve failure, e.g., page was destroyed (and landed in - * cl_page_state::CPS_FREEING instead of cl_page_state::CPS_CACHED). - * or, page was owned by another thread, or in IO. - * - * \see cl_page_disown() - * \see cl_page_operations::cpo_own() - * \see cl_page_own_try() - * \see cl_page_own - */ -static int cl_page_own0(const struct lu_env *env, struct cl_io *io, - struct cl_page *pg, int nonblock) -{ - int result; - - PINVRNT(env, pg, !cl_page_is_owned(pg, io)); - - io = cl_io_top(io); - - if (pg->cp_state == CPS_FREEING) { - result = -ENOENT; - } else { - result = CL_PAGE_INVOKE(env, pg, CL_PAGE_OP(cpo_own), - (const struct lu_env *, - const struct cl_page_slice *, - struct cl_io *, int), - io, nonblock); - if (result == 0) { - PASSERT(env, pg, !pg->cp_owner); - pg->cp_owner = cl_io_top(io); - cl_page_owner_set(pg); - if (pg->cp_state != CPS_FREEING) { - cl_page_state_set(env, pg, CPS_OWNED); - } else { - cl_page_disown0(env, io, pg); - result = -ENOENT; - } - } - } - PINVRNT(env, pg, ergo(result == 0, cl_page_invariant(pg))); - return result; -} - -/** - * Own a page, might be blocked. - * - * \see cl_page_own0() - */ -int cl_page_own(const struct lu_env *env, struct cl_io *io, struct cl_page *pg) -{ - return cl_page_own0(env, io, pg, 0); -} -EXPORT_SYMBOL(cl_page_own); - -/** - * Nonblock version of cl_page_own(). - * - * \see cl_page_own0() - */ -int cl_page_own_try(const struct lu_env *env, struct cl_io *io, - struct cl_page *pg) -{ - return cl_page_own0(env, io, pg, 1); -} -EXPORT_SYMBOL(cl_page_own_try); - -/** - * Assume page ownership. - * - * Called when page is already locked by the hosting VM. - * - * \pre !cl_page_is_owned(pg, io) - * \post cl_page_is_owned(pg, io) - * - * \see cl_page_operations::cpo_assume() - */ -void cl_page_assume(const struct lu_env *env, - struct cl_io *io, struct cl_page *pg) -{ - PINVRNT(env, pg, cl_object_same(pg->cp_obj, io->ci_obj)); - - io = cl_io_top(io); - - cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_assume)); - PASSERT(env, pg, !pg->cp_owner); - pg->cp_owner = cl_io_top(io); - cl_page_owner_set(pg); - cl_page_state_set(env, pg, CPS_OWNED); -} -EXPORT_SYMBOL(cl_page_assume); - -/** - * Releases page ownership without unlocking the page. - * - * Moves page into cl_page_state::CPS_CACHED without releasing a lock on the - * underlying VM page (as VM is supposed to do this itself). - * - * \pre cl_page_is_owned(pg, io) - * \post !cl_page_is_owned(pg, io) - * - * \see cl_page_assume() - */ -void cl_page_unassume(const struct lu_env *env, - struct cl_io *io, struct cl_page *pg) -{ - PINVRNT(env, pg, cl_page_is_owned(pg, io)); - PINVRNT(env, pg, cl_page_invariant(pg)); - - io = cl_io_top(io); - cl_page_owner_clear(pg); - cl_page_state_set(env, pg, CPS_CACHED); - CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(cpo_unassume), - (const struct lu_env *, - const struct cl_page_slice *, struct cl_io *), - io); -} -EXPORT_SYMBOL(cl_page_unassume); - -/** - * Releases page ownership. - * - * Moves page into cl_page_state::CPS_CACHED. - * - * \pre cl_page_is_owned(pg, io) - * \post !cl_page_is_owned(pg, io) - * - * \see cl_page_own() - * \see cl_page_operations::cpo_disown() - */ -void cl_page_disown(const struct lu_env *env, - struct cl_io *io, struct cl_page *pg) -{ - PINVRNT(env, pg, cl_page_is_owned(pg, io) || - pg->cp_state == CPS_FREEING); - - io = cl_io_top(io); - cl_page_disown0(env, io, pg); -} -EXPORT_SYMBOL(cl_page_disown); - -/** - * Called when page is to be removed from the object, e.g., as a result of - * truncate. - * - * Calls cl_page_operations::cpo_discard() top-to-bottom. - * - * \pre cl_page_is_owned(pg, io) - * - * \see cl_page_operations::cpo_discard() - */ -void cl_page_discard(const struct lu_env *env, - struct cl_io *io, struct cl_page *pg) -{ - PINVRNT(env, pg, cl_page_is_owned(pg, io)); - PINVRNT(env, pg, cl_page_invariant(pg)); - - cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_discard)); -} -EXPORT_SYMBOL(cl_page_discard); - -/** - * Version of cl_page_delete() that can be called for not fully constructed - * pages, e.g,. in a error handling cl_page_find()->cl_page_delete0() - * path. Doesn't check page invariant. - */ -static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg) -{ - PASSERT(env, pg, pg->cp_state != CPS_FREEING); - - /* - * Sever all ways to obtain new pointers to @pg. - */ - cl_page_owner_clear(pg); - - cl_page_state_set0(env, pg, CPS_FREEING); - - CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(cpo_delete), - (const struct lu_env *, - const struct cl_page_slice *)); -} - -/** - * Called when a decision is made to throw page out of memory. - * - * Notifies all layers about page destruction by calling - * cl_page_operations::cpo_delete() method top-to-bottom. - * - * Moves page into cl_page_state::CPS_FREEING state (this is the only place - * where transition to this state happens). - * - * Eliminates all venues through which new references to the page can be - * obtained: - * - * - removes page from the radix trees, - * - * - breaks linkage from VM page to cl_page. - * - * Once page reaches cl_page_state::CPS_FREEING, all remaining references will - * drain after some time, at which point page will be recycled. - * - * \pre VM page is locked - * \post pg->cp_state == CPS_FREEING - * - * \see cl_page_operations::cpo_delete() - */ -void cl_page_delete(const struct lu_env *env, struct cl_page *pg) -{ - PINVRNT(env, pg, cl_page_invariant(pg)); - cl_page_delete0(env, pg); -} -EXPORT_SYMBOL(cl_page_delete); - -/** - * Marks page up-to-date. - * - * Call cl_page_operations::cpo_export() through all layers top-to-bottom. The - * layer responsible for VM interaction has to mark/clear page as up-to-date - * by the \a uptodate argument. - * - * \see cl_page_operations::cpo_export() - */ -void cl_page_export(const struct lu_env *env, struct cl_page *pg, int uptodate) -{ - PINVRNT(env, pg, cl_page_invariant(pg)); - CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_export), - (const struct lu_env *, - const struct cl_page_slice *, int), uptodate); -} -EXPORT_SYMBOL(cl_page_export); - -/** - * Returns true, iff \a pg is VM locked in a suitable sense by the calling - * thread. - */ -int cl_page_is_vmlocked(const struct lu_env *env, const struct cl_page *pg) -{ - int result; - const struct cl_page_slice *slice; - - slice = container_of(pg->cp_layers.next, - const struct cl_page_slice, cpl_linkage); - PASSERT(env, pg, slice->cpl_ops->cpo_is_vmlocked); - /* - * Call ->cpo_is_vmlocked() directly instead of going through - * CL_PAGE_INVOKE(), because cl_page_is_vmlocked() is used by - * cl_page_invariant(). - */ - result = slice->cpl_ops->cpo_is_vmlocked(env, slice); - PASSERT(env, pg, result == -EBUSY || result == -ENODATA); - return result == -EBUSY; -} -EXPORT_SYMBOL(cl_page_is_vmlocked); - -static enum cl_page_state cl_req_type_state(enum cl_req_type crt) -{ - return crt == CRT_WRITE ? CPS_PAGEOUT : CPS_PAGEIN; -} - -static void cl_page_io_start(const struct lu_env *env, - struct cl_page *pg, enum cl_req_type crt) -{ - /* - * Page is queued for IO, change its state. - */ - cl_page_owner_clear(pg); - cl_page_state_set(env, pg, cl_req_type_state(crt)); -} - -/** - * Prepares page for immediate transfer. cl_page_operations::cpo_prep() is - * called top-to-bottom. Every layer either agrees to submit this page (by - * returning 0), or requests to omit this page (by returning -EALREADY). Layer - * handling interactions with the VM also has to inform VM that page is under - * transfer now. - */ -int cl_page_prep(const struct lu_env *env, struct cl_io *io, - struct cl_page *pg, enum cl_req_type crt) -{ - int result; - - PINVRNT(env, pg, cl_page_is_owned(pg, io)); - PINVRNT(env, pg, cl_page_invariant(pg)); - PINVRNT(env, pg, crt < CRT_NR); - - /* - * XXX this has to be called bottom-to-top, so that llite can set up - * PG_writeback without risking other layers deciding to skip this - * page. - */ - if (crt >= CRT_NR) - return -EINVAL; - result = cl_page_invoke(env, io, pg, CL_PAGE_OP(io[crt].cpo_prep)); - if (result == 0) - cl_page_io_start(env, pg, crt); - - CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result); - return result; -} -EXPORT_SYMBOL(cl_page_prep); - -/** - * Notify layers about transfer completion. - * - * Invoked by transfer sub-system (which is a part of osc) to notify layers - * that a transfer, of which this page is a part of has completed. - * - * Completion call-backs are executed in the bottom-up order, so that - * uppermost layer (llite), responsible for the VFS/VM interaction runs last - * and can release locks safely. - * - * \pre pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT - * \post pg->cp_state == CPS_CACHED - * - * \see cl_page_operations::cpo_completion() - */ -void cl_page_completion(const struct lu_env *env, - struct cl_page *pg, enum cl_req_type crt, int ioret) -{ - struct cl_sync_io *anchor = pg->cp_sync_io; - - PASSERT(env, pg, crt < CRT_NR); - PASSERT(env, pg, pg->cp_state == cl_req_type_state(crt)); - - CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, ioret); - - cl_page_state_set(env, pg, CPS_CACHED); - if (crt >= CRT_NR) - return; - CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(io[crt].cpo_completion), - (const struct lu_env *, - const struct cl_page_slice *, int), ioret); - if (anchor) { - LASSERT(pg->cp_sync_io == anchor); - pg->cp_sync_io = NULL; - cl_sync_io_note(env, anchor, ioret); - } -} -EXPORT_SYMBOL(cl_page_completion); - -/** - * Notify layers that transfer formation engine decided to yank this page from - * the cache and to make it a part of a transfer. - * - * \pre pg->cp_state == CPS_CACHED - * \post pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT - * - * \see cl_page_operations::cpo_make_ready() - */ -int cl_page_make_ready(const struct lu_env *env, struct cl_page *pg, - enum cl_req_type crt) -{ - int result; - - PINVRNT(env, pg, crt < CRT_NR); - - if (crt >= CRT_NR) - return -EINVAL; - result = CL_PAGE_INVOKE(env, pg, CL_PAGE_OP(io[crt].cpo_make_ready), - (const struct lu_env *, - const struct cl_page_slice *)); - if (result == 0) { - PASSERT(env, pg, pg->cp_state == CPS_CACHED); - cl_page_io_start(env, pg, crt); - } - CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result); - return result; -} -EXPORT_SYMBOL(cl_page_make_ready); - -/** - * Called if a pge is being written back by kernel's intention. - * - * \pre cl_page_is_owned(pg, io) - * \post ergo(result == 0, pg->cp_state == CPS_PAGEOUT) - * - * \see cl_page_operations::cpo_flush() - */ -int cl_page_flush(const struct lu_env *env, struct cl_io *io, - struct cl_page *pg) -{ - int result; - - PINVRNT(env, pg, cl_page_is_owned(pg, io)); - PINVRNT(env, pg, cl_page_invariant(pg)); - - result = cl_page_invoke(env, io, pg, CL_PAGE_OP(cpo_flush)); - - CL_PAGE_HEADER(D_TRACE, env, pg, "%d\n", result); - return result; -} -EXPORT_SYMBOL(cl_page_flush); - -/** - * Tells transfer engine that only part of a page is to be transmitted. - * - * \see cl_page_operations::cpo_clip() - */ -void cl_page_clip(const struct lu_env *env, struct cl_page *pg, - int from, int to) -{ - PINVRNT(env, pg, cl_page_invariant(pg)); - - CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", from, to); - CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_clip), - (const struct lu_env *, - const struct cl_page_slice *, int, int), - from, to); -} -EXPORT_SYMBOL(cl_page_clip); - -/** - * Prints human readable representation of \a pg to the \a f. - */ -void cl_page_header_print(const struct lu_env *env, void *cookie, - lu_printer_t printer, const struct cl_page *pg) -{ - (*printer)(env, cookie, - "page@%p[%d %p %d %d %p]\n", - pg, atomic_read(&pg->cp_ref), pg->cp_obj, - pg->cp_state, pg->cp_type, - pg->cp_owner); -} -EXPORT_SYMBOL(cl_page_header_print); - -/** - * Prints human readable representation of \a pg to the \a f. - */ -void cl_page_print(const struct lu_env *env, void *cookie, - lu_printer_t printer, const struct cl_page *pg) -{ - cl_page_header_print(env, cookie, printer, pg); - CL_PAGE_INVOKE(env, (struct cl_page *)pg, CL_PAGE_OP(cpo_print), - (const struct lu_env *env, - const struct cl_page_slice *slice, - void *cookie, lu_printer_t p), cookie, printer); - (*printer)(env, cookie, "end page@%p\n", pg); -} -EXPORT_SYMBOL(cl_page_print); - -/** - * Cancel a page which is still in a transfer. - */ -int cl_page_cancel(const struct lu_env *env, struct cl_page *page) -{ - return CL_PAGE_INVOKE(env, page, CL_PAGE_OP(cpo_cancel), - (const struct lu_env *, - const struct cl_page_slice *)); -} - -/** - * Converts a byte offset within object \a obj into a page index. - */ -loff_t cl_offset(const struct cl_object *obj, pgoff_t idx) -{ - /* - * XXX for now. - */ - return (loff_t)idx << PAGE_SHIFT; -} -EXPORT_SYMBOL(cl_offset); - -/** - * Converts a page index into a byte offset within object \a obj. - */ -pgoff_t cl_index(const struct cl_object *obj, loff_t offset) -{ - /* - * XXX for now. - */ - return offset >> PAGE_SHIFT; -} -EXPORT_SYMBOL(cl_index); - -size_t cl_page_size(const struct cl_object *obj) -{ - return 1UL << PAGE_SHIFT; -} -EXPORT_SYMBOL(cl_page_size); - -/** - * Adds page slice to the compound page. - * - * This is called by cl_object_operations::coo_page_init() methods to add a - * per-layer state to the page. New state is added at the end of - * cl_page::cp_layers list, that is, it is at the bottom of the stack. - * - * \see cl_lock_slice_add(), cl_req_slice_add(), cl_io_slice_add() - */ -void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice, - struct cl_object *obj, pgoff_t index, - const struct cl_page_operations *ops) -{ - list_add_tail(&slice->cpl_linkage, &page->cp_layers); - slice->cpl_obj = obj; - slice->cpl_index = index; - slice->cpl_ops = ops; - slice->cpl_page = page; -} -EXPORT_SYMBOL(cl_page_slice_add); - -/** - * Allocate and initialize cl_cache, called by ll_init_sbi(). - */ -struct cl_client_cache *cl_cache_init(unsigned long lru_page_max) -{ - struct cl_client_cache *cache = NULL; - - cache = kzalloc(sizeof(*cache), GFP_KERNEL); - if (!cache) - return NULL; - - /* Initialize cache data */ - atomic_set(&cache->ccc_users, 1); - cache->ccc_lru_max = lru_page_max; - atomic_long_set(&cache->ccc_lru_left, lru_page_max); - spin_lock_init(&cache->ccc_lru_lock); - INIT_LIST_HEAD(&cache->ccc_lru); - - atomic_long_set(&cache->ccc_unstable_nr, 0); - init_waitqueue_head(&cache->ccc_unstable_waitq); - - return cache; -} -EXPORT_SYMBOL(cl_cache_init); - -/** - * Increase cl_cache refcount - */ -void cl_cache_incref(struct cl_client_cache *cache) -{ - atomic_inc(&cache->ccc_users); -} -EXPORT_SYMBOL(cl_cache_incref); - -/** - * Decrease cl_cache refcount and free the cache if refcount=0. - * Since llite, lov and osc all hold cl_cache refcount, - * the free will not cause race. (LU-6173) - */ -void cl_cache_decref(struct cl_client_cache *cache) -{ - if (atomic_dec_and_test(&cache->ccc_users)) - kfree(cache); -} -EXPORT_SYMBOL(cl_cache_decref); diff --git a/drivers/staging/lustre/lustre/obdclass/class_obd.c b/drivers/staging/lustre/lustre/obdclass/class_obd.c deleted file mode 100644 index d6c46858941b8f22e53380ae07b7ab7ef826e11b..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdclass/class_obd.c +++ /dev/null @@ -1,544 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_CLASS - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "llog_internal.h" - -struct obd_device *obd_devs[MAX_OBD_DEVICES]; -struct list_head obd_types; -DEFINE_RWLOCK(obd_dev_lock); - -/* The following are visible and mutable through /sys/fs/lustre. */ -unsigned int obd_debug_peer_on_timeout; -EXPORT_SYMBOL(obd_debug_peer_on_timeout); -unsigned int obd_dump_on_timeout; -EXPORT_SYMBOL(obd_dump_on_timeout); -unsigned int obd_dump_on_eviction; -EXPORT_SYMBOL(obd_dump_on_eviction); -unsigned long obd_max_dirty_pages; -EXPORT_SYMBOL(obd_max_dirty_pages); -atomic_long_t obd_dirty_pages; -EXPORT_SYMBOL(obd_dirty_pages); -unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT; /* seconds */ -EXPORT_SYMBOL(obd_timeout); -unsigned int obd_timeout_set; -EXPORT_SYMBOL(obd_timeout_set); -/* Adaptive timeout defs here instead of ptlrpc module for /sys/fs/ access */ -unsigned int at_min; -EXPORT_SYMBOL(at_min); -unsigned int at_max = 600; -EXPORT_SYMBOL(at_max); -unsigned int at_history = 600; -EXPORT_SYMBOL(at_history); -int at_early_margin = 5; -EXPORT_SYMBOL(at_early_margin); -int at_extra = 30; -EXPORT_SYMBOL(at_extra); - -atomic_long_t obd_dirty_transit_pages; -EXPORT_SYMBOL(obd_dirty_transit_pages); - -char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE; -char obd_jobid_node[LUSTRE_JOBID_SIZE + 1]; - -/* Get jobid of current process from stored variable or calculate - * it from pid and user_id. - * - * Historically this was also done by reading the environment variable - * stored in between the "env_start" & "env_end" of task struct. - * This is now deprecated. - */ -int lustre_get_jobid(char *jobid) -{ - memset(jobid, 0, LUSTRE_JOBID_SIZE); - /* Jobstats isn't enabled */ - if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0) - return 0; - - /* Use process name + fsuid as jobid */ - if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) { - snprintf(jobid, LUSTRE_JOBID_SIZE, "%s.%u", - current->comm, - from_kuid(&init_user_ns, current_fsuid())); - return 0; - } - - /* Whole node dedicated to single job */ - if (strcmp(obd_jobid_var, JOBSTATS_NODELOCAL) == 0) { - strcpy(jobid, obd_jobid_node); - return 0; - } - - return -ENOENT; -} -EXPORT_SYMBOL(lustre_get_jobid); - -static int class_resolve_dev_name(__u32 len, const char *name) -{ - int rc; - int dev; - - if (!len || !name) { - CERROR("No name passed,!\n"); - rc = -EINVAL; - goto out; - } - if (name[len - 1] != 0) { - CERROR("Name not nul terminated!\n"); - rc = -EINVAL; - goto out; - } - - CDEBUG(D_IOCTL, "device name %s\n", name); - dev = class_name2dev(name); - if (dev == -1) { - CDEBUG(D_IOCTL, "No device for name %s!\n", name); - rc = -EINVAL; - goto out; - } - - CDEBUG(D_IOCTL, "device name %s, dev %d\n", name, dev); - rc = dev; - -out: - return rc; -} - -int class_handle_ioctl(unsigned int cmd, unsigned long arg) -{ - char *buf = NULL; - struct obd_ioctl_data *data; - struct libcfs_debug_ioctl_data *debug_data; - struct obd_device *obd = NULL; - int err = 0, len = 0; - - /* only for debugging */ - if (cmd == LIBCFS_IOC_DEBUG_MASK) { - debug_data = (struct libcfs_debug_ioctl_data *)arg; - libcfs_subsystem_debug = debug_data->subs; - libcfs_debug = debug_data->debug; - return 0; - } - - CDEBUG(D_IOCTL, "cmd = %x\n", cmd); - if (obd_ioctl_getdata(&buf, &len, (void __user *)arg)) { - CERROR("OBD ioctl: data error\n"); - return -EINVAL; - } - data = (struct obd_ioctl_data *)buf; - - switch (cmd) { - case OBD_IOC_PROCESS_CFG: { - struct lustre_cfg *lcfg; - - if (!data->ioc_plen1 || !data->ioc_pbuf1) { - CERROR("No config buffer passed!\n"); - err = -EINVAL; - goto out; - } - lcfg = kzalloc(data->ioc_plen1, GFP_NOFS); - if (!lcfg) { - err = -ENOMEM; - goto out; - } - if (copy_from_user(lcfg, data->ioc_pbuf1, data->ioc_plen1)) - err = -EFAULT; - if (!err) - err = lustre_cfg_sanity_check(lcfg, data->ioc_plen1); - if (!err) - err = class_process_config(lcfg); - - kfree(lcfg); - goto out; - } - - case OBD_GET_VERSION: - if (!data->ioc_inlbuf1) { - CERROR("No buffer passed in ioctl\n"); - err = -EINVAL; - goto out; - } - - if (strlen(LUSTRE_VERSION_STRING) + 1 > data->ioc_inllen1) { - CERROR("ioctl buffer too small to hold version\n"); - err = -EINVAL; - goto out; - } - - memcpy(data->ioc_bulk, LUSTRE_VERSION_STRING, - strlen(LUSTRE_VERSION_STRING) + 1); - - if (copy_to_user((void __user *)arg, data, len)) - err = -EFAULT; - goto out; - - case OBD_IOC_NAME2DEV: { - /* Resolve a device name. This does not change the - * currently selected device. - */ - int dev; - - dev = class_resolve_dev_name(data->ioc_inllen1, - data->ioc_inlbuf1); - data->ioc_dev = dev; - if (dev < 0) { - err = -EINVAL; - goto out; - } - - if (copy_to_user((void __user *)arg, data, sizeof(*data))) - err = -EFAULT; - goto out; - } - - case OBD_IOC_UUID2DEV: { - /* Resolve a device uuid. This does not change the - * currently selected device. - */ - int dev; - struct obd_uuid uuid; - - if (!data->ioc_inllen1 || !data->ioc_inlbuf1) { - CERROR("No UUID passed!\n"); - err = -EINVAL; - goto out; - } - if (data->ioc_inlbuf1[data->ioc_inllen1 - 1] != 0) { - CERROR("UUID not NUL terminated!\n"); - err = -EINVAL; - goto out; - } - - CDEBUG(D_IOCTL, "device name %s\n", data->ioc_inlbuf1); - obd_str2uuid(&uuid, data->ioc_inlbuf1); - dev = class_uuid2dev(&uuid); - data->ioc_dev = dev; - if (dev == -1) { - CDEBUG(D_IOCTL, "No device for UUID %s!\n", - data->ioc_inlbuf1); - err = -EINVAL; - goto out; - } - - CDEBUG(D_IOCTL, "device name %s, dev %d\n", data->ioc_inlbuf1, - dev); - - if (copy_to_user((void __user *)arg, data, sizeof(*data))) - err = -EFAULT; - goto out; - } - - case OBD_IOC_GETDEVICE: { - int index = data->ioc_count; - char *status, *str; - - if (!data->ioc_inlbuf1) { - CERROR("No buffer passed in ioctl\n"); - err = -EINVAL; - goto out; - } - if (data->ioc_inllen1 < 128) { - CERROR("ioctl buffer too small to hold version\n"); - err = -EINVAL; - goto out; - } - - obd = class_num2obd(index); - if (!obd) { - err = -ENOENT; - goto out; - } - - if (obd->obd_stopping) - status = "ST"; - else if (obd->obd_set_up) - status = "UP"; - else if (obd->obd_attached) - status = "AT"; - else - status = "--"; - str = (char *)data->ioc_bulk; - snprintf(str, len - sizeof(*data), "%3d %s %s %s %s %d", - (int)index, status, obd->obd_type->typ_name, - obd->obd_name, obd->obd_uuid.uuid, - atomic_read(&obd->obd_refcount)); - - if (copy_to_user((void __user *)arg, data, len)) - err = -EFAULT; - goto out; - } - } - - if (data->ioc_dev == OBD_DEV_BY_DEVNAME) { - if (data->ioc_inllen4 <= 0 || !data->ioc_inlbuf4) { - err = -EINVAL; - goto out; - } - if (strnlen(data->ioc_inlbuf4, MAX_OBD_NAME) >= MAX_OBD_NAME) { - err = -EINVAL; - goto out; - } - obd = class_name2obd(data->ioc_inlbuf4); - } else if (data->ioc_dev < class_devno_max()) { - obd = class_num2obd(data->ioc_dev); - } else { - CERROR("OBD ioctl: No device\n"); - err = -EINVAL; - goto out; - } - - if (!obd) { - CERROR("OBD ioctl : No Device %d\n", data->ioc_dev); - err = -EINVAL; - goto out; - } - LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); - - if (!obd->obd_set_up || obd->obd_stopping) { - CERROR("OBD ioctl: device not setup %d\n", data->ioc_dev); - err = -EINVAL; - goto out; - } - - switch (cmd) { - case OBD_IOC_NO_TRANSNO: { - if (!obd->obd_attached) { - CERROR("Device %d not attached\n", obd->obd_minor); - err = -ENODEV; - goto out; - } - CDEBUG(D_HA, "%s: disabling committed-transno notification\n", - obd->obd_name); - obd->obd_no_transno = 1; - err = 0; - goto out; - } - - default: { - err = obd_iocontrol(cmd, obd->obd_self_export, len, data, NULL); - if (err) - goto out; - - if (copy_to_user((void __user *)arg, data, len)) - err = -EFAULT; - goto out; - } - } - - out: - kvfree(buf); - return err; -} /* class_handle_ioctl */ - -#define OBD_INIT_CHECK -static int obd_init_checks(void) -{ - __u64 u64val, div64val; - char buf[64]; - int len, ret = 0; - - CDEBUG(D_INFO, "LPU64=%s, LPD64=%s, LPX64=%s\n", "%llu", "%lld", - "%#llx"); - - CDEBUG(D_INFO, "OBD_OBJECT_EOF = %#llx\n", (__u64)OBD_OBJECT_EOF); - - u64val = OBD_OBJECT_EOF; - CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = %#llx\n", u64val); - if (u64val != OBD_OBJECT_EOF) { - CERROR("__u64 %#llx(%d) != 0xffffffffffffffff\n", - u64val, (int)sizeof(u64val)); - ret = -EINVAL; - } - len = snprintf(buf, sizeof(buf), "%#llx", u64val); - if (len != 18) { - CWARN("LPX64 wrong length! strlen(%s)=%d != 18\n", buf, len); - ret = -EINVAL; - } - - div64val = OBD_OBJECT_EOF; - CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = %#llx\n", u64val); - if (u64val != OBD_OBJECT_EOF) { - CERROR("__u64 %#llx(%d) != 0xffffffffffffffff\n", - u64val, (int)sizeof(u64val)); - ret = -EOVERFLOW; - } - if (u64val >> 8 != OBD_OBJECT_EOF >> 8) { - CERROR("__u64 %#llx(%d) != 0xffffffffffffffff\n", - u64val, (int)sizeof(u64val)); - return -EOVERFLOW; - } - if (do_div(div64val, 256) != (u64val & 255)) { - CERROR("do_div(%#llx,256) != %llu\n", u64val, u64val & 255); - return -EOVERFLOW; - } - if (u64val >> 8 != div64val) { - CERROR("do_div(%#llx,256) %llu != %llu\n", - u64val, div64val, u64val >> 8); - return -EOVERFLOW; - } - len = snprintf(buf, sizeof(buf), "%#llx", u64val); - if (len != 18) { - CWARN("LPX64 wrong length! strlen(%s)=%d != 18\n", buf, len); - ret = -EINVAL; - } - len = snprintf(buf, sizeof(buf), "%llu", u64val); - if (len != 20) { - CWARN("LPU64 wrong length! strlen(%s)=%d != 20\n", buf, len); - ret = -EINVAL; - } - len = snprintf(buf, sizeof(buf), "%lld", u64val); - if (len != 2) { - CWARN("LPD64 wrong length! strlen(%s)=%d != 2\n", buf, len); - ret = -EINVAL; - } - if ((u64val & ~PAGE_MASK) >= PAGE_SIZE) { - CWARN("mask failed: u64val %llu >= %llu\n", u64val, - (__u64)PAGE_SIZE); - ret = -EINVAL; - } - - return ret; -} - -static int __init obdclass_init(void) -{ - int i, err; - - LCONSOLE_INFO("Lustre: Build Version: " LUSTRE_VERSION_STRING "\n"); - - spin_lock_init(&obd_types_lock); - - err = libcfs_setup(); - if (err) - return err; - - obd_zombie_impexp_init(); - - err = obd_init_checks(); - if (err) - return err; - - class_init_uuidlist(); - err = class_handle_init(); - if (err) - return err; - - INIT_LIST_HEAD(&obd_types); - - err = misc_register(&obd_psdev); - if (err) { - CERROR("cannot register OBD miscdevices: err %d\n", err); - return err; - } - - /* This struct is already zeroed for us (static global) */ - for (i = 0; i < class_devno_max(); i++) - obd_devs[i] = NULL; - - /* Default the dirty page cache cap to 1/2 of system memory. - * For clients with less memory, a larger fraction is needed - * for other purposes (mostly for BGL). - */ - if (totalram_pages <= 512 << (20 - PAGE_SHIFT)) - obd_max_dirty_pages = totalram_pages / 4; - else - obd_max_dirty_pages = totalram_pages / 2; - - err = obd_init_caches(); - if (err) - return err; - - err = class_procfs_init(); - if (err) - return err; - - err = obd_sysctl_init(); - if (err) - return err; - - err = lu_global_init(); - if (err) - return err; - - err = cl_global_init(); - if (err != 0) - return err; - - err = llog_info_init(); - if (err) - return err; - - err = lustre_register_fs(); - - return err; -} - -static void obdclass_exit(void) -{ - lustre_unregister_fs(); - - misc_deregister(&obd_psdev); - llog_info_fini(); - cl_global_fini(); - lu_global_fini(); - - obd_cleanup_caches(); - - class_procfs_clean(); - - class_handle_cleanup(); - class_exit_uuidlist(); - obd_zombie_impexp_stop(); -} - -MODULE_AUTHOR("OpenSFS, Inc. "); -MODULE_DESCRIPTION("Lustre Class Driver"); -MODULE_VERSION(LUSTRE_VERSION_STRING); -MODULE_LICENSE("GPL"); - -module_init(obdclass_init); -module_exit(obdclass_exit); diff --git a/drivers/staging/lustre/lustre/obdclass/debug.c b/drivers/staging/lustre/lustre/obdclass/debug.c deleted file mode 100644 index 2156a82a613ac893eee25b3255372df0cccb0ca5..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdclass/debug.c +++ /dev/null @@ -1,96 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/obdclass/debug.c - * - * Helper routines for dumping data structs for debugging. - */ - -#define DEBUG_SUBSYSTEM D_OTHER - -#include - -#include -#include -#include - -#define LPDS sizeof(__u64) -int block_debug_setup(void *addr, int len, __u64 off, __u64 id) -{ - LASSERT(addr); - - put_unaligned_le64(off, addr); - put_unaligned_le64(id, addr + LPDS); - addr += len - LPDS - LPDS; - put_unaligned_le64(off, addr); - put_unaligned_le64(id, addr + LPDS); - - return 0; -} -EXPORT_SYMBOL(block_debug_setup); - -int block_debug_check(char *who, void *addr, int end, __u64 off, __u64 id) -{ - __u64 ne_off; - int err = 0; - - LASSERT(addr); - - ne_off = le64_to_cpu(off); - id = le64_to_cpu(id); - if (memcmp(addr, (char *)&ne_off, LPDS)) { - CDEBUG(D_ERROR, "%s: id %#llx offset %llu off: %#llx != %#llx\n", - who, id, off, *(__u64 *)addr, ne_off); - err = -EINVAL; - } - if (memcmp(addr + LPDS, (char *)&id, LPDS)) { - CDEBUG(D_ERROR, "%s: id %#llx offset %llu id: %#llx != %#llx\n", - who, id, off, *(__u64 *)(addr + LPDS), id); - err = -EINVAL; - } - - addr += end - LPDS - LPDS; - if (memcmp(addr, (char *)&ne_off, LPDS)) { - CDEBUG(D_ERROR, "%s: id %#llx offset %llu end off: %#llx != %#llx\n", - who, id, off, *(__u64 *)addr, ne_off); - err = -EINVAL; - } - if (memcmp(addr + LPDS, (char *)&id, LPDS)) { - CDEBUG(D_ERROR, "%s: id %#llx offset %llu end id: %#llx != %#llx\n", - who, id, off, *(__u64 *)(addr + LPDS), id); - err = -EINVAL; - } - - return err; -} -EXPORT_SYMBOL(block_debug_check); -#undef LPDS diff --git a/drivers/staging/lustre/lustre/obdclass/genops.c b/drivers/staging/lustre/lustre/obdclass/genops.c deleted file mode 100644 index 234f383ce6d9d95b9ec56d2be69d0e904470f4ce..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdclass/genops.c +++ /dev/null @@ -1,1480 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/obdclass/genops.c - * - * These are the only exported functions, they provide some generic - * infrastructure for managing object devices - */ - -#define DEBUG_SUBSYSTEM S_CLASS -#include -#include -#include - -spinlock_t obd_types_lock; - -static struct kmem_cache *obd_device_cachep; -struct kmem_cache *obdo_cachep; -EXPORT_SYMBOL(obdo_cachep); -static struct kmem_cache *import_cachep; - -static struct workqueue_struct *zombie_wq; -static void obd_zombie_export_add(struct obd_export *exp); -static void obd_zombie_import_add(struct obd_import *imp); - -int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c); -EXPORT_SYMBOL(ptlrpc_put_connection_superhack); - -/* - * support functions: we could use inter-module communication, but this - * is more portable to other OS's - */ -static struct obd_device *obd_device_alloc(void) -{ - struct obd_device *obd; - - obd = kmem_cache_zalloc(obd_device_cachep, GFP_NOFS); - if (obd) - obd->obd_magic = OBD_DEVICE_MAGIC; - return obd; -} - -static void obd_device_free(struct obd_device *obd) -{ - LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "obd %p obd_magic %08x != %08x\n", - obd, obd->obd_magic, OBD_DEVICE_MAGIC); - if (obd->obd_namespace) { - CERROR("obd %p: namespace %p was not properly cleaned up (obd_force=%d)!\n", - obd, obd->obd_namespace, obd->obd_force); - LBUG(); - } - lu_ref_fini(&obd->obd_reference); - kmem_cache_free(obd_device_cachep, obd); -} - -static struct obd_type *class_search_type(const char *name) -{ - struct list_head *tmp; - struct obd_type *type; - - spin_lock(&obd_types_lock); - list_for_each(tmp, &obd_types) { - type = list_entry(tmp, struct obd_type, typ_chain); - if (strcmp(type->typ_name, name) == 0) { - spin_unlock(&obd_types_lock); - return type; - } - } - spin_unlock(&obd_types_lock); - return NULL; -} - -static struct obd_type *class_get_type(const char *name) -{ - struct obd_type *type = class_search_type(name); - - if (!type) { - const char *modname = name; - - if (!request_module("%s", modname)) { - CDEBUG(D_INFO, "Loaded module '%s'\n", modname); - type = class_search_type(name); - } else { - LCONSOLE_ERROR_MSG(0x158, "Can't load module '%s'\n", - modname); - } - } - if (type) { - spin_lock(&type->obd_type_lock); - type->typ_refcnt++; - try_module_get(type->typ_dt_ops->owner); - spin_unlock(&type->obd_type_lock); - } - return type; -} - -void class_put_type(struct obd_type *type) -{ - LASSERT(type); - spin_lock(&type->obd_type_lock); - type->typ_refcnt--; - module_put(type->typ_dt_ops->owner); - spin_unlock(&type->obd_type_lock); -} - -#define CLASS_MAX_NAME 1024 - -int class_register_type(struct obd_ops *dt_ops, struct md_ops *md_ops, - const char *name, - struct lu_device_type *ldt) -{ - struct obd_type *type; - int rc; - - /* sanity check */ - LASSERT(strnlen(name, CLASS_MAX_NAME) < CLASS_MAX_NAME); - - if (class_search_type(name)) { - CDEBUG(D_IOCTL, "Type %s already registered\n", name); - return -EEXIST; - } - - rc = -ENOMEM; - type = kzalloc(sizeof(*type), GFP_NOFS); - if (!type) - return rc; - - type->typ_dt_ops = kzalloc(sizeof(*type->typ_dt_ops), GFP_NOFS); - type->typ_md_ops = kzalloc(sizeof(*type->typ_md_ops), GFP_NOFS); - type->typ_name = kzalloc(strlen(name) + 1, GFP_NOFS); - - if (!type->typ_dt_ops || - !type->typ_md_ops || - !type->typ_name) - goto failed; - - *type->typ_dt_ops = *dt_ops; - /* md_ops is optional */ - if (md_ops) - *type->typ_md_ops = *md_ops; - strcpy(type->typ_name, name); - spin_lock_init(&type->obd_type_lock); - - type->typ_debugfs_entry = debugfs_create_dir(type->typ_name, - debugfs_lustre_root); - - type->typ_kobj = kobject_create_and_add(type->typ_name, lustre_kobj); - if (!type->typ_kobj) { - rc = -ENOMEM; - goto failed; - } - - if (ldt) { - type->typ_lu = ldt; - rc = lu_device_type_init(ldt); - if (rc != 0) - goto failed; - } - - spin_lock(&obd_types_lock); - list_add(&type->typ_chain, &obd_types); - spin_unlock(&obd_types_lock); - - return 0; - - failed: - if (type->typ_kobj) - kobject_put(type->typ_kobj); - kfree(type->typ_name); - kfree(type->typ_md_ops); - kfree(type->typ_dt_ops); - kfree(type); - return rc; -} -EXPORT_SYMBOL(class_register_type); - -int class_unregister_type(const char *name) -{ - struct obd_type *type = class_search_type(name); - - if (!type) { - CERROR("unknown obd type\n"); - return -EINVAL; - } - - if (type->typ_refcnt) { - CERROR("type %s has refcount (%d)\n", name, type->typ_refcnt); - /* This is a bad situation, let's make the best of it */ - /* Remove ops, but leave the name for debugging */ - kfree(type->typ_dt_ops); - kfree(type->typ_md_ops); - return -EBUSY; - } - - if (type->typ_kobj) - kobject_put(type->typ_kobj); - - debugfs_remove_recursive(type->typ_debugfs_entry); - - if (type->typ_lu) - lu_device_type_fini(type->typ_lu); - - spin_lock(&obd_types_lock); - list_del(&type->typ_chain); - spin_unlock(&obd_types_lock); - kfree(type->typ_name); - kfree(type->typ_dt_ops); - kfree(type->typ_md_ops); - kfree(type); - return 0; -} /* class_unregister_type */ -EXPORT_SYMBOL(class_unregister_type); - -/** - * Create a new obd device. - * - * Find an empty slot in ::obd_devs[], create a new obd device in it. - * - * \param[in] type_name obd device type string. - * \param[in] name obd device name. - * - * \retval NULL if create fails, otherwise return the obd device - * pointer created. - */ -struct obd_device *class_newdev(const char *type_name, const char *name) -{ - struct obd_device *result = NULL; - struct obd_device *newdev; - struct obd_type *type = NULL; - int i; - int new_obd_minor = 0; - - if (strlen(name) >= MAX_OBD_NAME) { - CERROR("name/uuid must be < %u bytes long\n", MAX_OBD_NAME); - return ERR_PTR(-EINVAL); - } - - type = class_get_type(type_name); - if (!type) { - CERROR("OBD: unknown type: %s\n", type_name); - return ERR_PTR(-ENODEV); - } - - newdev = obd_device_alloc(); - if (!newdev) { - result = ERR_PTR(-ENOMEM); - goto out_type; - } - - LASSERT(newdev->obd_magic == OBD_DEVICE_MAGIC); - - write_lock(&obd_dev_lock); - for (i = 0; i < class_devno_max(); i++) { - struct obd_device *obd = class_num2obd(i); - - if (obd && (strcmp(name, obd->obd_name) == 0)) { - CERROR("Device %s already exists at %d, won't add\n", - name, i); - if (result) { - LASSERTF(result->obd_magic == OBD_DEVICE_MAGIC, - "%p obd_magic %08x != %08x\n", result, - result->obd_magic, OBD_DEVICE_MAGIC); - LASSERTF(result->obd_minor == new_obd_minor, - "%p obd_minor %d != %d\n", result, - result->obd_minor, new_obd_minor); - - obd_devs[result->obd_minor] = NULL; - result->obd_name[0] = '\0'; - } - result = ERR_PTR(-EEXIST); - break; - } - if (!result && !obd) { - result = newdev; - result->obd_minor = i; - new_obd_minor = i; - result->obd_type = type; - strncpy(result->obd_name, name, - sizeof(result->obd_name) - 1); - obd_devs[i] = result; - } - } - write_unlock(&obd_dev_lock); - - if (!result && i >= class_devno_max()) { - CERROR("all %u OBD devices used, increase MAX_OBD_DEVICES\n", - class_devno_max()); - result = ERR_PTR(-EOVERFLOW); - goto out; - } - - if (IS_ERR(result)) - goto out; - - CDEBUG(D_IOCTL, "Adding new device %s (%p)\n", - result->obd_name, result); - - return result; -out: - obd_device_free(newdev); -out_type: - class_put_type(type); - return result; -} - -void class_release_dev(struct obd_device *obd) -{ - struct obd_type *obd_type = obd->obd_type; - - LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x != %08x\n", - obd, obd->obd_magic, OBD_DEVICE_MAGIC); - LASSERTF(obd == obd_devs[obd->obd_minor], "obd %p != obd_devs[%d] %p\n", - obd, obd->obd_minor, obd_devs[obd->obd_minor]); - LASSERT(obd_type); - - CDEBUG(D_INFO, "Release obd device %s at %d obd_type name =%s\n", - obd->obd_name, obd->obd_minor, obd->obd_type->typ_name); - - write_lock(&obd_dev_lock); - obd_devs[obd->obd_minor] = NULL; - write_unlock(&obd_dev_lock); - obd_device_free(obd); - - class_put_type(obd_type); -} - -int class_name2dev(const char *name) -{ - int i; - - if (!name) - return -1; - - read_lock(&obd_dev_lock); - for (i = 0; i < class_devno_max(); i++) { - struct obd_device *obd = class_num2obd(i); - - if (obd && strcmp(name, obd->obd_name) == 0) { - /* Make sure we finished attaching before we give - * out any references - */ - LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); - if (obd->obd_attached) { - read_unlock(&obd_dev_lock); - return i; - } - break; - } - } - read_unlock(&obd_dev_lock); - - return -1; -} - -struct obd_device *class_name2obd(const char *name) -{ - int dev = class_name2dev(name); - - if (dev < 0 || dev > class_devno_max()) - return NULL; - return class_num2obd(dev); -} -EXPORT_SYMBOL(class_name2obd); - -int class_uuid2dev(struct obd_uuid *uuid) -{ - int i; - - read_lock(&obd_dev_lock); - for (i = 0; i < class_devno_max(); i++) { - struct obd_device *obd = class_num2obd(i); - - if (obd && obd_uuid_equals(uuid, &obd->obd_uuid)) { - LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); - read_unlock(&obd_dev_lock); - return i; - } - } - read_unlock(&obd_dev_lock); - - return -1; -} - -/** - * Get obd device from ::obd_devs[] - * - * \param num [in] array index - * - * \retval NULL if ::obd_devs[\a num] does not contains an obd device - * otherwise return the obd device there. - */ -struct obd_device *class_num2obd(int num) -{ - struct obd_device *obd = NULL; - - if (num < class_devno_max()) { - obd = obd_devs[num]; - if (!obd) - return NULL; - - LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, - "%p obd_magic %08x != %08x\n", - obd, obd->obd_magic, OBD_DEVICE_MAGIC); - LASSERTF(obd->obd_minor == num, - "%p obd_minor %0d != %0d\n", - obd, obd->obd_minor, num); - } - - return obd; -} - -/* Search for a client OBD connected to tgt_uuid. If grp_uuid is - * specified, then only the client with that uuid is returned, - * otherwise any client connected to the tgt is returned. - */ -struct obd_device *class_find_client_obd(struct obd_uuid *tgt_uuid, - const char *typ_name, - struct obd_uuid *grp_uuid) -{ - int i; - - read_lock(&obd_dev_lock); - for (i = 0; i < class_devno_max(); i++) { - struct obd_device *obd = class_num2obd(i); - - if (!obd) - continue; - if ((strncmp(obd->obd_type->typ_name, typ_name, - strlen(typ_name)) == 0)) { - if (obd_uuid_equals(tgt_uuid, - &obd->u.cli.cl_target_uuid) && - ((grp_uuid) ? obd_uuid_equals(grp_uuid, - &obd->obd_uuid) : 1)) { - read_unlock(&obd_dev_lock); - return obd; - } - } - } - read_unlock(&obd_dev_lock); - - return NULL; -} -EXPORT_SYMBOL(class_find_client_obd); - -/* Iterate the obd_device list looking devices have grp_uuid. Start - * searching at *next, and if a device is found, the next index to look - * at is saved in *next. If next is NULL, then the first matching device - * will always be returned. - */ -struct obd_device *class_devices_in_group(struct obd_uuid *grp_uuid, int *next) -{ - int i; - - if (!next) - i = 0; - else if (*next >= 0 && *next < class_devno_max()) - i = *next; - else - return NULL; - - read_lock(&obd_dev_lock); - for (; i < class_devno_max(); i++) { - struct obd_device *obd = class_num2obd(i); - - if (!obd) - continue; - if (obd_uuid_equals(grp_uuid, &obd->obd_uuid)) { - if (next) - *next = i + 1; - read_unlock(&obd_dev_lock); - return obd; - } - } - read_unlock(&obd_dev_lock); - - return NULL; -} -EXPORT_SYMBOL(class_devices_in_group); - -/** - * to notify sptlrpc log for \a fsname has changed, let every relevant OBD - * adjust sptlrpc settings accordingly. - */ -int class_notify_sptlrpc_conf(const char *fsname, int namelen) -{ - struct obd_device *obd; - const char *type; - int i, rc = 0, rc2; - - LASSERT(namelen > 0); - - read_lock(&obd_dev_lock); - for (i = 0; i < class_devno_max(); i++) { - obd = class_num2obd(i); - - if (!obd || obd->obd_set_up == 0 || obd->obd_stopping) - continue; - - /* only notify mdc, osc, mdt, ost */ - type = obd->obd_type->typ_name; - if (strcmp(type, LUSTRE_MDC_NAME) != 0 && - strcmp(type, LUSTRE_OSC_NAME) != 0 && - strcmp(type, LUSTRE_MDT_NAME) != 0 && - strcmp(type, LUSTRE_OST_NAME) != 0) - continue; - - if (strncmp(obd->obd_name, fsname, namelen)) - continue; - - class_incref(obd, __func__, obd); - read_unlock(&obd_dev_lock); - rc2 = obd_set_info_async(NULL, obd->obd_self_export, - sizeof(KEY_SPTLRPC_CONF), - KEY_SPTLRPC_CONF, 0, NULL, NULL); - rc = rc ? rc : rc2; - class_decref(obd, __func__, obd); - read_lock(&obd_dev_lock); - } - read_unlock(&obd_dev_lock); - return rc; -} -EXPORT_SYMBOL(class_notify_sptlrpc_conf); - -void obd_cleanup_caches(void) -{ - kmem_cache_destroy(obd_device_cachep); - obd_device_cachep = NULL; - kmem_cache_destroy(obdo_cachep); - obdo_cachep = NULL; - kmem_cache_destroy(import_cachep); - import_cachep = NULL; -} - -int obd_init_caches(void) -{ - LASSERT(!obd_device_cachep); - obd_device_cachep = kmem_cache_create("ll_obd_dev_cache", - sizeof(struct obd_device), - 0, 0, NULL); - if (!obd_device_cachep) - goto out; - - LASSERT(!obdo_cachep); - obdo_cachep = kmem_cache_create("ll_obdo_cache", sizeof(struct obdo), - 0, 0, NULL); - if (!obdo_cachep) - goto out; - - LASSERT(!import_cachep); - import_cachep = kmem_cache_create("ll_import_cache", - sizeof(struct obd_import), - 0, 0, NULL); - if (!import_cachep) - goto out; - - return 0; - out: - obd_cleanup_caches(); - return -ENOMEM; -} - -/* map connection to client */ -struct obd_export *class_conn2export(struct lustre_handle *conn) -{ - struct obd_export *export; - - if (!conn) { - CDEBUG(D_CACHE, "looking for null handle\n"); - return NULL; - } - - if (conn->cookie == -1) { /* this means assign a new connection */ - CDEBUG(D_CACHE, "want a new connection\n"); - return NULL; - } - - CDEBUG(D_INFO, "looking for export cookie %#llx\n", conn->cookie); - export = class_handle2object(conn->cookie, NULL); - return export; -} -EXPORT_SYMBOL(class_conn2export); - -struct obd_device *class_exp2obd(struct obd_export *exp) -{ - if (exp) - return exp->exp_obd; - return NULL; -} -EXPORT_SYMBOL(class_exp2obd); - -struct obd_import *class_exp2cliimp(struct obd_export *exp) -{ - struct obd_device *obd = exp->exp_obd; - - if (!obd) - return NULL; - return obd->u.cli.cl_import; -} -EXPORT_SYMBOL(class_exp2cliimp); - -/* Export management functions */ -static void class_export_destroy(struct obd_export *exp) -{ - struct obd_device *obd = exp->exp_obd; - - LASSERT_ATOMIC_ZERO(&exp->exp_refcount); - LASSERT(obd); - - CDEBUG(D_IOCTL, "destroying export %p/%s for %s\n", exp, - exp->exp_client_uuid.uuid, obd->obd_name); - - /* "Local" exports (lctl, LOV->{mdc,osc}) have no connection. */ - if (exp->exp_connection) - ptlrpc_put_connection_superhack(exp->exp_connection); - - LASSERT(list_empty(&exp->exp_outstanding_replies)); - LASSERT(list_empty(&exp->exp_uncommitted_replies)); - LASSERT(list_empty(&exp->exp_req_replay_queue)); - LASSERT(list_empty(&exp->exp_hp_rpcs)); - obd_destroy_export(exp); - class_decref(obd, "export", exp); - - OBD_FREE_RCU(exp, sizeof(*exp), &exp->exp_handle); -} - -static void export_handle_addref(void *export) -{ - class_export_get(export); -} - -static struct portals_handle_ops export_handle_ops = { - .hop_addref = export_handle_addref, - .hop_free = NULL, -}; - -struct obd_export *class_export_get(struct obd_export *exp) -{ - atomic_inc(&exp->exp_refcount); - CDEBUG(D_INFO, "GETting export %p : new refcount %d\n", exp, - atomic_read(&exp->exp_refcount)); - return exp; -} -EXPORT_SYMBOL(class_export_get); - -void class_export_put(struct obd_export *exp) -{ - LASSERT_ATOMIC_GT_LT(&exp->exp_refcount, 0, LI_POISON); - CDEBUG(D_INFO, "PUTting export %p : new refcount %d\n", exp, - atomic_read(&exp->exp_refcount) - 1); - - if (atomic_dec_and_test(&exp->exp_refcount)) { - LASSERT(!list_empty(&exp->exp_obd_chain)); - CDEBUG(D_IOCTL, "final put %p/%s\n", - exp, exp->exp_client_uuid.uuid); - - /* release nid stat refererence */ - lprocfs_exp_cleanup(exp); - - obd_zombie_export_add(exp); - } -} -EXPORT_SYMBOL(class_export_put); - -static void obd_zombie_exp_cull(struct work_struct *ws) -{ - struct obd_export *export = container_of(ws, struct obd_export, exp_zombie_work); - - class_export_destroy(export); -} - -/* Creates a new export, adds it to the hash table, and returns a - * pointer to it. The refcount is 2: one for the hash reference, and - * one for the pointer returned by this function. - */ -struct obd_export *class_new_export(struct obd_device *obd, - struct obd_uuid *cluuid) -{ - struct obd_export *export; - int rc = 0; - - export = kzalloc(sizeof(*export), GFP_NOFS); - if (!export) - return ERR_PTR(-ENOMEM); - - export->exp_conn_cnt = 0; - atomic_set(&export->exp_refcount, 2); - atomic_set(&export->exp_rpc_count, 0); - atomic_set(&export->exp_cb_count, 0); - atomic_set(&export->exp_locks_count, 0); -#if LUSTRE_TRACKS_LOCK_EXP_REFS - INIT_LIST_HEAD(&export->exp_locks_list); - spin_lock_init(&export->exp_locks_list_guard); -#endif - atomic_set(&export->exp_replay_count, 0); - export->exp_obd = obd; - INIT_LIST_HEAD(&export->exp_outstanding_replies); - spin_lock_init(&export->exp_uncommitted_replies_lock); - INIT_LIST_HEAD(&export->exp_uncommitted_replies); - INIT_LIST_HEAD(&export->exp_req_replay_queue); - INIT_LIST_HEAD(&export->exp_handle.h_link); - INIT_LIST_HEAD(&export->exp_hp_rpcs); - class_handle_hash(&export->exp_handle, &export_handle_ops); - spin_lock_init(&export->exp_lock); - spin_lock_init(&export->exp_rpc_lock); - spin_lock_init(&export->exp_bl_list_lock); - INIT_LIST_HEAD(&export->exp_bl_list); - INIT_WORK(&export->exp_zombie_work, obd_zombie_exp_cull); - - export->exp_sp_peer = LUSTRE_SP_ANY; - export->exp_flvr.sf_rpc = SPTLRPC_FLVR_INVALID; - export->exp_client_uuid = *cluuid; - obd_init_export(export); - - spin_lock(&obd->obd_dev_lock); - /* shouldn't happen, but might race */ - if (obd->obd_stopping) { - rc = -ENODEV; - goto exit_unlock; - } - - if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) { - rc = obd_uuid_add(obd, export); - if (rc) { - LCONSOLE_WARN("%s: denying duplicate export for %s, %d\n", - obd->obd_name, cluuid->uuid, rc); - goto exit_unlock; - } - } - - class_incref(obd, "export", export); - list_add(&export->exp_obd_chain, &export->exp_obd->obd_exports); - export->exp_obd->obd_num_exports++; - spin_unlock(&obd->obd_dev_lock); - return export; - -exit_unlock: - spin_unlock(&obd->obd_dev_lock); - class_handle_unhash(&export->exp_handle); - obd_destroy_export(export); - kfree(export); - return ERR_PTR(rc); -} -EXPORT_SYMBOL(class_new_export); - -void class_unlink_export(struct obd_export *exp) -{ - class_handle_unhash(&exp->exp_handle); - - spin_lock(&exp->exp_obd->obd_dev_lock); - /* delete an uuid-export hashitem from hashtables */ - if (exp != exp->exp_obd->obd_self_export) - obd_uuid_del(exp->exp_obd, exp); - - list_move(&exp->exp_obd_chain, &exp->exp_obd->obd_unlinked_exports); - exp->exp_obd->obd_num_exports--; - spin_unlock(&exp->exp_obd->obd_dev_lock); - class_export_put(exp); -} - -/* Import management functions */ -static void class_import_destroy(struct obd_import *imp) -{ - CDEBUG(D_IOCTL, "destroying import %p for %s\n", imp, - imp->imp_obd->obd_name); - - LASSERT_ATOMIC_ZERO(&imp->imp_refcount); - - ptlrpc_put_connection_superhack(imp->imp_connection); - - while (!list_empty(&imp->imp_conn_list)) { - struct obd_import_conn *imp_conn; - - imp_conn = list_entry(imp->imp_conn_list.next, - struct obd_import_conn, oic_item); - list_del_init(&imp_conn->oic_item); - ptlrpc_put_connection_superhack(imp_conn->oic_conn); - kfree(imp_conn); - } - - LASSERT(!imp->imp_sec); - class_decref(imp->imp_obd, "import", imp); - OBD_FREE_RCU(imp, sizeof(*imp), &imp->imp_handle); -} - -static void import_handle_addref(void *import) -{ - class_import_get(import); -} - -static struct portals_handle_ops import_handle_ops = { - .hop_addref = import_handle_addref, - .hop_free = NULL, -}; - -struct obd_import *class_import_get(struct obd_import *import) -{ - atomic_inc(&import->imp_refcount); - CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", import, - atomic_read(&import->imp_refcount), - import->imp_obd->obd_name); - return import; -} -EXPORT_SYMBOL(class_import_get); - -void class_import_put(struct obd_import *imp) -{ - LASSERT_ATOMIC_GT_LT(&imp->imp_refcount, 0, LI_POISON); - - CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", imp, - atomic_read(&imp->imp_refcount) - 1, - imp->imp_obd->obd_name); - - if (atomic_dec_and_test(&imp->imp_refcount)) { - CDEBUG(D_INFO, "final put import %p\n", imp); - obd_zombie_import_add(imp); - } - - /* catch possible import put race */ - LASSERT_ATOMIC_GE_LT(&imp->imp_refcount, 0, LI_POISON); -} -EXPORT_SYMBOL(class_import_put); - -static void init_imp_at(struct imp_at *at) -{ - int i; - - at_init(&at->iat_net_latency, 0, 0); - for (i = 0; i < IMP_AT_MAX_PORTALS; i++) { - /* max service estimates are tracked on the server side, so - * don't use the AT history here, just use the last reported - * val. (But keep hist for proc histogram, worst_ever) - */ - at_init(&at->iat_service_estimate[i], INITIAL_CONNECT_TIMEOUT, - AT_FLG_NOHIST); - } -} - -static void obd_zombie_imp_cull(struct work_struct *ws) -{ - struct obd_import *import = container_of(ws, struct obd_import, imp_zombie_work); - - class_import_destroy(import); -} - -struct obd_import *class_new_import(struct obd_device *obd) -{ - struct obd_import *imp; - - imp = kzalloc(sizeof(*imp), GFP_NOFS); - if (!imp) - return NULL; - - INIT_LIST_HEAD(&imp->imp_pinger_chain); - INIT_LIST_HEAD(&imp->imp_replay_list); - INIT_LIST_HEAD(&imp->imp_sending_list); - INIT_LIST_HEAD(&imp->imp_delayed_list); - INIT_LIST_HEAD(&imp->imp_committed_list); - INIT_LIST_HEAD(&imp->imp_unreplied_list); - imp->imp_known_replied_xid = 0; - imp->imp_replay_cursor = &imp->imp_committed_list; - spin_lock_init(&imp->imp_lock); - imp->imp_last_success_conn = 0; - imp->imp_state = LUSTRE_IMP_NEW; - imp->imp_obd = class_incref(obd, "import", imp); - mutex_init(&imp->imp_sec_mutex); - init_waitqueue_head(&imp->imp_recovery_waitq); - INIT_WORK(&imp->imp_zombie_work, obd_zombie_imp_cull); - - atomic_set(&imp->imp_refcount, 2); - atomic_set(&imp->imp_unregistering, 0); - atomic_set(&imp->imp_inflight, 0); - atomic_set(&imp->imp_replay_inflight, 0); - atomic_set(&imp->imp_inval_count, 0); - INIT_LIST_HEAD(&imp->imp_conn_list); - INIT_LIST_HEAD(&imp->imp_handle.h_link); - class_handle_hash(&imp->imp_handle, &import_handle_ops); - init_imp_at(&imp->imp_at); - - /* the default magic is V2, will be used in connect RPC, and - * then adjusted according to the flags in request/reply. - */ - imp->imp_msg_magic = LUSTRE_MSG_MAGIC_V2; - - return imp; -} -EXPORT_SYMBOL(class_new_import); - -void class_destroy_import(struct obd_import *import) -{ - LASSERT(import); - LASSERT(import != LP_POISON); - - class_handle_unhash(&import->imp_handle); - - spin_lock(&import->imp_lock); - import->imp_generation++; - spin_unlock(&import->imp_lock); - class_import_put(import); -} -EXPORT_SYMBOL(class_destroy_import); - -#if LUSTRE_TRACKS_LOCK_EXP_REFS - -void __class_export_add_lock_ref(struct obd_export *exp, struct ldlm_lock *lock) -{ - spin_lock(&exp->exp_locks_list_guard); - - LASSERT(lock->l_exp_refs_nr >= 0); - - if (lock->l_exp_refs_target && lock->l_exp_refs_target != exp) { - LCONSOLE_WARN("setting export %p for lock %p which already has export %p\n", - exp, lock, lock->l_exp_refs_target); - } - if ((lock->l_exp_refs_nr++) == 0) { - list_add(&lock->l_exp_refs_link, &exp->exp_locks_list); - lock->l_exp_refs_target = exp; - } - CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n", - lock, exp, lock->l_exp_refs_nr); - spin_unlock(&exp->exp_locks_list_guard); -} - -void __class_export_del_lock_ref(struct obd_export *exp, struct ldlm_lock *lock) -{ - spin_lock(&exp->exp_locks_list_guard); - LASSERT(lock->l_exp_refs_nr > 0); - if (lock->l_exp_refs_target != exp) { - LCONSOLE_WARN("lock %p, mismatching export pointers: %p, %p\n", - lock, lock->l_exp_refs_target, exp); - } - if (-- lock->l_exp_refs_nr == 0) { - list_del_init(&lock->l_exp_refs_link); - lock->l_exp_refs_target = NULL; - } - CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n", - lock, exp, lock->l_exp_refs_nr); - spin_unlock(&exp->exp_locks_list_guard); -} -#endif - -/* A connection defines an export context in which preallocation can - * be managed. This releases the export pointer reference, and returns - * the export handle, so the export refcount is 1 when this function - * returns. - */ -int class_connect(struct lustre_handle *conn, struct obd_device *obd, - struct obd_uuid *cluuid) -{ - struct obd_export *export; - - LASSERT(conn); - LASSERT(obd); - LASSERT(cluuid); - - export = class_new_export(obd, cluuid); - if (IS_ERR(export)) - return PTR_ERR(export); - - conn->cookie = export->exp_handle.h_cookie; - class_export_put(export); - - CDEBUG(D_IOCTL, "connect: client %s, cookie %#llx\n", - cluuid->uuid, conn->cookie); - return 0; -} -EXPORT_SYMBOL(class_connect); - -/* This function removes 1-3 references from the export: - * 1 - for export pointer passed - * and if disconnect really need - * 2 - removing from hash - * 3 - in client_unlink_export - * The export pointer passed to this function can destroyed - */ -int class_disconnect(struct obd_export *export) -{ - int already_disconnected; - - if (!export) { - CWARN("attempting to free NULL export %p\n", export); - return -EINVAL; - } - - spin_lock(&export->exp_lock); - already_disconnected = export->exp_disconnected; - export->exp_disconnected = 1; - spin_unlock(&export->exp_lock); - - /* class_cleanup(), abort_recovery(), and class_fail_export() - * all end up in here, and if any of them race we shouldn't - * call extra class_export_puts(). - */ - if (already_disconnected) - goto no_disconn; - - CDEBUG(D_IOCTL, "disconnect: cookie %#llx\n", - export->exp_handle.h_cookie); - - class_unlink_export(export); -no_disconn: - class_export_put(export); - return 0; -} -EXPORT_SYMBOL(class_disconnect); - -void class_fail_export(struct obd_export *exp) -{ - int rc, already_failed; - - spin_lock(&exp->exp_lock); - already_failed = exp->exp_failed; - exp->exp_failed = 1; - spin_unlock(&exp->exp_lock); - - if (already_failed) { - CDEBUG(D_HA, "disconnecting dead export %p/%s; skipping\n", - exp, exp->exp_client_uuid.uuid); - return; - } - - CDEBUG(D_HA, "disconnecting export %p/%s\n", - exp, exp->exp_client_uuid.uuid); - - if (obd_dump_on_timeout) - libcfs_debug_dumplog(); - - /* need for safe call CDEBUG after obd_disconnect */ - class_export_get(exp); - - /* Most callers into obd_disconnect are removing their own reference - * (request, for example) in addition to the one from the hash table. - * We don't have such a reference here, so make one. - */ - class_export_get(exp); - rc = obd_disconnect(exp); - if (rc) - CERROR("disconnecting export %p failed: %d\n", exp, rc); - else - CDEBUG(D_HA, "disconnected export %p/%s\n", - exp, exp->exp_client_uuid.uuid); - class_export_put(exp); -} -EXPORT_SYMBOL(class_fail_export); - -#if LUSTRE_TRACKS_LOCK_EXP_REFS -void (*class_export_dump_hook)(struct obd_export *) = NULL; -#endif - -/** - * Add export to the obd_zombie thread and notify it. - */ -static void obd_zombie_export_add(struct obd_export *exp) -{ - spin_lock(&exp->exp_obd->obd_dev_lock); - LASSERT(!list_empty(&exp->exp_obd_chain)); - list_del_init(&exp->exp_obd_chain); - spin_unlock(&exp->exp_obd->obd_dev_lock); - queue_work(zombie_wq, &exp->exp_zombie_work); -} - -/** - * Add import to the obd_zombie thread and notify it. - */ -static void obd_zombie_import_add(struct obd_import *imp) -{ - LASSERT(!imp->imp_sec); - queue_work(zombie_wq, &imp->imp_zombie_work); -} - -/** - * wait when obd_zombie import/export queues become empty - */ -void obd_zombie_barrier(void) -{ - flush_workqueue(zombie_wq); -} -EXPORT_SYMBOL(obd_zombie_barrier); - -/** - * start destroy zombie import/export thread - */ -int obd_zombie_impexp_init(void) -{ - zombie_wq = alloc_workqueue("obd_zombid", 0, 0); - if (!zombie_wq) - return -ENOMEM; - - return 0; -} - -/** - * stop destroy zombie import/export thread - */ -void obd_zombie_impexp_stop(void) -{ - destroy_workqueue(zombie_wq); -} - -struct obd_request_slot_waiter { - struct list_head orsw_entry; - wait_queue_head_t orsw_waitq; - bool orsw_signaled; -}; - -static bool obd_request_slot_avail(struct client_obd *cli, - struct obd_request_slot_waiter *orsw) -{ - bool avail; - - spin_lock(&cli->cl_loi_list_lock); - avail = !!list_empty(&orsw->orsw_entry); - spin_unlock(&cli->cl_loi_list_lock); - - return avail; -}; - -/* - * For network flow control, the RPC sponsor needs to acquire a credit - * before sending the RPC. The credits count for a connection is defined - * by the "cl_max_rpcs_in_flight". If all the credits are occpuied, then - * the subsequent RPC sponsors need to wait until others released their - * credits, or the administrator increased the "cl_max_rpcs_in_flight". - */ -int obd_get_request_slot(struct client_obd *cli) -{ - struct obd_request_slot_waiter orsw; - int rc; - - spin_lock(&cli->cl_loi_list_lock); - if (cli->cl_r_in_flight < cli->cl_max_rpcs_in_flight) { - cli->cl_r_in_flight++; - spin_unlock(&cli->cl_loi_list_lock); - return 0; - } - - init_waitqueue_head(&orsw.orsw_waitq); - list_add_tail(&orsw.orsw_entry, &cli->cl_loi_read_list); - orsw.orsw_signaled = false; - spin_unlock(&cli->cl_loi_list_lock); - - rc = l_wait_event_abortable(orsw.orsw_waitq, - obd_request_slot_avail(cli, &orsw) || - orsw.orsw_signaled); - - /* - * Here, we must take the lock to avoid the on-stack 'orsw' to be - * freed but other (such as obd_put_request_slot) is using it. - */ - spin_lock(&cli->cl_loi_list_lock); - if (rc) { - if (!orsw.orsw_signaled) { - if (list_empty(&orsw.orsw_entry)) - cli->cl_r_in_flight--; - else - list_del(&orsw.orsw_entry); - } - } - - if (orsw.orsw_signaled) { - LASSERT(list_empty(&orsw.orsw_entry)); - - rc = -EINTR; - } - spin_unlock(&cli->cl_loi_list_lock); - - return rc; -} -EXPORT_SYMBOL(obd_get_request_slot); - -void obd_put_request_slot(struct client_obd *cli) -{ - struct obd_request_slot_waiter *orsw; - - spin_lock(&cli->cl_loi_list_lock); - cli->cl_r_in_flight--; - - /* If there is free slot, wakeup the first waiter. */ - if (!list_empty(&cli->cl_loi_read_list) && - likely(cli->cl_r_in_flight < cli->cl_max_rpcs_in_flight)) { - orsw = list_entry(cli->cl_loi_read_list.next, - struct obd_request_slot_waiter, orsw_entry); - list_del_init(&orsw->orsw_entry); - cli->cl_r_in_flight++; - wake_up(&orsw->orsw_waitq); - } - spin_unlock(&cli->cl_loi_list_lock); -} -EXPORT_SYMBOL(obd_put_request_slot); - -__u32 obd_get_max_rpcs_in_flight(struct client_obd *cli) -{ - return cli->cl_max_rpcs_in_flight; -} -EXPORT_SYMBOL(obd_get_max_rpcs_in_flight); - -int obd_set_max_rpcs_in_flight(struct client_obd *cli, __u32 max) -{ - struct obd_request_slot_waiter *orsw; - const char *typ_name; - __u32 old; - int diff; - int rc; - int i; - - if (max > OBD_MAX_RIF_MAX || max < 1) - return -ERANGE; - - typ_name = cli->cl_import->imp_obd->obd_type->typ_name; - if (!strcmp(typ_name, LUSTRE_MDC_NAME)) { - /* - * adjust max_mod_rpcs_in_flight to ensure it is always - * strictly lower that max_rpcs_in_flight - */ - if (max < 2) { - CERROR("%s: cannot set max_rpcs_in_flight to 1 because it must be higher than max_mod_rpcs_in_flight value\n", - cli->cl_import->imp_obd->obd_name); - return -ERANGE; - } - if (max <= cli->cl_max_mod_rpcs_in_flight) { - rc = obd_set_max_mod_rpcs_in_flight(cli, max - 1); - if (rc) - return rc; - } - } - - spin_lock(&cli->cl_loi_list_lock); - old = cli->cl_max_rpcs_in_flight; - cli->cl_max_rpcs_in_flight = max; - diff = max - old; - - /* We increase the max_rpcs_in_flight, then wakeup some waiters. */ - for (i = 0; i < diff; i++) { - if (list_empty(&cli->cl_loi_read_list)) - break; - - orsw = list_entry(cli->cl_loi_read_list.next, - struct obd_request_slot_waiter, orsw_entry); - list_del_init(&orsw->orsw_entry); - cli->cl_r_in_flight++; - wake_up(&orsw->orsw_waitq); - } - spin_unlock(&cli->cl_loi_list_lock); - - return 0; -} -EXPORT_SYMBOL(obd_set_max_rpcs_in_flight); - -int obd_set_max_mod_rpcs_in_flight(struct client_obd *cli, __u16 max) -{ - struct obd_connect_data *ocd; - u16 maxmodrpcs; - u16 prev; - - if (max > OBD_MAX_RIF_MAX || max < 1) - return -ERANGE; - - /* cannot exceed or equal max_rpcs_in_flight */ - if (max >= cli->cl_max_rpcs_in_flight) { - CERROR("%s: can't set max_mod_rpcs_in_flight to a value (%hu) higher or equal to max_rpcs_in_flight value (%u)\n", - cli->cl_import->imp_obd->obd_name, - max, cli->cl_max_rpcs_in_flight); - return -ERANGE; - } - - /* cannot exceed max modify RPCs in flight supported by the server */ - ocd = &cli->cl_import->imp_connect_data; - if (ocd->ocd_connect_flags & OBD_CONNECT_MULTIMODRPCS) - maxmodrpcs = ocd->ocd_maxmodrpcs; - else - maxmodrpcs = 1; - if (max > maxmodrpcs) { - CERROR("%s: can't set max_mod_rpcs_in_flight to a value (%hu) higher than max_mod_rpcs_per_client value (%hu) returned by the server at connection\n", - cli->cl_import->imp_obd->obd_name, - max, maxmodrpcs); - return -ERANGE; - } - - spin_lock(&cli->cl_mod_rpcs_lock); - - prev = cli->cl_max_mod_rpcs_in_flight; - cli->cl_max_mod_rpcs_in_flight = max; - - /* wakeup waiters if limit has been increased */ - if (cli->cl_max_mod_rpcs_in_flight > prev) - wake_up(&cli->cl_mod_rpcs_waitq); - - spin_unlock(&cli->cl_mod_rpcs_lock); - - return 0; -} -EXPORT_SYMBOL(obd_set_max_mod_rpcs_in_flight); - -#define pct(a, b) (b ? (a * 100) / b : 0) - -int obd_mod_rpc_stats_seq_show(struct client_obd *cli, struct seq_file *seq) -{ - unsigned long mod_tot = 0, mod_cum; - struct timespec64 now; - int i; - - ktime_get_real_ts64(&now); - - spin_lock(&cli->cl_mod_rpcs_lock); - - seq_printf(seq, "snapshot_time: %llu.%9lu (secs.nsecs)\n", - (s64)now.tv_sec, (unsigned long)now.tv_nsec); - seq_printf(seq, "modify_RPCs_in_flight: %hu\n", - cli->cl_mod_rpcs_in_flight); - - seq_puts(seq, "\n\t\t\tmodify\n"); - seq_puts(seq, "rpcs in flight rpcs %% cum %%\n"); - - mod_tot = lprocfs_oh_sum(&cli->cl_mod_rpcs_hist); - - mod_cum = 0; - for (i = 0; i < OBD_HIST_MAX; i++) { - unsigned long mod = cli->cl_mod_rpcs_hist.oh_buckets[i]; - - mod_cum += mod; - seq_printf(seq, "%d:\t\t%10lu %3lu %3lu\n", - i, mod, pct(mod, mod_tot), - pct(mod_cum, mod_tot)); - if (mod_cum == mod_tot) - break; - } - - spin_unlock(&cli->cl_mod_rpcs_lock); - - return 0; -} -EXPORT_SYMBOL(obd_mod_rpc_stats_seq_show); -#undef pct - -/* - * The number of modify RPCs sent in parallel is limited - * because the server has a finite number of slots per client to - * store request result and ensure reply reconstruction when needed. - * On the client, this limit is stored in cl_max_mod_rpcs_in_flight - * that takes into account server limit and cl_max_rpcs_in_flight - * value. - * On the MDC client, to avoid a potential deadlock (see Bugzilla 3462), - * one close request is allowed above the maximum. - */ -static inline bool obd_mod_rpc_slot_avail_locked(struct client_obd *cli, - bool close_req) -{ - bool avail; - - /* A slot is available if - * - number of modify RPCs in flight is less than the max - * - it's a close RPC and no other close request is in flight - */ - avail = cli->cl_mod_rpcs_in_flight < cli->cl_max_mod_rpcs_in_flight || - (close_req && !cli->cl_close_rpcs_in_flight); - - return avail; -} - -static inline bool obd_mod_rpc_slot_avail(struct client_obd *cli, - bool close_req) -{ - bool avail; - - spin_lock(&cli->cl_mod_rpcs_lock); - avail = obd_mod_rpc_slot_avail_locked(cli, close_req); - spin_unlock(&cli->cl_mod_rpcs_lock); - return avail; -} - -/* Get a modify RPC slot from the obd client @cli according - * to the kind of operation @opc that is going to be sent - * and the intent @it of the operation if it applies. - * If the maximum number of modify RPCs in flight is reached - * the thread is put to sleep. - * Returns the tag to be set in the request message. Tag 0 - * is reserved for non-modifying requests. - */ -u16 obd_get_mod_rpc_slot(struct client_obd *cli, __u32 opc, - struct lookup_intent *it) -{ - bool close_req = false; - u16 i, max; - - /* read-only metadata RPCs don't consume a slot on MDT - * for reply reconstruction - */ - if (it && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP || - it->it_op == IT_LAYOUT || it->it_op == IT_READDIR)) - return 0; - - if (opc == MDS_CLOSE) - close_req = true; - - do { - spin_lock(&cli->cl_mod_rpcs_lock); - max = cli->cl_max_mod_rpcs_in_flight; - if (obd_mod_rpc_slot_avail_locked(cli, close_req)) { - /* there is a slot available */ - cli->cl_mod_rpcs_in_flight++; - if (close_req) - cli->cl_close_rpcs_in_flight++; - lprocfs_oh_tally(&cli->cl_mod_rpcs_hist, - cli->cl_mod_rpcs_in_flight); - /* find a free tag */ - i = find_first_zero_bit(cli->cl_mod_tag_bitmap, - max + 1); - LASSERT(i < OBD_MAX_RIF_MAX); - LASSERT(!test_and_set_bit(i, cli->cl_mod_tag_bitmap)); - spin_unlock(&cli->cl_mod_rpcs_lock); - /* tag 0 is reserved for non-modify RPCs */ - return i + 1; - } - spin_unlock(&cli->cl_mod_rpcs_lock); - - CDEBUG(D_RPCTRACE, "%s: sleeping for a modify RPC slot opc %u, max %hu\n", - cli->cl_import->imp_obd->obd_name, opc, max); - - wait_event_idle(cli->cl_mod_rpcs_waitq, - obd_mod_rpc_slot_avail(cli, close_req)); - } while (true); -} -EXPORT_SYMBOL(obd_get_mod_rpc_slot); - -/* - * Put a modify RPC slot from the obd client @cli according - * to the kind of operation @opc that has been sent and the - * intent @it of the operation if it applies. - */ -void obd_put_mod_rpc_slot(struct client_obd *cli, u32 opc, - struct lookup_intent *it, u16 tag) -{ - bool close_req = false; - - if (it && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP || - it->it_op == IT_LAYOUT || it->it_op == IT_READDIR)) - return; - - if (opc == MDS_CLOSE) - close_req = true; - - spin_lock(&cli->cl_mod_rpcs_lock); - cli->cl_mod_rpcs_in_flight--; - if (close_req) - cli->cl_close_rpcs_in_flight--; - /* release the tag in the bitmap */ - LASSERT(tag - 1 < OBD_MAX_RIF_MAX); - LASSERT(test_and_clear_bit(tag - 1, cli->cl_mod_tag_bitmap) != 0); - spin_unlock(&cli->cl_mod_rpcs_lock); - wake_up(&cli->cl_mod_rpcs_waitq); -} -EXPORT_SYMBOL(obd_put_mod_rpc_slot); diff --git a/drivers/staging/lustre/lustre/obdclass/kernelcomm.c b/drivers/staging/lustre/lustre/obdclass/kernelcomm.c deleted file mode 100644 index 63067a7f1e199e2bb4a7fd247f5ea062a8ceb092..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdclass/kernelcomm.c +++ /dev/null @@ -1,240 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Author: Nathan Rutman - * - * Kernel <-> userspace communication routines. - * Using pipes for all arches. - */ - -#define DEBUG_SUBSYSTEM S_CLASS -#define D_KUC D_OTHER - -#include -#include -#include -#include - -/** - * libcfs_kkuc_msg_put - send an message from kernel to userspace - * @param fp to send the message to - * @param payload Payload data. First field of payload is always - * struct kuc_hdr - */ -int libcfs_kkuc_msg_put(struct file *filp, void *payload) -{ - struct kuc_hdr *kuch = (struct kuc_hdr *)payload; - ssize_t count = kuch->kuc_msglen; - loff_t offset = 0; - int rc = -ENXIO; - - if (IS_ERR_OR_NULL(filp)) - return -EBADF; - - if (kuch->kuc_magic != KUC_MAGIC) { - CERROR("KernelComm: bad magic %x\n", kuch->kuc_magic); - return rc; - } - - while (count > 0) { - rc = kernel_write(filp, payload, count, &offset); - if (rc < 0) - break; - count -= rc; - payload += rc; - rc = 0; - } - - if (rc < 0) - CWARN("message send failed (%d)\n", rc); - else - CDEBUG(D_KUC, "Sent message rc=%d, fp=%p\n", rc, filp); - - return rc; -} -EXPORT_SYMBOL(libcfs_kkuc_msg_put); - -/* - * Broadcast groups are global across all mounted filesystems; - * i.e. registering for a group on 1 fs will get messages for that - * group from any fs - */ -/** A single group registration has a uid and a file pointer */ -struct kkuc_reg { - struct list_head kr_chain; - int kr_uid; - struct file *kr_fp; - char kr_data[0]; -}; - -static struct list_head kkuc_groups[KUC_GRP_MAX + 1] = {}; -/* Protect message sending against remove and adds */ -static DECLARE_RWSEM(kg_sem); - -/** Add a receiver to a broadcast group - * @param filp pipe to write into - * @param uid identifier for this receiver - * @param group group number - * @param data user data - */ -int libcfs_kkuc_group_add(struct file *filp, int uid, unsigned int group, - void *data, size_t data_len) -{ - struct kkuc_reg *reg; - - if (group > KUC_GRP_MAX) { - CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group); - return -EINVAL; - } - - /* fput in group_rem */ - if (!filp) - return -EBADF; - - /* freed in group_rem */ - reg = kmalloc(sizeof(*reg) + data_len, 0); - if (!reg) - return -ENOMEM; - - reg->kr_fp = filp; - reg->kr_uid = uid; - memcpy(reg->kr_data, data, data_len); - - down_write(&kg_sem); - if (!kkuc_groups[group].next) - INIT_LIST_HEAD(&kkuc_groups[group]); - list_add(®->kr_chain, &kkuc_groups[group]); - up_write(&kg_sem); - - CDEBUG(D_KUC, "Added uid=%d fp=%p to group %d\n", uid, filp, group); - - return 0; -} -EXPORT_SYMBOL(libcfs_kkuc_group_add); - -int libcfs_kkuc_group_rem(int uid, unsigned int group) -{ - struct kkuc_reg *reg, *next; - - if (!kkuc_groups[group].next) - return 0; - - if (!uid) { - /* Broadcast a shutdown message */ - struct kuc_hdr lh; - - lh.kuc_magic = KUC_MAGIC; - lh.kuc_transport = KUC_TRANSPORT_GENERIC; - lh.kuc_msgtype = KUC_MSG_SHUTDOWN; - lh.kuc_msglen = sizeof(lh); - libcfs_kkuc_group_put(group, &lh); - } - - down_write(&kg_sem); - list_for_each_entry_safe(reg, next, &kkuc_groups[group], kr_chain) { - if (!uid || (uid == reg->kr_uid)) { - list_del(®->kr_chain); - CDEBUG(D_KUC, "Removed uid=%d fp=%p from group %d\n", - reg->kr_uid, reg->kr_fp, group); - if (reg->kr_fp) - fput(reg->kr_fp); - kfree(reg); - } - } - up_write(&kg_sem); - - return 0; -} -EXPORT_SYMBOL(libcfs_kkuc_group_rem); - -int libcfs_kkuc_group_put(unsigned int group, void *payload) -{ - struct kkuc_reg *reg; - int rc = 0; - int one_success = 0; - - down_write(&kg_sem); - list_for_each_entry(reg, &kkuc_groups[group], kr_chain) { - if (reg->kr_fp) { - rc = libcfs_kkuc_msg_put(reg->kr_fp, payload); - if (!rc) { - one_success = 1; - } else if (rc == -EPIPE) { - fput(reg->kr_fp); - reg->kr_fp = NULL; - } - } - } - up_write(&kg_sem); - - /* - * don't return an error if the message has been delivered - * at least to one agent - */ - if (one_success) - rc = 0; - - return rc; -} -EXPORT_SYMBOL(libcfs_kkuc_group_put); - -/** - * Calls a callback function for each link of the given kuc group. - * @param group the group to call the function on. - * @param cb_func the function to be called. - * @param cb_arg extra argument to be passed to the callback function. - */ -int libcfs_kkuc_group_foreach(unsigned int group, libcfs_kkuc_cb_t cb_func, - void *cb_arg) -{ - struct kkuc_reg *reg; - int rc = 0; - - if (group > KUC_GRP_MAX) { - CDEBUG(D_WARNING, "Kernelcomm: bad group %d\n", group); - return -EINVAL; - } - - /* no link for this group */ - if (!kkuc_groups[group].next) - return 0; - - down_read(&kg_sem); - list_for_each_entry(reg, &kkuc_groups[group], kr_chain) { - if (reg->kr_fp) - rc = cb_func(reg->kr_data, cb_arg); - } - up_read(&kg_sem); - - return rc; -} -EXPORT_SYMBOL(libcfs_kkuc_group_foreach); diff --git a/drivers/staging/lustre/lustre/obdclass/linkea.c b/drivers/staging/lustre/lustre/obdclass/linkea.c deleted file mode 100644 index 74c99ee216bb42c6435d79673c6b68a2417a65e5..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdclass/linkea.c +++ /dev/null @@ -1,249 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2013, 2014, Intel Corporation. - * Use is subject to license terms. - * - * Author: Di Wang - */ - -#include -#include -#include - -int linkea_data_new(struct linkea_data *ldata, struct lu_buf *buf) -{ - buf->lb_buf = kzalloc(PAGE_SIZE, GFP_NOFS); - if (!buf->lb_buf) - return -ENOMEM; - buf->lb_len = PAGE_SIZE; - ldata->ld_buf = buf; - ldata->ld_leh = ldata->ld_buf->lb_buf; - ldata->ld_leh->leh_magic = LINK_EA_MAGIC; - ldata->ld_leh->leh_len = sizeof(struct link_ea_header); - ldata->ld_leh->leh_reccount = 0; - ldata->ld_leh->leh_overflow_time = 0; - ldata->ld_leh->leh_padding = 0; - return 0; -} -EXPORT_SYMBOL(linkea_data_new); - -int linkea_init(struct linkea_data *ldata) -{ - struct link_ea_header *leh; - - LASSERT(ldata->ld_buf); - leh = ldata->ld_buf->lb_buf; - if (leh->leh_magic == __swab32(LINK_EA_MAGIC)) { - leh->leh_magic = LINK_EA_MAGIC; - leh->leh_reccount = __swab32(leh->leh_reccount); - leh->leh_len = __swab64(leh->leh_len); - leh->leh_overflow_time = __swab32(leh->leh_overflow_time); - leh->leh_padding = __swab32(leh->leh_padding); - /* individual entries are swabbed by linkea_entry_unpack() */ - } - - if (leh->leh_magic != LINK_EA_MAGIC) - return -EINVAL; - - if (leh->leh_reccount == 0 && leh->leh_overflow_time == 0) - return -ENODATA; - - ldata->ld_leh = leh; - return 0; -} -EXPORT_SYMBOL(linkea_init); - -int linkea_init_with_rec(struct linkea_data *ldata) -{ - int rc; - - rc = linkea_init(ldata); - if (!rc && ldata->ld_leh->leh_reccount == 0) - rc = -ENODATA; - - return rc; -} -EXPORT_SYMBOL(linkea_init_with_rec); - -/** - * Pack a link_ea_entry. - * All elements are stored as chars to avoid alignment issues. - * Numbers are always big-endian - * \retval record length - */ -int linkea_entry_pack(struct link_ea_entry *lee, const struct lu_name *lname, - const struct lu_fid *pfid) -{ - struct lu_fid tmpfid; - int reclen; - - tmpfid = *pfid; - if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LINKEA_CRASH)) - tmpfid.f_ver = ~0; - fid_cpu_to_be(&tmpfid, &tmpfid); - memcpy(&lee->lee_parent_fid, &tmpfid, sizeof(tmpfid)); - memcpy(lee->lee_name, lname->ln_name, lname->ln_namelen); - reclen = sizeof(struct link_ea_entry) + lname->ln_namelen; - - lee->lee_reclen[0] = (reclen >> 8) & 0xff; - lee->lee_reclen[1] = reclen & 0xff; - return reclen; -} -EXPORT_SYMBOL(linkea_entry_pack); - -void linkea_entry_unpack(const struct link_ea_entry *lee, int *reclen, - struct lu_name *lname, struct lu_fid *pfid) -{ - LASSERT(lee); - - *reclen = (lee->lee_reclen[0] << 8) | lee->lee_reclen[1]; - memcpy(pfid, &lee->lee_parent_fid, sizeof(*pfid)); - fid_be_to_cpu(pfid, pfid); - if (lname) { - lname->ln_name = lee->lee_name; - lname->ln_namelen = *reclen - sizeof(struct link_ea_entry); - } -} -EXPORT_SYMBOL(linkea_entry_unpack); - -/** - * Add a record to the end of link ea buf - **/ -int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname, - const struct lu_fid *pfid) -{ - struct link_ea_header *leh = ldata->ld_leh; - int reclen; - - LASSERT(leh); - - if (!lname || !pfid) - return -EINVAL; - - reclen = lname->ln_namelen + sizeof(struct link_ea_entry); - if (unlikely(leh->leh_len + reclen > MAX_LINKEA_SIZE)) { - /* - * Use 32-bits to save the overflow time, although it will - * shrink the ktime_get_real_seconds() returned 64-bits value - * to 32-bits value, it is still quite large and can be used - * for about 140 years. That is enough. - */ - leh->leh_overflow_time = ktime_get_real_seconds(); - if (unlikely(leh->leh_overflow_time == 0)) - leh->leh_overflow_time++; - - CDEBUG(D_INODE, "No enough space to hold linkea entry '" DFID ": %.*s' at %u\n", - PFID(pfid), lname->ln_namelen, - lname->ln_name, leh->leh_overflow_time); - return 0; - } - - if (leh->leh_len + reclen > ldata->ld_buf->lb_len) { - /* Note: this never happens as MAX_LINKEA_SIZE is 4096, while - * the initial allocation is PAGE_SIZE. - */ - void *b = krealloc(ldata->ld_buf->lb_buf, leh->leh_len + reclen, GFP_NOFS); - if (!b) - return -ENOMEM; - - ldata->ld_buf->lb_len = leh->leh_len + reclen; - leh = ldata->ld_leh = ldata->ld_buf->lb_buf = b; - } - - ldata->ld_lee = ldata->ld_buf->lb_buf + leh->leh_len; - ldata->ld_reclen = linkea_entry_pack(ldata->ld_lee, lname, pfid); - leh->leh_len += ldata->ld_reclen; - leh->leh_reccount++; - CDEBUG(D_INODE, "New link_ea name '" DFID ":%.*s' is added\n", - PFID(pfid), lname->ln_namelen, lname->ln_name); - return 0; -} -EXPORT_SYMBOL(linkea_add_buf); - -/** Del the current record from the link ea buf */ -void linkea_del_buf(struct linkea_data *ldata, const struct lu_name *lname) -{ - LASSERT(ldata->ld_leh && ldata->ld_lee); - LASSERT(ldata->ld_leh->leh_reccount > 0); - - ldata->ld_leh->leh_reccount--; - ldata->ld_leh->leh_len -= ldata->ld_reclen; - memmove(ldata->ld_lee, (char *)ldata->ld_lee + ldata->ld_reclen, - (char *)ldata->ld_leh + ldata->ld_leh->leh_len - - (char *)ldata->ld_lee); - CDEBUG(D_INODE, "Old link_ea name '%.*s' is removed\n", - lname->ln_namelen, lname->ln_name); - - if ((char *)ldata->ld_lee >= ((char *)ldata->ld_leh + - ldata->ld_leh->leh_len)) - ldata->ld_lee = NULL; -} -EXPORT_SYMBOL(linkea_del_buf); - -/** - * Check if such a link exists in linkEA. - * - * \param ldata link data the search to be done on - * \param lname name in the parent's directory entry pointing to this object - * \param pfid parent fid the link to be found for - * - * \retval 0 success - * \retval -ENOENT link does not exist - * \retval -ve on error - */ -int linkea_links_find(struct linkea_data *ldata, const struct lu_name *lname, - const struct lu_fid *pfid) -{ - struct lu_name tmpname; - struct lu_fid tmpfid; - int count; - - LASSERT(ldata->ld_leh); - - /* link #0, if leh_reccount == 0 we skip the loop and return -ENOENT */ - if (likely(ldata->ld_leh->leh_reccount > 0)) - ldata->ld_lee = (struct link_ea_entry *)(ldata->ld_leh + 1); - - for (count = 0; count < ldata->ld_leh->leh_reccount; count++) { - linkea_entry_unpack(ldata->ld_lee, &ldata->ld_reclen, - &tmpname, &tmpfid); - if (tmpname.ln_namelen == lname->ln_namelen && - lu_fid_eq(&tmpfid, pfid) && - (strncmp(tmpname.ln_name, lname->ln_name, - tmpname.ln_namelen) == 0)) - break; - ldata->ld_lee = (struct link_ea_entry *)((char *)ldata->ld_lee + - ldata->ld_reclen); - } - - if (count == ldata->ld_leh->leh_reccount) { - CDEBUG(D_INODE, "Old link_ea name '%.*s' not found\n", - lname->ln_namelen, lname->ln_name); - ldata->ld_lee = NULL; - ldata->ld_reclen = 0; - return -ENOENT; - } - return 0; -} -EXPORT_SYMBOL(linkea_links_find); diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c deleted file mode 100644 index 9c800580053b85819ff607e30f7fc133a336568a..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c +++ /dev/null @@ -1,514 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/obdclass/linux/linux-module.c - * - * Object Devices Class Driver - * These are the only exported functions, they provide some generic - * infrastructure for managing object devices - */ - -#define DEBUG_SUBSYSTEM S_CLASS - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#define OBD_MAX_IOCTL_BUFFER 8192 - -static int obd_ioctl_is_invalid(struct obd_ioctl_data *data) -{ - if (data->ioc_len > BIT(30)) { - CERROR("OBD ioctl: ioc_len larger than 1<<30\n"); - return 1; - } - - if (data->ioc_inllen1 > BIT(30)) { - CERROR("OBD ioctl: ioc_inllen1 larger than 1<<30\n"); - return 1; - } - - if (data->ioc_inllen2 > BIT(30)) { - CERROR("OBD ioctl: ioc_inllen2 larger than 1<<30\n"); - return 1; - } - - if (data->ioc_inllen3 > BIT(30)) { - CERROR("OBD ioctl: ioc_inllen3 larger than 1<<30\n"); - return 1; - } - - if (data->ioc_inllen4 > BIT(30)) { - CERROR("OBD ioctl: ioc_inllen4 larger than 1<<30\n"); - return 1; - } - - if (data->ioc_inlbuf1 && data->ioc_inllen1 == 0) { - CERROR("OBD ioctl: inlbuf1 pointer but 0 length\n"); - return 1; - } - - if (data->ioc_inlbuf2 && data->ioc_inllen2 == 0) { - CERROR("OBD ioctl: inlbuf2 pointer but 0 length\n"); - return 1; - } - - if (data->ioc_inlbuf3 && data->ioc_inllen3 == 0) { - CERROR("OBD ioctl: inlbuf3 pointer but 0 length\n"); - return 1; - } - - if (data->ioc_inlbuf4 && data->ioc_inllen4 == 0) { - CERROR("OBD ioctl: inlbuf4 pointer but 0 length\n"); - return 1; - } - - if (data->ioc_pbuf1 && data->ioc_plen1 == 0) { - CERROR("OBD ioctl: pbuf1 pointer but 0 length\n"); - return 1; - } - - if (data->ioc_pbuf2 && data->ioc_plen2 == 0) { - CERROR("OBD ioctl: pbuf2 pointer but 0 length\n"); - return 1; - } - - if (!data->ioc_pbuf1 && data->ioc_plen1 != 0) { - CERROR("OBD ioctl: plen1 set but NULL pointer\n"); - return 1; - } - - if (!data->ioc_pbuf2 && data->ioc_plen2 != 0) { - CERROR("OBD ioctl: plen2 set but NULL pointer\n"); - return 1; - } - - if (obd_ioctl_packlen(data) > data->ioc_len) { - CERROR("OBD ioctl: packlen exceeds ioc_len (%d > %d)\n", - obd_ioctl_packlen(data), data->ioc_len); - return 1; - } - - return 0; -} - -/* buffer MUST be at least the size of obd_ioctl_hdr */ -int obd_ioctl_getdata(char **buf, int *len, void __user *arg) -{ - struct obd_ioctl_hdr hdr; - struct obd_ioctl_data *data; - int err; - int offset = 0; - - if (copy_from_user(&hdr, arg, sizeof(hdr))) - return -EFAULT; - - if (hdr.ioc_version != OBD_IOCTL_VERSION) { - CERROR("Version mismatch kernel (%x) vs application (%x)\n", - OBD_IOCTL_VERSION, hdr.ioc_version); - return -EINVAL; - } - - if (hdr.ioc_len > OBD_MAX_IOCTL_BUFFER) { - CERROR("User buffer len %d exceeds %d max buffer\n", - hdr.ioc_len, OBD_MAX_IOCTL_BUFFER); - return -EINVAL; - } - - if (hdr.ioc_len < sizeof(struct obd_ioctl_data)) { - CERROR("User buffer too small for ioctl (%d)\n", hdr.ioc_len); - return -EINVAL; - } - - /* When there are lots of processes calling vmalloc on multi-core - * system, the high lock contention will hurt performance badly, - * obdfilter-survey is an example, which relies on ioctl. So we'd - * better avoid vmalloc on ioctl path. LU-66 - */ - *buf = kvzalloc(hdr.ioc_len, GFP_KERNEL); - if (!*buf) { - CERROR("Cannot allocate control buffer of len %d\n", - hdr.ioc_len); - return -EINVAL; - } - *len = hdr.ioc_len; - data = (struct obd_ioctl_data *)*buf; - - if (copy_from_user(*buf, arg, hdr.ioc_len)) { - err = -EFAULT; - goto free_buf; - } - if (hdr.ioc_len != data->ioc_len) { - err = -EINVAL; - goto free_buf; - } - - if (obd_ioctl_is_invalid(data)) { - CERROR("ioctl not correctly formatted\n"); - err = -EINVAL; - goto free_buf; - } - - if (data->ioc_inllen1) { - data->ioc_inlbuf1 = &data->ioc_bulk[0]; - offset += cfs_size_round(data->ioc_inllen1); - } - - if (data->ioc_inllen2) { - data->ioc_inlbuf2 = &data->ioc_bulk[0] + offset; - offset += cfs_size_round(data->ioc_inllen2); - } - - if (data->ioc_inllen3) { - data->ioc_inlbuf3 = &data->ioc_bulk[0] + offset; - offset += cfs_size_round(data->ioc_inllen3); - } - - if (data->ioc_inllen4) - data->ioc_inlbuf4 = &data->ioc_bulk[0] + offset; - - return 0; - -free_buf: - kvfree(*buf); - return err; -} -EXPORT_SYMBOL(obd_ioctl_getdata); - -/* opening /dev/obd */ -static int obd_class_open(struct inode *inode, struct file *file) -{ - try_module_get(THIS_MODULE); - return 0; -} - -/* closing /dev/obd */ -static int obd_class_release(struct inode *inode, struct file *file) -{ - module_put(THIS_MODULE); - return 0; -} - -/* to control /dev/obd */ -static long obd_class_ioctl(struct file *filp, unsigned int cmd, - unsigned long arg) -{ - int err = 0; - - /* Allow non-root access for OBD_IOC_PING_TARGET - used by lfs check */ - if (!capable(CAP_SYS_ADMIN) && (cmd != OBD_IOC_PING_TARGET)) - return err = -EACCES; - if ((cmd & 0xffffff00) == ((int)'T') << 8) /* ignore all tty ioctls */ - return err = -ENOTTY; - - err = class_handle_ioctl(cmd, (unsigned long)arg); - - return err; -} - -/* declare character device */ -static const struct file_operations obd_psdev_fops = { - .owner = THIS_MODULE, - .unlocked_ioctl = obd_class_ioctl, /* unlocked_ioctl */ - .open = obd_class_open, /* open */ - .release = obd_class_release, /* release */ -}; - -/* modules setup */ -struct miscdevice obd_psdev = { - .minor = MISC_DYNAMIC_MINOR, - .name = OBD_DEV_NAME, - .fops = &obd_psdev_fops, -}; - -static ssize_t version_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - return sprintf(buf, "%s\n", LUSTRE_VERSION_STRING); -} - -static ssize_t pinger_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - return sprintf(buf, "%s\n", "on"); -} - -static ssize_t -health_check_show(struct kobject *kobj, struct attribute *attr, char *buf) -{ - bool healthy = true; - int i; - size_t len = 0; - - if (libcfs_catastrophe) - return sprintf(buf, "LBUG\n"); - - read_lock(&obd_dev_lock); - for (i = 0; i < class_devno_max(); i++) { - struct obd_device *obd; - - obd = class_num2obd(i); - if (!obd || !obd->obd_attached || !obd->obd_set_up) - continue; - - LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); - if (obd->obd_stopping) - continue; - - class_incref(obd, __func__, current); - read_unlock(&obd_dev_lock); - - if (obd_health_check(NULL, obd)) - healthy = false; - class_decref(obd, __func__, current); - read_lock(&obd_dev_lock); - } - read_unlock(&obd_dev_lock); - - if (healthy) - len = sprintf(buf, "healthy\n"); - else - len = sprintf(buf, "NOT HEALTHY\n"); - - return len; -} - -static ssize_t jobid_var_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%s\n", obd_jobid_var); -} - -static ssize_t jobid_var_store(struct kobject *kobj, struct attribute *attr, - const char *buffer, - size_t count) -{ - if (!count || count > JOBSTATS_JOBID_VAR_MAX_LEN) - return -EINVAL; - - memset(obd_jobid_var, 0, JOBSTATS_JOBID_VAR_MAX_LEN + 1); - - memcpy(obd_jobid_var, buffer, count); - - /* Trim the trailing '\n' if any */ - if (obd_jobid_var[count - 1] == '\n') - obd_jobid_var[count - 1] = 0; - - return count; -} - -static ssize_t jobid_name_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%s\n", obd_jobid_node); -} - -static ssize_t jobid_name_store(struct kobject *kobj, struct attribute *attr, - const char *buffer, - size_t count) -{ - if (!count || count > LUSTRE_JOBID_SIZE) - return -EINVAL; - - memcpy(obd_jobid_node, buffer, count); - - obd_jobid_node[count] = 0; - - /* Trim the trailing '\n' if any */ - if (obd_jobid_node[count - 1] == '\n') - obd_jobid_node[count - 1] = 0; - - return count; -} - -/* Root for /sys/kernel/debug/lustre */ -struct dentry *debugfs_lustre_root; -EXPORT_SYMBOL_GPL(debugfs_lustre_root); - -LUSTRE_RO_ATTR(version); -LUSTRE_RO_ATTR(pinger); -LUSTRE_RO_ATTR(health_check); -LUSTRE_RW_ATTR(jobid_var); -LUSTRE_RW_ATTR(jobid_name); - -static struct attribute *lustre_attrs[] = { - &lustre_attr_version.attr, - &lustre_attr_pinger.attr, - &lustre_attr_health_check.attr, - &lustre_attr_jobid_name.attr, - &lustre_attr_jobid_var.attr, - NULL, -}; - -static void *obd_device_list_seq_start(struct seq_file *p, loff_t *pos) -{ - if (*pos >= class_devno_max()) - return NULL; - - return pos; -} - -static void obd_device_list_seq_stop(struct seq_file *p, void *v) -{ -} - -static void *obd_device_list_seq_next(struct seq_file *p, void *v, loff_t *pos) -{ - ++*pos; - if (*pos >= class_devno_max()) - return NULL; - - return pos; -} - -static int obd_device_list_seq_show(struct seq_file *p, void *v) -{ - loff_t index = *(loff_t *)v; - struct obd_device *obd = class_num2obd((int)index); - char *status; - - if (!obd) - return 0; - - LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); - if (obd->obd_stopping) - status = "ST"; - else if (obd->obd_inactive) - status = "IN"; - else if (obd->obd_set_up) - status = "UP"; - else if (obd->obd_attached) - status = "AT"; - else - status = "--"; - - seq_printf(p, "%3d %s %s %s %s %d\n", - (int)index, status, obd->obd_type->typ_name, - obd->obd_name, obd->obd_uuid.uuid, - atomic_read(&obd->obd_refcount)); - return 0; -} - -static const struct seq_operations obd_device_list_sops = { - .start = obd_device_list_seq_start, - .stop = obd_device_list_seq_stop, - .next = obd_device_list_seq_next, - .show = obd_device_list_seq_show, -}; - -static int obd_device_list_open(struct inode *inode, struct file *file) -{ - struct seq_file *seq; - int rc = seq_open(file, &obd_device_list_sops); - - if (rc) - return rc; - - seq = file->private_data; - seq->private = inode->i_private; - - return 0; -} - -static const struct file_operations obd_device_list_fops = { - .owner = THIS_MODULE, - .open = obd_device_list_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -struct kobject *lustre_kobj; -EXPORT_SYMBOL_GPL(lustre_kobj); - -static const struct attribute_group lustre_attr_group = { - .attrs = lustre_attrs, -}; - -int class_procfs_init(void) -{ - int rc = -ENOMEM; - - lustre_kobj = kobject_create_and_add("lustre", fs_kobj); - if (!lustre_kobj) - goto out; - - /* Create the files associated with this kobject */ - rc = sysfs_create_group(lustre_kobj, &lustre_attr_group); - if (rc) { - kobject_put(lustre_kobj); - goto out; - } - - debugfs_lustre_root = debugfs_create_dir("lustre", NULL); - - debugfs_create_file("devices", 0444, debugfs_lustre_root, NULL, - &obd_device_list_fops); -out: - return rc; -} - -int class_procfs_clean(void) -{ - debugfs_remove_recursive(debugfs_lustre_root); - - debugfs_lustre_root = NULL; - - sysfs_remove_group(lustre_kobj, &lustre_attr_group); - kobject_put(lustre_kobj); - - return 0; -} diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c deleted file mode 100644 index e5e8687784ee70650ce7a50e801712a32457a5d6..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c +++ /dev/null @@ -1,162 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define DEBUG_SUBSYSTEM S_CLASS - -#include -#include -#include - -struct static_lustre_uintvalue_attr { - struct { - struct attribute attr; - ssize_t (*show)(struct kobject *kobj, struct attribute *attr, - char *buf); - ssize_t (*store)(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t len); - } u; - int *value; -}; - -static ssize_t static_uintvalue_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct static_lustre_uintvalue_attr *lattr = (void *)attr; - - return sprintf(buf, "%d\n", *lattr->value); -} - -static ssize_t static_uintvalue_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, size_t count) -{ - struct static_lustre_uintvalue_attr *lattr = (void *)attr; - int rc; - unsigned int val; - - rc = kstrtouint(buffer, 10, &val); - if (rc) - return rc; - - *lattr->value = val; - - return count; -} - -#define LUSTRE_STATIC_UINT_ATTR(name, value) \ -static struct static_lustre_uintvalue_attr lustre_sattr_##name = \ - {__ATTR(name, 0644, \ - static_uintvalue_show, \ - static_uintvalue_store),\ - value } - -LUSTRE_STATIC_UINT_ATTR(timeout, &obd_timeout); - -static ssize_t max_dirty_mb_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - return sprintf(buf, "%lu\n", - obd_max_dirty_pages / (1 << (20 - PAGE_SHIFT))); -} - -static ssize_t max_dirty_mb_store(struct kobject *kobj, struct attribute *attr, - const char *buffer, size_t count) -{ - int rc; - unsigned long val; - - rc = kstrtoul(buffer, 10, &val); - if (rc) - return rc; - - val *= 1 << (20 - PAGE_SHIFT); /* convert to pages */ - - if (val > ((totalram_pages / 10) * 9)) { - /* Somebody wants to assign too much memory to dirty pages */ - return -EINVAL; - } - - if (val < 4 << (20 - PAGE_SHIFT)) { - /* Less than 4 Mb for dirty cache is also bad */ - return -EINVAL; - } - - obd_max_dirty_pages = val; - - return count; -} -LUSTRE_RW_ATTR(max_dirty_mb); - -LUSTRE_STATIC_UINT_ATTR(debug_peer_on_timeout, &obd_debug_peer_on_timeout); -LUSTRE_STATIC_UINT_ATTR(dump_on_timeout, &obd_dump_on_timeout); -LUSTRE_STATIC_UINT_ATTR(dump_on_eviction, &obd_dump_on_eviction); -LUSTRE_STATIC_UINT_ATTR(at_min, &at_min); -LUSTRE_STATIC_UINT_ATTR(at_max, &at_max); -LUSTRE_STATIC_UINT_ATTR(at_extra, &at_extra); -LUSTRE_STATIC_UINT_ATTR(at_early_margin, &at_early_margin); -LUSTRE_STATIC_UINT_ATTR(at_history, &at_history); - -static struct attribute *lustre_attrs[] = { - &lustre_sattr_timeout.u.attr, - &lustre_attr_max_dirty_mb.attr, - &lustre_sattr_debug_peer_on_timeout.u.attr, - &lustre_sattr_dump_on_timeout.u.attr, - &lustre_sattr_dump_on_eviction.u.attr, - &lustre_sattr_at_min.u.attr, - &lustre_sattr_at_max.u.attr, - &lustre_sattr_at_extra.u.attr, - &lustre_sattr_at_early_margin.u.attr, - &lustre_sattr_at_history.u.attr, - NULL, -}; - -static const struct attribute_group lustre_attr_group = { - .attrs = lustre_attrs, -}; - -int obd_sysctl_init(void) -{ - return sysfs_create_group(lustre_kobj, &lustre_attr_group); -} diff --git a/drivers/staging/lustre/lustre/obdclass/llog.c b/drivers/staging/lustre/lustre/obdclass/llog.c deleted file mode 100644 index bba84eae1e19ea397c23596d9366a692004dbd98..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdclass/llog.c +++ /dev/null @@ -1,524 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/obdclass/llog.c - * - * OST<->MDS recovery logging infrastructure. - * Invariants in implementation: - * - we do not share logs among different OST<->MDS connections, so that - * if an OST or MDS fails it need only look at log(s) relevant to itself - * - * Author: Andreas Dilger - * Author: Alex Zhuravlev - * Author: Mikhail Pershin - */ - -#define DEBUG_SUBSYSTEM S_LOG - -#include -#include -#include -#include -#include "llog_internal.h" - -/* - * Allocate a new log or catalog handle - * Used inside llog_open(). - */ -static struct llog_handle *llog_alloc_handle(void) -{ - struct llog_handle *loghandle; - - loghandle = kzalloc(sizeof(*loghandle), GFP_NOFS); - if (!loghandle) - return NULL; - - init_rwsem(&loghandle->lgh_lock); - spin_lock_init(&loghandle->lgh_hdr_lock); - INIT_LIST_HEAD(&loghandle->u.phd.phd_entry); - atomic_set(&loghandle->lgh_refcount, 1); - - return loghandle; -} - -/* - * Free llog handle and header data if exists. Used in llog_close() only - */ -static void llog_free_handle(struct llog_handle *loghandle) -{ - /* failed llog_init_handle */ - if (!loghandle->lgh_hdr) - goto out; - - if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) - LASSERT(list_empty(&loghandle->u.phd.phd_entry)); - else if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) - LASSERT(list_empty(&loghandle->u.chd.chd_head)); - kvfree(loghandle->lgh_hdr); -out: - kfree(loghandle); -} - -void llog_handle_get(struct llog_handle *loghandle) -{ - atomic_inc(&loghandle->lgh_refcount); -} - -void llog_handle_put(struct llog_handle *loghandle) -{ - LASSERT(atomic_read(&loghandle->lgh_refcount) > 0); - if (atomic_dec_and_test(&loghandle->lgh_refcount)) - llog_free_handle(loghandle); -} - -static int llog_read_header(const struct lu_env *env, - struct llog_handle *handle, - struct obd_uuid *uuid) -{ - struct llog_operations *lop; - int rc; - - rc = llog_handle2ops(handle, &lop); - if (rc) - return rc; - - if (!lop->lop_read_header) - return -EOPNOTSUPP; - - rc = lop->lop_read_header(env, handle); - if (rc == LLOG_EEMPTY) { - struct llog_log_hdr *llh = handle->lgh_hdr; - size_t len; - - /* lrh_len should be initialized in llog_init_handle */ - handle->lgh_last_idx = 0; /* header is record with index 0 */ - llh->llh_count = 1; /* for the header record */ - llh->llh_hdr.lrh_type = LLOG_HDR_MAGIC; - LASSERT(handle->lgh_ctxt->loc_chunk_size >= LLOG_MIN_CHUNK_SIZE); - llh->llh_hdr.lrh_len = handle->lgh_ctxt->loc_chunk_size; - llh->llh_hdr.lrh_index = 0; - llh->llh_timestamp = ktime_get_real_seconds(); - if (uuid) - memcpy(&llh->llh_tgtuuid, uuid, - sizeof(llh->llh_tgtuuid)); - llh->llh_bitmap_offset = offsetof(typeof(*llh), llh_bitmap); - /* - * Since update llog header might also call this function, - * let's reset the bitmap to 0 here - */ - len = llh->llh_hdr.lrh_len - llh->llh_bitmap_offset; - memset(LLOG_HDR_BITMAP(llh), 0, len - sizeof(llh->llh_tail)); - ext2_set_bit(0, LLOG_HDR_BITMAP(llh)); - LLOG_HDR_TAIL(llh)->lrt_len = llh->llh_hdr.lrh_len; - LLOG_HDR_TAIL(llh)->lrt_index = llh->llh_hdr.lrh_index; - rc = 0; - } - return rc; -} - -int llog_init_handle(const struct lu_env *env, struct llog_handle *handle, - int flags, struct obd_uuid *uuid) -{ - int chunk_size = handle->lgh_ctxt->loc_chunk_size; - enum llog_flag fmt = flags & LLOG_F_EXT_MASK; - struct llog_log_hdr *llh; - int rc; - - LASSERT(!handle->lgh_hdr); - - LASSERT(chunk_size >= LLOG_MIN_CHUNK_SIZE); - llh = kvzalloc(sizeof(*llh), GFP_KERNEL); - if (!llh) - return -ENOMEM; - handle->lgh_hdr = llh; - handle->lgh_hdr_size = chunk_size; - /* first assign flags to use llog_client_ops */ - llh->llh_flags = flags; - rc = llog_read_header(env, handle, uuid); - if (rc == 0) { - if (unlikely((llh->llh_flags & LLOG_F_IS_PLAIN && - flags & LLOG_F_IS_CAT) || - (llh->llh_flags & LLOG_F_IS_CAT && - flags & LLOG_F_IS_PLAIN))) { - CERROR("%s: llog type is %s but initializing %s\n", - handle->lgh_ctxt->loc_obd->obd_name, - llh->llh_flags & LLOG_F_IS_CAT ? - "catalog" : "plain", - flags & LLOG_F_IS_CAT ? "catalog" : "plain"); - rc = -EINVAL; - goto out; - } else if (llh->llh_flags & - (LLOG_F_IS_PLAIN | LLOG_F_IS_CAT)) { - /* - * it is possible to open llog without specifying llog - * type so it is taken from llh_flags - */ - flags = llh->llh_flags; - } else { - /* for some reason the llh_flags has no type set */ - CERROR("llog type is not specified!\n"); - rc = -EINVAL; - goto out; - } - if (unlikely(uuid && - !obd_uuid_equals(uuid, &llh->llh_tgtuuid))) { - CERROR("%s: llog uuid mismatch: %s/%s\n", - handle->lgh_ctxt->loc_obd->obd_name, - (char *)uuid->uuid, - (char *)llh->llh_tgtuuid.uuid); - rc = -EEXIST; - goto out; - } - } - if (flags & LLOG_F_IS_CAT) { - LASSERT(list_empty(&handle->u.chd.chd_head)); - INIT_LIST_HEAD(&handle->u.chd.chd_head); - llh->llh_size = sizeof(struct llog_logid_rec); - llh->llh_flags |= LLOG_F_IS_FIXSIZE; - } else if (!(flags & LLOG_F_IS_PLAIN)) { - CERROR("%s: unknown flags: %#x (expected %#x or %#x)\n", - handle->lgh_ctxt->loc_obd->obd_name, - flags, LLOG_F_IS_CAT, LLOG_F_IS_PLAIN); - rc = -EINVAL; - } - llh->llh_flags |= fmt; -out: - if (rc) { - kvfree(llh); - handle->lgh_hdr = NULL; - } - return rc; -} -EXPORT_SYMBOL(llog_init_handle); - -static int llog_process_thread(void *arg) -{ - struct llog_process_info *lpi = arg; - struct llog_handle *loghandle = lpi->lpi_loghandle; - struct llog_log_hdr *llh = loghandle->lgh_hdr; - struct llog_process_cat_data *cd = lpi->lpi_catdata; - char *buf; - u64 cur_offset, tmp_offset; - int chunk_size; - int rc = 0, index = 1, last_index; - int saved_index = 0; - int last_called_index = 0; - - if (!llh) - return -EINVAL; - - cur_offset = llh->llh_hdr.lrh_len; - chunk_size = llh->llh_hdr.lrh_len; - /* expect chunk_size to be power of two */ - LASSERT(is_power_of_2(chunk_size)); - - buf = kvzalloc(chunk_size, GFP_NOFS); - if (!buf) { - lpi->lpi_rc = -ENOMEM; - return 0; - } - - if (cd) { - last_called_index = cd->lpcd_first_idx; - index = cd->lpcd_first_idx + 1; - } - if (cd && cd->lpcd_last_idx) - last_index = cd->lpcd_last_idx; - else - last_index = LLOG_HDR_BITMAP_SIZE(llh) - 1; - - while (rc == 0) { - unsigned int buf_offset = 0; - struct llog_rec_hdr *rec; - bool partial_chunk; - off_t chunk_offset; - - /* skip records not set in bitmap */ - while (index <= last_index && - !ext2_test_bit(index, LLOG_HDR_BITMAP(llh))) - ++index; - - if (index > last_index) - break; - - CDEBUG(D_OTHER, "index: %d last_index %d\n", - index, last_index); -repeat: - /* get the buf with our target record; avoid old garbage */ - memset(buf, 0, chunk_size); - rc = llog_next_block(lpi->lpi_env, loghandle, &saved_index, - index, &cur_offset, buf, chunk_size); - if (rc) - goto out; - - /* - * NB: after llog_next_block() call the cur_offset is the - * offset of the next block after read one. - * The absolute offset of the current chunk is calculated - * from cur_offset value and stored in chunk_offset variable. - */ - tmp_offset = cur_offset; - if (do_div(tmp_offset, chunk_size)) { - partial_chunk = true; - chunk_offset = cur_offset & ~(chunk_size - 1); - } else { - partial_chunk = false; - chunk_offset = cur_offset - chunk_size; - } - - /* NB: when rec->lrh_len is accessed it is already swabbed - * since it is used at the "end" of the loop and the rec - * swabbing is done at the beginning of the loop. - */ - for (rec = (struct llog_rec_hdr *)(buf + buf_offset); - (char *)rec < buf + chunk_size; - rec = llog_rec_hdr_next(rec)) { - CDEBUG(D_OTHER, "processing rec 0x%p type %#x\n", - rec, rec->lrh_type); - - if (LLOG_REC_HDR_NEEDS_SWABBING(rec)) - lustre_swab_llog_rec(rec); - - CDEBUG(D_OTHER, "after swabbing, type=%#x idx=%d\n", - rec->lrh_type, rec->lrh_index); - - /* - * for partial chunk the end of it is zeroed, check - * for index 0 to distinguish it. - */ - if (partial_chunk && !rec->lrh_index) { - /* concurrent llog_add() might add new records - * while llog_processing, check this is not - * the case and re-read the current chunk - * otherwise. - */ - if (index > loghandle->lgh_last_idx) { - rc = 0; - goto out; - } - CDEBUG(D_OTHER, "Re-read last llog buffer for new records, index %u, last %u\n", - index, loghandle->lgh_last_idx); - /* save offset inside buffer for the re-read */ - buf_offset = (char *)rec - (char *)buf; - cur_offset = chunk_offset; - goto repeat; - } - - if (!rec->lrh_len || rec->lrh_len > chunk_size) { - CWARN("invalid length %d in llog record for index %d/%d\n", - rec->lrh_len, - rec->lrh_index, index); - rc = -EINVAL; - goto out; - } - - if (rec->lrh_index < index) { - CDEBUG(D_OTHER, "skipping lrh_index %d\n", - rec->lrh_index); - continue; - } - - if (rec->lrh_index != index) { - CERROR("%s: Invalid record: index %u but expected %u\n", - loghandle->lgh_ctxt->loc_obd->obd_name, - rec->lrh_index, index); - rc = -ERANGE; - goto out; - } - - CDEBUG(D_OTHER, - "lrh_index: %d lrh_len: %d (%d remains)\n", - rec->lrh_index, rec->lrh_len, - (int)(buf + chunk_size - (char *)rec)); - - loghandle->lgh_cur_idx = rec->lrh_index; - loghandle->lgh_cur_offset = (char *)rec - (char *)buf + - chunk_offset; - - /* if set, process the callback on this record */ - if (ext2_test_bit(index, LLOG_HDR_BITMAP(llh))) { - rc = lpi->lpi_cb(lpi->lpi_env, loghandle, rec, - lpi->lpi_cbdata); - last_called_index = index; - if (rc) - goto out; - } - - /* exit if the last index is reached */ - if (index >= last_index) { - rc = 0; - goto out; - } - index++; - } - } - -out: - if (cd) - cd->lpcd_last_idx = last_called_index; - - kvfree(buf); - lpi->lpi_rc = rc; - return 0; -} - -static int llog_process_thread_daemonize(void *arg) -{ - struct llog_process_info *lpi = arg; - struct lu_env env; - int rc; - - unshare_fs_struct(); - - /* client env has no keys, tags is just 0 */ - rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD); - if (rc) - goto out; - lpi->lpi_env = &env; - - rc = llog_process_thread(arg); - - lu_env_fini(&env); -out: - complete(&lpi->lpi_completion); - return rc; -} - -int llog_process_or_fork(const struct lu_env *env, - struct llog_handle *loghandle, - llog_cb_t cb, void *data, void *catdata, bool fork) -{ - struct llog_process_info *lpi; - int rc; - - lpi = kzalloc(sizeof(*lpi), GFP_NOFS); - if (!lpi) - return -ENOMEM; - lpi->lpi_loghandle = loghandle; - lpi->lpi_cb = cb; - lpi->lpi_cbdata = data; - lpi->lpi_catdata = catdata; - - if (fork) { - struct task_struct *task; - - /* The new thread can't use parent env, - * init the new one in llog_process_thread_daemonize. - */ - lpi->lpi_env = NULL; - init_completion(&lpi->lpi_completion); - task = kthread_run(llog_process_thread_daemonize, lpi, - "llog_process_thread"); - if (IS_ERR(task)) { - rc = PTR_ERR(task); - CERROR("%s: cannot start thread: rc = %d\n", - loghandle->lgh_ctxt->loc_obd->obd_name, rc); - goto out_lpi; - } - wait_for_completion(&lpi->lpi_completion); - } else { - lpi->lpi_env = env; - llog_process_thread(lpi); - } - rc = lpi->lpi_rc; -out_lpi: - kfree(lpi); - return rc; -} -EXPORT_SYMBOL(llog_process_or_fork); - -int llog_process(const struct lu_env *env, struct llog_handle *loghandle, - llog_cb_t cb, void *data, void *catdata) -{ - return llog_process_or_fork(env, loghandle, cb, data, catdata, true); -} -EXPORT_SYMBOL(llog_process); - -int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt, - struct llog_handle **lgh, struct llog_logid *logid, - char *name, enum llog_open_param open_param) -{ - const struct cred *old_cred = NULL; - int rc; - - LASSERT(ctxt); - LASSERT(ctxt->loc_logops); - - if (!ctxt->loc_logops->lop_open) { - *lgh = NULL; - return -EOPNOTSUPP; - } - - *lgh = llog_alloc_handle(); - if (!*lgh) - return -ENOMEM; - (*lgh)->lgh_ctxt = ctxt; - (*lgh)->lgh_logops = ctxt->loc_logops; - - if (cap_raised(current_cap(), CAP_SYS_RESOURCE)) { - struct cred *cred = prepare_creds(); - - if (cred) { - cap_raise(cred->cap_effective, CAP_SYS_RESOURCE); - old_cred = override_creds(cred); - } - } - rc = ctxt->loc_logops->lop_open(env, *lgh, logid, name, open_param); - if (old_cred) - revert_creds(old_cred); - - if (rc) { - llog_free_handle(*lgh); - *lgh = NULL; - } - return rc; -} -EXPORT_SYMBOL(llog_open); - -int llog_close(const struct lu_env *env, struct llog_handle *loghandle) -{ - struct llog_operations *lop; - int rc; - - rc = llog_handle2ops(loghandle, &lop); - if (rc) - goto out; - if (!lop->lop_close) { - rc = -EOPNOTSUPP; - goto out; - } - rc = lop->lop_close(env, loghandle); -out: - llog_handle_put(loghandle); - return rc; -} -EXPORT_SYMBOL(llog_close); diff --git a/drivers/staging/lustre/lustre/obdclass/llog_cat.c b/drivers/staging/lustre/lustre/obdclass/llog_cat.c deleted file mode 100644 index d9c63adff206401d1b6a950ddfef91b3e00690a4..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdclass/llog_cat.c +++ /dev/null @@ -1,236 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/obdclass/llog_cat.c - * - * OST<->MDS recovery logging infrastructure. - * - * Invariants in implementation: - * - we do not share logs among different OST<->MDS connections, so that - * if an OST or MDS fails it need only look at log(s) relevant to itself - * - * Author: Andreas Dilger - * Author: Alexey Zhuravlev - * Author: Mikhail Pershin - */ - -#define DEBUG_SUBSYSTEM S_LOG - -#include - -#include "llog_internal.h" - -/* Open an existent log handle and add it to the open list. - * This log handle will be closed when all of the records in it are removed. - * - * Assumes caller has already pushed us into the kernel context and is locking. - * We return a lock on the handle to ensure nobody yanks it from us. - * - * This takes extra reference on llog_handle via llog_handle_get() and require - * this reference to be put by caller using llog_handle_put() - */ -static int llog_cat_id2handle(const struct lu_env *env, - struct llog_handle *cathandle, - struct llog_handle **res, - struct llog_logid *logid) -{ - struct llog_handle *loghandle; - enum llog_flag fmt; - int rc = 0; - - if (!cathandle) - return -EBADF; - - fmt = cathandle->lgh_hdr->llh_flags & LLOG_F_EXT_MASK; - down_write(&cathandle->lgh_lock); - list_for_each_entry(loghandle, &cathandle->u.chd.chd_head, - u.phd.phd_entry) { - struct llog_logid *cgl = &loghandle->lgh_id; - - if (ostid_id(&cgl->lgl_oi) == ostid_id(&logid->lgl_oi) && - ostid_seq(&cgl->lgl_oi) == ostid_seq(&logid->lgl_oi)) { - if (cgl->lgl_ogen != logid->lgl_ogen) { - CERROR("%s: log " DOSTID " generation %x != %x\n", - loghandle->lgh_ctxt->loc_obd->obd_name, - POSTID(&logid->lgl_oi), cgl->lgl_ogen, - logid->lgl_ogen); - continue; - } - loghandle->u.phd.phd_cat_handle = cathandle; - up_write(&cathandle->lgh_lock); - rc = 0; - goto out; - } - } - up_write(&cathandle->lgh_lock); - - rc = llog_open(env, cathandle->lgh_ctxt, &loghandle, logid, NULL, - LLOG_OPEN_EXISTS); - if (rc < 0) { - CERROR("%s: error opening log id " DOSTID ":%x: rc = %d\n", - cathandle->lgh_ctxt->loc_obd->obd_name, - POSTID(&logid->lgl_oi), logid->lgl_ogen, rc); - return rc; - } - - rc = llog_init_handle(env, loghandle, fmt | LLOG_F_IS_PLAIN, NULL); - if (rc < 0) { - llog_close(env, loghandle); - loghandle = NULL; - return rc; - } - - down_write(&cathandle->lgh_lock); - list_add_tail(&loghandle->u.phd.phd_entry, &cathandle->u.chd.chd_head); - up_write(&cathandle->lgh_lock); - - loghandle->u.phd.phd_cat_handle = cathandle; - loghandle->u.phd.phd_cookie.lgc_lgl = cathandle->lgh_id; - loghandle->u.phd.phd_cookie.lgc_index = - loghandle->lgh_hdr->llh_cat_idx; -out: - llog_handle_get(loghandle); - *res = loghandle; - return 0; -} - -int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle) -{ - struct llog_handle *loghandle, *n; - - list_for_each_entry_safe(loghandle, n, &cathandle->u.chd.chd_head, - u.phd.phd_entry) { - /* unlink open-not-created llogs */ - list_del_init(&loghandle->u.phd.phd_entry); - llog_close(env, loghandle); - } - /* if handle was stored in ctxt, remove it too */ - if (cathandle->lgh_ctxt->loc_handle == cathandle) - cathandle->lgh_ctxt->loc_handle = NULL; - return llog_close(env, cathandle); -} -EXPORT_SYMBOL(llog_cat_close); - -static int llog_cat_process_cb(const struct lu_env *env, - struct llog_handle *cat_llh, - struct llog_rec_hdr *rec, void *data) -{ - struct llog_process_data *d = data; - struct llog_logid_rec *lir = (struct llog_logid_rec *)rec; - struct llog_handle *llh; - int rc; - - if (rec->lrh_type != LLOG_LOGID_MAGIC) { - CERROR("invalid record in catalog\n"); - return -EINVAL; - } - CDEBUG(D_HA, "processing log " DOSTID ":%x at index %u of catalog " - DOSTID "\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen, - rec->lrh_index, POSTID(&cat_llh->lgh_id.lgl_oi)); - - rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id); - if (rc) { - CERROR("%s: cannot find handle for llog " DOSTID ": %d\n", - cat_llh->lgh_ctxt->loc_obd->obd_name, - POSTID(&lir->lid_id.lgl_oi), rc); - return rc; - } - - if (rec->lrh_index < d->lpd_startcat) - /* Skip processing of the logs until startcat */ - rc = 0; - else if (d->lpd_startidx > 0) { - struct llog_process_cat_data cd; - - cd.lpcd_first_idx = d->lpd_startidx; - cd.lpcd_last_idx = 0; - rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data, - &cd, false); - /* Continue processing the next log from idx 0 */ - d->lpd_startidx = 0; - } else { - rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data, - NULL, false); - } - - llog_handle_put(llh); - - return rc; -} - -static int llog_cat_process_or_fork(const struct lu_env *env, - struct llog_handle *cat_llh, - llog_cb_t cb, void *data, int startcat, - int startidx, bool fork) -{ - struct llog_process_data d; - struct llog_log_hdr *llh = cat_llh->lgh_hdr; - int rc; - - LASSERT(llh->llh_flags & LLOG_F_IS_CAT); - d.lpd_data = data; - d.lpd_cb = cb; - d.lpd_startcat = startcat; - d.lpd_startidx = startidx; - - if (llh->llh_cat_idx > cat_llh->lgh_last_idx) { - struct llog_process_cat_data cd; - - CWARN("catlog " DOSTID " crosses index zero\n", - POSTID(&cat_llh->lgh_id.lgl_oi)); - - cd.lpcd_first_idx = llh->llh_cat_idx; - cd.lpcd_last_idx = 0; - rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb, - &d, &cd, fork); - if (rc != 0) - return rc; - - cd.lpcd_first_idx = 0; - cd.lpcd_last_idx = cat_llh->lgh_last_idx; - rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb, - &d, &cd, fork); - } else { - rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb, - &d, NULL, fork); - } - - return rc; -} - -int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh, - llog_cb_t cb, void *data, int startcat, int startidx) -{ - return llog_cat_process_or_fork(env, cat_llh, cb, data, startcat, - startidx, false); -} -EXPORT_SYMBOL(llog_cat_process); diff --git a/drivers/staging/lustre/lustre/obdclass/llog_internal.h b/drivers/staging/lustre/lustre/obdclass/llog_internal.h deleted file mode 100644 index 4991d4e589dcbd46bc1eaa55456a7f7a61eec0dc..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdclass/llog_internal.h +++ /dev/null @@ -1,79 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#ifndef __LLOG_INTERNAL_H__ -#define __LLOG_INTERNAL_H__ - -#include - -struct llog_process_info { - struct llog_handle *lpi_loghandle; - llog_cb_t lpi_cb; - void *lpi_cbdata; - void *lpi_catdata; - int lpi_rc; - struct completion lpi_completion; - const struct lu_env *lpi_env; - -}; - -struct llog_thread_info { - struct lu_attr lgi_attr; - struct lu_fid lgi_fid; - struct lu_buf lgi_buf; - loff_t lgi_off; - struct llog_rec_hdr lgi_lrh; - struct llog_rec_tail lgi_tail; -}; - -extern struct lu_context_key llog_thread_key; - -int llog_info_init(void); -void llog_info_fini(void); - -void llog_handle_get(struct llog_handle *loghandle); -void llog_handle_put(struct llog_handle *loghandle); -int class_config_dump_handler(const struct lu_env *env, - struct llog_handle *handle, - struct llog_rec_hdr *rec, void *data); -int llog_process_or_fork(const struct lu_env *env, - struct llog_handle *loghandle, - llog_cb_t cb, void *data, void *catdata, bool fork); -int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle, - struct llog_handle *loghandle, int index); - -static inline struct llog_rec_hdr *llog_rec_hdr_next(struct llog_rec_hdr *rec) -{ - return (struct llog_rec_hdr *)((char *)rec + rec->lrh_len); -} -#endif diff --git a/drivers/staging/lustre/lustre/obdclass/llog_obd.c b/drivers/staging/lustre/lustre/obdclass/llog_obd.c deleted file mode 100644 index 26aea114a29bdd41c192463189f58dc755f84e14..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdclass/llog_obd.c +++ /dev/null @@ -1,225 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_LOG - -#include -#include -#include "llog_internal.h" - -/* helper functions for calling the llog obd methods */ -static struct llog_ctxt *llog_new_ctxt(struct obd_device *obd) -{ - struct llog_ctxt *ctxt; - - ctxt = kzalloc(sizeof(*ctxt), GFP_NOFS); - if (!ctxt) - return NULL; - - ctxt->loc_obd = obd; - atomic_set(&ctxt->loc_refcount, 1); - - return ctxt; -} - -static void llog_ctxt_destroy(struct llog_ctxt *ctxt) -{ - if (ctxt->loc_exp) { - class_export_put(ctxt->loc_exp); - ctxt->loc_exp = NULL; - } - if (ctxt->loc_imp) { - class_import_put(ctxt->loc_imp); - ctxt->loc_imp = NULL; - } - kfree(ctxt); -} - -int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt) -{ - struct obd_llog_group *olg = ctxt->loc_olg; - struct obd_device *obd; - int rc = 0; - - spin_lock(&olg->olg_lock); - if (!atomic_dec_and_test(&ctxt->loc_refcount)) { - spin_unlock(&olg->olg_lock); - return rc; - } - olg->olg_ctxts[ctxt->loc_idx] = NULL; - spin_unlock(&olg->olg_lock); - - obd = ctxt->loc_obd; - spin_lock(&obd->obd_dev_lock); - /* sync with llog ctxt user thread */ - spin_unlock(&obd->obd_dev_lock); - - /* obd->obd_starting is needed for the case of cleanup - * in error case while obd is starting up. - */ - LASSERTF(obd->obd_starting == 1 || - obd->obd_stopping == 1 || obd->obd_set_up == 0, - "wrong obd state: %d/%d/%d\n", !!obd->obd_starting, - !!obd->obd_stopping, !!obd->obd_set_up); - - /* cleanup the llog ctxt here */ - if (CTXTP(ctxt, cleanup)) - rc = CTXTP(ctxt, cleanup)(env, ctxt); - - llog_ctxt_destroy(ctxt); - wake_up(&olg->olg_waitq); - return rc; -} -EXPORT_SYMBOL(__llog_ctxt_put); - -int llog_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt) -{ - struct obd_llog_group *olg; - int rc, idx; - - olg = ctxt->loc_olg; - LASSERT(olg); - LASSERT(olg != LP_POISON); - - idx = ctxt->loc_idx; - - /* - * Banlance the ctxt get when calling llog_cleanup() - */ - LASSERT(atomic_read(&ctxt->loc_refcount) < LI_POISON); - LASSERT(atomic_read(&ctxt->loc_refcount) > 1); - llog_ctxt_put(ctxt); - - /* - * Try to free the ctxt. - */ - rc = __llog_ctxt_put(env, ctxt); - if (rc) - CERROR("Error %d while cleaning up ctxt %p\n", - rc, ctxt); - - l_wait_event_abortable(olg->olg_waitq, - llog_group_ctxt_null(olg, idx)); - - return rc; -} -EXPORT_SYMBOL(llog_cleanup); - -int llog_setup(const struct lu_env *env, struct obd_device *obd, - struct obd_llog_group *olg, int index, - struct obd_device *disk_obd, struct llog_operations *op) -{ - struct llog_ctxt *ctxt; - int rc = 0; - - if (index < 0 || index >= LLOG_MAX_CTXTS) - return -EINVAL; - - LASSERT(olg); - - ctxt = llog_new_ctxt(obd); - if (!ctxt) - return -ENOMEM; - - ctxt->loc_obd = obd; - ctxt->loc_olg = olg; - ctxt->loc_idx = index; - ctxt->loc_logops = op; - mutex_init(&ctxt->loc_mutex); - ctxt->loc_exp = class_export_get(disk_obd->obd_self_export); - ctxt->loc_flags = LLOG_CTXT_FLAG_UNINITIALIZED; - ctxt->loc_chunk_size = LLOG_MIN_CHUNK_SIZE; - - rc = llog_group_set_ctxt(olg, ctxt, index); - if (rc) { - llog_ctxt_destroy(ctxt); - if (rc == -EEXIST) { - ctxt = llog_group_get_ctxt(olg, index); - if (ctxt) { - /* - * mds_lov_update_desc() might call here multiple - * times. So if the llog is already set up then - * don't to do it again. - */ - CDEBUG(D_CONFIG, "obd %s ctxt %d already set up\n", - obd->obd_name, index); - LASSERT(ctxt->loc_olg == olg); - LASSERT(ctxt->loc_obd == obd); - LASSERT(ctxt->loc_exp == disk_obd->obd_self_export); - LASSERT(ctxt->loc_logops == op); - llog_ctxt_put(ctxt); - } - rc = 0; - } - return rc; - } - - if (op->lop_setup) { - if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LLOG_SETUP)) - rc = -EOPNOTSUPP; - else - rc = op->lop_setup(env, obd, olg, index, disk_obd); - } - - if (rc) { - CERROR("%s: ctxt %d lop_setup=%p failed: rc = %d\n", - obd->obd_name, index, op->lop_setup, rc); - llog_group_clear_ctxt(olg, index); - llog_ctxt_destroy(ctxt); - } else { - CDEBUG(D_CONFIG, "obd %s ctxt %d is initialized\n", - obd->obd_name, index); - ctxt->loc_flags &= ~LLOG_CTXT_FLAG_UNINITIALIZED; - } - - return rc; -} -EXPORT_SYMBOL(llog_setup); - -/* context key constructor/destructor: llog_key_init, llog_key_fini */ -LU_KEY_INIT_FINI(llog, struct llog_thread_info); -/* context key: llog_thread_key */ -LU_CONTEXT_KEY_DEFINE(llog, LCT_MD_THREAD | LCT_MG_THREAD | LCT_LOCAL); -LU_KEY_INIT_GENERIC(llog); - -int llog_info_init(void) -{ - llog_key_init_generic(&llog_thread_key, NULL); - lu_context_key_register(&llog_thread_key); - return 0; -} - -void llog_info_fini(void) -{ - lu_context_key_degister(&llog_thread_key); -} diff --git a/drivers/staging/lustre/lustre/obdclass/llog_swab.c b/drivers/staging/lustre/lustre/obdclass/llog_swab.c deleted file mode 100644 index b431c3408fe470dc442ff9c9582d6757594b4e92..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdclass/llog_swab.c +++ /dev/null @@ -1,412 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/obdclass/llog_swab.c - * - * Swabbing of llog datatypes (from disk or over the wire). - * - * Author: jacob berkman - */ - -#define DEBUG_SUBSYSTEM S_LOG - -#include -#include - -static void print_llogd_body(struct llogd_body *d) -{ - CDEBUG(D_OTHER, "llogd body: %p\n", d); - CDEBUG(D_OTHER, "\tlgd_logid.lgl_oi: " DOSTID "\n", - POSTID(&d->lgd_logid.lgl_oi)); - CDEBUG(D_OTHER, "\tlgd_logid.lgl_ogen: %#x\n", d->lgd_logid.lgl_ogen); - CDEBUG(D_OTHER, "\tlgd_ctxt_idx: %#x\n", d->lgd_ctxt_idx); - CDEBUG(D_OTHER, "\tlgd_llh_flags: %#x\n", d->lgd_llh_flags); - CDEBUG(D_OTHER, "\tlgd_index: %#x\n", d->lgd_index); - CDEBUG(D_OTHER, "\tlgd_saved_index: %#x\n", d->lgd_saved_index); - CDEBUG(D_OTHER, "\tlgd_len: %#x\n", d->lgd_len); - CDEBUG(D_OTHER, "\tlgd_cur_offset: %#llx\n", d->lgd_cur_offset); -} - -void lustre_swab_lu_fid(struct lu_fid *fid) -{ - __swab64s(&fid->f_seq); - __swab32s(&fid->f_oid); - __swab32s(&fid->f_ver); -} -EXPORT_SYMBOL(lustre_swab_lu_fid); - -void lustre_swab_ost_id(struct ost_id *oid) -{ - if (fid_seq_is_mdt0(oid->oi.oi_seq)) { - __swab64s(&oid->oi.oi_id); - __swab64s(&oid->oi.oi_seq); - } else { - lustre_swab_lu_fid(&oid->oi_fid); - } -} -EXPORT_SYMBOL(lustre_swab_ost_id); - -static void lustre_swab_llog_id(struct llog_logid *log_id) -{ - __swab64s(&log_id->lgl_oi.oi.oi_id); - __swab64s(&log_id->lgl_oi.oi.oi_seq); - __swab32s(&log_id->lgl_ogen); -} - -void lustre_swab_llogd_body(struct llogd_body *d) -{ - print_llogd_body(d); - lustre_swab_llog_id(&d->lgd_logid); - __swab32s(&d->lgd_ctxt_idx); - __swab32s(&d->lgd_llh_flags); - __swab32s(&d->lgd_index); - __swab32s(&d->lgd_saved_index); - __swab32s(&d->lgd_len); - __swab64s(&d->lgd_cur_offset); - print_llogd_body(d); -} -EXPORT_SYMBOL(lustre_swab_llogd_body); - -void lustre_swab_llogd_conn_body(struct llogd_conn_body *d) -{ - __swab64s(&d->lgdc_gen.mnt_cnt); - __swab64s(&d->lgdc_gen.conn_cnt); - lustre_swab_llog_id(&d->lgdc_logid); - __swab32s(&d->lgdc_ctxt_idx); -} -EXPORT_SYMBOL(lustre_swab_llogd_conn_body); - -static void lustre_swab_ll_fid(struct ll_fid *fid) -{ - __swab64s(&fid->id); - __swab32s(&fid->generation); - __swab32s(&fid->f_type); -} - -void lustre_swab_lu_seq_range(struct lu_seq_range *range) -{ - __swab64s(&range->lsr_start); - __swab64s(&range->lsr_end); - __swab32s(&range->lsr_index); - __swab32s(&range->lsr_flags); -} -EXPORT_SYMBOL(lustre_swab_lu_seq_range); - -void lustre_swab_llog_rec(struct llog_rec_hdr *rec) -{ - struct llog_rec_tail *tail = NULL; - - __swab32s(&rec->lrh_len); - __swab32s(&rec->lrh_index); - __swab32s(&rec->lrh_type); - __swab32s(&rec->lrh_id); - - switch (rec->lrh_type) { - case OST_SZ_REC: - { - struct llog_size_change_rec *lsc = - (struct llog_size_change_rec *)rec; - - lustre_swab_ll_fid(&lsc->lsc_fid); - __swab32s(&lsc->lsc_ioepoch); - tail = &lsc->lsc_tail; - break; - } - case MDS_UNLINK_REC: - { - struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec; - - __swab64s(&lur->lur_oid); - __swab32s(&lur->lur_oseq); - __swab32s(&lur->lur_count); - tail = &lur->lur_tail; - break; - } - case MDS_UNLINK64_REC: - { - struct llog_unlink64_rec *lur = - (struct llog_unlink64_rec *)rec; - - lustre_swab_lu_fid(&lur->lur_fid); - __swab32s(&lur->lur_count); - tail = &lur->lur_tail; - break; - } - case CHANGELOG_REC: - { - struct llog_changelog_rec *cr = - (struct llog_changelog_rec *)rec; - - __swab16s(&cr->cr.cr_namelen); - __swab16s(&cr->cr.cr_flags); - __swab32s(&cr->cr.cr_type); - __swab64s(&cr->cr.cr_index); - __swab64s(&cr->cr.cr_prev); - __swab64s(&cr->cr.cr_time); - lustre_swab_lu_fid(&cr->cr.cr_tfid); - lustre_swab_lu_fid(&cr->cr.cr_pfid); - if (cr->cr.cr_flags & CLF_RENAME) { - struct changelog_ext_rename *rnm = - changelog_rec_rename(&cr->cr); - - lustre_swab_lu_fid(&rnm->cr_sfid); - lustre_swab_lu_fid(&rnm->cr_spfid); - } - /* - * Because the tail follows a variable-length structure we need - * to compute its location at runtime - */ - tail = (struct llog_rec_tail *)((char *)&cr->cr + - changelog_rec_size(&cr->cr) + - cr->cr.cr_namelen); - break; - } - - case CHANGELOG_USER_REC: - { - struct llog_changelog_user_rec *cur = - (struct llog_changelog_user_rec *)rec; - - __swab32s(&cur->cur_id); - __swab64s(&cur->cur_endrec); - tail = &cur->cur_tail; - break; - } - - case HSM_AGENT_REC: { - struct llog_agent_req_rec *arr = - (struct llog_agent_req_rec *)rec; - - __swab32s(&arr->arr_hai.hai_len); - __swab32s(&arr->arr_hai.hai_action); - lustre_swab_lu_fid(&arr->arr_hai.hai_fid); - lustre_swab_lu_fid(&arr->arr_hai.hai_dfid); - __swab64s(&arr->arr_hai.hai_cookie); - __swab64s(&arr->arr_hai.hai_extent.offset); - __swab64s(&arr->arr_hai.hai_extent.length); - __swab64s(&arr->arr_hai.hai_gid); - /* no swabing for opaque data */ - /* hai_data[0]; */ - break; - } - - case MDS_SETATTR64_REC: - { - struct llog_setattr64_rec *lsr = - (struct llog_setattr64_rec *)rec; - - lustre_swab_ost_id(&lsr->lsr_oi); - __swab32s(&lsr->lsr_uid); - __swab32s(&lsr->lsr_uid_h); - __swab32s(&lsr->lsr_gid); - __swab32s(&lsr->lsr_gid_h); - __swab64s(&lsr->lsr_valid); - tail = &lsr->lsr_tail; - break; - } - case OBD_CFG_REC: - /* these are swabbed as they are consumed */ - break; - case LLOG_HDR_MAGIC: - { - struct llog_log_hdr *llh = (struct llog_log_hdr *)rec; - - __swab64s(&llh->llh_timestamp); - __swab32s(&llh->llh_count); - __swab32s(&llh->llh_bitmap_offset); - __swab32s(&llh->llh_flags); - __swab32s(&llh->llh_size); - __swab32s(&llh->llh_cat_idx); - tail = LLOG_HDR_TAIL(llh); - break; - } - case LLOG_LOGID_MAGIC: - { - struct llog_logid_rec *lid = (struct llog_logid_rec *)rec; - - lustre_swab_llog_id(&lid->lid_id); - tail = &lid->lid_tail; - break; - } - case LLOG_GEN_REC: - { - struct llog_gen_rec *lgr = (struct llog_gen_rec *)rec; - - __swab64s(&lgr->lgr_gen.mnt_cnt); - __swab64s(&lgr->lgr_gen.conn_cnt); - tail = &lgr->lgr_tail; - break; - } - case LLOG_PAD_MAGIC: - break; - default: - CERROR("Unknown llog rec type %#x swabbing rec %p\n", - rec->lrh_type, rec); - } - - if (tail) { - __swab32s(&tail->lrt_len); - __swab32s(&tail->lrt_index); - } -} -EXPORT_SYMBOL(lustre_swab_llog_rec); - -static void print_llog_hdr(struct llog_log_hdr *h) -{ - CDEBUG(D_OTHER, "llog header: %p\n", h); - CDEBUG(D_OTHER, "\tllh_hdr.lrh_index: %#x\n", h->llh_hdr.lrh_index); - CDEBUG(D_OTHER, "\tllh_hdr.lrh_len: %#x\n", h->llh_hdr.lrh_len); - CDEBUG(D_OTHER, "\tllh_hdr.lrh_type: %#x\n", h->llh_hdr.lrh_type); - CDEBUG(D_OTHER, "\tllh_timestamp: %#llx\n", h->llh_timestamp); - CDEBUG(D_OTHER, "\tllh_count: %#x\n", h->llh_count); - CDEBUG(D_OTHER, "\tllh_bitmap_offset: %#x\n", h->llh_bitmap_offset); - CDEBUG(D_OTHER, "\tllh_flags: %#x\n", h->llh_flags); - CDEBUG(D_OTHER, "\tllh_size: %#x\n", h->llh_size); - CDEBUG(D_OTHER, "\tllh_cat_idx: %#x\n", h->llh_cat_idx); - CDEBUG(D_OTHER, "\tllh_tail.lrt_index: %#x\n", - LLOG_HDR_TAIL(h)->lrt_index); - CDEBUG(D_OTHER, "\tllh_tail.lrt_len: %#x\n", - LLOG_HDR_TAIL(h)->lrt_len); -} - -void lustre_swab_llog_hdr(struct llog_log_hdr *h) -{ - print_llog_hdr(h); - - lustre_swab_llog_rec(&h->llh_hdr); - - print_llog_hdr(h); -} -EXPORT_SYMBOL(lustre_swab_llog_hdr); - -static void print_lustre_cfg(struct lustre_cfg *lcfg) -{ - int i; - - if (!(libcfs_debug & D_OTHER)) /* don't loop on nothing */ - return; - CDEBUG(D_OTHER, "lustre_cfg: %p\n", lcfg); - CDEBUG(D_OTHER, "\tlcfg->lcfg_version: %#x\n", lcfg->lcfg_version); - - CDEBUG(D_OTHER, "\tlcfg->lcfg_command: %#x\n", lcfg->lcfg_command); - CDEBUG(D_OTHER, "\tlcfg->lcfg_num: %#x\n", lcfg->lcfg_num); - CDEBUG(D_OTHER, "\tlcfg->lcfg_flags: %#x\n", lcfg->lcfg_flags); - CDEBUG(D_OTHER, "\tlcfg->lcfg_nid: %s\n", libcfs_nid2str(lcfg->lcfg_nid)); - - CDEBUG(D_OTHER, "\tlcfg->lcfg_bufcount: %d\n", lcfg->lcfg_bufcount); - if (lcfg->lcfg_bufcount < LUSTRE_CFG_MAX_BUFCOUNT) - for (i = 0; i < lcfg->lcfg_bufcount; i++) - CDEBUG(D_OTHER, "\tlcfg->lcfg_buflens[%d]: %d\n", - i, lcfg->lcfg_buflens[i]); -} - -void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg) -{ - int i; - - __swab32s(&lcfg->lcfg_version); - - if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) { - CERROR("not swabbing lustre_cfg version %#x (expecting %#x)\n", - lcfg->lcfg_version, LUSTRE_CFG_VERSION); - return; - } - - __swab32s(&lcfg->lcfg_command); - __swab32s(&lcfg->lcfg_num); - __swab32s(&lcfg->lcfg_flags); - __swab64s(&lcfg->lcfg_nid); - __swab32s(&lcfg->lcfg_bufcount); - for (i = 0; i < lcfg->lcfg_bufcount && i < LUSTRE_CFG_MAX_BUFCOUNT; i++) - __swab32s(&lcfg->lcfg_buflens[i]); - - print_lustre_cfg(lcfg); -} - -/* used only for compatibility with old on-disk cfg_marker data */ -struct cfg_marker32 { - __u32 cm_step; - __u32 cm_flags; - __u32 cm_vers; - __u32 padding; - __u32 cm_createtime; - __u32 cm_canceltime; - char cm_tgtname[MTI_NAME_MAXLEN]; - char cm_comment[MTI_NAME_MAXLEN]; -}; - -#define MTI_NAMELEN32 (MTI_NAME_MAXLEN - \ - (sizeof(struct cfg_marker) - sizeof(struct cfg_marker32))) - -void lustre_swab_cfg_marker(struct cfg_marker *marker, int swab, int size) -{ - struct cfg_marker32 *cm32 = (struct cfg_marker32 *)marker; - - if (swab) { - __swab32s(&marker->cm_step); - __swab32s(&marker->cm_flags); - __swab32s(&marker->cm_vers); - } - if (size == sizeof(*cm32)) { - __u32 createtime, canceltime; - /* There was a problem with the original declaration of - * cfg_marker on 32-bit systems because it used time_t as - * a wire protocol structure, and didn't verify this in - * wirecheck. We now have to convert the offsets of the - * later fields in order to work on 32- and 64-bit systems. - * - * Fortunately, the cm_comment field has no functional use - * so can be sacrificed when converting the timestamp size. - * - * Overwrite fields from the end first, so they are not - * clobbered, and use memmove() instead of memcpy() because - * the source and target buffers overlap. bug 16771 - */ - createtime = cm32->cm_createtime; - canceltime = cm32->cm_canceltime; - memmove(marker->cm_comment, cm32->cm_comment, MTI_NAMELEN32); - marker->cm_comment[MTI_NAMELEN32 - 1] = '\0'; - memmove(marker->cm_tgtname, cm32->cm_tgtname, - sizeof(marker->cm_tgtname)); - if (swab) { - __swab32s(&createtime); - __swab32s(&canceltime); - } - marker->cm_createtime = createtime; - marker->cm_canceltime = canceltime; - CDEBUG(D_CONFIG, "Find old cfg_marker(Srv32b,Clt64b) for target %s, converting\n", - marker->cm_tgtname); - } else if (swab) { - __swab64s(&marker->cm_createtime); - __swab64s(&marker->cm_canceltime); - } -} diff --git a/drivers/staging/lustre/lustre/obdclass/lprocfs_counters.c b/drivers/staging/lustre/lustre/obdclass/lprocfs_counters.c deleted file mode 100644 index 85f09aff6e835ffbfb1769910028237463c483b7..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdclass/lprocfs_counters.c +++ /dev/null @@ -1,134 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2013, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/obdclass/lprocfs_counters.c - * - * Lustre lprocfs counter routines - * - * Author: Andreas Dilger - */ - -#include -#include -#include -#include - -void lprocfs_counter_add(struct lprocfs_stats *stats, int idx, long amount) -{ - struct lprocfs_counter *percpu_cntr; - struct lprocfs_counter_header *header; - int smp_id; - unsigned long flags = 0; - - if (!stats) - return; - - LASSERTF(0 <= idx && idx < stats->ls_num, - "idx %d, ls_num %hu\n", idx, stats->ls_num); - - /* With per-client stats, statistics are allocated only for - * single CPU area, so the smp_id should be 0 always. - */ - smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID, &flags); - if (smp_id < 0) - return; - - header = &stats->ls_cnt_header[idx]; - percpu_cntr = lprocfs_stats_counter_get(stats, smp_id, idx); - percpu_cntr->lc_count++; - - if (header->lc_config & LPROCFS_CNTR_AVGMINMAX) { - /* - * lprocfs_counter_add() can be called in interrupt context, - * as memory allocation could trigger memory shrinker call - * ldlm_pool_shrink(), which calls lprocfs_counter_add(). - * LU-1727. - * - */ - if (in_interrupt() && - (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) - percpu_cntr->lc_sum_irq += amount; - else - percpu_cntr->lc_sum += amount; - - if (header->lc_config & LPROCFS_CNTR_STDDEV) - percpu_cntr->lc_sumsquare += (__s64)amount * amount; - if (amount < percpu_cntr->lc_min) - percpu_cntr->lc_min = amount; - if (amount > percpu_cntr->lc_max) - percpu_cntr->lc_max = amount; - } - lprocfs_stats_unlock(stats, LPROCFS_GET_SMP_ID, &flags); -} -EXPORT_SYMBOL(lprocfs_counter_add); - -void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx, long amount) -{ - struct lprocfs_counter *percpu_cntr; - struct lprocfs_counter_header *header; - int smp_id; - unsigned long flags = 0; - - if (!stats) - return; - - LASSERTF(0 <= idx && idx < stats->ls_num, - "idx %d, ls_num %hu\n", idx, stats->ls_num); - - /* With per-client stats, statistics are allocated only for - * single CPU area, so the smp_id should be 0 always. - */ - smp_id = lprocfs_stats_lock(stats, LPROCFS_GET_SMP_ID, &flags); - if (smp_id < 0) - return; - - header = &stats->ls_cnt_header[idx]; - percpu_cntr = lprocfs_stats_counter_get(stats, smp_id, idx); - if (header->lc_config & LPROCFS_CNTR_AVGMINMAX) { - /* - * Sometimes we use RCU callbacks to free memory which calls - * lprocfs_counter_sub(), and RCU callbacks may execute in - * softirq context - right now that's the only case we're in - * softirq context here, use separate counter for that. - * bz20650. - * - */ - if (in_interrupt() && - (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) - percpu_cntr->lc_sum_irq -= amount; - else - percpu_cntr->lc_sum -= amount; - } - lprocfs_stats_unlock(stats, LPROCFS_GET_SMP_ID, &flags); -} -EXPORT_SYMBOL(lprocfs_counter_sub); diff --git a/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c b/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c deleted file mode 100644 index bdbe6f52031a3eb37c9eca12251215287d035233..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c +++ /dev/null @@ -1,1698 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/obdclass/lprocfs_status.c - * - * Author: Hariharan Thantry - */ - -#define DEBUG_SUBSYSTEM S_CLASS - -#include -#include -#include -#include -#include - -static const char * const obd_connect_names[] = { - "read_only", - "lov_index", - "connect_from_mds", - "write_grant", - "server_lock", - "version", - "request_portal", - "acl", - "xattr", - "create_on_write", - "truncate_lock", - "initial_transno", - "inode_bit_locks", - "join_file(obsolete)", - "getattr_by_fid", - "no_oh_for_devices", - "remote_client", - "remote_client_by_force", - "max_byte_per_rpc", - "64bit_qdata", - "mds_capability", - "oss_capability", - "early_lock_cancel", - "som", - "adaptive_timeouts", - "lru_resize", - "mds_mds_connection", - "real_conn", - "change_qunit_size", - "alt_checksum_algorithm", - "fid_is_enabled", - "version_recovery", - "pools", - "grant_shrink", - "skip_orphan", - "large_ea", - "full20", - "layout_lock", - "64bithash", - "object_max_bytes", - "imp_recov", - "jobstats", - "umask", - "einprogress", - "grant_param", - "flock_owner", - "lvb_type", - "nanoseconds_times", - "lightweight_conn", - "short_io", - "pingless", - "flock_deadlock", - "disp_stripe", - "open_by_fid", - "lfsck", - "unknown", - "unlink_close", - "multi_mod_rpcs", - "dir_stripe", - "subtree", - "lock_ahead", - "bulk_mbits", - "compact_obdo", - "second_flags", - NULL -}; - -int obd_connect_flags2str(char *page, int count, __u64 flags, char *sep) -{ - __u64 mask = 1; - int i, ret = 0; - - for (i = 0; obd_connect_names[i]; i++, mask <<= 1) { - if (flags & mask) - ret += snprintf(page + ret, count - ret, "%s%s", - ret ? sep : "", obd_connect_names[i]); - } - if (flags & ~(mask - 1)) - ret += snprintf(page + ret, count - ret, - "%sunknown flags %#llx", - ret ? sep : "", flags & ~(mask - 1)); - return ret; -} -EXPORT_SYMBOL(obd_connect_flags2str); - -static void obd_connect_data_seqprint(struct seq_file *m, - struct obd_connect_data *ocd) -{ - u64 flags; - - LASSERT(ocd); - flags = ocd->ocd_connect_flags; - - seq_printf(m, " connect_data:\n" - " flags: %llx\n" - " instance: %u\n", - ocd->ocd_connect_flags, - ocd->ocd_instance); - if (flags & OBD_CONNECT_VERSION) - seq_printf(m, " target_version: %u.%u.%u.%u\n", - OBD_OCD_VERSION_MAJOR(ocd->ocd_version), - OBD_OCD_VERSION_MINOR(ocd->ocd_version), - OBD_OCD_VERSION_PATCH(ocd->ocd_version), - OBD_OCD_VERSION_FIX(ocd->ocd_version)); - if (flags & OBD_CONNECT_MDS) - seq_printf(m, " mdt_index: %d\n", ocd->ocd_group); - if (flags & OBD_CONNECT_GRANT) - seq_printf(m, " initial_grant: %d\n", ocd->ocd_grant); - if (flags & OBD_CONNECT_INDEX) - seq_printf(m, " target_index: %u\n", ocd->ocd_index); - if (flags & OBD_CONNECT_BRW_SIZE) - seq_printf(m, " max_brw_size: %d\n", ocd->ocd_brw_size); - if (flags & OBD_CONNECT_IBITS) - seq_printf(m, " ibits_known: %llx\n", - ocd->ocd_ibits_known); - if (flags & OBD_CONNECT_GRANT_PARAM) - seq_printf(m, " grant_block_size: %d\n" - " grant_inode_size: %d\n" - " grant_extent_overhead: %d\n", - ocd->ocd_blocksize, - ocd->ocd_inodespace, - ocd->ocd_grant_extent); - if (flags & OBD_CONNECT_TRANSNO) - seq_printf(m, " first_transno: %llx\n", - ocd->ocd_transno); - if (flags & OBD_CONNECT_CKSUM) - seq_printf(m, " cksum_types: %#x\n", - ocd->ocd_cksum_types); - if (flags & OBD_CONNECT_MAX_EASIZE) - seq_printf(m, " max_easize: %d\n", ocd->ocd_max_easize); - if (flags & OBD_CONNECT_MAXBYTES) - seq_printf(m, " max_object_bytes: %llx\n", - ocd->ocd_maxbytes); - if (flags & OBD_CONNECT_MULTIMODRPCS) - seq_printf(m, " max_mod_rpcs: %hu\n", - ocd->ocd_maxmodrpcs); -} - -int lprocfs_read_frac_helper(char *buffer, unsigned long count, long val, - int mult) -{ - long decimal_val, frac_val; - int prtn; - - if (count < 10) - return -EINVAL; - - decimal_val = val / mult; - prtn = snprintf(buffer, count, "%ld", decimal_val); - frac_val = val % mult; - - if (prtn < (count - 4) && frac_val > 0) { - long temp_frac; - int i, temp_mult = 1, frac_bits = 0; - - temp_frac = frac_val * 10; - buffer[prtn++] = '.'; - while (frac_bits < 2 && (temp_frac / mult) < 1) { - /* only reserved 2 bits fraction */ - buffer[prtn++] = '0'; - temp_frac *= 10; - frac_bits++; - } - /* - * Need to think these cases : - * 1. #echo x.00 > /sys/xxx output result : x - * 2. #echo x.0x > /sys/xxx output result : x.0x - * 3. #echo x.x0 > /sys/xxx output result : x.x - * 4. #echo x.xx > /sys/xxx output result : x.xx - * Only reserved 2 bits fraction. - */ - for (i = 0; i < (5 - prtn); i++) - temp_mult *= 10; - - frac_bits = min((int)count - prtn, 3 - frac_bits); - prtn += snprintf(buffer + prtn, frac_bits, "%ld", - frac_val * temp_mult / mult); - - prtn--; - while (buffer[prtn] < '1' || buffer[prtn] > '9') { - prtn--; - if (buffer[prtn] == '.') { - prtn--; - break; - } - } - prtn++; - } - buffer[prtn++] = '\n'; - return prtn; -} -EXPORT_SYMBOL(lprocfs_read_frac_helper); - -int lprocfs_write_frac_helper(const char __user *buffer, unsigned long count, - int *val, int mult) -{ - char kernbuf[20], *end, *pbuf; - - if (count > (sizeof(kernbuf) - 1)) - return -EINVAL; - - if (copy_from_user(kernbuf, buffer, count)) - return -EFAULT; - - kernbuf[count] = '\0'; - pbuf = kernbuf; - if (*pbuf == '-') { - mult = -mult; - pbuf++; - } - - *val = (int)simple_strtoul(pbuf, &end, 10) * mult; - if (pbuf == end) - return -EINVAL; - - if (end && *end == '.') { - int temp_val, pow = 1; - int i; - - pbuf = end + 1; - if (strlen(pbuf) > 5) - pbuf[5] = '\0'; /*only allow 5bits fractional*/ - - temp_val = (int)simple_strtoul(pbuf, &end, 10) * mult; - - if (pbuf < end) { - for (i = 0; i < (end - pbuf); i++) - pow *= 10; - - *val += temp_val / pow; - } - } - return 0; -} -EXPORT_SYMBOL(lprocfs_write_frac_helper); - -static int lprocfs_no_percpu_stats; -module_param(lprocfs_no_percpu_stats, int, 0644); -MODULE_PARM_DESC(lprocfs_no_percpu_stats, "Do not alloc percpu data for lprocfs stats"); - -#define MAX_STRING_SIZE 128 - -int lprocfs_single_release(struct inode *inode, struct file *file) -{ - return single_release(inode, file); -} -EXPORT_SYMBOL(lprocfs_single_release); - -int lprocfs_seq_release(struct inode *inode, struct file *file) -{ - return seq_release(inode, file); -} -EXPORT_SYMBOL(lprocfs_seq_release); - -/* lprocfs API calls */ - -static const struct file_operations lprocfs_generic_fops = { }; - -void ldebugfs_add_vars(struct dentry *parent, struct lprocfs_vars *list, - void *data) -{ - if (IS_ERR_OR_NULL(parent) || IS_ERR_OR_NULL(list)) - return; - - while (list->name) { - umode_t mode = 0; - - if (list->proc_mode != 0000) { - mode = list->proc_mode; - } else if (list->fops) { - if (list->fops->read) - mode = 0444; - if (list->fops->write) - mode |= 0200; - } - debugfs_create_file(list->name, mode, parent, - list->data ?: data, - list->fops ?: &lprocfs_generic_fops); - list++; - } - return; -} -EXPORT_SYMBOL_GPL(ldebugfs_add_vars); - -/* Generic callbacks */ -static ssize_t uuid_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - - return sprintf(buf, "%s\n", obd->obd_uuid.uuid); -} -LUSTRE_RO_ATTR(uuid); - -static ssize_t blocksize_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - struct obd_statfs osfs; - int rc = obd_statfs(NULL, obd->obd_self_export, &osfs, - get_jiffies_64() - OBD_STATFS_CACHE_SECONDS * HZ, - OBD_STATFS_NODELAY); - if (!rc) - return sprintf(buf, "%u\n", osfs.os_bsize); - - return rc; -} -LUSTRE_RO_ATTR(blocksize); - -static ssize_t kbytestotal_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - struct obd_statfs osfs; - int rc = obd_statfs(NULL, obd->obd_self_export, &osfs, - get_jiffies_64() - OBD_STATFS_CACHE_SECONDS * HZ, - OBD_STATFS_NODELAY); - if (!rc) { - __u32 blk_size = osfs.os_bsize >> 10; - __u64 result = osfs.os_blocks; - - while (blk_size >>= 1) - result <<= 1; - - return sprintf(buf, "%llu\n", result); - } - - return rc; -} -LUSTRE_RO_ATTR(kbytestotal); - -static ssize_t kbytesfree_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - struct obd_statfs osfs; - int rc = obd_statfs(NULL, obd->obd_self_export, &osfs, - get_jiffies_64() - OBD_STATFS_CACHE_SECONDS * HZ, - OBD_STATFS_NODELAY); - if (!rc) { - __u32 blk_size = osfs.os_bsize >> 10; - __u64 result = osfs.os_bfree; - - while (blk_size >>= 1) - result <<= 1; - - return sprintf(buf, "%llu\n", result); - } - - return rc; -} -LUSTRE_RO_ATTR(kbytesfree); - -static ssize_t kbytesavail_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - struct obd_statfs osfs; - int rc = obd_statfs(NULL, obd->obd_self_export, &osfs, - get_jiffies_64() - OBD_STATFS_CACHE_SECONDS * HZ, - OBD_STATFS_NODELAY); - if (!rc) { - __u32 blk_size = osfs.os_bsize >> 10; - __u64 result = osfs.os_bavail; - - while (blk_size >>= 1) - result <<= 1; - - return sprintf(buf, "%llu\n", result); - } - - return rc; -} -LUSTRE_RO_ATTR(kbytesavail); - -static ssize_t filestotal_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - struct obd_statfs osfs; - int rc = obd_statfs(NULL, obd->obd_self_export, &osfs, - get_jiffies_64() - OBD_STATFS_CACHE_SECONDS * HZ, - OBD_STATFS_NODELAY); - if (!rc) - return sprintf(buf, "%llu\n", osfs.os_files); - - return rc; -} -LUSTRE_RO_ATTR(filestotal); - -static ssize_t filesfree_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - struct obd_statfs osfs; - int rc = obd_statfs(NULL, obd->obd_self_export, &osfs, - get_jiffies_64() - OBD_STATFS_CACHE_SECONDS * HZ, - OBD_STATFS_NODELAY); - if (!rc) - return sprintf(buf, "%llu\n", osfs.os_ffree); - - return rc; -} -LUSTRE_RO_ATTR(filesfree); - -int lprocfs_rd_server_uuid(struct seq_file *m, void *data) -{ - struct obd_device *obd = data; - struct obd_import *imp; - char *imp_state_name = NULL; - int rc; - - LASSERT(obd); - rc = lprocfs_climp_check(obd); - if (rc) - return rc; - - imp = obd->u.cli.cl_import; - imp_state_name = ptlrpc_import_state_name(imp->imp_state); - seq_printf(m, "%s\t%s%s\n", - obd2cli_tgt(obd), imp_state_name, - imp->imp_deactive ? "\tDEACTIVATED" : ""); - - up_read(&obd->u.cli.cl_sem); - - return 0; -} -EXPORT_SYMBOL(lprocfs_rd_server_uuid); - -int lprocfs_rd_conn_uuid(struct seq_file *m, void *data) -{ - struct obd_device *obd = data; - struct ptlrpc_connection *conn; - int rc; - - LASSERT(obd); - - rc = lprocfs_climp_check(obd); - if (rc) - return rc; - - conn = obd->u.cli.cl_import->imp_connection; - if (conn && obd->u.cli.cl_import) - seq_printf(m, "%s\n", conn->c_remote_uuid.uuid); - else - seq_puts(m, "\n"); - - up_read(&obd->u.cli.cl_sem); - - return 0; -} -EXPORT_SYMBOL(lprocfs_rd_conn_uuid); - -/** - * Lock statistics structure for access, possibly only on this CPU. - * - * The statistics struct may be allocated with per-CPU structures for - * efficient concurrent update (usually only on server-wide stats), or - * as a single global struct (e.g. for per-client or per-job statistics), - * so the required locking depends on the type of structure allocated. - * - * For per-CPU statistics, pin the thread to the current cpuid so that - * will only access the statistics for that CPU. If the stats structure - * for the current CPU has not been allocated (or previously freed), - * allocate it now. The per-CPU statistics do not need locking since - * the thread is pinned to the CPU during update. - * - * For global statistics, lock the stats structure to prevent concurrent update. - * - * \param[in] stats statistics structure to lock - * \param[in] opc type of operation: - * LPROCFS_GET_SMP_ID: "lock" and return current CPU index - * for incrementing statistics for that CPU - * LPROCFS_GET_NUM_CPU: "lock" and return number of used - * CPU indices to iterate over all indices - * \param[out] flags CPU interrupt saved state for IRQ-safe locking - * - * \retval cpuid of current thread or number of allocated structs - * \retval negative on error (only for opc LPROCFS_GET_SMP_ID + per-CPU stats) - */ -int lprocfs_stats_lock(struct lprocfs_stats *stats, - enum lprocfs_stats_lock_ops opc, - unsigned long *flags) -{ - if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) { - if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) - spin_lock_irqsave(&stats->ls_lock, *flags); - else - spin_lock(&stats->ls_lock); - return opc == LPROCFS_GET_NUM_CPU ? 1 : 0; - } - - switch (opc) { - case LPROCFS_GET_SMP_ID: { - unsigned int cpuid = get_cpu(); - - if (unlikely(!stats->ls_percpu[cpuid])) { - int rc = lprocfs_stats_alloc_one(stats, cpuid); - - if (rc < 0) { - put_cpu(); - return rc; - } - } - return cpuid; - } - case LPROCFS_GET_NUM_CPU: - return stats->ls_biggest_alloc_num; - default: - LBUG(); - } -} - -/** - * Unlock statistics structure after access. - * - * Unlock the lock acquired via lprocfs_stats_lock() for global statistics, - * or unpin this thread from the current cpuid for per-CPU statistics. - * - * This function must be called using the same arguments as used when calling - * lprocfs_stats_lock() so that the correct operation can be performed. - * - * \param[in] stats statistics structure to unlock - * \param[in] opc type of operation (current cpuid or number of structs) - * \param[in] flags CPU interrupt saved state for IRQ-safe locking - */ -void lprocfs_stats_unlock(struct lprocfs_stats *stats, - enum lprocfs_stats_lock_ops opc, - unsigned long *flags) -{ - if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) { - if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) - spin_unlock_irqrestore(&stats->ls_lock, *flags); - else - spin_unlock(&stats->ls_lock); - } else if (opc == LPROCFS_GET_SMP_ID) { - put_cpu(); - } -} - -/** add up per-cpu counters */ -void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx, - struct lprocfs_counter *cnt) -{ - unsigned int num_entry; - struct lprocfs_counter *percpu_cntr; - int i; - unsigned long flags = 0; - - memset(cnt, 0, sizeof(*cnt)); - - if (!stats) { - /* set count to 1 to avoid divide-by-zero errs in callers */ - cnt->lc_count = 1; - return; - } - - cnt->lc_min = LC_MIN_INIT; - - num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags); - - for (i = 0; i < num_entry; i++) { - if (!stats->ls_percpu[i]) - continue; - percpu_cntr = lprocfs_stats_counter_get(stats, i, idx); - - cnt->lc_count += percpu_cntr->lc_count; - cnt->lc_sum += percpu_cntr->lc_sum; - if (percpu_cntr->lc_min < cnt->lc_min) - cnt->lc_min = percpu_cntr->lc_min; - if (percpu_cntr->lc_max > cnt->lc_max) - cnt->lc_max = percpu_cntr->lc_max; - cnt->lc_sumsquare += percpu_cntr->lc_sumsquare; - } - - lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags); -} - -/** - * Append a space separated list of current set flags to str. - */ -#define flag2str(flag, first) \ - do { \ - if (imp->imp_##flag) \ - seq_printf(m, "%s" #flag, first ? "" : ", "); \ - } while (0) -static int obd_import_flags2str(struct obd_import *imp, struct seq_file *m) -{ - bool first = true; - - if (imp->imp_obd->obd_no_recov) { - seq_puts(m, "no_recov"); - first = false; - } - - flag2str(invalid, first); - first = false; - flag2str(deactive, first); - flag2str(replayable, first); - flag2str(pingable, first); - return 0; -} - -#undef flags2str - -static void obd_connect_seq_flags2str(struct seq_file *m, __u64 flags, char *sep) -{ - __u64 mask = 1; - int i; - bool first = true; - - for (i = 0; obd_connect_names[i]; i++, mask <<= 1) { - if (flags & mask) { - seq_printf(m, "%s%s", - first ? sep : "", obd_connect_names[i]); - first = false; - } - } - if (flags & ~(mask - 1)) - seq_printf(m, "%sunknown flags %#llx", - first ? sep : "", flags & ~(mask - 1)); -} - -int lprocfs_rd_import(struct seq_file *m, void *data) -{ - char nidstr[LNET_NIDSTR_SIZE]; - struct lprocfs_counter ret; - struct lprocfs_counter_header *header; - struct obd_device *obd = data; - struct obd_import *imp; - struct obd_import_conn *conn; - struct obd_connect_data *ocd; - int j; - int k; - int rw = 0; - int rc; - - LASSERT(obd); - rc = lprocfs_climp_check(obd); - if (rc) - return rc; - - imp = obd->u.cli.cl_import; - ocd = &imp->imp_connect_data; - - seq_printf(m, "import:\n" - " name: %s\n" - " target: %s\n" - " state: %s\n" - " instance: %u\n" - " connect_flags: [ ", - obd->obd_name, - obd2cli_tgt(obd), - ptlrpc_import_state_name(imp->imp_state), - imp->imp_connect_data.ocd_instance); - obd_connect_seq_flags2str(m, imp->imp_connect_data.ocd_connect_flags, - ", "); - seq_puts(m, " ]\n"); - obd_connect_data_seqprint(m, ocd); - seq_puts(m, " import_flags: [ "); - obd_import_flags2str(imp, m); - - seq_puts(m, - " ]\n" - " connection:\n" - " failover_nids: [ "); - spin_lock(&imp->imp_lock); - j = 0; - list_for_each_entry(conn, &imp->imp_conn_list, oic_item) { - libcfs_nid2str_r(conn->oic_conn->c_peer.nid, - nidstr, sizeof(nidstr)); - seq_printf(m, "%s%s", j ? ", " : "", nidstr); - j++; - } - if (imp->imp_connection) - libcfs_nid2str_r(imp->imp_connection->c_peer.nid, - nidstr, sizeof(nidstr)); - else - strncpy(nidstr, "", sizeof(nidstr)); - seq_printf(m, - " ]\n" - " current_connection: %s\n" - " connection_attempts: %u\n" - " generation: %u\n" - " in-progress_invalidations: %u\n", - nidstr, - imp->imp_conn_cnt, - imp->imp_generation, - atomic_read(&imp->imp_inval_count)); - spin_unlock(&imp->imp_lock); - - if (!obd->obd_svc_stats) - goto out_climp; - - header = &obd->obd_svc_stats->ls_cnt_header[PTLRPC_REQWAIT_CNTR]; - lprocfs_stats_collect(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR, &ret); - if (ret.lc_count != 0) { - /* first argument to do_div MUST be __u64 */ - __u64 sum = ret.lc_sum; - - do_div(sum, ret.lc_count); - ret.lc_sum = sum; - } else { - ret.lc_sum = 0; - } - seq_printf(m, - " rpcs:\n" - " inflight: %u\n" - " unregistering: %u\n" - " timeouts: %u\n" - " avg_waittime: %llu %s\n", - atomic_read(&imp->imp_inflight), - atomic_read(&imp->imp_unregistering), - atomic_read(&imp->imp_timeouts), - ret.lc_sum, header->lc_units); - - k = 0; - for (j = 0; j < IMP_AT_MAX_PORTALS; j++) { - if (imp->imp_at.iat_portal[j] == 0) - break; - k = max_t(unsigned int, k, - at_get(&imp->imp_at.iat_service_estimate[j])); - } - seq_printf(m, - " service_estimates:\n" - " services: %u sec\n" - " network: %u sec\n", - k, - at_get(&imp->imp_at.iat_net_latency)); - - seq_printf(m, - " transactions:\n" - " last_replay: %llu\n" - " peer_committed: %llu\n" - " last_checked: %llu\n", - imp->imp_last_replay_transno, - imp->imp_peer_committed_transno, - imp->imp_last_transno_checked); - - /* avg data rates */ - for (rw = 0; rw <= 1; rw++) { - lprocfs_stats_collect(obd->obd_svc_stats, - PTLRPC_LAST_CNTR + BRW_READ_BYTES + rw, - &ret); - if (ret.lc_sum > 0 && ret.lc_count > 0) { - /* first argument to do_div MUST be __u64 */ - __u64 sum = ret.lc_sum; - - do_div(sum, ret.lc_count); - ret.lc_sum = sum; - seq_printf(m, - " %s_data_averages:\n" - " bytes_per_rpc: %llu\n", - rw ? "write" : "read", - ret.lc_sum); - } - k = (int)ret.lc_sum; - j = opcode_offset(OST_READ + rw) + EXTRA_MAX_OPCODES; - header = &obd->obd_svc_stats->ls_cnt_header[j]; - lprocfs_stats_collect(obd->obd_svc_stats, j, &ret); - if (ret.lc_sum > 0 && ret.lc_count != 0) { - /* first argument to do_div MUST be __u64 */ - __u64 sum = ret.lc_sum; - - do_div(sum, ret.lc_count); - ret.lc_sum = sum; - seq_printf(m, - " %s_per_rpc: %llu\n", - header->lc_units, ret.lc_sum); - j = (int)ret.lc_sum; - if (j > 0) - seq_printf(m, - " MB_per_sec: %u.%.02u\n", - k / j, (100 * k / j) % 100); - } - } - -out_climp: - up_read(&obd->u.cli.cl_sem); - return 0; -} -EXPORT_SYMBOL(lprocfs_rd_import); - -int lprocfs_rd_state(struct seq_file *m, void *data) -{ - struct obd_device *obd = data; - struct obd_import *imp; - int j, k, rc; - - LASSERT(obd); - rc = lprocfs_climp_check(obd); - if (rc) - return rc; - - imp = obd->u.cli.cl_import; - - seq_printf(m, "current_state: %s\n", - ptlrpc_import_state_name(imp->imp_state)); - seq_puts(m, "state_history:\n"); - k = imp->imp_state_hist_idx; - for (j = 0; j < IMP_STATE_HIST_LEN; j++) { - struct import_state_hist *ish = - &imp->imp_state_hist[(k + j) % IMP_STATE_HIST_LEN]; - if (ish->ish_state == 0) - continue; - seq_printf(m, " - [ %lld, %s ]\n", (s64)ish->ish_time, - ptlrpc_import_state_name(ish->ish_state)); - } - - up_read(&obd->u.cli.cl_sem); - return 0; -} -EXPORT_SYMBOL(lprocfs_rd_state); - -int lprocfs_at_hist_helper(struct seq_file *m, struct adaptive_timeout *at) -{ - int i; - - for (i = 0; i < AT_BINS; i++) - seq_printf(m, "%3u ", at->at_hist[i]); - seq_puts(m, "\n"); - return 0; -} -EXPORT_SYMBOL(lprocfs_at_hist_helper); - -/* See also ptlrpc_lprocfs_rd_timeouts */ -int lprocfs_rd_timeouts(struct seq_file *m, void *data) -{ - struct obd_device *obd = data; - struct obd_import *imp; - unsigned int cur, worst; - time64_t now, worstt; - struct dhms ts; - int i, rc; - - LASSERT(obd); - rc = lprocfs_climp_check(obd); - if (rc) - return rc; - - imp = obd->u.cli.cl_import; - - now = ktime_get_real_seconds(); - - /* Some network health info for kicks */ - s2dhms(&ts, now - imp->imp_last_reply_time); - seq_printf(m, "%-10s : %lld, " DHMS_FMT " ago\n", - "last reply", (s64)imp->imp_last_reply_time, DHMS_VARS(&ts)); - - cur = at_get(&imp->imp_at.iat_net_latency); - worst = imp->imp_at.iat_net_latency.at_worst_ever; - worstt = imp->imp_at.iat_net_latency.at_worst_time; - s2dhms(&ts, now - worstt); - seq_printf(m, "%-10s : cur %3u worst %3u (at %lld, " DHMS_FMT " ago) ", - "network", cur, worst, (s64)worstt, DHMS_VARS(&ts)); - lprocfs_at_hist_helper(m, &imp->imp_at.iat_net_latency); - - for (i = 0; i < IMP_AT_MAX_PORTALS; i++) { - if (imp->imp_at.iat_portal[i] == 0) - break; - cur = at_get(&imp->imp_at.iat_service_estimate[i]); - worst = imp->imp_at.iat_service_estimate[i].at_worst_ever; - worstt = imp->imp_at.iat_service_estimate[i].at_worst_time; - s2dhms(&ts, now - worstt); - seq_printf(m, "portal %-2d : cur %3u worst %3u (at %lld, " - DHMS_FMT " ago) ", imp->imp_at.iat_portal[i], - cur, worst, (s64)worstt, DHMS_VARS(&ts)); - lprocfs_at_hist_helper(m, &imp->imp_at.iat_service_estimate[i]); - } - - up_read(&obd->u.cli.cl_sem); - return 0; -} -EXPORT_SYMBOL(lprocfs_rd_timeouts); - -int lprocfs_rd_connect_flags(struct seq_file *m, void *data) -{ - struct obd_device *obd = data; - __u64 flags; - int rc; - - rc = lprocfs_climp_check(obd); - if (rc) - return rc; - - flags = obd->u.cli.cl_import->imp_connect_data.ocd_connect_flags; - seq_printf(m, "flags=%#llx\n", flags); - obd_connect_seq_flags2str(m, flags, "\n"); - seq_puts(m, "\n"); - up_read(&obd->u.cli.cl_sem); - return 0; -} -EXPORT_SYMBOL(lprocfs_rd_connect_flags); - -static struct attribute *obd_def_attrs[] = { - &lustre_attr_blocksize.attr, - &lustre_attr_kbytestotal.attr, - &lustre_attr_kbytesfree.attr, - &lustre_attr_kbytesavail.attr, - &lustre_attr_filestotal.attr, - &lustre_attr_filesfree.attr, - &lustre_attr_uuid.attr, - NULL, -}; - -static void obd_sysfs_release(struct kobject *kobj) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - - complete(&obd->obd_kobj_unregister); -} - -static struct kobj_type obd_ktype = { - .default_attrs = obd_def_attrs, - .sysfs_ops = &lustre_sysfs_ops, - .release = obd_sysfs_release, -}; - -int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list, - const struct attribute_group *attrs) -{ - int rc = 0; - - init_completion(&obd->obd_kobj_unregister); - rc = kobject_init_and_add(&obd->obd_kobj, &obd_ktype, - obd->obd_type->typ_kobj, - "%s", obd->obd_name); - if (rc) - return rc; - - if (attrs) { - rc = sysfs_create_group(&obd->obd_kobj, attrs); - if (rc) { - kobject_put(&obd->obd_kobj); - return rc; - } - } - - obd->obd_debugfs_entry = debugfs_create_dir(obd->obd_name, - obd->obd_type->typ_debugfs_entry); - ldebugfs_add_vars(obd->obd_debugfs_entry, list, obd); - - return rc; -} -EXPORT_SYMBOL_GPL(lprocfs_obd_setup); - -int lprocfs_obd_cleanup(struct obd_device *obd) -{ - if (!obd) - return -EINVAL; - - debugfs_remove_recursive(obd->obd_debugfs_entry); - - kobject_put(&obd->obd_kobj); - wait_for_completion(&obd->obd_kobj_unregister); - - return 0; -} -EXPORT_SYMBOL_GPL(lprocfs_obd_cleanup); - -int lprocfs_stats_alloc_one(struct lprocfs_stats *stats, unsigned int cpuid) -{ - struct lprocfs_counter *cntr; - unsigned int percpusize; - int rc = -ENOMEM; - unsigned long flags = 0; - int i; - - LASSERT(!stats->ls_percpu[cpuid]); - LASSERT((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0); - - percpusize = lprocfs_stats_counter_size(stats); - stats->ls_percpu[cpuid] = kzalloc(percpusize, GFP_ATOMIC); - if (stats->ls_percpu[cpuid]) { - rc = 0; - if (unlikely(stats->ls_biggest_alloc_num <= cpuid)) { - if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) - spin_lock_irqsave(&stats->ls_lock, flags); - else - spin_lock(&stats->ls_lock); - if (stats->ls_biggest_alloc_num <= cpuid) - stats->ls_biggest_alloc_num = cpuid + 1; - if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) - spin_unlock_irqrestore(&stats->ls_lock, flags); - else - spin_unlock(&stats->ls_lock); - } - /* initialize the ls_percpu[cpuid] non-zero counter */ - for (i = 0; i < stats->ls_num; ++i) { - cntr = lprocfs_stats_counter_get(stats, cpuid, i); - cntr->lc_min = LC_MIN_INIT; - } - } - return rc; -} - -struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num, - enum lprocfs_stats_flags flags) -{ - struct lprocfs_stats *stats; - unsigned int num_entry; - unsigned int percpusize = 0; - int i; - - if (num == 0) - return NULL; - - if (lprocfs_no_percpu_stats != 0) - flags |= LPROCFS_STATS_FLAG_NOPERCPU; - - if (flags & LPROCFS_STATS_FLAG_NOPERCPU) - num_entry = 1; - else - num_entry = num_possible_cpus(); - - /* alloc percpu pointers for all possible cpu slots */ - stats = kvzalloc(offsetof(typeof(*stats), ls_percpu[num_entry]), - GFP_KERNEL); - if (!stats) - return NULL; - - stats->ls_num = num; - stats->ls_flags = flags; - spin_lock_init(&stats->ls_lock); - - /* alloc num of counter headers */ - stats->ls_cnt_header = kvmalloc_array(stats->ls_num, - sizeof(struct lprocfs_counter_header), - GFP_KERNEL | __GFP_ZERO); - if (!stats->ls_cnt_header) - goto fail; - - if ((flags & LPROCFS_STATS_FLAG_NOPERCPU) != 0) { - /* contains only one set counters */ - percpusize = lprocfs_stats_counter_size(stats); - stats->ls_percpu[0] = kzalloc(percpusize, GFP_ATOMIC); - if (!stats->ls_percpu[0]) - goto fail; - stats->ls_biggest_alloc_num = 1; - } else if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) { - /* alloc all percpu data */ - for (i = 0; i < num_entry; ++i) - if (lprocfs_stats_alloc_one(stats, i) < 0) - goto fail; - } - - return stats; - -fail: - lprocfs_free_stats(&stats); - return NULL; -} -EXPORT_SYMBOL(lprocfs_alloc_stats); - -void lprocfs_free_stats(struct lprocfs_stats **statsh) -{ - struct lprocfs_stats *stats = *statsh; - unsigned int num_entry; - unsigned int percpusize; - unsigned int i; - - if (!stats || stats->ls_num == 0) - return; - *statsh = NULL; - - if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) - num_entry = 1; - else - num_entry = num_possible_cpus(); - - percpusize = lprocfs_stats_counter_size(stats); - for (i = 0; i < num_entry; i++) - kfree(stats->ls_percpu[i]); - kvfree(stats->ls_cnt_header); - kvfree(stats); -} -EXPORT_SYMBOL(lprocfs_free_stats); - -__u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx, - enum lprocfs_fields_flags field) -{ - unsigned int i; - unsigned int num_cpu; - unsigned long flags = 0; - __u64 ret = 0; - - LASSERT(stats); - - num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags); - for (i = 0; i < num_cpu; i++) { - if (!stats->ls_percpu[i]) - continue; - ret += lprocfs_read_helper( - lprocfs_stats_counter_get(stats, i, idx), - &stats->ls_cnt_header[idx], stats->ls_flags, - field); - } - lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags); - return ret; -} -EXPORT_SYMBOL(lprocfs_stats_collector); - -void lprocfs_clear_stats(struct lprocfs_stats *stats) -{ - struct lprocfs_counter *percpu_cntr; - int i; - int j; - unsigned int num_entry; - unsigned long flags = 0; - - num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags); - - for (i = 0; i < num_entry; i++) { - if (!stats->ls_percpu[i]) - continue; - for (j = 0; j < stats->ls_num; j++) { - percpu_cntr = lprocfs_stats_counter_get(stats, i, j); - percpu_cntr->lc_count = 0; - percpu_cntr->lc_min = LC_MIN_INIT; - percpu_cntr->lc_max = 0; - percpu_cntr->lc_sumsquare = 0; - percpu_cntr->lc_sum = 0; - if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) - percpu_cntr->lc_sum_irq = 0; - } - } - - lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags); -} -EXPORT_SYMBOL(lprocfs_clear_stats); - -static ssize_t lprocfs_stats_seq_write(struct file *file, - const char __user *buf, - size_t len, loff_t *off) -{ - struct seq_file *seq = file->private_data; - struct lprocfs_stats *stats = seq->private; - - lprocfs_clear_stats(stats); - - return len; -} - -static void *lprocfs_stats_seq_start(struct seq_file *p, loff_t *pos) -{ - struct lprocfs_stats *stats = p->private; - - return (*pos < stats->ls_num) ? pos : NULL; -} - -static void lprocfs_stats_seq_stop(struct seq_file *p, void *v) -{ -} - -static void *lprocfs_stats_seq_next(struct seq_file *p, void *v, loff_t *pos) -{ - (*pos)++; - return lprocfs_stats_seq_start(p, pos); -} - -/* seq file export of one lprocfs counter */ -static int lprocfs_stats_seq_show(struct seq_file *p, void *v) -{ - struct lprocfs_stats *stats = p->private; - struct lprocfs_counter_header *hdr; - struct lprocfs_counter ctr; - int idx = *(loff_t *)v; - - if (idx == 0) { - struct timespec64 now; - - ktime_get_real_ts64(&now); - seq_printf(p, "%-25s %llu.%9lu secs.usecs\n", - "snapshot_time", - (s64)now.tv_sec, (unsigned long)now.tv_nsec); - } - - hdr = &stats->ls_cnt_header[idx]; - lprocfs_stats_collect(stats, idx, &ctr); - - if (ctr.lc_count != 0) { - seq_printf(p, "%-25s %lld samples [%s]", - hdr->lc_name, ctr.lc_count, hdr->lc_units); - - if ((hdr->lc_config & LPROCFS_CNTR_AVGMINMAX) && - (ctr.lc_count > 0)) { - seq_printf(p, " %lld %lld %lld", - ctr.lc_min, ctr.lc_max, ctr.lc_sum); - if (hdr->lc_config & LPROCFS_CNTR_STDDEV) - seq_printf(p, " %lld", ctr.lc_sumsquare); - } - seq_putc(p, '\n'); - } - - return 0; -} - -static const struct seq_operations lprocfs_stats_seq_sops = { - .start = lprocfs_stats_seq_start, - .stop = lprocfs_stats_seq_stop, - .next = lprocfs_stats_seq_next, - .show = lprocfs_stats_seq_show, -}; - -static int lprocfs_stats_seq_open(struct inode *inode, struct file *file) -{ - struct seq_file *seq; - int rc; - - rc = seq_open(file, &lprocfs_stats_seq_sops); - if (rc) - return rc; - - seq = file->private_data; - seq->private = inode->i_private; - - return 0; -} - -const struct file_operations lprocfs_stats_seq_fops = { - .owner = THIS_MODULE, - .open = lprocfs_stats_seq_open, - .read = seq_read, - .write = lprocfs_stats_seq_write, - .llseek = seq_lseek, - .release = lprocfs_seq_release, -}; -EXPORT_SYMBOL_GPL(lprocfs_stats_seq_fops); - -void lprocfs_counter_init(struct lprocfs_stats *stats, int index, - unsigned int conf, const char *name, - const char *units) -{ - struct lprocfs_counter_header *header; - struct lprocfs_counter *percpu_cntr; - unsigned long flags = 0; - unsigned int i; - unsigned int num_cpu; - - header = &stats->ls_cnt_header[index]; - LASSERTF(header, "Failed to allocate stats header:[%d]%s/%s\n", - index, name, units); - - header->lc_config = conf; - header->lc_name = name; - header->lc_units = units; - - num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags); - for (i = 0; i < num_cpu; ++i) { - if (!stats->ls_percpu[i]) - continue; - percpu_cntr = lprocfs_stats_counter_get(stats, i, index); - percpu_cntr->lc_count = 0; - percpu_cntr->lc_min = LC_MIN_INIT; - percpu_cntr->lc_max = 0; - percpu_cntr->lc_sumsquare = 0; - percpu_cntr->lc_sum = 0; - if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) - percpu_cntr->lc_sum_irq = 0; - } - lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags); -} -EXPORT_SYMBOL(lprocfs_counter_init); - -int lprocfs_exp_cleanup(struct obd_export *exp) -{ - return 0; -} -EXPORT_SYMBOL(lprocfs_exp_cleanup); - -__s64 lprocfs_read_helper(struct lprocfs_counter *lc, - struct lprocfs_counter_header *header, - enum lprocfs_stats_flags flags, - enum lprocfs_fields_flags field) -{ - __s64 ret = 0; - - if (!lc || !header) - return 0; - - switch (field) { - case LPROCFS_FIELDS_FLAGS_CONFIG: - ret = header->lc_config; - break; - case LPROCFS_FIELDS_FLAGS_SUM: - ret = lc->lc_sum; - if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) - ret += lc->lc_sum_irq; - break; - case LPROCFS_FIELDS_FLAGS_MIN: - ret = lc->lc_min; - break; - case LPROCFS_FIELDS_FLAGS_MAX: - ret = lc->lc_max; - break; - case LPROCFS_FIELDS_FLAGS_AVG: - ret = (lc->lc_max - lc->lc_min) / 2; - break; - case LPROCFS_FIELDS_FLAGS_SUMSQUARE: - ret = lc->lc_sumsquare; - break; - case LPROCFS_FIELDS_FLAGS_COUNT: - ret = lc->lc_count; - break; - default: - break; - } - - return 0; -} -EXPORT_SYMBOL(lprocfs_read_helper); - -int lprocfs_write_helper(const char __user *buffer, unsigned long count, - int *val) -{ - return lprocfs_write_frac_helper(buffer, count, val, 1); -} -EXPORT_SYMBOL(lprocfs_write_helper); - -int lprocfs_write_u64_helper(const char __user *buffer, unsigned long count, - __u64 *val) -{ - return lprocfs_write_frac_u64_helper(buffer, count, val, 1); -} -EXPORT_SYMBOL(lprocfs_write_u64_helper); - -int lprocfs_write_frac_u64_helper(const char __user *buffer, - unsigned long count, __u64 *val, int mult) -{ - char kernbuf[22], *end, *pbuf; - __u64 whole, frac = 0, units; - unsigned int frac_d = 1; - int sign = 1; - - if (count > (sizeof(kernbuf) - 1)) - return -EINVAL; - - if (copy_from_user(kernbuf, buffer, count)) - return -EFAULT; - - kernbuf[count] = '\0'; - pbuf = kernbuf; - if (*pbuf == '-') { - sign = -1; - pbuf++; - } - - whole = simple_strtoull(pbuf, &end, 10); - if (pbuf == end) - return -EINVAL; - - if (*end == '.') { - int i; - - pbuf = end + 1; - - /* need to limit frac_d to a __u32 */ - if (strlen(pbuf) > 10) - pbuf[10] = '\0'; - - frac = simple_strtoull(pbuf, &end, 10); - /* count decimal places */ - for (i = 0; i < (end - pbuf); i++) - frac_d *= 10; - } - - units = 1; - if (end) { - switch (tolower(*end)) { - case 'p': - units <<= 10; - /* fall through */ - case 't': - units <<= 10; - /* fall through */ - case 'g': - units <<= 10; - /* fall through */ - case 'm': - units <<= 10; - /* fall through */ - case 'k': - units <<= 10; - } - } - /* Specified units override the multiplier */ - if (units > 1) - mult = units; - - frac *= mult; - do_div(frac, frac_d); - *val = sign * (whole * mult + frac); - return 0; -} -EXPORT_SYMBOL(lprocfs_write_frac_u64_helper); - -static char *lprocfs_strnstr(const char *s1, const char *s2, size_t len) -{ - size_t l2; - - l2 = strlen(s2); - if (!l2) - return (char *)s1; - while (len >= l2) { - len--; - if (!memcmp(s1, s2, l2)) - return (char *)s1; - s1++; - } - return NULL; -} - -/** - * Find the string \a name in the input \a buffer, and return a pointer to the - * value immediately following \a name, reducing \a count appropriately. - * If \a name is not found the original \a buffer is returned. - */ -char *lprocfs_find_named_value(const char *buffer, const char *name, - size_t *count) -{ - char *val; - size_t buflen = *count; - - /* there is no strnstr() in rhel5 and ubuntu kernels */ - val = lprocfs_strnstr(buffer, name, buflen); - if (!val) - return (char *)buffer; - - val += strlen(name); /* skip prefix */ - while (val < buffer + buflen && isspace(*val)) /* skip separator */ - val++; - - *count = 0; - while (val < buffer + buflen && isalnum(*val)) { - ++*count; - ++val; - } - - return val - *count; -} -EXPORT_SYMBOL(lprocfs_find_named_value); - -void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value) -{ - if (value >= OBD_HIST_MAX) - value = OBD_HIST_MAX - 1; - - spin_lock(&oh->oh_lock); - oh->oh_buckets[value]++; - spin_unlock(&oh->oh_lock); -} -EXPORT_SYMBOL(lprocfs_oh_tally); - -void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value) -{ - unsigned int val = 0; - - if (likely(value != 0)) - val = min(fls(value - 1), OBD_HIST_MAX); - - lprocfs_oh_tally(oh, val); -} -EXPORT_SYMBOL(lprocfs_oh_tally_log2); - -unsigned long lprocfs_oh_sum(struct obd_histogram *oh) -{ - unsigned long ret = 0; - int i; - - for (i = 0; i < OBD_HIST_MAX; i++) - ret += oh->oh_buckets[i]; - return ret; -} -EXPORT_SYMBOL(lprocfs_oh_sum); - -void lprocfs_oh_clear(struct obd_histogram *oh) -{ - spin_lock(&oh->oh_lock); - memset(oh->oh_buckets, 0, sizeof(oh->oh_buckets)); - spin_unlock(&oh->oh_lock); -} -EXPORT_SYMBOL(lprocfs_oh_clear); - -int lprocfs_wr_root_squash(const char __user *buffer, unsigned long count, - struct root_squash_info *squash, char *name) -{ - char kernbuf[64], *tmp, *errmsg; - unsigned long uid, gid; - int rc; - - if (count >= sizeof(kernbuf)) { - errmsg = "string too long"; - rc = -EINVAL; - goto failed_noprint; - } - if (copy_from_user(kernbuf, buffer, count)) { - errmsg = "bad address"; - rc = -EFAULT; - goto failed_noprint; - } - kernbuf[count] = '\0'; - - /* look for uid gid separator */ - tmp = strchr(kernbuf, ':'); - if (!tmp) { - errmsg = "needs uid:gid format"; - rc = -EINVAL; - goto failed; - } - *tmp = '\0'; - tmp++; - - /* parse uid */ - if (kstrtoul(kernbuf, 0, &uid) != 0) { - errmsg = "bad uid"; - rc = -EINVAL; - goto failed; - } - /* parse gid */ - if (kstrtoul(tmp, 0, &gid) != 0) { - errmsg = "bad gid"; - rc = -EINVAL; - goto failed; - } - - squash->rsi_uid = uid; - squash->rsi_gid = gid; - - LCONSOLE_INFO("%s: root_squash is set to %u:%u\n", - name, squash->rsi_uid, squash->rsi_gid); - return count; - -failed: - if (tmp) { - tmp--; - *tmp = ':'; - } - CWARN("%s: failed to set root_squash to \"%s\", %s, rc = %d\n", - name, kernbuf, errmsg, rc); - return rc; -failed_noprint: - CWARN("%s: failed to set root_squash due to %s, rc = %d\n", - name, errmsg, rc); - return rc; -} -EXPORT_SYMBOL(lprocfs_wr_root_squash); - -int lprocfs_wr_nosquash_nids(const char __user *buffer, unsigned long count, - struct root_squash_info *squash, char *name) -{ - char *kernbuf = NULL, *errmsg; - struct list_head tmp; - int len = count; - int rc; - - if (count > 4096) { - errmsg = "string too long"; - rc = -EINVAL; - goto failed; - } - - kernbuf = kzalloc(count + 1, GFP_NOFS); - if (!kernbuf) { - errmsg = "no memory"; - rc = -ENOMEM; - goto failed; - } - - if (copy_from_user(kernbuf, buffer, count)) { - errmsg = "bad address"; - rc = -EFAULT; - goto failed; - } - kernbuf[count] = '\0'; - - if (count > 0 && kernbuf[count - 1] == '\n') - len = count - 1; - - if ((len == 4 && !strncmp(kernbuf, "NONE", len)) || - (len == 5 && !strncmp(kernbuf, "clear", len))) { - /* empty string is special case */ - down_write(&squash->rsi_sem); - if (!list_empty(&squash->rsi_nosquash_nids)) - cfs_free_nidlist(&squash->rsi_nosquash_nids); - up_write(&squash->rsi_sem); - LCONSOLE_INFO("%s: nosquash_nids is cleared\n", name); - kfree(kernbuf); - return count; - } - - INIT_LIST_HEAD(&tmp); - if (cfs_parse_nidlist(kernbuf, count, &tmp) <= 0) { - errmsg = "can't parse"; - rc = -EINVAL; - goto failed; - } - LCONSOLE_INFO("%s: nosquash_nids set to %s\n", - name, kernbuf); - kfree(kernbuf); - kernbuf = NULL; - - down_write(&squash->rsi_sem); - if (!list_empty(&squash->rsi_nosquash_nids)) - cfs_free_nidlist(&squash->rsi_nosquash_nids); - list_splice(&tmp, &squash->rsi_nosquash_nids); - up_write(&squash->rsi_sem); - - return count; - -failed: - if (kernbuf) { - CWARN("%s: failed to set nosquash_nids to \"%s\", %s rc = %d\n", - name, kernbuf, errmsg, rc); - kfree(kernbuf); - kernbuf = NULL; - } else { - CWARN("%s: failed to set nosquash_nids due to %s rc = %d\n", - name, errmsg, rc); - } - return rc; -} -EXPORT_SYMBOL(lprocfs_wr_nosquash_nids); - -static ssize_t lustre_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct lustre_attr *a = container_of(attr, struct lustre_attr, attr); - - return a->show ? a->show(kobj, attr, buf) : 0; -} - -static ssize_t lustre_attr_store(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t len) -{ - struct lustre_attr *a = container_of(attr, struct lustre_attr, attr); - - return a->store ? a->store(kobj, attr, buf, len) : len; -} - -const struct sysfs_ops lustre_sysfs_ops = { - .show = lustre_attr_show, - .store = lustre_attr_store, -}; -EXPORT_SYMBOL_GPL(lustre_sysfs_ops); diff --git a/drivers/staging/lustre/lustre/obdclass/lu_object.c b/drivers/staging/lustre/lustre/obdclass/lu_object.c deleted file mode 100644 index aa9d74e087f42599df110f7c275723b76e8a27a1..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdclass/lu_object.c +++ /dev/null @@ -1,2056 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/obdclass/lu_object.c - * - * Lustre Object. - * These are the only exported functions, they provide some generic - * infrastructure for managing object devices - * - * Author: Nikita Danilov - */ - -#define DEBUG_SUBSYSTEM S_CLASS - -#include - -/* hash_long() */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct lu_site_bkt_data { - /** - * LRU list, updated on each access to object. Protected by - * bucket lock of lu_site::ls_obj_hash. - * - * "Cold" end of LRU is lu_site::ls_lru.next. Accessed object are - * moved to the lu_site::ls_lru.prev (this is due to the non-existence - * of list_for_each_entry_safe_reverse()). - */ - struct list_head lsb_lru; - /** - * Wait-queue signaled when an object in this site is ultimately - * destroyed (lu_object_free()). It is used by lu_object_find() to - * wait before re-trying when object in the process of destruction is - * found in the hash table. - * - * \see htable_lookup(). - */ - wait_queue_head_t lsb_marche_funebre; -}; - -enum { - LU_CACHE_PERCENT_MAX = 50, - LU_CACHE_PERCENT_DEFAULT = 20 -}; - -#define LU_CACHE_NR_MAX_ADJUST 512 -#define LU_CACHE_NR_UNLIMITED -1 -#define LU_CACHE_NR_DEFAULT LU_CACHE_NR_UNLIMITED -#define LU_CACHE_NR_LDISKFS_LIMIT LU_CACHE_NR_UNLIMITED -#define LU_CACHE_NR_ZFS_LIMIT 256 - -#define LU_SITE_BITS_MIN 12 -#define LU_SITE_BITS_MAX 24 -#define LU_SITE_BITS_MAX_CL 19 -/** - * total 256 buckets, we don't want too many buckets because: - * - consume too much memory - * - avoid unbalanced LRU list - */ -#define LU_SITE_BKT_BITS 8 - -static unsigned int lu_cache_percent = LU_CACHE_PERCENT_DEFAULT; -module_param(lu_cache_percent, int, 0644); -MODULE_PARM_DESC(lu_cache_percent, "Percentage of memory to be used as lu_object cache"); - -static long lu_cache_nr = LU_CACHE_NR_DEFAULT; -module_param(lu_cache_nr, long, 0644); -MODULE_PARM_DESC(lu_cache_nr, "Maximum number of objects in lu_object cache"); - -static void lu_object_free(const struct lu_env *env, struct lu_object *o); -static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx); - -wait_queue_head_t * -lu_site_wq_from_fid(struct lu_site *site, struct lu_fid *fid) -{ - struct cfs_hash_bd bd; - struct lu_site_bkt_data *bkt; - - cfs_hash_bd_get(site->ls_obj_hash, fid, &bd); - bkt = cfs_hash_bd_extra_get(site->ls_obj_hash, &bd); - return &bkt->lsb_marche_funebre; -} -EXPORT_SYMBOL(lu_site_wq_from_fid); - -/** - * Decrease reference counter on object. If last reference is freed, return - * object to the cache, unless lu_object_is_dying(o) holds. In the latter - * case, free object immediately. - */ -void lu_object_put(const struct lu_env *env, struct lu_object *o) -{ - struct lu_site_bkt_data *bkt; - struct lu_object_header *top; - struct lu_site *site; - struct lu_object *orig; - struct cfs_hash_bd bd; - const struct lu_fid *fid; - - top = o->lo_header; - site = o->lo_dev->ld_site; - orig = o; - - /* - * till we have full fids-on-OST implemented anonymous objects - * are possible in OSP. such an object isn't listed in the site - * so we should not remove it from the site. - */ - fid = lu_object_fid(o); - if (fid_is_zero(fid)) { - LASSERT(!top->loh_hash.next && !top->loh_hash.pprev); - LASSERT(list_empty(&top->loh_lru)); - if (!atomic_dec_and_test(&top->loh_ref)) - return; - list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) { - if (o->lo_ops->loo_object_release) - o->lo_ops->loo_object_release(env, o); - } - lu_object_free(env, orig); - return; - } - - cfs_hash_bd_get(site->ls_obj_hash, &top->loh_fid, &bd); - bkt = cfs_hash_bd_extra_get(site->ls_obj_hash, &bd); - - if (!cfs_hash_bd_dec_and_lock(site->ls_obj_hash, &bd, &top->loh_ref)) { - if (lu_object_is_dying(top)) { - /* - * somebody may be waiting for this, currently only - * used for cl_object, see cl_object_put_last(). - */ - wake_up_all(&bkt->lsb_marche_funebre); - } - return; - } - - /* - * When last reference is released, iterate over object - * layers, and notify them that object is no longer busy. - */ - list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) { - if (o->lo_ops->loo_object_release) - o->lo_ops->loo_object_release(env, o); - } - - if (!lu_object_is_dying(top)) { - LASSERT(list_empty(&top->loh_lru)); - list_add_tail(&top->loh_lru, &bkt->lsb_lru); - percpu_counter_inc(&site->ls_lru_len_counter); - CDEBUG(D_INODE, "Add %p to site lru. hash: %p, bkt: %p\n", - o, site->ls_obj_hash, bkt); - cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1); - return; - } - - /* - * If object is dying (will not be cached), then removed it - * from hash table and LRU. - * - * This is done with hash table and LRU lists locked. As the only - * way to acquire first reference to previously unreferenced - * object is through hash-table lookup (lu_object_find()), - * or LRU scanning (lu_site_purge()), that are done under hash-table - * and LRU lock, no race with concurrent object lookup is possible - * and we can safely destroy object below. - */ - if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags)) - cfs_hash_bd_del_locked(site->ls_obj_hash, &bd, &top->loh_hash); - cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1); - /* - * Object was already removed from hash and lru above, can - * kill it. - */ - lu_object_free(env, orig); -} -EXPORT_SYMBOL(lu_object_put); - -/** - * Kill the object and take it out of LRU cache. - * Currently used by client code for layout change. - */ -void lu_object_unhash(const struct lu_env *env, struct lu_object *o) -{ - struct lu_object_header *top; - - top = o->lo_header; - set_bit(LU_OBJECT_HEARD_BANSHEE, &top->loh_flags); - if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags)) { - struct lu_site *site = o->lo_dev->ld_site; - struct cfs_hash *obj_hash = site->ls_obj_hash; - struct cfs_hash_bd bd; - - cfs_hash_bd_get_and_lock(obj_hash, &top->loh_fid, &bd, 1); - if (!list_empty(&top->loh_lru)) { - struct lu_site_bkt_data *bkt; - - list_del_init(&top->loh_lru); - bkt = cfs_hash_bd_extra_get(obj_hash, &bd); - percpu_counter_dec(&site->ls_lru_len_counter); - } - cfs_hash_bd_del_locked(obj_hash, &bd, &top->loh_hash); - cfs_hash_bd_unlock(obj_hash, &bd, 1); - } -} -EXPORT_SYMBOL(lu_object_unhash); - -/** - * Allocate new object. - * - * This follows object creation protocol, described in the comment within - * struct lu_device_operations definition. - */ -static struct lu_object *lu_object_alloc(const struct lu_env *env, - struct lu_device *dev, - const struct lu_fid *f, - const struct lu_object_conf *conf) -{ - struct lu_object *scan; - struct lu_object *top; - struct list_head *layers; - unsigned int init_mask = 0; - unsigned int init_flag; - int clean; - int result; - - /* - * Create top-level object slice. This will also create - * lu_object_header. - */ - top = dev->ld_ops->ldo_object_alloc(env, NULL, dev); - if (!top) - return ERR_PTR(-ENOMEM); - if (IS_ERR(top)) - return top; - /* - * This is the only place where object fid is assigned. It's constant - * after this point. - */ - top->lo_header->loh_fid = *f; - layers = &top->lo_header->loh_layers; - - do { - /* - * Call ->loo_object_init() repeatedly, until no more new - * object slices are created. - */ - clean = 1; - init_flag = 1; - list_for_each_entry(scan, layers, lo_linkage) { - if (init_mask & init_flag) - goto next; - clean = 0; - scan->lo_header = top->lo_header; - result = scan->lo_ops->loo_object_init(env, scan, conf); - if (result != 0) { - lu_object_free(env, top); - return ERR_PTR(result); - } - init_mask |= init_flag; -next: - init_flag <<= 1; - } - } while (!clean); - - list_for_each_entry_reverse(scan, layers, lo_linkage) { - if (scan->lo_ops->loo_object_start) { - result = scan->lo_ops->loo_object_start(env, scan); - if (result != 0) { - lu_object_free(env, top); - return ERR_PTR(result); - } - } - } - - lprocfs_counter_incr(dev->ld_site->ls_stats, LU_SS_CREATED); - return top; -} - -/** - * Free an object. - */ -static void lu_object_free(const struct lu_env *env, struct lu_object *o) -{ - wait_queue_head_t *wq; - struct lu_site *site; - struct lu_object *scan; - struct list_head *layers; - struct list_head splice; - - site = o->lo_dev->ld_site; - layers = &o->lo_header->loh_layers; - wq = lu_site_wq_from_fid(site, &o->lo_header->loh_fid); - /* - * First call ->loo_object_delete() method to release all resources. - */ - list_for_each_entry_reverse(scan, layers, lo_linkage) { - if (scan->lo_ops->loo_object_delete) - scan->lo_ops->loo_object_delete(env, scan); - } - - /* - * Then, splice object layers into stand-alone list, and call - * ->loo_object_free() on all layers to free memory. Splice is - * necessary, because lu_object_header is freed together with the - * top-level slice. - */ - INIT_LIST_HEAD(&splice); - list_splice_init(layers, &splice); - while (!list_empty(&splice)) { - /* - * Free layers in bottom-to-top order, so that object header - * lives as long as possible and ->loo_object_free() methods - * can look at its contents. - */ - o = container_of(splice.prev, struct lu_object, lo_linkage); - list_del_init(&o->lo_linkage); - o->lo_ops->loo_object_free(env, o); - } - - if (waitqueue_active(wq)) - wake_up_all(wq); -} - -/** - * Free \a nr objects from the cold end of the site LRU list. - * if canblock is false, then don't block awaiting for another - * instance of lu_site_purge() to complete - */ -int lu_site_purge_objects(const struct lu_env *env, struct lu_site *s, - int nr, bool canblock) -{ - struct lu_object_header *h; - struct lu_object_header *temp; - struct lu_site_bkt_data *bkt; - struct cfs_hash_bd bd; - struct cfs_hash_bd bd2; - struct list_head dispose; - int did_sth; - unsigned int start = 0; - int count; - int bnr; - unsigned int i; - - if (OBD_FAIL_CHECK(OBD_FAIL_OBD_NO_LRU)) - return 0; - - INIT_LIST_HEAD(&dispose); - /* - * Under LRU list lock, scan LRU list and move unreferenced objects to - * the dispose list, removing them from LRU and hash table. - */ - if (nr != ~0) - start = s->ls_purge_start; - bnr = (nr == ~0) ? -1 : nr / (int)CFS_HASH_NBKT(s->ls_obj_hash) + 1; - again: - /* - * It doesn't make any sense to make purge threads parallel, that can - * only bring troubles to us. See LU-5331. - */ - if (canblock) - mutex_lock(&s->ls_purge_mutex); - else if (!mutex_trylock(&s->ls_purge_mutex)) - goto out; - - did_sth = 0; - cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) { - if (i < start) - continue; - count = bnr; - cfs_hash_bd_lock(s->ls_obj_hash, &bd, 1); - bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd); - - list_for_each_entry_safe(h, temp, &bkt->lsb_lru, loh_lru) { - LASSERT(atomic_read(&h->loh_ref) == 0); - - cfs_hash_bd_get(s->ls_obj_hash, &h->loh_fid, &bd2); - LASSERT(bd.bd_bucket == bd2.bd_bucket); - - cfs_hash_bd_del_locked(s->ls_obj_hash, - &bd2, &h->loh_hash); - list_move(&h->loh_lru, &dispose); - percpu_counter_dec(&s->ls_lru_len_counter); - if (did_sth == 0) - did_sth = 1; - - if (nr != ~0 && --nr == 0) - break; - - if (count > 0 && --count == 0) - break; - } - cfs_hash_bd_unlock(s->ls_obj_hash, &bd, 1); - cond_resched(); - /* - * Free everything on the dispose list. This is safe against - * races due to the reasons described in lu_object_put(). - */ - while (!list_empty(&dispose)) { - h = container_of(dispose.next, - struct lu_object_header, loh_lru); - list_del_init(&h->loh_lru); - lu_object_free(env, lu_object_top(h)); - lprocfs_counter_incr(s->ls_stats, LU_SS_LRU_PURGED); - } - - if (nr == 0) - break; - } - mutex_unlock(&s->ls_purge_mutex); - - if (nr != 0 && did_sth && start != 0) { - start = 0; /* restart from the first bucket */ - goto again; - } - /* race on s->ls_purge_start, but nobody cares */ - s->ls_purge_start = i % CFS_HASH_NBKT(s->ls_obj_hash); -out: - return nr; -} -EXPORT_SYMBOL(lu_site_purge_objects); - -/* - * Object printing. - * - * Code below has to jump through certain loops to output object description - * into libcfs_debug_msg-based log. The problem is that lu_object_print() - * composes object description from strings that are parts of _lines_ of - * output (i.e., strings that are not terminated by newline). This doesn't fit - * very well into libcfs_debug_msg() interface that assumes that each message - * supplied to it is a self-contained output line. - * - * To work around this, strings are collected in a temporary buffer - * (implemented as a value of lu_cdebug_key key), until terminating newline - * character is detected. - * - */ - -enum { - /** - * Maximal line size. - * - * XXX overflow is not handled correctly. - */ - LU_CDEBUG_LINE = 512 -}; - -struct lu_cdebug_data { - /** - * Temporary buffer. - */ - char lck_area[LU_CDEBUG_LINE]; -}; - -/* context key constructor/destructor: lu_global_key_init, lu_global_key_fini */ -LU_KEY_INIT_FINI(lu_global, struct lu_cdebug_data); - -/** - * Key, holding temporary buffer. This key is registered very early by - * lu_global_init(). - */ -static struct lu_context_key lu_global_key = { - .lct_tags = LCT_MD_THREAD | LCT_DT_THREAD | - LCT_MG_THREAD | LCT_CL_THREAD | LCT_LOCAL, - .lct_init = lu_global_key_init, - .lct_fini = lu_global_key_fini -}; - -/** - * Printer function emitting messages through libcfs_debug_msg(). - */ -int lu_cdebug_printer(const struct lu_env *env, - void *cookie, const char *format, ...) -{ - struct libcfs_debug_msg_data *msgdata = cookie; - struct lu_cdebug_data *key; - int used; - int complete; - va_list args; - - va_start(args, format); - - key = lu_context_key_get(&env->le_ctx, &lu_global_key); - - used = strlen(key->lck_area); - complete = format[strlen(format) - 1] == '\n'; - /* - * Append new chunk to the buffer. - */ - vsnprintf(key->lck_area + used, - ARRAY_SIZE(key->lck_area) - used, format, args); - if (complete) { - if (cfs_cdebug_show(msgdata->msg_mask, msgdata->msg_subsys)) - libcfs_debug_msg(msgdata, "%s\n", key->lck_area); - key->lck_area[0] = 0; - } - va_end(args); - return 0; -} -EXPORT_SYMBOL(lu_cdebug_printer); - -/** - * Print object header. - */ -void lu_object_header_print(const struct lu_env *env, void *cookie, - lu_printer_t printer, - const struct lu_object_header *hdr) -{ - (*printer)(env, cookie, "header@%p[%#lx, %d, " DFID "%s%s%s]", - hdr, hdr->loh_flags, atomic_read(&hdr->loh_ref), - PFID(&hdr->loh_fid), - hlist_unhashed(&hdr->loh_hash) ? "" : " hash", - list_empty((struct list_head *)&hdr->loh_lru) ? \ - "" : " lru", - hdr->loh_attr & LOHA_EXISTS ? " exist":""); -} -EXPORT_SYMBOL(lu_object_header_print); - -/** - * Print human readable representation of the \a o to the \a printer. - */ -void lu_object_print(const struct lu_env *env, void *cookie, - lu_printer_t printer, const struct lu_object *o) -{ - static const char ruler[] = "........................................"; - struct lu_object_header *top; - int depth = 4; - - top = o->lo_header; - lu_object_header_print(env, cookie, printer, top); - (*printer)(env, cookie, "{\n"); - - list_for_each_entry(o, &top->loh_layers, lo_linkage) { - /* - * print `.' \a depth times followed by type name and address - */ - (*printer)(env, cookie, "%*.*s%s@%p", depth, depth, ruler, - o->lo_dev->ld_type->ldt_name, o); - - if (o->lo_ops->loo_object_print) - (*o->lo_ops->loo_object_print)(env, cookie, printer, o); - - (*printer)(env, cookie, "\n"); - } - - (*printer)(env, cookie, "} header@%p\n", top); -} -EXPORT_SYMBOL(lu_object_print); - -/* - * NOTE: htable_lookup() is called with the relevant - * hash bucket locked, but might drop and re-acquire the lock. - */ -static struct lu_object *htable_lookup(struct lu_site *s, - struct cfs_hash_bd *bd, - const struct lu_fid *f, - __u64 *version) -{ - struct lu_site_bkt_data *bkt; - struct lu_object_header *h; - struct hlist_node *hnode; - u64 ver = cfs_hash_bd_version_get(bd); - - if (*version == ver) - return ERR_PTR(-ENOENT); - - *version = ver; - bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, bd); - /* cfs_hash_bd_peek_locked is a somehow "internal" function - * of cfs_hash, it doesn't add refcount on object. - */ - hnode = cfs_hash_bd_peek_locked(s->ls_obj_hash, bd, (void *)f); - if (!hnode) { - lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_MISS); - return ERR_PTR(-ENOENT); - } - - h = container_of(hnode, struct lu_object_header, loh_hash); - cfs_hash_get(s->ls_obj_hash, hnode); - lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_HIT); - if (!list_empty(&h->loh_lru)) { - list_del_init(&h->loh_lru); - percpu_counter_dec(&s->ls_lru_len_counter); - } - return lu_object_top(h); -} - -/** - * Search cache for an object with the fid \a f. If such object is found, - * return it. Otherwise, create new object, insert it into cache and return - * it. In any case, additional reference is acquired on the returned object. - */ -static struct lu_object *lu_object_find(const struct lu_env *env, - struct lu_device *dev, - const struct lu_fid *f, - const struct lu_object_conf *conf) -{ - return lu_object_find_at(env, dev->ld_site->ls_top_dev, f, conf); -} - -/* - * Limit the lu_object cache to a maximum of lu_cache_nr objects. Because - * the calculation for the number of objects to reclaim is not covered by - * a lock the maximum number of objects is capped by LU_CACHE_MAX_ADJUST. - * This ensures that many concurrent threads will not accidentally purge - * the entire cache. - */ -static void lu_object_limit(const struct lu_env *env, struct lu_device *dev) -{ - __u64 size, nr; - - if (lu_cache_nr == LU_CACHE_NR_UNLIMITED) - return; - - size = cfs_hash_size_get(dev->ld_site->ls_obj_hash); - nr = (__u64)lu_cache_nr; - if (size <= nr) - return; - - lu_site_purge_objects(env, dev->ld_site, - min_t(__u64, size - nr, LU_CACHE_NR_MAX_ADJUST), - false); -} - -/** - * Core logic of lu_object_find*() functions. - * - * Much like lu_object_find(), but top level device of object is specifically - * \a dev rather than top level device of the site. This interface allows - * objects of different "stacking" to be created within the same site. - */ -struct lu_object *lu_object_find_at(const struct lu_env *env, - struct lu_device *dev, - const struct lu_fid *f, - const struct lu_object_conf *conf) -{ - struct lu_object *o; - struct lu_object *shadow; - struct lu_site *s; - struct cfs_hash *hs; - struct cfs_hash_bd bd; - __u64 version = 0; - - /* - * This uses standard index maintenance protocol: - * - * - search index under lock, and return object if found; - * - otherwise, unlock index, allocate new object; - * - lock index and search again; - * - if nothing is found (usual case), insert newly created - * object into index; - * - otherwise (race: other thread inserted object), free - * object just allocated. - * - unlock index; - * - return object. - * - * For "LOC_F_NEW" case, we are sure the object is new established. - * It is unnecessary to perform lookup-alloc-lookup-insert, instead, - * just alloc and insert directly. - * - */ - s = dev->ld_site; - hs = s->ls_obj_hash; - - cfs_hash_bd_get(hs, f, &bd); - if (!(conf && conf->loc_flags & LOC_F_NEW)) { - cfs_hash_bd_lock(hs, &bd, 1); - o = htable_lookup(s, &bd, f, &version); - cfs_hash_bd_unlock(hs, &bd, 1); - - if (!IS_ERR(o) || PTR_ERR(o) != -ENOENT) - return o; - } - /* - * Allocate new object. This may result in rather complicated - * operations, including fld queries, inode loading, etc. - */ - o = lu_object_alloc(env, dev, f, conf); - if (IS_ERR(o)) - return o; - - LASSERT(lu_fid_eq(lu_object_fid(o), f)); - - cfs_hash_bd_lock(hs, &bd, 1); - - if (conf && conf->loc_flags & LOC_F_NEW) - shadow = ERR_PTR(-ENOENT); - else - shadow = htable_lookup(s, &bd, f, &version); - if (likely(PTR_ERR(shadow) == -ENOENT)) { - cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash); - cfs_hash_bd_unlock(hs, &bd, 1); - - lu_object_limit(env, dev); - - return o; - } - - lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_RACE); - cfs_hash_bd_unlock(hs, &bd, 1); - lu_object_free(env, o); - return shadow; -} -EXPORT_SYMBOL(lu_object_find_at); - -/** - * Find object with given fid, and return its slice belonging to given device. - */ -struct lu_object *lu_object_find_slice(const struct lu_env *env, - struct lu_device *dev, - const struct lu_fid *f, - const struct lu_object_conf *conf) -{ - struct lu_object *top; - struct lu_object *obj; - - top = lu_object_find(env, dev, f, conf); - if (IS_ERR(top)) - return top; - - obj = lu_object_locate(top->lo_header, dev->ld_type); - if (unlikely(!obj)) { - lu_object_put(env, top); - obj = ERR_PTR(-ENOENT); - } - - return obj; -} -EXPORT_SYMBOL(lu_object_find_slice); - -/** - * Global list of all device types. - */ -static LIST_HEAD(lu_device_types); - -int lu_device_type_init(struct lu_device_type *ldt) -{ - int result = 0; - - atomic_set(&ldt->ldt_device_nr, 0); - INIT_LIST_HEAD(&ldt->ldt_linkage); - if (ldt->ldt_ops->ldto_init) - result = ldt->ldt_ops->ldto_init(ldt); - - if (!result) { - spin_lock(&obd_types_lock); - list_add(&ldt->ldt_linkage, &lu_device_types); - spin_unlock(&obd_types_lock); - } - - return result; -} -EXPORT_SYMBOL(lu_device_type_init); - -void lu_device_type_fini(struct lu_device_type *ldt) -{ - spin_lock(&obd_types_lock); - list_del_init(&ldt->ldt_linkage); - spin_unlock(&obd_types_lock); - if (ldt->ldt_ops->ldto_fini) - ldt->ldt_ops->ldto_fini(ldt); -} -EXPORT_SYMBOL(lu_device_type_fini); - -/** - * Global list of all sites on this node - */ -static LIST_HEAD(lu_sites); -static DECLARE_RWSEM(lu_sites_guard); - -/** - * Global environment used by site shrinker. - */ -static struct lu_env lu_shrink_env; - -struct lu_site_print_arg { - struct lu_env *lsp_env; - void *lsp_cookie; - lu_printer_t lsp_printer; -}; - -static int -lu_site_obj_print(struct cfs_hash *hs, struct cfs_hash_bd *bd, - struct hlist_node *hnode, void *data) -{ - struct lu_site_print_arg *arg = (struct lu_site_print_arg *)data; - struct lu_object_header *h; - - h = hlist_entry(hnode, struct lu_object_header, loh_hash); - if (!list_empty(&h->loh_layers)) { - const struct lu_object *o; - - o = lu_object_top(h); - lu_object_print(arg->lsp_env, arg->lsp_cookie, - arg->lsp_printer, o); - } else { - lu_object_header_print(arg->lsp_env, arg->lsp_cookie, - arg->lsp_printer, h); - } - return 0; -} - -/** - * Print all objects in \a s. - */ -void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie, - lu_printer_t printer) -{ - struct lu_site_print_arg arg = { - .lsp_env = (struct lu_env *)env, - .lsp_cookie = cookie, - .lsp_printer = printer, - }; - - cfs_hash_for_each(s->ls_obj_hash, lu_site_obj_print, &arg); -} -EXPORT_SYMBOL(lu_site_print); - -/** - * Return desired hash table order. - */ -static unsigned long lu_htable_order(struct lu_device *top) -{ - unsigned long bits_max = LU_SITE_BITS_MAX; - unsigned long cache_size; - unsigned long bits; - - if (!strcmp(top->ld_type->ldt_name, LUSTRE_VVP_NAME)) - bits_max = LU_SITE_BITS_MAX_CL; - - /* - * Calculate hash table size, assuming that we want reasonable - * performance when 20% of total memory is occupied by cache of - * lu_objects. - * - * Size of lu_object is (arbitrary) taken as 1K (together with inode). - */ - cache_size = totalram_pages; - -#if BITS_PER_LONG == 32 - /* limit hashtable size for lowmem systems to low RAM */ - if (cache_size > 1 << (30 - PAGE_SHIFT)) - cache_size = 1 << (30 - PAGE_SHIFT) * 3 / 4; -#endif - - /* clear off unreasonable cache setting. */ - if (lu_cache_percent == 0 || lu_cache_percent > LU_CACHE_PERCENT_MAX) { - CWARN("obdclass: invalid lu_cache_percent: %u, it must be in the range of (0, %u]. Will use default value: %u.\n", - lu_cache_percent, LU_CACHE_PERCENT_MAX, - LU_CACHE_PERCENT_DEFAULT); - - lu_cache_percent = LU_CACHE_PERCENT_DEFAULT; - } - cache_size = cache_size / 100 * lu_cache_percent * - (PAGE_SIZE / 1024); - - for (bits = 1; (1 << bits) < cache_size; ++bits) - ; - return clamp_t(typeof(bits), bits, LU_SITE_BITS_MIN, bits_max); -} - -static unsigned int lu_obj_hop_hash(struct cfs_hash *hs, - const void *key, unsigned int mask) -{ - struct lu_fid *fid = (struct lu_fid *)key; - __u32 hash; - - hash = fid_flatten32(fid); - hash += (hash >> 4) + (hash << 12); /* mixing oid and seq */ - hash = hash_long(hash, hs->hs_bkt_bits); - - /* give me another random factor */ - hash -= hash_long((unsigned long)hs, fid_oid(fid) % 11 + 3); - - hash <<= hs->hs_cur_bits - hs->hs_bkt_bits; - hash |= (fid_seq(fid) + fid_oid(fid)) & (CFS_HASH_NBKT(hs) - 1); - - return hash & mask; -} - -static void *lu_obj_hop_object(struct hlist_node *hnode) -{ - return hlist_entry(hnode, struct lu_object_header, loh_hash); -} - -static void *lu_obj_hop_key(struct hlist_node *hnode) -{ - struct lu_object_header *h; - - h = hlist_entry(hnode, struct lu_object_header, loh_hash); - return &h->loh_fid; -} - -static int lu_obj_hop_keycmp(const void *key, struct hlist_node *hnode) -{ - struct lu_object_header *h; - - h = hlist_entry(hnode, struct lu_object_header, loh_hash); - return lu_fid_eq(&h->loh_fid, (struct lu_fid *)key); -} - -static void lu_obj_hop_get(struct cfs_hash *hs, struct hlist_node *hnode) -{ - struct lu_object_header *h; - - h = hlist_entry(hnode, struct lu_object_header, loh_hash); - atomic_inc(&h->loh_ref); -} - -static void lu_obj_hop_put_locked(struct cfs_hash *hs, struct hlist_node *hnode) -{ - LBUG(); /* we should never called it */ -} - -static struct cfs_hash_ops lu_site_hash_ops = { - .hs_hash = lu_obj_hop_hash, - .hs_key = lu_obj_hop_key, - .hs_keycmp = lu_obj_hop_keycmp, - .hs_object = lu_obj_hop_object, - .hs_get = lu_obj_hop_get, - .hs_put_locked = lu_obj_hop_put_locked, -}; - -static void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d) -{ - spin_lock(&s->ls_ld_lock); - if (list_empty(&d->ld_linkage)) - list_add(&d->ld_linkage, &s->ls_ld_linkage); - spin_unlock(&s->ls_ld_lock); -} - -/** - * Initialize site \a s, with \a d as the top level device. - */ -int lu_site_init(struct lu_site *s, struct lu_device *top) -{ - struct lu_site_bkt_data *bkt; - struct cfs_hash_bd bd; - unsigned long bits; - unsigned long i; - char name[16]; - int rc; - - memset(s, 0, sizeof(*s)); - mutex_init(&s->ls_purge_mutex); - - rc = percpu_counter_init(&s->ls_lru_len_counter, 0, GFP_NOFS); - if (rc) - return -ENOMEM; - - snprintf(name, sizeof(name), "lu_site_%s", top->ld_type->ldt_name); - for (bits = lu_htable_order(top); bits >= LU_SITE_BITS_MIN; bits--) { - s->ls_obj_hash = cfs_hash_create(name, bits, bits, - bits - LU_SITE_BKT_BITS, - sizeof(*bkt), 0, 0, - &lu_site_hash_ops, - CFS_HASH_SPIN_BKTLOCK | - CFS_HASH_NO_ITEMREF | - CFS_HASH_DEPTH | - CFS_HASH_ASSERT_EMPTY | - CFS_HASH_COUNTER); - if (s->ls_obj_hash) - break; - } - - if (!s->ls_obj_hash) { - CERROR("failed to create lu_site hash with bits: %lu\n", bits); - return -ENOMEM; - } - - cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) { - bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd); - INIT_LIST_HEAD(&bkt->lsb_lru); - init_waitqueue_head(&bkt->lsb_marche_funebre); - } - - s->ls_stats = lprocfs_alloc_stats(LU_SS_LAST_STAT, 0); - if (!s->ls_stats) { - cfs_hash_putref(s->ls_obj_hash); - s->ls_obj_hash = NULL; - return -ENOMEM; - } - - lprocfs_counter_init(s->ls_stats, LU_SS_CREATED, - 0, "created", "created"); - lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_HIT, - 0, "cache_hit", "cache_hit"); - lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_MISS, - 0, "cache_miss", "cache_miss"); - lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_RACE, - 0, "cache_race", "cache_race"); - lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_DEATH_RACE, - 0, "cache_death_race", "cache_death_race"); - lprocfs_counter_init(s->ls_stats, LU_SS_LRU_PURGED, - 0, "lru_purged", "lru_purged"); - - INIT_LIST_HEAD(&s->ls_linkage); - s->ls_top_dev = top; - top->ld_site = s; - lu_device_get(top); - lu_ref_add(&top->ld_reference, "site-top", s); - - INIT_LIST_HEAD(&s->ls_ld_linkage); - spin_lock_init(&s->ls_ld_lock); - - lu_dev_add_linkage(s, top); - - return 0; -} -EXPORT_SYMBOL(lu_site_init); - -/** - * Finalize \a s and release its resources. - */ -void lu_site_fini(struct lu_site *s) -{ - down_write(&lu_sites_guard); - list_del_init(&s->ls_linkage); - up_write(&lu_sites_guard); - - percpu_counter_destroy(&s->ls_lru_len_counter); - - if (s->ls_obj_hash) { - cfs_hash_putref(s->ls_obj_hash); - s->ls_obj_hash = NULL; - } - - if (s->ls_top_dev) { - s->ls_top_dev->ld_site = NULL; - lu_ref_del(&s->ls_top_dev->ld_reference, "site-top", s); - lu_device_put(s->ls_top_dev); - s->ls_top_dev = NULL; - } - - if (s->ls_stats) - lprocfs_free_stats(&s->ls_stats); -} -EXPORT_SYMBOL(lu_site_fini); - -/** - * Called when initialization of stack for this site is completed. - */ -int lu_site_init_finish(struct lu_site *s) -{ - int result; - - down_write(&lu_sites_guard); - result = lu_context_refill(&lu_shrink_env.le_ctx); - if (result == 0) - list_add(&s->ls_linkage, &lu_sites); - up_write(&lu_sites_guard); - return result; -} -EXPORT_SYMBOL(lu_site_init_finish); - -/** - * Acquire additional reference on device \a d - */ -void lu_device_get(struct lu_device *d) -{ - atomic_inc(&d->ld_ref); -} -EXPORT_SYMBOL(lu_device_get); - -/** - * Release reference on device \a d. - */ -void lu_device_put(struct lu_device *d) -{ - LASSERT(atomic_read(&d->ld_ref) > 0); - atomic_dec(&d->ld_ref); -} -EXPORT_SYMBOL(lu_device_put); - -/** - * Initialize device \a d of type \a t. - */ -int lu_device_init(struct lu_device *d, struct lu_device_type *t) -{ - if (atomic_inc_return(&t->ldt_device_nr) == 1 && - t->ldt_ops->ldto_start) - t->ldt_ops->ldto_start(t); - - memset(d, 0, sizeof(*d)); - atomic_set(&d->ld_ref, 0); - d->ld_type = t; - lu_ref_init(&d->ld_reference); - INIT_LIST_HEAD(&d->ld_linkage); - return 0; -} -EXPORT_SYMBOL(lu_device_init); - -/** - * Finalize device \a d. - */ -void lu_device_fini(struct lu_device *d) -{ - struct lu_device_type *t = d->ld_type; - - if (d->ld_obd) { - d->ld_obd->obd_lu_dev = NULL; - d->ld_obd = NULL; - } - - lu_ref_fini(&d->ld_reference); - LASSERTF(atomic_read(&d->ld_ref) == 0, - "Refcount is %u\n", atomic_read(&d->ld_ref)); - LASSERT(atomic_read(&t->ldt_device_nr) > 0); - - if (atomic_dec_and_test(&t->ldt_device_nr) && - t->ldt_ops->ldto_stop) - t->ldt_ops->ldto_stop(t); -} -EXPORT_SYMBOL(lu_device_fini); - -/** - * Initialize object \a o that is part of compound object \a h and was created - * by device \a d. - */ -int lu_object_init(struct lu_object *o, struct lu_object_header *h, - struct lu_device *d) -{ - memset(o, 0, sizeof(*o)); - o->lo_header = h; - o->lo_dev = d; - lu_device_get(d); - lu_ref_add_at(&d->ld_reference, &o->lo_dev_ref, "lu_object", o); - INIT_LIST_HEAD(&o->lo_linkage); - - return 0; -} -EXPORT_SYMBOL(lu_object_init); - -/** - * Finalize object and release its resources. - */ -void lu_object_fini(struct lu_object *o) -{ - struct lu_device *dev = o->lo_dev; - - LASSERT(list_empty(&o->lo_linkage)); - - if (dev) { - lu_ref_del_at(&dev->ld_reference, &o->lo_dev_ref, - "lu_object", o); - lu_device_put(dev); - o->lo_dev = NULL; - } -} -EXPORT_SYMBOL(lu_object_fini); - -/** - * Add object \a o as first layer of compound object \a h - * - * This is typically called by the ->ldo_object_alloc() method of top-level - * device. - */ -void lu_object_add_top(struct lu_object_header *h, struct lu_object *o) -{ - list_move(&o->lo_linkage, &h->loh_layers); -} -EXPORT_SYMBOL(lu_object_add_top); - -/** - * Add object \a o as a layer of compound object, going after \a before. - * - * This is typically called by the ->ldo_object_alloc() method of \a - * before->lo_dev. - */ -void lu_object_add(struct lu_object *before, struct lu_object *o) -{ - list_move(&o->lo_linkage, &before->lo_linkage); -} -EXPORT_SYMBOL(lu_object_add); - -/** - * Initialize compound object. - */ -int lu_object_header_init(struct lu_object_header *h) -{ - memset(h, 0, sizeof(*h)); - atomic_set(&h->loh_ref, 1); - INIT_HLIST_NODE(&h->loh_hash); - INIT_LIST_HEAD(&h->loh_lru); - INIT_LIST_HEAD(&h->loh_layers); - lu_ref_init(&h->loh_reference); - return 0; -} -EXPORT_SYMBOL(lu_object_header_init); - -/** - * Finalize compound object. - */ -void lu_object_header_fini(struct lu_object_header *h) -{ - LASSERT(list_empty(&h->loh_layers)); - LASSERT(list_empty(&h->loh_lru)); - LASSERT(hlist_unhashed(&h->loh_hash)); - lu_ref_fini(&h->loh_reference); -} -EXPORT_SYMBOL(lu_object_header_fini); - -/** - * Given a compound object, find its slice, corresponding to the device type - * \a dtype. - */ -struct lu_object *lu_object_locate(struct lu_object_header *h, - const struct lu_device_type *dtype) -{ - struct lu_object *o; - - list_for_each_entry(o, &h->loh_layers, lo_linkage) { - if (o->lo_dev->ld_type == dtype) - return o; - } - return NULL; -} -EXPORT_SYMBOL(lu_object_locate); - -/** - * Finalize and free devices in the device stack. - * - * Finalize device stack by purging object cache, and calling - * lu_device_type_operations::ldto_device_fini() and - * lu_device_type_operations::ldto_device_free() on all devices in the stack. - */ -void lu_stack_fini(const struct lu_env *env, struct lu_device *top) -{ - struct lu_site *site = top->ld_site; - struct lu_device *scan; - struct lu_device *next; - - lu_site_purge(env, site, ~0); - for (scan = top; scan; scan = next) { - next = scan->ld_type->ldt_ops->ldto_device_fini(env, scan); - lu_ref_del(&scan->ld_reference, "lu-stack", &lu_site_init); - lu_device_put(scan); - } - - /* purge again. */ - lu_site_purge(env, site, ~0); - - for (scan = top; scan; scan = next) { - const struct lu_device_type *ldt = scan->ld_type; - struct obd_type *type; - - next = ldt->ldt_ops->ldto_device_free(env, scan); - type = ldt->ldt_obd_type; - if (type) { - type->typ_refcnt--; - class_put_type(type); - } - } -} - -enum { - /** - * Maximal number of tld slots. - */ - LU_CONTEXT_KEY_NR = 40 -}; - -static struct lu_context_key *lu_keys[LU_CONTEXT_KEY_NR] = { NULL, }; - -static DEFINE_RWLOCK(lu_keys_guard); -static atomic_t lu_key_initing_cnt = ATOMIC_INIT(0); - -/** - * Global counter incremented whenever key is registered, unregistered, - * revived or quiesced. This is used to void unnecessary calls to - * lu_context_refill(). No locking is provided, as initialization and shutdown - * are supposed to be externally serialized. - */ -static unsigned int key_set_version; - -/** - * Register new key. - */ -int lu_context_key_register(struct lu_context_key *key) -{ - int result; - unsigned int i; - - LASSERT(key->lct_init); - LASSERT(key->lct_fini); - LASSERT(key->lct_tags != 0); - - result = -ENFILE; - write_lock(&lu_keys_guard); - for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) { - if (!lu_keys[i]) { - key->lct_index = i; - atomic_set(&key->lct_used, 1); - lu_keys[i] = key; - lu_ref_init(&key->lct_reference); - result = 0; - ++key_set_version; - break; - } - } - write_unlock(&lu_keys_guard); - return result; -} -EXPORT_SYMBOL(lu_context_key_register); - -static void key_fini(struct lu_context *ctx, int index) -{ - if (ctx->lc_value && ctx->lc_value[index]) { - struct lu_context_key *key; - - key = lu_keys[index]; - LASSERT(atomic_read(&key->lct_used) > 1); - - key->lct_fini(ctx, key, ctx->lc_value[index]); - lu_ref_del(&key->lct_reference, "ctx", ctx); - atomic_dec(&key->lct_used); - - if ((ctx->lc_tags & LCT_NOREF) == 0) - module_put(key->lct_owner); - ctx->lc_value[index] = NULL; - } -} - -/** - * Deregister key. - */ -void lu_context_key_degister(struct lu_context_key *key) -{ - LASSERT(atomic_read(&key->lct_used) >= 1); - LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys)); - - lu_context_key_quiesce(key); - - write_lock(&lu_keys_guard); - ++key_set_version; - key_fini(&lu_shrink_env.le_ctx, key->lct_index); - - /** - * Wait until all transient contexts referencing this key have - * run lu_context_key::lct_fini() method. - */ - while (atomic_read(&key->lct_used) > 1) { - write_unlock(&lu_keys_guard); - CDEBUG(D_INFO, "%s: \"%s\" %p, %d\n", - __func__, module_name(key->lct_owner), - key, atomic_read(&key->lct_used)); - schedule(); - write_lock(&lu_keys_guard); - } - if (lu_keys[key->lct_index]) { - lu_keys[key->lct_index] = NULL; - lu_ref_fini(&key->lct_reference); - } - write_unlock(&lu_keys_guard); - - LASSERTF(atomic_read(&key->lct_used) == 1, - "key has instances: %d\n", - atomic_read(&key->lct_used)); -} -EXPORT_SYMBOL(lu_context_key_degister); - -/** - * Register a number of keys. This has to be called after all keys have been - * initialized by a call to LU_CONTEXT_KEY_INIT(). - */ -int lu_context_key_register_many(struct lu_context_key *k, ...) -{ - struct lu_context_key *key = k; - va_list args; - int result; - - va_start(args, k); - do { - result = lu_context_key_register(key); - if (result) - break; - key = va_arg(args, struct lu_context_key *); - } while (key); - va_end(args); - - if (result != 0) { - va_start(args, k); - while (k != key) { - lu_context_key_degister(k); - k = va_arg(args, struct lu_context_key *); - } - va_end(args); - } - - return result; -} -EXPORT_SYMBOL(lu_context_key_register_many); - -/** - * De-register a number of keys. This is a dual to - * lu_context_key_register_many(). - */ -void lu_context_key_degister_many(struct lu_context_key *k, ...) -{ - va_list args; - - va_start(args, k); - do { - lu_context_key_degister(k); - k = va_arg(args, struct lu_context_key*); - } while (k); - va_end(args); -} -EXPORT_SYMBOL(lu_context_key_degister_many); - -/** - * Revive a number of keys. - */ -void lu_context_key_revive_many(struct lu_context_key *k, ...) -{ - va_list args; - - va_start(args, k); - do { - lu_context_key_revive(k); - k = va_arg(args, struct lu_context_key*); - } while (k); - va_end(args); -} -EXPORT_SYMBOL(lu_context_key_revive_many); - -/** - * Quiescent a number of keys. - */ -void lu_context_key_quiesce_many(struct lu_context_key *k, ...) -{ - va_list args; - - va_start(args, k); - do { - lu_context_key_quiesce(k); - k = va_arg(args, struct lu_context_key*); - } while (k); - va_end(args); -} -EXPORT_SYMBOL(lu_context_key_quiesce_many); - -/** - * Return value associated with key \a key in context \a ctx. - */ -void *lu_context_key_get(const struct lu_context *ctx, - const struct lu_context_key *key) -{ - LINVRNT(ctx->lc_state == LCS_ENTERED); - LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys)); - LASSERT(lu_keys[key->lct_index] == key); - return ctx->lc_value[key->lct_index]; -} -EXPORT_SYMBOL(lu_context_key_get); - -/** - * List of remembered contexts. XXX document me. - */ -static LIST_HEAD(lu_context_remembered); - -/** - * Destroy \a key in all remembered contexts. This is used to destroy key - * values in "shared" contexts (like service threads), when a module owning - * the key is about to be unloaded. - */ -void lu_context_key_quiesce(struct lu_context_key *key) -{ - struct lu_context *ctx; - - if (!(key->lct_tags & LCT_QUIESCENT)) { - /* - * XXX memory barrier has to go here. - */ - write_lock(&lu_keys_guard); - key->lct_tags |= LCT_QUIESCENT; - - /** - * Wait until all lu_context_key::lct_init() methods - * have completed. - */ - while (atomic_read(&lu_key_initing_cnt) > 0) { - write_unlock(&lu_keys_guard); - CDEBUG(D_INFO, "%s: \"%s\" %p, %d (%d)\n", - __func__, - module_name(key->lct_owner), - key, atomic_read(&key->lct_used), - atomic_read(&lu_key_initing_cnt)); - schedule(); - write_lock(&lu_keys_guard); - } - - list_for_each_entry(ctx, &lu_context_remembered, lc_remember) - key_fini(ctx, key->lct_index); - - ++key_set_version; - write_unlock(&lu_keys_guard); - } -} - -void lu_context_key_revive(struct lu_context_key *key) -{ - write_lock(&lu_keys_guard); - key->lct_tags &= ~LCT_QUIESCENT; - ++key_set_version; - write_unlock(&lu_keys_guard); -} - -static void keys_fini(struct lu_context *ctx) -{ - unsigned int i; - - if (!ctx->lc_value) - return; - - for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) - key_fini(ctx, i); - - kfree(ctx->lc_value); - ctx->lc_value = NULL; -} - -static int keys_fill(struct lu_context *ctx) -{ - unsigned int pre_version; - unsigned int i; - - /* - * A serialisation with lu_context_key_quiesce() is needed, but some - * "key->lct_init()" are calling kernel memory allocation routine and - * can't be called while holding a spin_lock. - * "lu_keys_guard" is held while incrementing "lu_key_initing_cnt" - * to ensure the start of the serialisation. - * An atomic_t variable is still used, in order not to reacquire the - * lock when decrementing the counter. - */ - read_lock(&lu_keys_guard); - atomic_inc(&lu_key_initing_cnt); - pre_version = key_set_version; - read_unlock(&lu_keys_guard); - -refill: - LINVRNT(ctx->lc_value); - for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) { - struct lu_context_key *key; - - key = lu_keys[i]; - if (!ctx->lc_value[i] && key && - (key->lct_tags & ctx->lc_tags) && - /* - * Don't create values for a LCT_QUIESCENT key, as this - * will pin module owning a key. - */ - !(key->lct_tags & LCT_QUIESCENT)) { - void *value; - - LINVRNT(key->lct_init); - LINVRNT(key->lct_index == i); - - if (!(ctx->lc_tags & LCT_NOREF) && - !try_module_get(key->lct_owner)) { - /* module is unloading, skip this key */ - continue; - } - - value = key->lct_init(ctx, key); - if (unlikely(IS_ERR(value))) { - atomic_dec(&lu_key_initing_cnt); - return PTR_ERR(value); - } - - lu_ref_add_atomic(&key->lct_reference, "ctx", ctx); - atomic_inc(&key->lct_used); - /* - * This is the only place in the code, where an - * element of ctx->lc_value[] array is set to non-NULL - * value. - */ - ctx->lc_value[i] = value; - if (key->lct_exit) - ctx->lc_tags |= LCT_HAS_EXIT; - } - } - - read_lock(&lu_keys_guard); - if (pre_version != key_set_version) { - pre_version = key_set_version; - read_unlock(&lu_keys_guard); - goto refill; - } - ctx->lc_version = key_set_version; - atomic_dec(&lu_key_initing_cnt); - read_unlock(&lu_keys_guard); - return 0; -} - -static int keys_init(struct lu_context *ctx) -{ - ctx->lc_value = kcalloc(ARRAY_SIZE(lu_keys), sizeof(ctx->lc_value[0]), - GFP_NOFS); - if (likely(ctx->lc_value)) - return keys_fill(ctx); - - return -ENOMEM; -} - -/** - * Initialize context data-structure. Create values for all keys. - */ -int lu_context_init(struct lu_context *ctx, __u32 tags) -{ - int rc; - - memset(ctx, 0, sizeof(*ctx)); - ctx->lc_state = LCS_INITIALIZED; - ctx->lc_tags = tags; - if (tags & LCT_REMEMBER) { - write_lock(&lu_keys_guard); - list_add(&ctx->lc_remember, &lu_context_remembered); - write_unlock(&lu_keys_guard); - } else { - INIT_LIST_HEAD(&ctx->lc_remember); - } - - rc = keys_init(ctx); - if (rc != 0) - lu_context_fini(ctx); - - return rc; -} -EXPORT_SYMBOL(lu_context_init); - -/** - * Finalize context data-structure. Destroy key values. - */ -void lu_context_fini(struct lu_context *ctx) -{ - LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT); - ctx->lc_state = LCS_FINALIZED; - - if ((ctx->lc_tags & LCT_REMEMBER) == 0) { - LASSERT(list_empty(&ctx->lc_remember)); - keys_fini(ctx); - - } else { /* could race with key degister */ - write_lock(&lu_keys_guard); - keys_fini(ctx); - list_del_init(&ctx->lc_remember); - write_unlock(&lu_keys_guard); - } -} -EXPORT_SYMBOL(lu_context_fini); - -/** - * Called before entering context. - */ -void lu_context_enter(struct lu_context *ctx) -{ - LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT); - ctx->lc_state = LCS_ENTERED; -} -EXPORT_SYMBOL(lu_context_enter); - -/** - * Called after exiting from \a ctx - */ -void lu_context_exit(struct lu_context *ctx) -{ - unsigned int i; - - LINVRNT(ctx->lc_state == LCS_ENTERED); - ctx->lc_state = LCS_LEFT; - if (ctx->lc_tags & LCT_HAS_EXIT && ctx->lc_value) { - /* could race with key quiescency */ - if (ctx->lc_tags & LCT_REMEMBER) - read_lock(&lu_keys_guard); - - for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) { - if (ctx->lc_value[i]) { - struct lu_context_key *key; - - key = lu_keys[i]; - if (key->lct_exit) - key->lct_exit(ctx, - key, ctx->lc_value[i]); - } - } - - if (ctx->lc_tags & LCT_REMEMBER) - read_unlock(&lu_keys_guard); - } -} -EXPORT_SYMBOL(lu_context_exit); - -/** - * Allocate for context all missing keys that were registered after context - * creation. key_set_version is only changed in rare cases when modules - * are loaded and removed. - */ -int lu_context_refill(struct lu_context *ctx) -{ - read_lock(&lu_keys_guard); - if (likely(ctx->lc_version == key_set_version)) { - read_unlock(&lu_keys_guard); - return 0; - } - - read_unlock(&lu_keys_guard); - return keys_fill(ctx); -} - -/** - * lu_ctx_tags/lu_ses_tags will be updated if there are new types of - * obd being added. Currently, this is only used on client side, specifically - * for echo device client, for other stack (like ptlrpc threads), context are - * predefined when the lu_device type are registered, during the module probe - * phase. - */ -__u32 lu_context_tags_default; -__u32 lu_session_tags_default; - -int lu_env_init(struct lu_env *env, __u32 tags) -{ - int result; - - env->le_ses = NULL; - result = lu_context_init(&env->le_ctx, tags); - if (likely(result == 0)) - lu_context_enter(&env->le_ctx); - return result; -} -EXPORT_SYMBOL(lu_env_init); - -void lu_env_fini(struct lu_env *env) -{ - lu_context_exit(&env->le_ctx); - lu_context_fini(&env->le_ctx); - env->le_ses = NULL; -} -EXPORT_SYMBOL(lu_env_fini); - -int lu_env_refill(struct lu_env *env) -{ - int result; - - result = lu_context_refill(&env->le_ctx); - if (result == 0 && env->le_ses) - result = lu_context_refill(env->le_ses); - return result; -} -EXPORT_SYMBOL(lu_env_refill); - -struct lu_site_stats { - unsigned int lss_populated; - unsigned int lss_max_search; - unsigned int lss_total; - unsigned int lss_busy; -}; - -static void lu_site_stats_get(const struct lu_site *s, - struct lu_site_stats *stats, int populated) -{ - struct cfs_hash *hs = s->ls_obj_hash; - struct cfs_hash_bd bd; - unsigned int i; - /* - * percpu_counter_sum_positive() won't accept a const pointer - * as it does modify the struct by taking a spinlock - */ - struct lu_site *s2 = (struct lu_site *)s; - - stats->lss_busy += cfs_hash_size_get(hs) - - percpu_counter_sum_positive(&s2->ls_lru_len_counter); - cfs_hash_for_each_bucket(hs, &bd, i) { - struct hlist_head *hhead; - - cfs_hash_bd_lock(hs, &bd, 1); - stats->lss_total += cfs_hash_bd_count_get(&bd); - stats->lss_max_search = max((int)stats->lss_max_search, - cfs_hash_bd_depmax_get(&bd)); - if (!populated) { - cfs_hash_bd_unlock(hs, &bd, 1); - continue; - } - - cfs_hash_bd_for_each_hlist(hs, &bd, hhead) { - if (!hlist_empty(hhead)) - stats->lss_populated++; - } - cfs_hash_bd_unlock(hs, &bd, 1); - } -} - -/* - * lu_cache_shrink_count() returns an approximate number of cached objects - * that can be freed by shrink_slab(). A counter, which tracks the - * number of items in the site's lru, is maintained in a percpu_counter - * for each site. The percpu values are incremented and decremented as - * objects are added or removed from the lru. The percpu values are summed - * and saved whenever a percpu value exceeds a threshold. Thus the saved, - * summed value at any given time may not accurately reflect the current - * lru length. But this value is sufficiently accurate for the needs of - * a shrinker. - * - * Using a per cpu counter is a compromise solution to concurrent access: - * lu_object_put() can update the counter without locking the site and - * lu_cache_shrink_count can sum the counters without locking each - * ls_obj_hash bucket. - */ -static unsigned long lu_cache_shrink_count(struct shrinker *sk, - struct shrink_control *sc) -{ - struct lu_site *s; - struct lu_site *tmp; - unsigned long cached = 0; - - if (!(sc->gfp_mask & __GFP_FS)) - return 0; - - down_read(&lu_sites_guard); - list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) - cached += percpu_counter_read_positive(&s->ls_lru_len_counter); - up_read(&lu_sites_guard); - - cached = (cached / 100) * sysctl_vfs_cache_pressure; - CDEBUG(D_INODE, "%ld objects cached, cache pressure %d\n", - cached, sysctl_vfs_cache_pressure); - - return cached; -} - -static unsigned long lu_cache_shrink_scan(struct shrinker *sk, - struct shrink_control *sc) -{ - struct lu_site *s; - struct lu_site *tmp; - unsigned long remain = sc->nr_to_scan, freed = 0; - LIST_HEAD(splice); - - if (!(sc->gfp_mask & __GFP_FS)) - /* We must not take the lu_sites_guard lock when - * __GFP_FS is *not* set because of the deadlock - * possibility detailed above. Additionally, - * since we cannot determine the number of - * objects in the cache without taking this - * lock, we're in a particularly tough spot. As - * a result, we'll just lie and say our cache is - * empty. This _should_ be ok, as we can't - * reclaim objects when __GFP_FS is *not* set - * anyways. - */ - return SHRINK_STOP; - - down_write(&lu_sites_guard); - list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) { - freed = lu_site_purge(&lu_shrink_env, s, remain); - remain -= freed; - /* - * Move just shrunk site to the tail of site list to - * assure shrinking fairness. - */ - list_move_tail(&s->ls_linkage, &splice); - } - list_splice(&splice, lu_sites.prev); - up_write(&lu_sites_guard); - - return sc->nr_to_scan - remain; -} - -/** - * Debugging printer function using printk(). - */ -static struct shrinker lu_site_shrinker = { - .count_objects = lu_cache_shrink_count, - .scan_objects = lu_cache_shrink_scan, - .seeks = DEFAULT_SEEKS, -}; - -/** - * Initialization of global lu_* data. - */ -int lu_global_init(void) -{ - int result; - - CDEBUG(D_INFO, "Lustre LU module (%p).\n", &lu_keys); - - result = lu_ref_global_init(); - if (result != 0) - return result; - - LU_CONTEXT_KEY_INIT(&lu_global_key); - result = lu_context_key_register(&lu_global_key); - if (result != 0) { - lu_ref_global_fini(); - return result; - } - - /* - * At this level, we don't know what tags are needed, so allocate them - * conservatively. This should not be too bad, because this - * environment is global. - */ - down_write(&lu_sites_guard); - result = lu_env_init(&lu_shrink_env, LCT_SHRINKER); - up_write(&lu_sites_guard); - if (result != 0) { - lu_context_key_degister(&lu_global_key); - lu_ref_global_fini(); - return result; - } - - /* - * seeks estimation: 3 seeks to read a record from oi, one to read - * inode, one for ea. Unfortunately setting this high value results in - * lu_object/inode cache consuming all the memory. - */ - result = register_shrinker(&lu_site_shrinker); - if (result != 0) { - /* Order explained in lu_global_fini(). */ - lu_context_key_degister(&lu_global_key); - - down_write(&lu_sites_guard); - lu_env_fini(&lu_shrink_env); - up_write(&lu_sites_guard); - - lu_ref_global_fini(); - return result; - } - - return 0; -} - -/** - * Dual to lu_global_init(). - */ -void lu_global_fini(void) -{ - unregister_shrinker(&lu_site_shrinker); - lu_context_key_degister(&lu_global_key); - - /* - * Tear shrinker environment down _after_ de-registering - * lu_global_key, because the latter has a value in the former. - */ - down_write(&lu_sites_guard); - lu_env_fini(&lu_shrink_env); - up_write(&lu_sites_guard); - - lu_ref_global_fini(); -} - -static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx) -{ - struct lprocfs_counter ret; - - lprocfs_stats_collect(stats, idx, &ret); - return (__u32)ret.lc_count; -} - -/** - * Output site statistical counters into a buffer. Suitable for - * lprocfs_rd_*()-style functions. - */ -int lu_site_stats_print(const struct lu_site *s, struct seq_file *m) -{ - struct lu_site_stats stats; - - memset(&stats, 0, sizeof(stats)); - lu_site_stats_get(s, &stats, 1); - - seq_printf(m, "%d/%d %d/%ld %d %d %d %d %d %d %d\n", - stats.lss_busy, - stats.lss_total, - stats.lss_populated, - CFS_HASH_NHLIST(s->ls_obj_hash), - stats.lss_max_search, - ls_stats_read(s->ls_stats, LU_SS_CREATED), - ls_stats_read(s->ls_stats, LU_SS_CACHE_HIT), - ls_stats_read(s->ls_stats, LU_SS_CACHE_MISS), - ls_stats_read(s->ls_stats, LU_SS_CACHE_RACE), - ls_stats_read(s->ls_stats, LU_SS_CACHE_DEATH_RACE), - ls_stats_read(s->ls_stats, LU_SS_LRU_PURGED)); - return 0; -} -EXPORT_SYMBOL(lu_site_stats_print); - -/** - * Helper function to initialize a number of kmem slab caches at once. - */ -int lu_kmem_init(struct lu_kmem_descr *caches) -{ - int result; - struct lu_kmem_descr *iter = caches; - - for (result = 0; iter->ckd_cache; ++iter) { - *iter->ckd_cache = kmem_cache_create(iter->ckd_name, - iter->ckd_size, - 0, 0, NULL); - if (!*iter->ckd_cache) { - result = -ENOMEM; - /* free all previously allocated caches */ - lu_kmem_fini(caches); - break; - } - } - return result; -} -EXPORT_SYMBOL(lu_kmem_init); - -/** - * Helper function to finalize a number of kmem slab cached at once. Dual to - * lu_kmem_init(). - */ -void lu_kmem_fini(struct lu_kmem_descr *caches) -{ - for (; caches->ckd_cache; ++caches) { - kmem_cache_destroy(*caches->ckd_cache); - *caches->ckd_cache = NULL; - } -} -EXPORT_SYMBOL(lu_kmem_fini); diff --git a/drivers/staging/lustre/lustre/obdclass/lu_ref.c b/drivers/staging/lustre/lustre/obdclass/lu_ref.c deleted file mode 100644 index f67cb89ea0babf5a58f8ef507c9775da7b1c6e5e..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdclass/lu_ref.c +++ /dev/null @@ -1,45 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/obdclass/lu_ref.c - * - * Lustre reference. - * - * Author: Nikita Danilov - */ - -#define DEBUG_SUBSYSTEM S_CLASS - -#include -#include -#include -#include diff --git a/drivers/staging/lustre/lustre/obdclass/lustre_handles.c b/drivers/staging/lustre/lustre/obdclass/lustre_handles.c deleted file mode 100644 index cdc8dc10690d279dc22bd305cbd8cc7af1c79085..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdclass/lustre_handles.c +++ /dev/null @@ -1,241 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/obdclass/lustre_handles.c - * - * Author: Phil Schwan - */ - -#define DEBUG_SUBSYSTEM S_CLASS - -#include -#include -#include -#include - -static __u64 handle_base; -#define HANDLE_INCR 7 -static spinlock_t handle_base_lock; - -static struct handle_bucket { - spinlock_t lock; - struct list_head head; -} *handle_hash; - -#define HANDLE_HASH_SIZE (1 << 16) -#define HANDLE_HASH_MASK (HANDLE_HASH_SIZE - 1) - -/* - * Generate a unique 64bit cookie (hash) for a handle and insert it into - * global (per-node) hash-table. - */ -void class_handle_hash(struct portals_handle *h, - struct portals_handle_ops *ops) -{ - struct handle_bucket *bucket; - - LASSERT(h); - LASSERT(list_empty(&h->h_link)); - - /* - * This is fast, but simplistic cookie generation algorithm, it will - * need a re-do at some point in the future for security. - */ - spin_lock(&handle_base_lock); - handle_base += HANDLE_INCR; - - if (unlikely(handle_base == 0)) { - /* - * Cookie of zero is "dangerous", because in many places it's - * assumed that 0 means "unassigned" handle, not bound to any - * object. - */ - CWARN("The universe has been exhausted: cookie wrap-around.\n"); - handle_base += HANDLE_INCR; - } - h->h_cookie = handle_base; - spin_unlock(&handle_base_lock); - - h->h_ops = ops; - spin_lock_init(&h->h_lock); - - bucket = &handle_hash[h->h_cookie & HANDLE_HASH_MASK]; - spin_lock(&bucket->lock); - list_add_rcu(&h->h_link, &bucket->head); - h->h_in = 1; - spin_unlock(&bucket->lock); - - CDEBUG(D_INFO, "added object %p with handle %#llx to hash\n", - h, h->h_cookie); -} -EXPORT_SYMBOL(class_handle_hash); - -static void class_handle_unhash_nolock(struct portals_handle *h) -{ - if (list_empty(&h->h_link)) { - CERROR("removing an already-removed handle (%#llx)\n", - h->h_cookie); - return; - } - - CDEBUG(D_INFO, "removing object %p with handle %#llx from hash\n", - h, h->h_cookie); - - spin_lock(&h->h_lock); - if (h->h_in == 0) { - spin_unlock(&h->h_lock); - return; - } - h->h_in = 0; - spin_unlock(&h->h_lock); - list_del_rcu(&h->h_link); -} - -void class_handle_unhash(struct portals_handle *h) -{ - struct handle_bucket *bucket; - - bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK); - - spin_lock(&bucket->lock); - class_handle_unhash_nolock(h); - spin_unlock(&bucket->lock); -} -EXPORT_SYMBOL(class_handle_unhash); - -void *class_handle2object(__u64 cookie, const void *owner) -{ - struct handle_bucket *bucket; - struct portals_handle *h; - void *retval = NULL; - - LASSERT(handle_hash); - - /* Be careful when you want to change this code. See the - * rcu_read_lock() definition on top this file. - jxiong - */ - bucket = handle_hash + (cookie & HANDLE_HASH_MASK); - - rcu_read_lock(); - list_for_each_entry_rcu(h, &bucket->head, h_link) { - if (h->h_cookie != cookie || h->h_owner != owner) - continue; - - spin_lock(&h->h_lock); - if (likely(h->h_in != 0)) { - h->h_ops->hop_addref(h); - retval = h; - } - spin_unlock(&h->h_lock); - break; - } - rcu_read_unlock(); - - return retval; -} -EXPORT_SYMBOL(class_handle2object); - -void class_handle_free_cb(struct rcu_head *rcu) -{ - struct portals_handle *h; - void *ptr; - - h = container_of(rcu, struct portals_handle, h_rcu); - ptr = (void *)(unsigned long)h->h_cookie; - - if (h->h_ops->hop_free) - h->h_ops->hop_free(ptr, h->h_size); - else - kfree(ptr); -} -EXPORT_SYMBOL(class_handle_free_cb); - -int class_handle_init(void) -{ - struct handle_bucket *bucket; - - LASSERT(!handle_hash); - - handle_hash = kvzalloc(sizeof(*bucket) * HANDLE_HASH_SIZE, - GFP_KERNEL); - if (!handle_hash) - return -ENOMEM; - - spin_lock_init(&handle_base_lock); - for (bucket = handle_hash + HANDLE_HASH_SIZE - 1; bucket >= handle_hash; - bucket--) { - INIT_LIST_HEAD(&bucket->head); - spin_lock_init(&bucket->lock); - } - - get_random_bytes(&handle_base, sizeof(handle_base)); - LASSERT(handle_base != 0ULL); - - return 0; -} - -static int cleanup_all_handles(void) -{ - int rc; - int i; - - for (rc = i = 0; i < HANDLE_HASH_SIZE; i++) { - struct portals_handle *h; - - spin_lock(&handle_hash[i].lock); - list_for_each_entry_rcu(h, &handle_hash[i].head, h_link) { - CERROR("force clean handle %#llx addr %p ops %p\n", - h->h_cookie, h, h->h_ops); - - class_handle_unhash_nolock(h); - rc++; - } - spin_unlock(&handle_hash[i].lock); - } - - return rc; -} - -void class_handle_cleanup(void) -{ - int count; - - LASSERT(handle_hash); - - count = cleanup_all_handles(); - - kvfree(handle_hash); - handle_hash = NULL; - - if (count != 0) - CERROR("handle_count at cleanup: %d\n", count); -} diff --git a/drivers/staging/lustre/lustre/obdclass/lustre_peer.c b/drivers/staging/lustre/lustre/obdclass/lustre_peer.c deleted file mode 100644 index e286a266542316d531443364e157407c2d15c4b6..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdclass/lustre_peer.c +++ /dev/null @@ -1,214 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_RPC - -#include -#include -#include -#include -#include -#include -#include - -#define NIDS_MAX 32 - -struct uuid_nid_data { - struct list_head un_list; - struct obd_uuid un_uuid; - int un_nid_count; - lnet_nid_t un_nids[NIDS_MAX]; -}; - -/* FIXME: This should probably become more elegant than a global linked list */ -static struct list_head g_uuid_list; -static spinlock_t g_uuid_lock; - -void class_init_uuidlist(void) -{ - INIT_LIST_HEAD(&g_uuid_list); - spin_lock_init(&g_uuid_lock); -} - -void class_exit_uuidlist(void) -{ - /* delete all */ - class_del_uuid(NULL); -} - -int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index) -{ - struct uuid_nid_data *data; - struct obd_uuid tmp; - int rc = -ENOENT; - - obd_str2uuid(&tmp, uuid); - spin_lock(&g_uuid_lock); - list_for_each_entry(data, &g_uuid_list, un_list) { - if (obd_uuid_equals(&data->un_uuid, &tmp)) { - if (index >= data->un_nid_count) - break; - - rc = 0; - *peer_nid = data->un_nids[index]; - break; - } - } - spin_unlock(&g_uuid_lock); - return rc; -} -EXPORT_SYMBOL(lustre_uuid_to_peer); - -/* Add a nid to a niduuid. Multiple nids can be added to a single uuid; - * LNET will choose the best one. - */ -int class_add_uuid(const char *uuid, __u64 nid) -{ - struct uuid_nid_data *data, *entry; - int found = 0; - - LASSERT(nid != 0); /* valid newconfig NID is never zero */ - - if (strlen(uuid) > UUID_MAX - 1) - return -EOVERFLOW; - - data = kzalloc(sizeof(*data), GFP_NOFS); - if (!data) - return -ENOMEM; - - obd_str2uuid(&data->un_uuid, uuid); - data->un_nids[0] = nid; - data->un_nid_count = 1; - - spin_lock(&g_uuid_lock); - list_for_each_entry(entry, &g_uuid_list, un_list) { - if (obd_uuid_equals(&entry->un_uuid, &data->un_uuid)) { - int i; - - found = 1; - for (i = 0; i < entry->un_nid_count; i++) - if (nid == entry->un_nids[i]) - break; - - if (i == entry->un_nid_count) { - LASSERT(entry->un_nid_count < NIDS_MAX); - entry->un_nids[entry->un_nid_count++] = nid; - } - break; - } - } - if (!found) - list_add(&data->un_list, &g_uuid_list); - spin_unlock(&g_uuid_lock); - - if (found) { - CDEBUG(D_INFO, "found uuid %s %s cnt=%d\n", uuid, - libcfs_nid2str(nid), entry->un_nid_count); - kfree(data); - } else { - CDEBUG(D_INFO, "add uuid %s %s\n", uuid, libcfs_nid2str(nid)); - } - return 0; -} - -/* Delete the nids for one uuid if specified, otherwise delete all */ -int class_del_uuid(const char *uuid) -{ - LIST_HEAD(deathrow); - struct uuid_nid_data *data; - struct uuid_nid_data *temp; - - spin_lock(&g_uuid_lock); - if (uuid) { - struct obd_uuid tmp; - - obd_str2uuid(&tmp, uuid); - list_for_each_entry(data, &g_uuid_list, un_list) { - if (obd_uuid_equals(&data->un_uuid, &tmp)) { - list_move(&data->un_list, &deathrow); - break; - } - } - } else { - list_splice_init(&g_uuid_list, &deathrow); - } - spin_unlock(&g_uuid_lock); - - if (uuid && list_empty(&deathrow)) { - CDEBUG(D_INFO, "Try to delete a non-existent uuid %s\n", uuid); - return -EINVAL; - } - - list_for_each_entry_safe(data, temp, &deathrow, un_list) { - list_del(&data->un_list); - - CDEBUG(D_INFO, "del uuid %s %s/%d\n", - obd_uuid2str(&data->un_uuid), - libcfs_nid2str(data->un_nids[0]), - data->un_nid_count); - - kfree(data); - } - - return 0; -} - -/* check if @nid exists in nid list of @uuid */ -int class_check_uuid(struct obd_uuid *uuid, __u64 nid) -{ - struct uuid_nid_data *entry; - int found = 0; - - CDEBUG(D_INFO, "check if uuid %s has %s.\n", - obd_uuid2str(uuid), libcfs_nid2str(nid)); - - spin_lock(&g_uuid_lock); - list_for_each_entry(entry, &g_uuid_list, un_list) { - int i; - - if (!obd_uuid_equals(&entry->un_uuid, uuid)) - continue; - - /* found the uuid, check if it has @nid */ - for (i = 0; i < entry->un_nid_count; i++) { - if (entry->un_nids[i] == nid) { - found = 1; - break; - } - } - break; - } - spin_unlock(&g_uuid_lock); - return found; -} -EXPORT_SYMBOL(class_check_uuid); diff --git a/drivers/staging/lustre/lustre/obdclass/obd_config.c b/drivers/staging/lustre/lustre/obdclass/obd_config.c deleted file mode 100644 index ffc1814398a5e68104c95ece9a19086d9792cd4f..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdclass/obd_config.c +++ /dev/null @@ -1,1538 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/obdclass/obd_config.c - * - * Config API - */ - -#define DEBUG_SUBSYSTEM S_CLASS - -#include - -#include -#include -#include -#include -#include -#include - -#include "llog_internal.h" - -/* - * uuid<->export lustre hash operations - */ -/* - * NOTE: It is impossible to find an export that is in failed - * state with this function - */ -static int -uuid_keycmp(struct rhashtable_compare_arg *arg, const void *obj) -{ - const struct obd_uuid *uuid = arg->key; - const struct obd_export *exp = obj; - - if (obd_uuid_equals(uuid, &exp->exp_client_uuid) && - !exp->exp_failed) - return 0; - return -ESRCH; -} - -static void -uuid_export_exit(void *vexport, void *data) -{ - struct obd_export *exp = vexport; - - class_export_put(exp); -} - -static const struct rhashtable_params uuid_hash_params = { - .key_len = sizeof(struct obd_uuid), - .key_offset = offsetof(struct obd_export, exp_client_uuid), - .head_offset = offsetof(struct obd_export, exp_uuid_hash), - .obj_cmpfn = uuid_keycmp, - .automatic_shrinking = true, -}; - -int obd_uuid_add(struct obd_device *obd, struct obd_export *export) -{ - int rc; - - rc = rhashtable_lookup_insert_fast(&obd->obd_uuid_hash, - &export->exp_uuid_hash, - uuid_hash_params); - if (rc == 0) - class_export_get(export); - else if (rc == -EEXIST) - rc = -EALREADY; - else - /* map obscure error codes to -ENOMEM */ - rc = -ENOMEM; - return rc; -} - -void obd_uuid_del(struct obd_device *obd, struct obd_export *export) -{ - int rc; - - rc = rhashtable_remove_fast(&obd->obd_uuid_hash, - &export->exp_uuid_hash, - uuid_hash_params); - - if (rc == 0) - class_export_put(export); -} - -/*********** string parsing utils *********/ - -/* returns 0 if we find this key in the buffer, else 1 */ -int class_find_param(char *buf, char *key, char **valp) -{ - char *ptr; - - if (!buf) - return 1; - - ptr = strstr(buf, key); - if (!ptr) - return 1; - - if (valp) - *valp = ptr + strlen(key); - - return 0; -} -EXPORT_SYMBOL(class_find_param); - -/* returns 0 if this is the first key in the buffer, else 1. - * valp points to first char after key. - */ -static int class_match_param(char *buf, const char *key, char **valp) -{ - if (!buf) - return 1; - - if (memcmp(buf, key, strlen(key)) != 0) - return 1; - - if (valp) - *valp = buf + strlen(key); - - return 0; -} - -static int parse_nid(char *buf, void *value, int quiet) -{ - lnet_nid_t *nid = value; - - *nid = libcfs_str2nid(buf); - if (*nid != LNET_NID_ANY) - return 0; - - if (!quiet) - LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", buf); - return -EINVAL; -} - -static int parse_net(char *buf, void *value) -{ - __u32 *net = value; - - *net = libcfs_str2net(buf); - CDEBUG(D_INFO, "Net %s\n", libcfs_net2str(*net)); - return 0; -} - -enum { - CLASS_PARSE_NID = 1, - CLASS_PARSE_NET, -}; - -/* 0 is good nid, - * 1 not found - * < 0 error - * endh is set to next separator - */ -static int class_parse_value(char *buf, int opc, void *value, char **endh, - int quiet) -{ - char *endp; - char tmp; - int rc = 0; - - if (!buf) - return 1; - while (*buf == ',' || *buf == ':') - buf++; - if (*buf == ' ' || *buf == '/' || *buf == '\0') - return 1; - - /* nid separators or end of nids */ - endp = strpbrk(buf, ",: /"); - if (!endp) - endp = buf + strlen(buf); - - tmp = *endp; - *endp = '\0'; - switch (opc) { - default: - LBUG(); - case CLASS_PARSE_NID: - rc = parse_nid(buf, value, quiet); - break; - case CLASS_PARSE_NET: - rc = parse_net(buf, value); - break; - } - *endp = tmp; - if (rc != 0) - return rc; - if (endh) - *endh = endp; - return 0; -} - -int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh) -{ - return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 0); -} -EXPORT_SYMBOL(class_parse_nid); - -int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh) -{ - return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 1); -} -EXPORT_SYMBOL(class_parse_nid_quiet); - -char *lustre_cfg_string(struct lustre_cfg *lcfg, u32 index) -{ - char *s; - - if (!lcfg->lcfg_buflens[index]) - return NULL; - - s = lustre_cfg_buf(lcfg, index); - if (!s) - return NULL; - - /* - * make sure it's NULL terminated, even if this kills a char - * of data. Try to use the padding first though. - */ - if (s[lcfg->lcfg_buflens[index] - 1] != '\0') { - size_t last = ALIGN(lcfg->lcfg_buflens[index], 8) - 1; - char lost; - - /* Use the smaller value */ - if (last > lcfg->lcfg_buflens[index]) - last = lcfg->lcfg_buflens[index]; - - lost = s[last]; - s[last] = '\0'; - if (lost != '\0') { - CWARN("Truncated buf %d to '%s' (lost '%c'...)\n", - index, s, lost); - } - } - return s; -} -EXPORT_SYMBOL(lustre_cfg_string); - -/********************** class fns **********************/ - -/** - * Create a new obd device and set the type, name and uuid. If successful, - * the new device can be accessed by either name or uuid. - */ -static int class_attach(struct lustre_cfg *lcfg) -{ - struct obd_device *obd = NULL; - char *typename, *name, *uuid; - int rc, len; - - if (!LUSTRE_CFG_BUFLEN(lcfg, 1)) { - CERROR("No type passed!\n"); - return -EINVAL; - } - typename = lustre_cfg_string(lcfg, 1); - - if (!LUSTRE_CFG_BUFLEN(lcfg, 0)) { - CERROR("No name passed!\n"); - return -EINVAL; - } - name = lustre_cfg_string(lcfg, 0); - - if (!LUSTRE_CFG_BUFLEN(lcfg, 2)) { - CERROR("No UUID passed!\n"); - return -EINVAL; - } - uuid = lustre_cfg_string(lcfg, 2); - - CDEBUG(D_IOCTL, "attach type %s name: %s uuid: %s\n", - typename, name, uuid); - - obd = class_newdev(typename, name); - if (IS_ERR(obd)) { - /* Already exists or out of obds */ - rc = PTR_ERR(obd); - obd = NULL; - CERROR("Cannot create device %s of type %s : %d\n", - name, typename, rc); - goto out; - } - LASSERTF(obd, "Cannot get obd device %s of type %s\n", - name, typename); - LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, - "obd %p obd_magic %08X != %08X\n", - obd, obd->obd_magic, OBD_DEVICE_MAGIC); - LASSERTF(strncmp(obd->obd_name, name, strlen(name)) == 0, - "%p obd_name %s != %s\n", obd, obd->obd_name, name); - - rwlock_init(&obd->obd_pool_lock); - obd->obd_pool_limit = 0; - obd->obd_pool_slv = 0; - - INIT_LIST_HEAD(&obd->obd_exports); - INIT_LIST_HEAD(&obd->obd_unlinked_exports); - INIT_LIST_HEAD(&obd->obd_delayed_exports); - spin_lock_init(&obd->obd_nid_lock); - spin_lock_init(&obd->obd_dev_lock); - mutex_init(&obd->obd_dev_mutex); - spin_lock_init(&obd->obd_osfs_lock); - /* obd->obd_osfs_age must be set to a value in the distant - * past to guarantee a fresh statfs is fetched on mount. - */ - obd->obd_osfs_age = get_jiffies_64() - 1000 * HZ; - - /* XXX belongs in setup not attach */ - init_rwsem(&obd->obd_observer_link_sem); - /* recovery data */ - init_waitqueue_head(&obd->obd_evict_inprogress_waitq); - - llog_group_init(&obd->obd_olg); - - obd->obd_conn_inprogress = 0; - - len = strlen(uuid); - if (len >= sizeof(obd->obd_uuid)) { - CERROR("uuid must be < %d bytes long\n", - (int)sizeof(obd->obd_uuid)); - rc = -EINVAL; - goto out; - } - memcpy(obd->obd_uuid.uuid, uuid, len); - - /* Detach drops this */ - spin_lock(&obd->obd_dev_lock); - atomic_set(&obd->obd_refcount, 1); - spin_unlock(&obd->obd_dev_lock); - lu_ref_init(&obd->obd_reference); - lu_ref_add(&obd->obd_reference, "attach", obd); - - obd->obd_attached = 1; - CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n", - obd->obd_minor, typename, atomic_read(&obd->obd_refcount)); - return 0; - out: - if (obd) - class_release_dev(obd); - - return rc; -} - -/** Create hashes, self-export, and call type-specific setup. - * Setup is effectively the "start this obd" call. - */ -static int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg) -{ - int err = 0; - struct obd_export *exp; - - LASSERT(obd); - LASSERTF(obd == class_num2obd(obd->obd_minor), - "obd %p != obd_devs[%d] %p\n", - obd, obd->obd_minor, class_num2obd(obd->obd_minor)); - LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, - "obd %p obd_magic %08x != %08x\n", - obd, obd->obd_magic, OBD_DEVICE_MAGIC); - - /* have we attached a type to this device? */ - if (!obd->obd_attached) { - CERROR("Device %d not attached\n", obd->obd_minor); - return -ENODEV; - } - - if (obd->obd_set_up) { - CERROR("Device %d already setup (type %s)\n", - obd->obd_minor, obd->obd_type->typ_name); - return -EEXIST; - } - - /* is someone else setting us up right now? (attach inits spinlock) */ - spin_lock(&obd->obd_dev_lock); - if (obd->obd_starting) { - spin_unlock(&obd->obd_dev_lock); - CERROR("Device %d setup in progress (type %s)\n", - obd->obd_minor, obd->obd_type->typ_name); - return -EEXIST; - } - /* just leave this on forever. I can't use obd_set_up here because - * other fns check that status, and we're not actually set up yet. - */ - obd->obd_starting = 1; - spin_unlock(&obd->obd_dev_lock); - - /* create an uuid-export lustre hash */ - err = rhashtable_init(&obd->obd_uuid_hash, &uuid_hash_params); - - if (err) - goto err_hash; - - exp = class_new_export(obd, &obd->obd_uuid); - if (IS_ERR(exp)) { - err = PTR_ERR(exp); - goto err_new; - } - - obd->obd_self_export = exp; - class_export_put(exp); - - err = obd_setup(obd, lcfg); - if (err) - goto err_exp; - - obd->obd_set_up = 1; - - spin_lock(&obd->obd_dev_lock); - /* cleanup drops this */ - class_incref(obd, "setup", obd); - spin_unlock(&obd->obd_dev_lock); - - CDEBUG(D_IOCTL, "finished setup of obd %s (uuid %s)\n", - obd->obd_name, obd->obd_uuid.uuid); - - return 0; -err_exp: - if (obd->obd_self_export) { - class_unlink_export(obd->obd_self_export); - obd->obd_self_export = NULL; - } -err_new: - rhashtable_destroy(&obd->obd_uuid_hash); -err_hash: - obd->obd_starting = 0; - CERROR("setup %s failed (%d)\n", obd->obd_name, err); - return err; -} - -/** We have finished using this obd and are ready to destroy it. - * There can be no more references to this obd. - */ -static int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg) -{ - if (obd->obd_set_up) { - CERROR("OBD device %d still set up\n", obd->obd_minor); - return -EBUSY; - } - - spin_lock(&obd->obd_dev_lock); - if (!obd->obd_attached) { - spin_unlock(&obd->obd_dev_lock); - CERROR("OBD device %d not attached\n", obd->obd_minor); - return -ENODEV; - } - obd->obd_attached = 0; - spin_unlock(&obd->obd_dev_lock); - - CDEBUG(D_IOCTL, "detach on obd %s (uuid %s)\n", - obd->obd_name, obd->obd_uuid.uuid); - - class_decref(obd, "attach", obd); - return 0; -} - -/** Start shutting down the obd. There may be in-progress ops when - * this is called. We tell them to start shutting down with a call - * to class_disconnect_exports(). - */ -static int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg) -{ - int err = 0; - char *flag; - - OBD_RACE(OBD_FAIL_LDLM_RECOV_CLIENTS); - - if (!obd->obd_set_up) { - CERROR("Device %d not setup\n", obd->obd_minor); - return -ENODEV; - } - - spin_lock(&obd->obd_dev_lock); - if (obd->obd_stopping) { - spin_unlock(&obd->obd_dev_lock); - CERROR("OBD %d already stopping\n", obd->obd_minor); - return -ENODEV; - } - /* Leave this on forever */ - obd->obd_stopping = 1; - spin_unlock(&obd->obd_dev_lock); - - while (obd->obd_conn_inprogress > 0) - cond_resched(); - smp_rmb(); - - if (lcfg->lcfg_bufcount >= 2 && LUSTRE_CFG_BUFLEN(lcfg, 1) > 0) { - for (flag = lustre_cfg_string(lcfg, 1); *flag != 0; flag++) - switch (*flag) { - case 'F': - obd->obd_force = 1; - break; - case 'A': - LCONSOLE_WARN("Failing over %s\n", - obd->obd_name); - obd->obd_fail = 1; - obd->obd_no_transno = 1; - obd->obd_no_recov = 1; - if (OBP(obd, iocontrol)) { - obd_iocontrol(OBD_IOC_SYNC, - obd->obd_self_export, - 0, NULL, NULL); - } - break; - default: - CERROR("Unrecognised flag '%c'\n", *flag); - } - } - - LASSERT(obd->obd_self_export); - - /* Precleanup, we must make sure all exports get destroyed. */ - err = obd_precleanup(obd); - if (err) - CERROR("Precleanup %s returned %d\n", - obd->obd_name, err); - - /* destroy an uuid-export hash body */ - rhashtable_free_and_destroy(&obd->obd_uuid_hash, uuid_export_exit, NULL); - - class_decref(obd, "setup", obd); - obd->obd_set_up = 0; - - return 0; -} - -struct obd_device *class_incref(struct obd_device *obd, - const char *scope, const void *source) -{ - lu_ref_add_atomic(&obd->obd_reference, scope, source); - atomic_inc(&obd->obd_refcount); - CDEBUG(D_INFO, "incref %s (%p) now %d\n", obd->obd_name, obd, - atomic_read(&obd->obd_refcount)); - - return obd; -} -EXPORT_SYMBOL(class_incref); - -void class_decref(struct obd_device *obd, const char *scope, const void *source) -{ - int err; - int refs; - - spin_lock(&obd->obd_dev_lock); - atomic_dec(&obd->obd_refcount); - refs = atomic_read(&obd->obd_refcount); - spin_unlock(&obd->obd_dev_lock); - lu_ref_del(&obd->obd_reference, scope, source); - - CDEBUG(D_INFO, "Decref %s (%p) now %d\n", obd->obd_name, obd, refs); - - if ((refs == 1) && obd->obd_stopping) { - /* All exports have been destroyed; there should - * be no more in-progress ops by this point. - */ - - spin_lock(&obd->obd_self_export->exp_lock); - obd->obd_self_export->exp_flags |= exp_flags_from_obd(obd); - spin_unlock(&obd->obd_self_export->exp_lock); - - /* note that we'll recurse into class_decref again */ - class_unlink_export(obd->obd_self_export); - return; - } - - if (refs == 0) { - CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n", - obd->obd_name, obd->obd_uuid.uuid); - LASSERT(!obd->obd_attached); - if (obd->obd_stopping) { - /* If we're not stopping, we were never set up */ - err = obd_cleanup(obd); - if (err) - CERROR("Cleanup %s returned %d\n", - obd->obd_name, err); - } - class_release_dev(obd); - } -} -EXPORT_SYMBOL(class_decref); - -/** Add a failover nid location. - * Client obd types contact server obd types using this nid list. - */ -static int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg) -{ - struct obd_import *imp; - struct obd_uuid uuid; - int rc; - - if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 || - LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) { - CERROR("invalid conn_uuid\n"); - return -EINVAL; - } - if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) && - strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) && - strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) && - strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) && - strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME)) { - CERROR("can't add connection on non-client dev\n"); - return -EINVAL; - } - - imp = obd->u.cli.cl_import; - if (!imp) { - CERROR("try to add conn on immature client dev\n"); - return -EINVAL; - } - - obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1)); - rc = obd_add_conn(imp, &uuid, lcfg->lcfg_num); - - return rc; -} - -/** Remove a failover nid location. - */ -static int class_del_conn(struct obd_device *obd, struct lustre_cfg *lcfg) -{ - struct obd_import *imp; - struct obd_uuid uuid; - int rc; - - if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 || - LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) { - CERROR("invalid conn_uuid\n"); - return -EINVAL; - } - if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) && - strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME)) { - CERROR("can't del connection on non-client dev\n"); - return -EINVAL; - } - - imp = obd->u.cli.cl_import; - if (!imp) { - CERROR("try to del conn on immature client dev\n"); - return -EINVAL; - } - - obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1)); - rc = obd_del_conn(imp, &uuid); - - return rc; -} - -static LIST_HEAD(lustre_profile_list); -static DEFINE_SPINLOCK(lustre_profile_list_lock); - -struct lustre_profile *class_get_profile(const char *prof) -{ - struct lustre_profile *lprof; - - spin_lock(&lustre_profile_list_lock); - list_for_each_entry(lprof, &lustre_profile_list, lp_list) { - if (!strcmp(lprof->lp_profile, prof)) { - lprof->lp_refs++; - spin_unlock(&lustre_profile_list_lock); - return lprof; - } - } - spin_unlock(&lustre_profile_list_lock); - return NULL; -} -EXPORT_SYMBOL(class_get_profile); - -/** Create a named "profile". - * This defines the mdc and osc names to use for a client. - * This also is used to define the lov to be used by a mdt. - */ -static int class_add_profile(int proflen, char *prof, int osclen, char *osc, - int mdclen, char *mdc) -{ - struct lustre_profile *lprof; - int err = 0; - - CDEBUG(D_CONFIG, "Add profile %s\n", prof); - - lprof = kzalloc(sizeof(*lprof), GFP_NOFS); - if (!lprof) - return -ENOMEM; - INIT_LIST_HEAD(&lprof->lp_list); - - LASSERT(proflen == (strlen(prof) + 1)); - lprof->lp_profile = kmemdup(prof, proflen, GFP_NOFS); - if (!lprof->lp_profile) { - err = -ENOMEM; - goto free_lprof; - } - - LASSERT(osclen == (strlen(osc) + 1)); - lprof->lp_dt = kmemdup(osc, osclen, GFP_NOFS); - if (!lprof->lp_dt) { - err = -ENOMEM; - goto free_lp_profile; - } - - if (mdclen > 0) { - LASSERT(mdclen == (strlen(mdc) + 1)); - lprof->lp_md = kmemdup(mdc, mdclen, GFP_NOFS); - if (!lprof->lp_md) { - err = -ENOMEM; - goto free_lp_dt; - } - } - - spin_lock(&lustre_profile_list_lock); - lprof->lp_refs = 1; - lprof->lp_list_deleted = false; - list_add(&lprof->lp_list, &lustre_profile_list); - spin_unlock(&lustre_profile_list_lock); - return err; - -free_lp_dt: - kfree(lprof->lp_dt); -free_lp_profile: - kfree(lprof->lp_profile); -free_lprof: - kfree(lprof); - return err; -} - -void class_del_profile(const char *prof) -{ - struct lustre_profile *lprof; - - CDEBUG(D_CONFIG, "Del profile %s\n", prof); - - lprof = class_get_profile(prof); - if (lprof) { - spin_lock(&lustre_profile_list_lock); - /* because get profile increments the ref counter */ - lprof->lp_refs--; - list_del(&lprof->lp_list); - lprof->lp_list_deleted = true; - spin_unlock(&lustre_profile_list_lock); - - class_put_profile(lprof); - } -} -EXPORT_SYMBOL(class_del_profile); - -void class_put_profile(struct lustre_profile *lprof) -{ - spin_lock(&lustre_profile_list_lock); - if (--lprof->lp_refs > 0) { - LASSERT(lprof->lp_refs > 0); - spin_unlock(&lustre_profile_list_lock); - return; - } - spin_unlock(&lustre_profile_list_lock); - - /* confirm not a negative number */ - LASSERT(!lprof->lp_refs); - - /* - * At least one class_del_profile/profiles must be called - * on the target profile or lustre_profile_list will corrupt - */ - LASSERT(lprof->lp_list_deleted); - kfree(lprof->lp_profile); - kfree(lprof->lp_dt); - kfree(lprof->lp_md); - kfree(lprof); -} -EXPORT_SYMBOL(class_put_profile); - -/* COMPAT_146 */ -void class_del_profiles(void) -{ - struct lustre_profile *lprof, *n; - - spin_lock(&lustre_profile_list_lock); - list_for_each_entry_safe(lprof, n, &lustre_profile_list, lp_list) { - list_del(&lprof->lp_list); - lprof->lp_list_deleted = true; - spin_unlock(&lustre_profile_list_lock); - - class_put_profile(lprof); - - spin_lock(&lustre_profile_list_lock); - } - spin_unlock(&lustre_profile_list_lock); -} -EXPORT_SYMBOL(class_del_profiles); - -static int class_set_global(char *ptr, int val, struct lustre_cfg *lcfg) -{ - if (class_match_param(ptr, PARAM_AT_MIN, NULL) == 0) - at_min = val; - else if (class_match_param(ptr, PARAM_AT_MAX, NULL) == 0) - at_max = val; - else if (class_match_param(ptr, PARAM_AT_EXTRA, NULL) == 0) - at_extra = val; - else if (class_match_param(ptr, PARAM_AT_EARLY_MARGIN, NULL) == 0) - at_early_margin = val; - else if (class_match_param(ptr, PARAM_AT_HISTORY, NULL) == 0) - at_history = val; - else if (class_match_param(ptr, PARAM_JOBID_VAR, NULL) == 0) - strlcpy(obd_jobid_var, lustre_cfg_string(lcfg, 2), - JOBSTATS_JOBID_VAR_MAX_LEN + 1); - else - return -EINVAL; - - CDEBUG(D_IOCTL, "global %s = %d\n", ptr, val); - return 0; -} - -/* We can't call ll_process_config or lquota_process_config directly because - * it lives in a module that must be loaded after this one. - */ -static int (*client_process_config)(struct lustre_cfg *lcfg); -static int (*quota_process_config)(struct lustre_cfg *lcfg); - -void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg)) -{ - client_process_config = cpc; -} -EXPORT_SYMBOL(lustre_register_client_process_config); - -static int process_param2_config(struct lustre_cfg *lcfg) -{ - char *param = lustre_cfg_string(lcfg, 1); - char *upcall = lustre_cfg_string(lcfg, 2); - char *argv[] = { - [0] = "/usr/sbin/lctl", - [1] = "set_param", - [2] = param, - [3] = NULL - }; - ktime_t start; - ktime_t end; - int rc; - - /* Add upcall processing here. Now only lctl is supported */ - if (strcmp(upcall, LCTL_UPCALL) != 0) { - CERROR("Unsupported upcall %s\n", upcall); - return -EINVAL; - } - - start = ktime_get(); - rc = call_usermodehelper(argv[0], argv, NULL, UMH_WAIT_PROC); - end = ktime_get(); - - if (rc < 0) { - CERROR( - "lctl: error invoking upcall %s %s %s: rc = %d; time %ldus\n", - argv[0], argv[1], argv[2], rc, - (long)ktime_us_delta(end, start)); - } else { - CDEBUG(D_HA, "lctl: invoked upcall %s %s %s, time %ldus\n", - argv[0], argv[1], argv[2], - (long)ktime_us_delta(end, start)); - rc = 0; - } - - return rc; -} - -/** Process configuration commands given in lustre_cfg form. - * These may come from direct calls (e.g. class_manual_cleanup) - * or processing the config llog, or ioctl from lctl. - */ -int class_process_config(struct lustre_cfg *lcfg) -{ - struct obd_device *obd; - int err; - - LASSERT(lcfg && !IS_ERR(lcfg)); - CDEBUG(D_IOCTL, "processing cmd: %x\n", lcfg->lcfg_command); - - /* Commands that don't need a device */ - switch (lcfg->lcfg_command) { - case LCFG_ATTACH: { - err = class_attach(lcfg); - goto out; - } - case LCFG_ADD_UUID: { - CDEBUG(D_IOCTL, "adding mapping from uuid %s to nid %#llx (%s)\n", - lustre_cfg_string(lcfg, 1), lcfg->lcfg_nid, - libcfs_nid2str(lcfg->lcfg_nid)); - - err = class_add_uuid(lustre_cfg_string(lcfg, 1), lcfg->lcfg_nid); - goto out; - } - case LCFG_DEL_UUID: { - CDEBUG(D_IOCTL, "removing mappings for uuid %s\n", - (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) == 0) - ? "" : lustre_cfg_string(lcfg, 1)); - - err = class_del_uuid(lustre_cfg_string(lcfg, 1)); - goto out; - } - case LCFG_MOUNTOPT: { - CDEBUG(D_IOCTL, "mountopt: profile %s osc %s mdc %s\n", - lustre_cfg_string(lcfg, 1), - lustre_cfg_string(lcfg, 2), - lustre_cfg_string(lcfg, 3)); - /* set these mount options somewhere, so ll_fill_super - * can find them. - */ - err = class_add_profile(LUSTRE_CFG_BUFLEN(lcfg, 1), - lustre_cfg_string(lcfg, 1), - LUSTRE_CFG_BUFLEN(lcfg, 2), - lustre_cfg_string(lcfg, 2), - LUSTRE_CFG_BUFLEN(lcfg, 3), - lustre_cfg_string(lcfg, 3)); - goto out; - } - case LCFG_DEL_MOUNTOPT: { - CDEBUG(D_IOCTL, "mountopt: profile %s\n", - lustre_cfg_string(lcfg, 1)); - class_del_profile(lustre_cfg_string(lcfg, 1)); - err = 0; - goto out; - } - case LCFG_SET_TIMEOUT: { - CDEBUG(D_IOCTL, "changing lustre timeout from %d to %d\n", - obd_timeout, lcfg->lcfg_num); - obd_timeout = max(lcfg->lcfg_num, 1U); - obd_timeout_set = 1; - err = 0; - goto out; - } - case LCFG_SET_LDLM_TIMEOUT: { - /* ldlm_timeout is not used on the client */ - err = 0; - goto out; - } - case LCFG_SET_UPCALL: { - LCONSOLE_ERROR_MSG(0x15a, "recovery upcall is deprecated\n"); - /* COMPAT_146 Don't fail on old configs */ - err = 0; - goto out; - } - case LCFG_MARKER: { - struct cfg_marker *marker; - - marker = lustre_cfg_buf(lcfg, 1); - CDEBUG(D_IOCTL, "marker %d (%#x) %.16s %s\n", marker->cm_step, - marker->cm_flags, marker->cm_tgtname, marker->cm_comment); - err = 0; - goto out; - } - case LCFG_PARAM: { - char *tmp; - /* llite has no obd */ - if ((class_match_param(lustre_cfg_string(lcfg, 1), - PARAM_LLITE, NULL) == 0) && - client_process_config) { - err = (*client_process_config)(lcfg); - goto out; - } else if ((class_match_param(lustre_cfg_string(lcfg, 1), - PARAM_SYS, &tmp) == 0)) { - /* Global param settings */ - err = class_set_global(tmp, lcfg->lcfg_num, lcfg); - /* - * Client or server should not fail to mount if - * it hits an unknown configuration parameter. - */ - if (err != 0) - CWARN("Ignoring unknown param %s\n", tmp); - - err = 0; - goto out; - } else if ((class_match_param(lustre_cfg_string(lcfg, 1), - PARAM_QUOTA, &tmp) == 0) && - quota_process_config) { - err = (*quota_process_config)(lcfg); - goto out; - } - - break; - } - case LCFG_SET_PARAM: { - err = process_param2_config(lcfg); - goto out; - } - } - /* Commands that require a device */ - obd = class_name2obd(lustre_cfg_string(lcfg, 0)); - if (!obd) { - if (!LUSTRE_CFG_BUFLEN(lcfg, 0)) - CERROR("this lcfg command requires a device name\n"); - else - CERROR("no device for: %s\n", - lustre_cfg_string(lcfg, 0)); - - err = -EINVAL; - goto out; - } - - switch (lcfg->lcfg_command) { - case LCFG_SETUP: { - err = class_setup(obd, lcfg); - goto out; - } - case LCFG_DETACH: { - err = class_detach(obd, lcfg); - err = 0; - goto out; - } - case LCFG_CLEANUP: { - err = class_cleanup(obd, lcfg); - err = 0; - goto out; - } - case LCFG_ADD_CONN: { - err = class_add_conn(obd, lcfg); - err = 0; - goto out; - } - case LCFG_DEL_CONN: { - err = class_del_conn(obd, lcfg); - err = 0; - goto out; - } - case LCFG_POOL_NEW: { - err = obd_pool_new(obd, lustre_cfg_string(lcfg, 2)); - err = 0; - goto out; - } - case LCFG_POOL_ADD: { - err = obd_pool_add(obd, lustre_cfg_string(lcfg, 2), - lustre_cfg_string(lcfg, 3)); - err = 0; - goto out; - } - case LCFG_POOL_REM: { - err = obd_pool_rem(obd, lustre_cfg_string(lcfg, 2), - lustre_cfg_string(lcfg, 3)); - err = 0; - goto out; - } - case LCFG_POOL_DEL: { - err = obd_pool_del(obd, lustre_cfg_string(lcfg, 2)); - err = 0; - goto out; - } - default: { - err = obd_process_config(obd, sizeof(*lcfg), lcfg); - goto out; - } - } -out: - if ((err < 0) && !(lcfg->lcfg_command & LCFG_REQUIRED)) { - CWARN("Ignoring error %d on optional command %#x\n", err, - lcfg->lcfg_command); - err = 0; - } - return err; -} -EXPORT_SYMBOL(class_process_config); - -int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars, - struct lustre_cfg *lcfg, void *data) -{ - struct lprocfs_vars *var; - struct file fakefile; - struct seq_file fake_seqfile; - char *key, *sval; - int i, keylen, vallen; - int matched = 0, j = 0; - int rc = 0; - int skip = 0; - - if (lcfg->lcfg_command != LCFG_PARAM) { - CERROR("Unknown command: %d\n", lcfg->lcfg_command); - return -EINVAL; - } - - /* fake a seq file so that var->fops->write can work... */ - fakefile.private_data = &fake_seqfile; - fake_seqfile.private = data; - /* e.g. tunefs.lustre --param mdt.group_upcall=foo /r/tmp/lustre-mdt - * or lctl conf_param lustre-MDT0000.mdt.group_upcall=bar - * or lctl conf_param lustre-OST0000.osc.max_dirty_mb=36 - */ - for (i = 1; i < lcfg->lcfg_bufcount; i++) { - key = lustre_cfg_buf(lcfg, i); - /* Strip off prefix */ - if (class_match_param(key, prefix, &key)) { - /* - * If the prefix doesn't match, return error so we - * can pass it down the stack - */ - return -ENOSYS; - } - sval = strchr(key, '='); - if (!sval || (*(sval + 1) == 0)) { - CERROR("Can't parse param %s (missing '=')\n", key); - /* rc = -EINVAL; continue parsing other params */ - continue; - } - keylen = sval - key; - sval++; - vallen = strlen(sval); - matched = 0; - j = 0; - /* Search proc entries */ - while (lvars[j].name) { - var = &lvars[j]; - if (!class_match_param(key, var->name, NULL) && - keylen == strlen(var->name)) { - matched++; - rc = -EROFS; - if (var->fops && var->fops->write) { - mm_segment_t oldfs; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - rc = var->fops->write(&fakefile, - (const char __user *)sval, - vallen, NULL); - set_fs(oldfs); - } - break; - } - j++; - } - if (!matched) { - CERROR("%.*s: %s unknown param %s\n", - (int)strlen(prefix) - 1, prefix, - (char *)lustre_cfg_string(lcfg, 0), key); - /* rc = -EINVAL; continue parsing other params */ - skip++; - } else if (rc < 0) { - CERROR("%s: error writing proc entry '%s': rc = %d\n", - prefix, var->name, rc); - rc = 0; - } else { - CDEBUG(D_CONFIG, "%s.%.*s: Set parameter %.*s=%s\n", - lustre_cfg_string(lcfg, 0), - (int)strlen(prefix) - 1, prefix, - (int)(sval - key - 1), key, sval); - } - } - - if (rc > 0) - rc = 0; - if (!rc && skip) - rc = skip; - return rc; -} -EXPORT_SYMBOL(class_process_proc_param); - -/** Parse a configuration llog, doing various manipulations on them - * for various reasons, (modifications for compatibility, skip obsolete - * records, change uuids, etc), then class_process_config() resulting - * net records. - */ -int class_config_llog_handler(const struct lu_env *env, - struct llog_handle *handle, - struct llog_rec_hdr *rec, void *data) -{ - struct config_llog_instance *clli = data; - int cfg_len = rec->lrh_len; - char *cfg_buf = (char *)(rec + 1); - int rc = 0; - - switch (rec->lrh_type) { - case OBD_CFG_REC: { - struct lustre_cfg *lcfg, *lcfg_new; - struct lustre_cfg_bufs bufs; - char *inst_name = NULL; - int inst_len = 0; - size_t lcfg_len; - int swab = 0; - - lcfg = (struct lustre_cfg *)cfg_buf; - if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION)) { - lustre_swab_lustre_cfg(lcfg); - swab = 1; - } - - rc = lustre_cfg_sanity_check(cfg_buf, cfg_len); - if (rc) - goto out; - - /* Figure out config state info */ - if (lcfg->lcfg_command == LCFG_MARKER) { - struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1); - - lustre_swab_cfg_marker(marker, swab, - LUSTRE_CFG_BUFLEN(lcfg, 1)); - CDEBUG(D_CONFIG, "Marker, inst_flg=%#x mark_flg=%#x\n", - clli->cfg_flags, marker->cm_flags); - if (marker->cm_flags & CM_START) { - /* all previous flags off */ - clli->cfg_flags = CFG_F_MARKER; - if (marker->cm_flags & CM_SKIP) { - clli->cfg_flags |= CFG_F_SKIP; - CDEBUG(D_CONFIG, "SKIP #%d\n", - marker->cm_step); - } else if ((marker->cm_flags & CM_EXCLUDE) || - (clli->cfg_sb && - lustre_check_exclusion(clli->cfg_sb, - marker->cm_tgtname))) { - clli->cfg_flags |= CFG_F_EXCLUDE; - CDEBUG(D_CONFIG, "EXCLUDE %d\n", - marker->cm_step); - } - } else if (marker->cm_flags & CM_END) { - clli->cfg_flags = 0; - } - } - /* A config command without a start marker before it is - * illegal (post 146) - */ - if (!(clli->cfg_flags & CFG_F_COMPAT146) && - !(clli->cfg_flags & CFG_F_MARKER) && - (lcfg->lcfg_command != LCFG_MARKER)) { - CWARN("Config not inside markers, ignoring! (inst: %p, uuid: %s, flags: %#x)\n", - clli->cfg_instance, - clli->cfg_uuid.uuid, clli->cfg_flags); - clli->cfg_flags |= CFG_F_SKIP; - } - if (clli->cfg_flags & CFG_F_SKIP) { - CDEBUG(D_CONFIG, "skipping %#x\n", - clli->cfg_flags); - rc = 0; - /* No processing! */ - break; - } - - /* - * For interoperability between 1.8 and 2.0, - * rename "mds" obd device type to "mdt". - */ - { - char *typename = lustre_cfg_string(lcfg, 1); - char *index = lustre_cfg_string(lcfg, 2); - - if ((lcfg->lcfg_command == LCFG_ATTACH && typename && - strcmp(typename, "mds") == 0)) { - CWARN("For 1.8 interoperability, rename obd type from mds to mdt\n"); - typename[2] = 't'; - } - if ((lcfg->lcfg_command == LCFG_SETUP && index && - strcmp(index, "type") == 0)) { - CDEBUG(D_INFO, "For 1.8 interoperability, set this index to '0'\n"); - index[0] = '0'; - index[1] = 0; - } - } - - if (clli->cfg_flags & CFG_F_EXCLUDE) { - CDEBUG(D_CONFIG, "cmd: %x marked EXCLUDED\n", - lcfg->lcfg_command); - if (lcfg->lcfg_command == LCFG_LOV_ADD_OBD) - /* Add inactive instead */ - lcfg->lcfg_command = LCFG_LOV_ADD_INA; - } - - lustre_cfg_bufs_init(&bufs, lcfg); - - if (clli && clli->cfg_instance && - LUSTRE_CFG_BUFLEN(lcfg, 0) > 0) { - inst_len = LUSTRE_CFG_BUFLEN(lcfg, 0) + - sizeof(clli->cfg_instance) * 2 + 4; - inst_name = kasprintf(GFP_NOFS, "%s-%p", - lustre_cfg_string(lcfg, 0), - clli->cfg_instance); - if (!inst_name) { - rc = -ENOMEM; - goto out; - } - lustre_cfg_bufs_set_string(&bufs, 0, inst_name); - CDEBUG(D_CONFIG, "cmd %x, instance name: %s\n", - lcfg->lcfg_command, inst_name); - } - - /* we override the llog's uuid for clients, to insure they - * are unique - */ - if (clli && clli->cfg_instance && - lcfg->lcfg_command == LCFG_ATTACH) { - lustre_cfg_bufs_set_string(&bufs, 2, - clli->cfg_uuid.uuid); - } - /* - * sptlrpc config record, we expect 2 data segments: - * [0]: fs_name/target_name, - * [1]: rule string - * moving them to index [1] and [2], and insert MGC's - * obdname at index [0]. - */ - if (clli && !clli->cfg_instance && - lcfg->lcfg_command == LCFG_SPTLRPC_CONF) { - lustre_cfg_bufs_set(&bufs, 2, bufs.lcfg_buf[1], - bufs.lcfg_buflen[1]); - lustre_cfg_bufs_set(&bufs, 1, bufs.lcfg_buf[0], - bufs.lcfg_buflen[0]); - lustre_cfg_bufs_set_string(&bufs, 0, - clli->cfg_obdname); - } - - lcfg_len = lustre_cfg_len(bufs.lcfg_bufcount, bufs.lcfg_buflen); - lcfg_new = kzalloc(lcfg_len, GFP_NOFS); - if (!lcfg_new) { - rc = -ENOMEM; - goto out; - } - - lustre_cfg_init(lcfg_new, lcfg->lcfg_command, &bufs); - lcfg_new->lcfg_num = lcfg->lcfg_num; - lcfg_new->lcfg_flags = lcfg->lcfg_flags; - - /* XXX Hack to try to remain binary compatible with - * pre-newconfig logs - */ - if (lcfg->lcfg_nal != 0 && /* pre-newconfig log? */ - (lcfg->lcfg_nid >> 32) == 0) { - __u32 addr = (__u32)(lcfg->lcfg_nid & 0xffffffff); - - lcfg_new->lcfg_nid = - LNET_MKNID(LNET_MKNET(lcfg->lcfg_nal, 0), addr); - CWARN("Converted pre-newconfig NAL %d NID %x to %s\n", - lcfg->lcfg_nal, addr, - libcfs_nid2str(lcfg_new->lcfg_nid)); - } else { - lcfg_new->lcfg_nid = lcfg->lcfg_nid; - } - - lcfg_new->lcfg_nal = 0; /* illegal value for obsolete field */ - - rc = class_process_config(lcfg_new); - kfree(lcfg_new); - kfree(inst_name); - break; - } - default: - CERROR("Unknown llog record type %#x encountered\n", - rec->lrh_type); - break; - } -out: - if (rc) { - CERROR("%s: cfg command failed: rc = %d\n", - handle->lgh_ctxt->loc_obd->obd_name, rc); - class_config_dump_handler(NULL, handle, rec, data); - } - return rc; -} -EXPORT_SYMBOL(class_config_llog_handler); - -int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt, - char *name, struct config_llog_instance *cfg) -{ - struct llog_process_cat_data cd = {0, 0}; - struct llog_handle *llh; - llog_cb_t callback; - int rc; - - CDEBUG(D_INFO, "looking up llog %s\n", name); - rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS); - if (rc) - return rc; - - rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL); - if (rc) - goto parse_out; - - /* continue processing from where we last stopped to end-of-log */ - if (cfg) { - cd.lpcd_first_idx = cfg->cfg_last_idx; - callback = cfg->cfg_callback; - LASSERT(callback); - } else { - callback = class_config_llog_handler; - } - - cd.lpcd_last_idx = 0; - - rc = llog_process(env, llh, callback, cfg, &cd); - - CDEBUG(D_CONFIG, "Processed log %s gen %d-%d (rc=%d)\n", name, - cd.lpcd_first_idx + 1, cd.lpcd_last_idx, rc); - if (cfg) - cfg->cfg_last_idx = cd.lpcd_last_idx; - -parse_out: - llog_close(env, llh); - return rc; -} -EXPORT_SYMBOL(class_config_parse_llog); - -/** - * parse config record and output dump in supplied buffer. - * This is separated from class_config_dump_handler() to use - * for ioctl needs as well - */ -static int class_config_parse_rec(struct llog_rec_hdr *rec, char *buf, - int size) -{ - struct lustre_cfg *lcfg = (struct lustre_cfg *)(rec + 1); - char *ptr = buf; - char *end = buf + size; - int rc = 0; - - LASSERT(rec->lrh_type == OBD_CFG_REC); - rc = lustre_cfg_sanity_check(lcfg, rec->lrh_len); - if (rc < 0) - return rc; - - ptr += snprintf(ptr, end - ptr, "cmd=%05x ", lcfg->lcfg_command); - if (lcfg->lcfg_flags) - ptr += snprintf(ptr, end - ptr, "flags=%#08x ", - lcfg->lcfg_flags); - - if (lcfg->lcfg_num) - ptr += snprintf(ptr, end - ptr, "num=%#08x ", lcfg->lcfg_num); - - if (lcfg->lcfg_nid) { - char nidstr[LNET_NIDSTR_SIZE]; - - libcfs_nid2str_r(lcfg->lcfg_nid, nidstr, sizeof(nidstr)); - ptr += snprintf(ptr, end - ptr, "nid=%s(%#llx)\n ", - nidstr, lcfg->lcfg_nid); - } - - if (lcfg->lcfg_command == LCFG_MARKER) { - struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1); - - ptr += snprintf(ptr, end - ptr, "marker=%d(%#x)%s '%s'", - marker->cm_step, marker->cm_flags, - marker->cm_tgtname, marker->cm_comment); - } else { - int i; - - for (i = 0; i < lcfg->lcfg_bufcount; i++) { - ptr += snprintf(ptr, end - ptr, "%d:%s ", i, - lustre_cfg_string(lcfg, i)); - } - } - ptr += snprintf(ptr, end - ptr, "\n"); - /* return consumed bytes */ - rc = ptr - buf; - return rc; -} - -int class_config_dump_handler(const struct lu_env *env, - struct llog_handle *handle, - struct llog_rec_hdr *rec, void *data) -{ - char *outstr; - int rc = 0; - - outstr = kzalloc(256, GFP_NOFS); - if (!outstr) - return -ENOMEM; - - if (rec->lrh_type == OBD_CFG_REC) { - class_config_parse_rec(rec, outstr, 256); - LCONSOLE(D_WARNING, " %s", outstr); - } else { - LCONSOLE(D_WARNING, "unhandled lrh_type: %#x\n", rec->lrh_type); - rc = -EINVAL; - } - - kfree(outstr); - return rc; -} - -/** Call class_cleanup and class_detach. - * "Manual" only in the sense that we're faking lcfg commands. - */ -int class_manual_cleanup(struct obd_device *obd) -{ - char flags[3] = ""; - struct lustre_cfg *lcfg; - struct lustre_cfg_bufs bufs; - int rc; - - if (!obd) { - CERROR("empty cleanup\n"); - return -EALREADY; - } - - if (obd->obd_force) - strcat(flags, "F"); - if (obd->obd_fail) - strcat(flags, "A"); - - CDEBUG(D_CONFIG, "Manual cleanup of %s (flags='%s')\n", - obd->obd_name, flags); - - lustre_cfg_bufs_reset(&bufs, obd->obd_name); - lustre_cfg_bufs_set_string(&bufs, 1, flags); - lcfg = kzalloc(lustre_cfg_len(bufs.lcfg_bufcount, bufs.lcfg_buflen), - GFP_NOFS); - if (!lcfg) - return -ENOMEM; - lustre_cfg_init(lcfg, LCFG_CLEANUP, &bufs); - - rc = class_process_config(lcfg); - if (rc) { - CERROR("cleanup failed %d: %s\n", rc, obd->obd_name); - goto out; - } - - /* the lcfg is almost the same for both ops */ - lcfg->lcfg_command = LCFG_DETACH; - rc = class_process_config(lcfg); - if (rc) - CERROR("detach failed %d: %s\n", rc, obd->obd_name); -out: - kfree(lcfg); - return rc; -} -EXPORT_SYMBOL(class_manual_cleanup); diff --git a/drivers/staging/lustre/lustre/obdclass/obd_mount.c b/drivers/staging/lustre/lustre/obdclass/obd_mount.c deleted file mode 100644 index 06c38fdef7ba75f063554074b6fea422ff120b54..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdclass/obd_mount.c +++ /dev/null @@ -1,1245 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/obdclass/obd_mount.c - * - * Client mount routines - * - * Author: Nathan Rutman - */ - -#define DEBUG_SUBSYSTEM S_CLASS -#define D_MOUNT (D_SUPER | D_CONFIG/*|D_WARNING */) -#define PRINT_CMD CDEBUG - -#include -#include -#include -#include -#include -#include -#include -#include - -static DEFINE_SPINLOCK(client_lock); -static struct module *client_mod; -static int (*client_fill_super)(struct super_block *sb); -static void (*kill_super_cb)(struct super_block *sb); - -/**************** config llog ********************/ - -/** Get a config log from the MGS and process it. - * This func is called for both clients and servers. - * Continue to process new statements appended to the logs - * (whenever the config lock is revoked) until lustre_end_log - * is called. - * @param sb The superblock is used by the MGC to write to the local copy of - * the config log - * @param logname The name of the llog to replicate from the MGS - * @param cfg Since the same mgc may be used to follow multiple config logs - * (e.g. ost1, ost2, client), the config_llog_instance keeps the state for - * this log, and is added to the mgc's list of logs to follow. - */ -int lustre_process_log(struct super_block *sb, char *logname, - struct config_llog_instance *cfg) -{ - struct lustre_cfg *lcfg; - struct lustre_cfg_bufs *bufs; - struct lustre_sb_info *lsi = s2lsi(sb); - struct obd_device *mgc = lsi->lsi_mgc; - int rc; - - LASSERT(mgc); - LASSERT(cfg); - - bufs = kzalloc(sizeof(*bufs), GFP_NOFS); - if (!bufs) - return -ENOMEM; - - /* mgc_process_config */ - lustre_cfg_bufs_reset(bufs, mgc->obd_name); - lustre_cfg_bufs_set_string(bufs, 1, logname); - lustre_cfg_bufs_set(bufs, 2, cfg, sizeof(*cfg)); - lustre_cfg_bufs_set(bufs, 3, &sb, sizeof(sb)); - lcfg = kzalloc(lustre_cfg_len(bufs->lcfg_bufcount, bufs->lcfg_buflen), - GFP_NOFS); - if (!lcfg) { - rc = -ENOMEM; - goto out; - } - lustre_cfg_init(lcfg, LCFG_LOG_START, bufs); - - rc = obd_process_config(mgc, sizeof(*lcfg), lcfg); - kfree(lcfg); -out: - kfree(bufs); - - if (rc == -EINVAL) - LCONSOLE_ERROR_MSG(0x15b, "%s: The configuration from log '%s' failed from the MGS (%d). Make sure this client and the MGS are running compatible versions of Lustre.\n", - mgc->obd_name, logname, rc); - - else if (rc) - LCONSOLE_ERROR_MSG(0x15c, "%s: The configuration from log '%s' failed (%d). This may be the result of communication errors between this node and the MGS, a bad configuration, or other errors. See the syslog for more information.\n", - mgc->obd_name, logname, - rc); - - /* class_obd_list(); */ - return rc; -} -EXPORT_SYMBOL(lustre_process_log); - -/* Stop watching this config log for updates */ -int lustre_end_log(struct super_block *sb, char *logname, - struct config_llog_instance *cfg) -{ - struct lustre_cfg *lcfg; - struct lustre_cfg_bufs bufs; - struct lustre_sb_info *lsi = s2lsi(sb); - struct obd_device *mgc = lsi->lsi_mgc; - int rc; - - if (!mgc) - return -ENOENT; - - /* mgc_process_config */ - lustre_cfg_bufs_reset(&bufs, mgc->obd_name); - lustre_cfg_bufs_set_string(&bufs, 1, logname); - if (cfg) - lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg)); - lcfg = kzalloc(lustre_cfg_len(bufs.lcfg_bufcount, bufs.lcfg_buflen), - GFP_NOFS); - if (!lcfg) - return -ENOMEM; - lustre_cfg_init(lcfg, LCFG_LOG_END, &bufs); - - rc = obd_process_config(mgc, sizeof(*lcfg), lcfg); - kfree(lcfg); - return rc; -} -EXPORT_SYMBOL(lustre_end_log); - -/**************** obd start *******************/ - -/** lustre_cfg_bufs are a holdover from 1.4; we can still set these up from - * lctl (and do for echo cli/srv. - */ -static int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd, - char *s1, char *s2, char *s3, char *s4) -{ - struct lustre_cfg_bufs bufs; - struct lustre_cfg *lcfg = NULL; - int rc; - - CDEBUG(D_TRACE, "lcfg %s %#x %s %s %s %s\n", cfgname, - cmd, s1, s2, s3, s4); - - lustre_cfg_bufs_reset(&bufs, cfgname); - if (s1) - lustre_cfg_bufs_set_string(&bufs, 1, s1); - if (s2) - lustre_cfg_bufs_set_string(&bufs, 2, s2); - if (s3) - lustre_cfg_bufs_set_string(&bufs, 3, s3); - if (s4) - lustre_cfg_bufs_set_string(&bufs, 4, s4); - - lcfg = kzalloc(lustre_cfg_len(bufs.lcfg_bufcount, bufs.lcfg_buflen), - GFP_NOFS); - if (!lcfg) - return -ENOMEM; - lustre_cfg_init(lcfg, cmd, &bufs); - lcfg->lcfg_nid = nid; - rc = class_process_config(lcfg); - kfree(lcfg); - return rc; -} - -/** Call class_attach and class_setup. These methods in turn call - * obd type-specific methods. - */ -static int lustre_start_simple(char *obdname, char *type, char *uuid, - char *s1, char *s2, char *s3, char *s4) -{ - int rc; - - CDEBUG(D_MOUNT, "Starting obd %s (typ=%s)\n", obdname, type); - - rc = do_lcfg(obdname, 0, LCFG_ATTACH, type, uuid, NULL, NULL); - if (rc) { - CERROR("%s attach error %d\n", obdname, rc); - return rc; - } - rc = do_lcfg(obdname, 0, LCFG_SETUP, s1, s2, s3, s4); - if (rc) { - CERROR("%s setup error %d\n", obdname, rc); - do_lcfg(obdname, 0, LCFG_DETACH, NULL, NULL, NULL, NULL); - } - return rc; -} - -static DEFINE_MUTEX(mgc_start_lock); - -/** Set up a mgc obd to process startup logs - * - * \param sb [in] super block of the mgc obd - * - * \retval 0 success, otherwise error code - */ -int lustre_start_mgc(struct super_block *sb) -{ - struct obd_connect_data *data = NULL; - struct lustre_sb_info *lsi = s2lsi(sb); - struct obd_device *obd; - struct obd_export *exp; - struct obd_uuid *uuid; - class_uuid_t uuidc; - lnet_nid_t nid; - char nidstr[LNET_NIDSTR_SIZE]; - char *mgcname = NULL, *niduuid = NULL, *mgssec = NULL; - char *ptr; - int rc = 0, i = 0, j; - - LASSERT(lsi->lsi_lmd); - - /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */ - ptr = lsi->lsi_lmd->lmd_dev; - if (class_parse_nid(ptr, &nid, &ptr) == 0) - i++; - if (i == 0) { - CERROR("No valid MGS nids found.\n"); - return -EINVAL; - } - - mutex_lock(&mgc_start_lock); - - libcfs_nid2str_r(nid, nidstr, sizeof(nidstr)); - mgcname = kasprintf(GFP_NOFS, - "%s%s", LUSTRE_MGC_OBDNAME, nidstr); - niduuid = kasprintf(GFP_NOFS, "%s_%x", mgcname, 0); - if (!mgcname || !niduuid) { - rc = -ENOMEM; - goto out_free; - } - - mgssec = lsi->lsi_lmd->lmd_mgssec ? lsi->lsi_lmd->lmd_mgssec : ""; - - data = kzalloc(sizeof(*data), GFP_NOFS); - if (!data) { - rc = -ENOMEM; - goto out_free; - } - - obd = class_name2obd(mgcname); - if (obd && !obd->obd_stopping) { - int recov_bk; - - rc = obd_set_info_async(NULL, obd->obd_self_export, - strlen(KEY_MGSSEC), KEY_MGSSEC, - strlen(mgssec), mgssec, NULL); - if (rc) - goto out_free; - - /* Re-using an existing MGC */ - atomic_inc(&obd->u.cli.cl_mgc_refcount); - - /* IR compatibility check, only for clients */ - if (lmd_is_client(lsi->lsi_lmd)) { - int has_ir; - int vallen = sizeof(*data); - __u32 *flags = &lsi->lsi_lmd->lmd_flags; - - rc = obd_get_info(NULL, obd->obd_self_export, - strlen(KEY_CONN_DATA), KEY_CONN_DATA, - &vallen, data); - LASSERT(rc == 0); - has_ir = OCD_HAS_FLAG(data, IMP_RECOV); - if (has_ir ^ !(*flags & LMD_FLG_NOIR)) { - /* LMD_FLG_NOIR is for test purpose only */ - LCONSOLE_WARN( - "Trying to mount a client with IR setting not compatible with current mgc. Force to use current mgc setting that is IR %s.\n", - has_ir ? "enabled" : "disabled"); - if (has_ir) - *flags &= ~LMD_FLG_NOIR; - else - *flags |= LMD_FLG_NOIR; - } - } - - recov_bk = 0; - - /* Try all connections, but only once (again). - * We don't want to block another target from starting - * (using its local copy of the log), but we do want to connect - * if at all possible. - */ - recov_bk++; - CDEBUG(D_MOUNT, "%s: Set MGC reconnect %d\n", mgcname, - recov_bk); - rc = obd_set_info_async(NULL, obd->obd_self_export, - sizeof(KEY_INIT_RECOV_BACKUP), - KEY_INIT_RECOV_BACKUP, - sizeof(recov_bk), &recov_bk, NULL); - rc = 0; - goto out; - } - - CDEBUG(D_MOUNT, "Start MGC '%s'\n", mgcname); - - /* Add the primary nids for the MGS */ - i = 0; - /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */ - ptr = lsi->lsi_lmd->lmd_dev; - while (class_parse_nid(ptr, &nid, &ptr) == 0) { - rc = do_lcfg(mgcname, nid, - LCFG_ADD_UUID, niduuid, NULL, NULL, NULL); - if (!rc) - i++; - /* Stop at the first failover nid */ - if (*ptr == ':') - break; - } - if (i == 0) { - CERROR("No valid MGS nids found.\n"); - rc = -EINVAL; - goto out_free; - } - lsi->lsi_lmd->lmd_mgs_failnodes = 1; - - /* Random uuid for MGC allows easier reconnects */ - uuid = kzalloc(sizeof(*uuid), GFP_NOFS); - if (!uuid) { - rc = -ENOMEM; - goto out_free; - } - - ll_generate_random_uuid(uuidc); - class_uuid_unparse(uuidc, uuid); - - /* Start the MGC */ - rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME, - (char *)uuid->uuid, LUSTRE_MGS_OBDNAME, - niduuid, NULL, NULL); - kfree(uuid); - if (rc) - goto out_free; - - /* Add any failover MGS nids */ - i = 1; - while (ptr && ((*ptr == ':' || - class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0))) { - /* New failover node */ - sprintf(niduuid, "%s_%x", mgcname, i); - j = 0; - while (class_parse_nid_quiet(ptr, &nid, &ptr) == 0) { - rc = do_lcfg(mgcname, nid, LCFG_ADD_UUID, niduuid, - NULL, NULL, NULL); - if (!rc) - ++j; - if (*ptr == ':') - break; - } - if (j > 0) { - rc = do_lcfg(mgcname, 0, LCFG_ADD_CONN, - niduuid, NULL, NULL, NULL); - if (!rc) - i++; - } else { - /* at ":/fsname" */ - break; - } - } - lsi->lsi_lmd->lmd_mgs_failnodes = i; - - obd = class_name2obd(mgcname); - if (!obd) { - CERROR("Can't find mgcobd %s\n", mgcname); - rc = -ENOTCONN; - goto out_free; - } - - rc = obd_set_info_async(NULL, obd->obd_self_export, - strlen(KEY_MGSSEC), KEY_MGSSEC, - strlen(mgssec), mgssec, NULL); - if (rc) - goto out_free; - - /* Keep a refcount of servers/clients who started with "mount", - * so we know when we can get rid of the mgc. - */ - atomic_set(&obd->u.cli.cl_mgc_refcount, 1); - - /* We connect to the MGS at setup, and don't disconnect until cleanup */ - data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT | - OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV | - OBD_CONNECT_LVB_TYPE | OBD_CONNECT_BULK_MBITS; - -#if OBD_OCD_VERSION(3, 0, 53, 0) > LUSTRE_VERSION_CODE - data->ocd_connect_flags |= OBD_CONNECT_MNE_SWAB; -#endif - - if (lmd_is_client(lsi->lsi_lmd) && - lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR) - data->ocd_connect_flags &= ~OBD_CONNECT_IMP_RECOV; - data->ocd_version = LUSTRE_VERSION_CODE; - rc = obd_connect(NULL, &exp, obd, &obd->obd_uuid, data, NULL); - if (rc) { - CERROR("connect failed %d\n", rc); - goto out; - } - - obd->u.cli.cl_mgc_mgsexp = exp; - -out: - /* Keep the mgc info in the sb. Note that many lsi's can point - * to the same mgc. - */ - lsi->lsi_mgc = obd; -out_free: - mutex_unlock(&mgc_start_lock); - - kfree(data); - kfree(mgcname); - kfree(niduuid); - return rc; -} - -static int lustre_stop_mgc(struct super_block *sb) -{ - struct lustre_sb_info *lsi = s2lsi(sb); - struct obd_device *obd; - char *niduuid = NULL, *ptr = NULL; - int i, rc = 0, len = 0; - - if (!lsi) - return -ENOENT; - obd = lsi->lsi_mgc; - if (!obd) - return -ENOENT; - lsi->lsi_mgc = NULL; - - mutex_lock(&mgc_start_lock); - LASSERT(atomic_read(&obd->u.cli.cl_mgc_refcount) > 0); - if (!atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) { - /* This is not fatal, every client that stops - * will call in here. - */ - CDEBUG(D_MOUNT, "mgc still has %d references.\n", - atomic_read(&obd->u.cli.cl_mgc_refcount)); - rc = -EBUSY; - goto out; - } - - /* The MGC has no recoverable data in any case. - * force shutdown set in umount_begin - */ - obd->obd_no_recov = 1; - - if (obd->u.cli.cl_mgc_mgsexp) { - /* An error is not fatal, if we are unable to send the - * disconnect mgs ping evictor cleans up the export - */ - rc = obd_disconnect(obd->u.cli.cl_mgc_mgsexp); - if (rc) - CDEBUG(D_MOUNT, "disconnect failed %d\n", rc); - } - - /* Save the obdname for cleaning the nid uuids, which are obdname_XX */ - len = strlen(obd->obd_name) + 6; - niduuid = kzalloc(len, GFP_NOFS); - if (niduuid) { - strcpy(niduuid, obd->obd_name); - ptr = niduuid + strlen(niduuid); - } - - rc = class_manual_cleanup(obd); - if (rc) - goto out; - - /* Clean the nid uuids */ - if (!niduuid) { - rc = -ENOMEM; - goto out; - } - - for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) { - sprintf(ptr, "_%x", i); - rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_DEL_UUID, - niduuid, NULL, NULL, NULL); - if (rc) - CERROR("del MDC UUID %s failed: rc = %d\n", - niduuid, rc); - } -out: - kfree(niduuid); - - /* class_import_put will get rid of the additional connections */ - mutex_unlock(&mgc_start_lock); - return rc; -} - -/***************** lustre superblock **************/ - -static struct lustre_sb_info *lustre_init_lsi(struct super_block *sb) -{ - struct lustre_sb_info *lsi; - - lsi = kzalloc(sizeof(*lsi), GFP_NOFS); - if (!lsi) - return NULL; - lsi->lsi_lmd = kzalloc(sizeof(*lsi->lsi_lmd), GFP_NOFS); - if (!lsi->lsi_lmd) { - kfree(lsi); - return NULL; - } - - lsi->lsi_lmd->lmd_exclude_count = 0; - lsi->lsi_lmd->lmd_recovery_time_soft = 0; - lsi->lsi_lmd->lmd_recovery_time_hard = 0; - s2lsi_nocast(sb) = lsi; - /* we take 1 extra ref for our setup */ - atomic_set(&lsi->lsi_mounts, 1); - - /* Default umount style */ - lsi->lsi_flags = LSI_UMOUNT_FAILOVER; - - return lsi; -} - -static int lustre_free_lsi(struct super_block *sb) -{ - struct lustre_sb_info *lsi = s2lsi(sb); - - CDEBUG(D_MOUNT, "Freeing lsi %p\n", lsi); - - /* someone didn't call server_put_mount. */ - LASSERT(atomic_read(&lsi->lsi_mounts) == 0); - - if (lsi->lsi_lmd) { - kfree(lsi->lsi_lmd->lmd_dev); - kfree(lsi->lsi_lmd->lmd_profile); - kfree(lsi->lsi_lmd->lmd_mgssec); - kfree(lsi->lsi_lmd->lmd_opts); - if (lsi->lsi_lmd->lmd_exclude_count) - kfree(lsi->lsi_lmd->lmd_exclude); - kfree(lsi->lsi_lmd->lmd_mgs); - kfree(lsi->lsi_lmd->lmd_osd_type); - kfree(lsi->lsi_lmd->lmd_params); - - kfree(lsi->lsi_lmd); - } - - LASSERT(!lsi->lsi_llsbi); - kfree(lsi); - s2lsi_nocast(sb) = NULL; - - return 0; -} - -/* The lsi has one reference for every server that is using the disk - - * e.g. MDT, MGS, and potentially MGC - */ -static int lustre_put_lsi(struct super_block *sb) -{ - struct lustre_sb_info *lsi = s2lsi(sb); - - CDEBUG(D_MOUNT, "put %p %d\n", sb, atomic_read(&lsi->lsi_mounts)); - if (atomic_dec_and_test(&lsi->lsi_mounts)) { - lustre_free_lsi(sb); - return 1; - } - return 0; -} - -/*** SERVER NAME *** - * - * FSNAME is between 1 and 8 characters (inclusive). - * Excluded characters are '/' and ':' - * SEPARATOR is either ':' or '-' - * TYPE: "OST", "MDT", etc. - * INDEX: Hex representation of the index - */ - -/** Get the fsname ("lustre") from the server name ("lustre-OST003F"). - * @param [in] svname server name including type and index - * @param [out] fsname Buffer to copy filesystem name prefix into. - * Must have at least 'strlen(fsname) + 1' chars. - * @param [out] endptr if endptr isn't NULL it is set to end of fsname - * rc < 0 on error - */ -static int server_name2fsname(const char *svname, char *fsname, - const char **endptr) -{ - const char *dash; - - dash = svname + strnlen(svname, 8); /* max fsname length is 8 */ - for (; dash > svname && *dash != '-' && *dash != ':'; dash--) - ; - if (dash == svname) - return -EINVAL; - - if (fsname) { - strncpy(fsname, svname, dash - svname); - fsname[dash - svname] = '\0'; - } - - if (endptr) - *endptr = dash; - - return 0; -} - -/* Get the index from the obd name. - * rc = server type, or - * rc < 0 on error - * if endptr isn't NULL it is set to end of name - */ -static int server_name2index(const char *svname, __u32 *idx, - const char **endptr) -{ - unsigned long index; - int rc; - const char *dash; - - /* We use server_name2fsname() just for parsing */ - rc = server_name2fsname(svname, NULL, &dash); - if (rc != 0) - return rc; - - dash++; - - if (strncmp(dash, "MDT", 3) == 0) - rc = LDD_F_SV_TYPE_MDT; - else if (strncmp(dash, "OST", 3) == 0) - rc = LDD_F_SV_TYPE_OST; - else - return -EINVAL; - - dash += 3; - - if (strncmp(dash, "all", 3) == 0) { - if (endptr) - *endptr = dash + 3; - return rc | LDD_F_SV_ALL; - } - - index = simple_strtoul(dash, (char **)endptr, 16); - if (idx) - *idx = index; - - /* Account for -mdc after index that is possible when specifying mdt */ - if (endptr && strncmp(LUSTRE_MDC_NAME, *endptr + 1, - sizeof(LUSTRE_MDC_NAME) - 1) == 0) - *endptr += sizeof(LUSTRE_MDC_NAME); - - return rc; -} - -/*************** mount common between server and client ***************/ - -/* Common umount */ -int lustre_common_put_super(struct super_block *sb) -{ - int rc; - - CDEBUG(D_MOUNT, "dropping sb %p\n", sb); - - /* Drop a ref to the MGC */ - rc = lustre_stop_mgc(sb); - if (rc && (rc != -ENOENT)) { - if (rc != -EBUSY) { - CERROR("Can't stop MGC: %d\n", rc); - return rc; - } - /* BUSY just means that there's some other obd that - * needs the mgc. Let him clean it up. - */ - CDEBUG(D_MOUNT, "MGC still in use\n"); - } - /* Drop a ref to the mounted disk */ - lustre_put_lsi(sb); - return rc; -} -EXPORT_SYMBOL(lustre_common_put_super); - -static void lmd_print(struct lustre_mount_data *lmd) -{ - int i; - - PRINT_CMD(D_MOUNT, " mount data:\n"); - if (lmd_is_client(lmd)) - PRINT_CMD(D_MOUNT, "profile: %s\n", lmd->lmd_profile); - PRINT_CMD(D_MOUNT, "device: %s\n", lmd->lmd_dev); - PRINT_CMD(D_MOUNT, "flags: %x\n", lmd->lmd_flags); - - if (lmd->lmd_opts) - PRINT_CMD(D_MOUNT, "options: %s\n", lmd->lmd_opts); - - if (lmd->lmd_recovery_time_soft) - PRINT_CMD(D_MOUNT, "recovery time soft: %d\n", - lmd->lmd_recovery_time_soft); - - if (lmd->lmd_recovery_time_hard) - PRINT_CMD(D_MOUNT, "recovery time hard: %d\n", - lmd->lmd_recovery_time_hard); - - for (i = 0; i < lmd->lmd_exclude_count; i++) { - PRINT_CMD(D_MOUNT, "exclude %d: OST%04x\n", i, - lmd->lmd_exclude[i]); - } -} - -/* Is this server on the exclusion list */ -int lustre_check_exclusion(struct super_block *sb, char *svname) -{ - struct lustre_sb_info *lsi = s2lsi(sb); - struct lustre_mount_data *lmd = lsi->lsi_lmd; - __u32 index; - int i, rc; - - rc = server_name2index(svname, &index, NULL); - if (rc != LDD_F_SV_TYPE_OST) - /* Only exclude OSTs */ - return 0; - - CDEBUG(D_MOUNT, "Check exclusion %s (%d) in %d of %s\n", svname, - index, lmd->lmd_exclude_count, lmd->lmd_dev); - - for (i = 0; i < lmd->lmd_exclude_count; i++) { - if (index == lmd->lmd_exclude[i]) { - CWARN("Excluding %s (on exclusion list)\n", svname); - return 1; - } - } - return 0; -} - -/* mount -v -o exclude=lustre-OST0001:lustre-OST0002 -t lustre ... */ -static int lmd_make_exclusion(struct lustre_mount_data *lmd, const char *ptr) -{ - const char *s1 = ptr, *s2; - __u32 index = 0, *exclude_list; - int rc = 0, devmax; - - /* The shortest an ost name can be is 8 chars: -OST0000. - * We don't actually know the fsname at this time, so in fact - * a user could specify any fsname. - */ - devmax = strlen(ptr) / 8 + 1; - - /* temp storage until we figure out how many we have */ - exclude_list = kcalloc(devmax, sizeof(index), GFP_NOFS); - if (!exclude_list) - return -ENOMEM; - - /* we enter this fn pointing at the '=' */ - while (*s1 && *s1 != ' ' && *s1 != ',') { - s1++; - rc = server_name2index(s1, &index, &s2); - if (rc < 0) { - CERROR("Can't parse server name '%s': rc = %d\n", - s1, rc); - break; - } - if (rc == LDD_F_SV_TYPE_OST) - exclude_list[lmd->lmd_exclude_count++] = index; - else - CDEBUG(D_MOUNT, "ignoring exclude %.*s: type = %#x\n", - (uint)(s2 - s1), s1, rc); - s1 = s2; - /* now we are pointing at ':' (next exclude) - * or ',' (end of excludes) - */ - if (lmd->lmd_exclude_count >= devmax) - break; - } - if (rc >= 0) /* non-err */ - rc = 0; - - if (lmd->lmd_exclude_count) { - /* permanent, freed in lustre_free_lsi */ - lmd->lmd_exclude = kcalloc(lmd->lmd_exclude_count, - sizeof(index), GFP_NOFS); - if (lmd->lmd_exclude) { - memcpy(lmd->lmd_exclude, exclude_list, - sizeof(index) * lmd->lmd_exclude_count); - } else { - rc = -ENOMEM; - lmd->lmd_exclude_count = 0; - } - } - kfree(exclude_list); - return rc; -} - -static int lmd_parse_mgssec(struct lustre_mount_data *lmd, char *ptr) -{ - char *tail; - int length; - - kfree(lmd->lmd_mgssec); - lmd->lmd_mgssec = NULL; - - tail = strchr(ptr, ','); - if (!tail) - length = strlen(ptr); - else - length = tail - ptr; - - lmd->lmd_mgssec = kzalloc(length + 1, GFP_NOFS); - if (!lmd->lmd_mgssec) - return -ENOMEM; - - memcpy(lmd->lmd_mgssec, ptr, length); - lmd->lmd_mgssec[length] = '\0'; - return 0; -} - -static int lmd_parse_string(char **handle, char *ptr) -{ - char *tail; - int length; - - if (!handle || !ptr) - return -EINVAL; - - kfree(*handle); - *handle = NULL; - - tail = strchr(ptr, ','); - if (!tail) - length = strlen(ptr); - else - length = tail - ptr; - - *handle = kzalloc(length + 1, GFP_NOFS); - if (!*handle) - return -ENOMEM; - - memcpy(*handle, ptr, length); - (*handle)[length] = '\0'; - - return 0; -} - -/* Collect multiple values for mgsnid specifiers */ -static int lmd_parse_mgs(struct lustre_mount_data *lmd, char **ptr) -{ - lnet_nid_t nid; - char *tail = *ptr; - char *mgsnid; - int length; - int oldlen = 0; - - /* Find end of nidlist */ - while (class_parse_nid_quiet(tail, &nid, &tail) == 0) - ; - length = tail - *ptr; - if (length == 0) { - LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", *ptr); - return -EINVAL; - } - - if (lmd->lmd_mgs) - oldlen = strlen(lmd->lmd_mgs) + 1; - - mgsnid = kzalloc(oldlen + length + 1, GFP_NOFS); - if (!mgsnid) - return -ENOMEM; - - if (lmd->lmd_mgs) { - /* Multiple mgsnid= are taken to mean failover locations */ - memcpy(mgsnid, lmd->lmd_mgs, oldlen); - mgsnid[oldlen - 1] = ':'; - kfree(lmd->lmd_mgs); - } - memcpy(mgsnid + oldlen, *ptr, length); - mgsnid[oldlen + length] = '\0'; - lmd->lmd_mgs = mgsnid; - *ptr = tail; - - return 0; -} - -/** Parse mount line options - * e.g. mount -v -t lustre -o abort_recov uml1:uml2:/lustre-client /mnt/lustre - * dev is passed as device=uml1:/lustre by mount.lustre - */ -static int lmd_parse(char *options, struct lustre_mount_data *lmd) -{ - char *s1, *s2, *devname = NULL; - struct lustre_mount_data *raw = (struct lustre_mount_data *)options; - int rc = 0; - - LASSERT(lmd); - if (!options) { - LCONSOLE_ERROR_MSG(0x162, "Missing mount data: check that /sbin/mount.lustre is installed.\n"); - return -EINVAL; - } - - /* Options should be a string - try to detect old lmd data */ - if ((raw->lmd_magic & 0xffffff00) == (LMD_MAGIC & 0xffffff00)) { - LCONSOLE_ERROR_MSG(0x163, "You're using an old version of /sbin/mount.lustre. Please install version %s\n", - LUSTRE_VERSION_STRING); - return -EINVAL; - } - lmd->lmd_magic = LMD_MAGIC; - - lmd->lmd_params = kzalloc(LMD_PARAMS_MAXLEN, GFP_NOFS); - if (!lmd->lmd_params) - return -ENOMEM; - lmd->lmd_params[0] = '\0'; - - /* Set default flags here */ - - s1 = options; - while (*s1) { - int clear = 0; - int time_min = OBD_RECOVERY_TIME_MIN; - char *s3; - - /* Skip whitespace and extra commas */ - while (*s1 == ' ' || *s1 == ',') - s1++; - s3 = s1; - - /* Client options are parsed in ll_options: eg. flock, - * user_xattr, acl - */ - - /* Parse non-ldiskfs options here. Rather than modifying - * ldiskfs, we just zero these out here - */ - if (strncmp(s1, "abort_recov", 11) == 0) { - lmd->lmd_flags |= LMD_FLG_ABORT_RECOV; - clear++; - } else if (strncmp(s1, "recovery_time_soft=", 19) == 0) { - lmd->lmd_recovery_time_soft = max_t(int, - simple_strtoul(s1 + 19, NULL, 10), time_min); - clear++; - } else if (strncmp(s1, "recovery_time_hard=", 19) == 0) { - lmd->lmd_recovery_time_hard = max_t(int, - simple_strtoul(s1 + 19, NULL, 10), time_min); - clear++; - } else if (strncmp(s1, "noir", 4) == 0) { - lmd->lmd_flags |= LMD_FLG_NOIR; /* test purpose only. */ - clear++; - } else if (strncmp(s1, "nosvc", 5) == 0) { - lmd->lmd_flags |= LMD_FLG_NOSVC; - clear++; - } else if (strncmp(s1, "nomgs", 5) == 0) { - lmd->lmd_flags |= LMD_FLG_NOMGS; - clear++; - } else if (strncmp(s1, "noscrub", 7) == 0) { - lmd->lmd_flags |= LMD_FLG_NOSCRUB; - clear++; - } else if (strncmp(s1, PARAM_MGSNODE, - sizeof(PARAM_MGSNODE) - 1) == 0) { - s2 = s1 + sizeof(PARAM_MGSNODE) - 1; - /* Assume the next mount opt is the first - * invalid nid we get to. - */ - rc = lmd_parse_mgs(lmd, &s2); - if (rc) - goto invalid; - clear++; - } else if (strncmp(s1, "writeconf", 9) == 0) { - lmd->lmd_flags |= LMD_FLG_WRITECONF; - clear++; - } else if (strncmp(s1, "update", 6) == 0) { - lmd->lmd_flags |= LMD_FLG_UPDATE; - clear++; - } else if (strncmp(s1, "virgin", 6) == 0) { - lmd->lmd_flags |= LMD_FLG_VIRGIN; - clear++; - } else if (strncmp(s1, "noprimnode", 10) == 0) { - lmd->lmd_flags |= LMD_FLG_NO_PRIMNODE; - clear++; - } else if (strncmp(s1, "mgssec=", 7) == 0) { - rc = lmd_parse_mgssec(lmd, s1 + 7); - if (rc) - goto invalid; - s3 = s2; - clear++; - /* ost exclusion list */ - } else if (strncmp(s1, "exclude=", 8) == 0) { - rc = lmd_make_exclusion(lmd, s1 + 7); - if (rc) - goto invalid; - clear++; - } else if (strncmp(s1, "mgs", 3) == 0) { - /* We are an MGS */ - lmd->lmd_flags |= LMD_FLG_MGS; - clear++; - } else if (strncmp(s1, "svname=", 7) == 0) { - rc = lmd_parse_string(&lmd->lmd_profile, s1 + 7); - if (rc) - goto invalid; - clear++; - } else if (strncmp(s1, "param=", 6) == 0) { - size_t length, params_length; - char *tail = strchr(s1 + 6, ','); - - if (!tail) { - length = strlen(s1); - } else { - lnet_nid_t nid; - char *param_str = tail + 1; - int supplementary = 1; - - while (!class_parse_nid_quiet(param_str, &nid, - ¶m_str)) { - supplementary = 0; - } - length = param_str - s1 - supplementary; - } - length -= 6; - params_length = strlen(lmd->lmd_params); - if (params_length + length + 1 >= LMD_PARAMS_MAXLEN) - return -E2BIG; - strncat(lmd->lmd_params, s1 + 6, length); - lmd->lmd_params[params_length + length] = '\0'; - strlcat(lmd->lmd_params, " ", LMD_PARAMS_MAXLEN); - s3 = s1 + 6 + length; - clear++; - } else if (strncmp(s1, "osd=", 4) == 0) { - rc = lmd_parse_string(&lmd->lmd_osd_type, s1 + 4); - if (rc) - goto invalid; - clear++; - } - /* Linux 2.4 doesn't pass the device, so we stuck it at the - * end of the options. - */ - else if (strncmp(s1, "device=", 7) == 0) { - devname = s1 + 7; - /* terminate options right before device. device - * must be the last one. - */ - *s1 = '\0'; - break; - } - - /* Find next opt */ - s2 = strchr(s1, ','); - if (!s2) { - if (clear) - *s1 = '\0'; - break; - } - s2++; - if (clear) - memmove(s1, s2, strlen(s2) + 1); - else - s1 = s2; - } - - if (!devname) { - LCONSOLE_ERROR_MSG(0x164, "Can't find the device name (need mount option 'device=...')\n"); - goto invalid; - } - - s1 = strstr(devname, ":/"); - if (s1) { - ++s1; - lmd->lmd_flags |= LMD_FLG_CLIENT; - /* Remove leading /s from fsname */ - while (*++s1 == '/') - ; - /* Freed in lustre_free_lsi */ - lmd->lmd_profile = kasprintf(GFP_NOFS, "%s-client", s1); - if (!lmd->lmd_profile) - return -ENOMEM; - } - - /* Freed in lustre_free_lsi */ - lmd->lmd_dev = kzalloc(strlen(devname) + 1, GFP_NOFS); - if (!lmd->lmd_dev) - return -ENOMEM; - strcpy(lmd->lmd_dev, devname); - - /* Save mount options */ - s1 = options + strlen(options) - 1; - while (s1 >= options && (*s1 == ',' || *s1 == ' ')) - *s1-- = 0; - if (*options != 0) { - /* Freed in lustre_free_lsi */ - lmd->lmd_opts = kzalloc(strlen(options) + 1, GFP_NOFS); - if (!lmd->lmd_opts) - return -ENOMEM; - strcpy(lmd->lmd_opts, options); - } - - lmd_print(lmd); - lmd->lmd_magic = LMD_MAGIC; - - return rc; - -invalid: - CERROR("Bad mount options %s\n", options); - return -EINVAL; -} - -/** This is the entry point for the mount call into Lustre. - * This is called when a server or client is mounted, - * and this is where we start setting things up. - * @param data Mount options (e.g. -o flock,abort_recov) - */ -static int lustre_fill_super(struct super_block *sb, void *lmd2_data, int silent) -{ - struct lustre_mount_data *lmd; - struct lustre_sb_info *lsi; - int rc; - - CDEBUG(D_MOUNT | D_VFSTRACE, "VFS Op: sb %p\n", sb); - - lsi = lustre_init_lsi(sb); - if (!lsi) - return -ENOMEM; - lmd = lsi->lsi_lmd; - - /* - * Disable lockdep during mount, because mount locking patterns are - * `special'. - */ - lockdep_off(); - - /* - * LU-639: the obd cleanup of last mount may not finish yet, wait here. - */ - obd_zombie_barrier(); - - /* Figure out the lmd from the mount options */ - if (lmd_parse(lmd2_data, lmd)) { - lustre_put_lsi(sb); - rc = -EINVAL; - goto out; - } - - if (lmd_is_client(lmd)) { - bool have_client = false; - CDEBUG(D_MOUNT, "Mounting client %s\n", lmd->lmd_profile); - if (!client_fill_super) - request_module("lustre"); - spin_lock(&client_lock); - if (client_fill_super && try_module_get(client_mod)) - have_client = true; - spin_unlock(&client_lock); - if (!have_client) { - LCONSOLE_ERROR_MSG(0x165, "Nothing registered for client mount! Is the 'lustre' module loaded?\n"); - lustre_put_lsi(sb); - rc = -ENODEV; - } else { - rc = lustre_start_mgc(sb); - if (rc) { - lustre_common_put_super(sb); - goto out; - } - /* Connect and start */ - /* (should always be ll_fill_super) */ - rc = (*client_fill_super)(sb); - /* c_f_s will call lustre_common_put_super on failure, otherwise - * c_f_s will have taken another reference to the module */ - module_put(client_mod); - } - } else { - CERROR("This is client-side-only module, cannot handle server mount.\n"); - rc = -EINVAL; - } - - /* If error happens in fill_super() call, @lsi will be killed there. - * This is why we do not put it here. - */ - goto out; -out: - if (rc) { - CERROR("Unable to mount %s (%d)\n", - s2lsi(sb) ? lmd->lmd_dev : "", rc); - } else { - CDEBUG(D_SUPER, "Mount %s complete\n", - lmd->lmd_dev); - } - lockdep_on(); - return rc; -} - -/* We can't call ll_fill_super by name because it lives in a module that - * must be loaded after this one. - */ -void lustre_register_super_ops(struct module *mod, - int (*cfs)(struct super_block *sb), - void (*ksc)(struct super_block *sb)) -{ - spin_lock(&client_lock); - client_mod = mod; - client_fill_super = cfs; - kill_super_cb = ksc; - spin_unlock(&client_lock); -} -EXPORT_SYMBOL(lustre_register_super_ops); - -/***************** FS registration ******************/ -static struct dentry *lustre_mount(struct file_system_type *fs_type, int flags, - const char *devname, void *data) -{ - return mount_nodev(fs_type, flags, data, lustre_fill_super); -} - -static void lustre_kill_super(struct super_block *sb) -{ - struct lustre_sb_info *lsi = s2lsi(sb); - - if (kill_super_cb && lsi) - (*kill_super_cb)(sb); - - kill_anon_super(sb); -} - -/** Register the "lustre" fs type - */ -static struct file_system_type lustre_fs_type = { - .owner = THIS_MODULE, - .name = "lustre", - .mount = lustre_mount, - .kill_sb = lustre_kill_super, - .fs_flags = FS_RENAME_DOES_D_MOVE, -}; -MODULE_ALIAS_FS("lustre"); - -int lustre_register_fs(void) -{ - return register_filesystem(&lustre_fs_type); -} - -int lustre_unregister_fs(void) -{ - return unregister_filesystem(&lustre_fs_type); -} diff --git a/drivers/staging/lustre/lustre/obdclass/obdo.c b/drivers/staging/lustre/lustre/obdclass/obdo.c deleted file mode 100644 index c4503bc365910c1ebb38ee494677d2c064b0fce4..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdclass/obdo.c +++ /dev/null @@ -1,181 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/obdclass/obdo.c - * - * Object Devices Class Driver - * These are the only exported functions, they provide some generic - * infrastructure for managing object devices - */ - -#define DEBUG_SUBSYSTEM S_CLASS - -#include -#include -#include - -void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent) -{ - dst->o_parent_oid = fid_oid(parent); - dst->o_parent_seq = fid_seq(parent); - dst->o_parent_ver = fid_ver(parent); - dst->o_valid |= OBD_MD_FLGENER | OBD_MD_FLFID; -} -EXPORT_SYMBOL(obdo_set_parent_fid); - -/* WARNING: the file systems must take care not to tinker with - * attributes they don't manage (such as blocks). - */ -void obdo_from_inode(struct obdo *dst, struct inode *src, u32 valid) -{ - u32 newvalid = 0; - - if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME)) - CDEBUG(D_INODE, "valid %x, new time %lu/%lu\n", - valid, LTIME_S(src->i_mtime), - LTIME_S(src->i_ctime)); - - if (valid & OBD_MD_FLATIME) { - dst->o_atime = LTIME_S(src->i_atime); - newvalid |= OBD_MD_FLATIME; - } - if (valid & OBD_MD_FLMTIME) { - dst->o_mtime = LTIME_S(src->i_mtime); - newvalid |= OBD_MD_FLMTIME; - } - if (valid & OBD_MD_FLCTIME) { - dst->o_ctime = LTIME_S(src->i_ctime); - newvalid |= OBD_MD_FLCTIME; - } - if (valid & OBD_MD_FLSIZE) { - dst->o_size = i_size_read(src); - newvalid |= OBD_MD_FLSIZE; - } - if (valid & OBD_MD_FLBLOCKS) { /* allocation of space (x512 bytes) */ - dst->o_blocks = src->i_blocks; - newvalid |= OBD_MD_FLBLOCKS; - } - if (valid & OBD_MD_FLBLKSZ) { /* optimal block size */ - dst->o_blksize = 1 << src->i_blkbits; - newvalid |= OBD_MD_FLBLKSZ; - } - if (valid & OBD_MD_FLTYPE) { - dst->o_mode = (dst->o_mode & S_IALLUGO) | - (src->i_mode & S_IFMT); - newvalid |= OBD_MD_FLTYPE; - } - if (valid & OBD_MD_FLMODE) { - dst->o_mode = (dst->o_mode & S_IFMT) | - (src->i_mode & S_IALLUGO); - newvalid |= OBD_MD_FLMODE; - } - if (valid & OBD_MD_FLUID) { - dst->o_uid = from_kuid(&init_user_ns, src->i_uid); - newvalid |= OBD_MD_FLUID; - } - if (valid & OBD_MD_FLGID) { - dst->o_gid = from_kgid(&init_user_ns, src->i_gid); - newvalid |= OBD_MD_FLGID; - } - if (valid & OBD_MD_FLFLAGS) { - dst->o_flags = src->i_flags; - newvalid |= OBD_MD_FLFLAGS; - } - dst->o_valid |= newvalid; -} -EXPORT_SYMBOL(obdo_from_inode); - -void obdo_to_ioobj(const struct obdo *oa, struct obd_ioobj *ioobj) -{ - ioobj->ioo_oid = oa->o_oi; - if (unlikely(!(oa->o_valid & OBD_MD_FLGROUP))) - ostid_set_seq_mdt0(&ioobj->ioo_oid); - - /* Since 2.4 this does not contain o_mode in the low 16 bits. - * Instead, it holds (bd_md_max_brw - 1) for multi-bulk BRW RPCs - */ - ioobj->ioo_max_brw = 0; -} -EXPORT_SYMBOL(obdo_to_ioobj); - -/** - * Create an obdo to send over the wire - */ -void lustre_set_wire_obdo(const struct obd_connect_data *ocd, - struct obdo *wobdo, const struct obdo *lobdo) -{ - *wobdo = *lobdo; - wobdo->o_flags &= ~OBD_FL_LOCAL_MASK; - if (!ocd) - return; - - if (unlikely(!(ocd->ocd_connect_flags & OBD_CONNECT_FID)) && - fid_seq_is_echo(ostid_seq(&lobdo->o_oi))) { - /* - * Currently OBD_FL_OSTID will only be used when 2.4 echo - * client communicate with pre-2.4 server - */ - wobdo->o_oi.oi.oi_id = fid_oid(&lobdo->o_oi.oi_fid); - wobdo->o_oi.oi.oi_seq = fid_seq(&lobdo->o_oi.oi_fid); - } -} -EXPORT_SYMBOL(lustre_set_wire_obdo); - -/** - * Create a local obdo from a wire based odbo - */ -void lustre_get_wire_obdo(const struct obd_connect_data *ocd, - struct obdo *lobdo, const struct obdo *wobdo) -{ - u32 local_flags = 0; - - if (lobdo->o_valid & OBD_MD_FLFLAGS) - local_flags = lobdo->o_flags & OBD_FL_LOCAL_MASK; - - *lobdo = *wobdo; - if (local_flags) { - lobdo->o_valid |= OBD_MD_FLFLAGS; - lobdo->o_flags &= ~OBD_FL_LOCAL_MASK; - lobdo->o_flags |= local_flags; - } - if (!ocd) - return; - - if (unlikely(!(ocd->ocd_connect_flags & OBD_CONNECT_FID)) && - fid_seq_is_echo(wobdo->o_oi.oi.oi_seq)) { - /* see above */ - lobdo->o_oi.oi_fid.f_seq = wobdo->o_oi.oi.oi_seq; - lobdo->o_oi.oi_fid.f_oid = wobdo->o_oi.oi.oi_id; - lobdo->o_oi.oi_fid.f_ver = 0; - } -} -EXPORT_SYMBOL(lustre_get_wire_obdo); diff --git a/drivers/staging/lustre/lustre/obdclass/statfs_pack.c b/drivers/staging/lustre/lustre/obdclass/statfs_pack.c deleted file mode 100644 index 355e888885f4b54d2f3d5ce635ec37caec8f7ac2..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdclass/statfs_pack.c +++ /dev/null @@ -1,58 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/obdclass/statfs_pack.c - * - * (Un)packing of OST/MDS requests - * - * Author: Andreas Dilger - */ - -#define DEBUG_SUBSYSTEM S_CLASS - -#include -#include -#include -#include -#include - -void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs) -{ - memset(sfs, 0, sizeof(*sfs)); - sfs->f_type = osfs->os_type; - sfs->f_blocks = osfs->os_blocks; - sfs->f_bfree = osfs->os_bfree; - sfs->f_bavail = osfs->os_bavail; - sfs->f_files = osfs->os_files; - sfs->f_ffree = osfs->os_ffree; - sfs->f_bsize = osfs->os_bsize; - sfs->f_namelen = osfs->os_namelen; -} -EXPORT_SYMBOL(statfs_unpack); diff --git a/drivers/staging/lustre/lustre/obdclass/uuid.c b/drivers/staging/lustre/lustre/obdclass/uuid.c deleted file mode 100644 index ec8c6dc5c9a73a59ceaf20e07d972f6512c543a4..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdclass/uuid.c +++ /dev/null @@ -1,45 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/obdclass/uuid.c - * - * Public include file for the UUID library - */ - -#define DEBUG_SUBSYSTEM S_CLASS - -#include -#include - -void class_uuid_unparse(class_uuid_t uu, struct obd_uuid *out) -{ - sprintf(out->uuid, "%pU", uu); -} -EXPORT_SYMBOL(class_uuid_unparse); diff --git a/drivers/staging/lustre/lustre/obdecho/Makefile b/drivers/staging/lustre/lustre/obdecho/Makefile deleted file mode 100644 index 6be66fbab87215eb536d9ca7672b6f726bee1e11..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdecho/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/include -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/lustre/include - -obj-$(CONFIG_LUSTRE_FS) += obdecho.o -obdecho-y := echo_client.o diff --git a/drivers/staging/lustre/lustre/obdecho/echo_client.c b/drivers/staging/lustre/lustre/obdecho/echo_client.c deleted file mode 100644 index b692e76e71089d3bef2db5d6d835ec3a86ab7b7f..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdecho/echo_client.c +++ /dev/null @@ -1,1729 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_ECHO - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "echo_internal.h" - -/** \defgroup echo_client Echo Client - * @{ - */ - -struct echo_device { - struct cl_device ed_cl; - struct echo_client_obd *ed_ec; - - struct cl_site ed_site_myself; - struct lu_site *ed_site; - struct lu_device *ed_next; -}; - -struct echo_object { - struct cl_object eo_cl; - struct cl_object_header eo_hdr; - - struct echo_device *eo_dev; - struct list_head eo_obj_chain; - struct lov_oinfo *eo_oinfo; - atomic_t eo_npages; - int eo_deleted; -}; - -struct echo_object_conf { - struct cl_object_conf eoc_cl; - struct lov_oinfo **eoc_oinfo; -}; - -struct echo_page { - struct cl_page_slice ep_cl; - struct mutex ep_lock; -}; - -struct echo_lock { - struct cl_lock_slice el_cl; - struct list_head el_chain; - struct echo_object *el_object; - __u64 el_cookie; - atomic_t el_refcount; -}; - -static int echo_client_setup(const struct lu_env *env, - struct obd_device *obddev, - struct lustre_cfg *lcfg); -static int echo_client_cleanup(struct obd_device *obddev); - -/** \defgroup echo_helpers Helper functions - * @{ - */ -static inline struct echo_device *cl2echo_dev(const struct cl_device *dev) -{ - return container_of_safe(dev, struct echo_device, ed_cl); -} - -static inline struct cl_device *echo_dev2cl(struct echo_device *d) -{ - return &d->ed_cl; -} - -static inline struct echo_device *obd2echo_dev(const struct obd_device *obd) -{ - return cl2echo_dev(lu2cl_dev(obd->obd_lu_dev)); -} - -static inline struct cl_object *echo_obj2cl(struct echo_object *eco) -{ - return &eco->eo_cl; -} - -static inline struct echo_object *cl2echo_obj(const struct cl_object *o) -{ - return container_of(o, struct echo_object, eo_cl); -} - -static inline struct echo_page *cl2echo_page(const struct cl_page_slice *s) -{ - return container_of(s, struct echo_page, ep_cl); -} - -static inline struct echo_lock *cl2echo_lock(const struct cl_lock_slice *s) -{ - return container_of(s, struct echo_lock, el_cl); -} - -static inline struct cl_lock *echo_lock2cl(const struct echo_lock *ecl) -{ - return ecl->el_cl.cls_lock; -} - -static struct lu_context_key echo_thread_key; -static inline struct echo_thread_info *echo_env_info(const struct lu_env *env) -{ - struct echo_thread_info *info; - - info = lu_context_key_get(&env->le_ctx, &echo_thread_key); - LASSERT(info); - return info; -} - -static inline -struct echo_object_conf *cl2echo_conf(const struct cl_object_conf *c) -{ - return container_of(c, struct echo_object_conf, eoc_cl); -} - -/** @} echo_helpers */ -static int cl_echo_object_put(struct echo_object *eco); -static int cl_echo_object_brw(struct echo_object *eco, int rw, u64 offset, - struct page **pages, int npages, int async); - -struct echo_thread_info { - struct echo_object_conf eti_conf; - struct lustre_md eti_md; - - struct cl_2queue eti_queue; - struct cl_io eti_io; - struct cl_lock eti_lock; - struct lu_fid eti_fid; - struct lu_fid eti_fid2; -}; - -/* No session used right now */ -struct echo_session_info { - unsigned long dummy; -}; - -static struct kmem_cache *echo_lock_kmem; -static struct kmem_cache *echo_object_kmem; -static struct kmem_cache *echo_thread_kmem; -static struct kmem_cache *echo_session_kmem; - -static struct lu_kmem_descr echo_caches[] = { - { - .ckd_cache = &echo_lock_kmem, - .ckd_name = "echo_lock_kmem", - .ckd_size = sizeof(struct echo_lock) - }, - { - .ckd_cache = &echo_object_kmem, - .ckd_name = "echo_object_kmem", - .ckd_size = sizeof(struct echo_object) - }, - { - .ckd_cache = &echo_thread_kmem, - .ckd_name = "echo_thread_kmem", - .ckd_size = sizeof(struct echo_thread_info) - }, - { - .ckd_cache = &echo_session_kmem, - .ckd_name = "echo_session_kmem", - .ckd_size = sizeof(struct echo_session_info) - }, - { - .ckd_cache = NULL - } -}; - -/** \defgroup echo_page Page operations - * - * Echo page operations. - * - * @{ - */ -static int echo_page_own(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *io, int nonblock) -{ - struct echo_page *ep = cl2echo_page(slice); - - if (!nonblock) - mutex_lock(&ep->ep_lock); - else if (!mutex_trylock(&ep->ep_lock)) - return -EAGAIN; - return 0; -} - -static void echo_page_disown(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *io) -{ - struct echo_page *ep = cl2echo_page(slice); - - LASSERT(mutex_is_locked(&ep->ep_lock)); - mutex_unlock(&ep->ep_lock); -} - -static void echo_page_discard(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *unused) -{ - cl_page_delete(env, slice->cpl_page); -} - -static int echo_page_is_vmlocked(const struct lu_env *env, - const struct cl_page_slice *slice) -{ - if (mutex_is_locked(&cl2echo_page(slice)->ep_lock)) - return -EBUSY; - return -ENODATA; -} - -static void echo_page_completion(const struct lu_env *env, - const struct cl_page_slice *slice, - int ioret) -{ - LASSERT(slice->cpl_page->cp_sync_io); -} - -static void echo_page_fini(const struct lu_env *env, - struct cl_page_slice *slice) -{ - struct echo_object *eco = cl2echo_obj(slice->cpl_obj); - - atomic_dec(&eco->eo_npages); - put_page(slice->cpl_page->cp_vmpage); -} - -static int echo_page_prep(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *unused) -{ - return 0; -} - -static int echo_page_print(const struct lu_env *env, - const struct cl_page_slice *slice, - void *cookie, lu_printer_t printer) -{ - struct echo_page *ep = cl2echo_page(slice); - - (*printer)(env, cookie, LUSTRE_ECHO_CLIENT_NAME "-page@%p %d vm@%p\n", - ep, mutex_is_locked(&ep->ep_lock), - slice->cpl_page->cp_vmpage); - return 0; -} - -static const struct cl_page_operations echo_page_ops = { - .cpo_own = echo_page_own, - .cpo_disown = echo_page_disown, - .cpo_discard = echo_page_discard, - .cpo_fini = echo_page_fini, - .cpo_print = echo_page_print, - .cpo_is_vmlocked = echo_page_is_vmlocked, - .io = { - [CRT_READ] = { - .cpo_prep = echo_page_prep, - .cpo_completion = echo_page_completion, - }, - [CRT_WRITE] = { - .cpo_prep = echo_page_prep, - .cpo_completion = echo_page_completion, - } - } -}; - -/** @} echo_page */ - -/** \defgroup echo_lock Locking - * - * echo lock operations - * - * @{ - */ -static void echo_lock_fini(const struct lu_env *env, - struct cl_lock_slice *slice) -{ - struct echo_lock *ecl = cl2echo_lock(slice); - - LASSERT(list_empty(&ecl->el_chain)); - kmem_cache_free(echo_lock_kmem, ecl); -} - -static const struct cl_lock_operations echo_lock_ops = { - .clo_fini = echo_lock_fini, -}; - -/** @} echo_lock */ - -/** \defgroup echo_cl_ops cl_object operations - * - * operations for cl_object - * - * @{ - */ -static int echo_page_init(const struct lu_env *env, struct cl_object *obj, - struct cl_page *page, pgoff_t index) -{ - struct echo_page *ep = cl_object_page_slice(obj, page); - struct echo_object *eco = cl2echo_obj(obj); - - get_page(page->cp_vmpage); - mutex_init(&ep->ep_lock); - cl_page_slice_add(page, &ep->ep_cl, obj, index, &echo_page_ops); - atomic_inc(&eco->eo_npages); - return 0; -} - -static int echo_io_init(const struct lu_env *env, struct cl_object *obj, - struct cl_io *io) -{ - return 0; -} - -static int echo_lock_init(const struct lu_env *env, - struct cl_object *obj, struct cl_lock *lock, - const struct cl_io *unused) -{ - struct echo_lock *el; - - el = kmem_cache_zalloc(echo_lock_kmem, GFP_NOFS); - if (el) { - cl_lock_slice_add(lock, &el->el_cl, obj, &echo_lock_ops); - el->el_object = cl2echo_obj(obj); - INIT_LIST_HEAD(&el->el_chain); - atomic_set(&el->el_refcount, 0); - } - return !el ? -ENOMEM : 0; -} - -static int echo_conf_set(const struct lu_env *env, struct cl_object *obj, - const struct cl_object_conf *conf) -{ - return 0; -} - -static const struct cl_object_operations echo_cl_obj_ops = { - .coo_page_init = echo_page_init, - .coo_lock_init = echo_lock_init, - .coo_io_init = echo_io_init, - .coo_conf_set = echo_conf_set -}; - -/** @} echo_cl_ops */ - -/** \defgroup echo_lu_ops lu_object operations - * - * operations for echo lu object. - * - * @{ - */ -static int echo_object_init(const struct lu_env *env, struct lu_object *obj, - const struct lu_object_conf *conf) -{ - struct echo_device *ed = cl2echo_dev(lu2cl_dev(obj->lo_dev)); - struct echo_client_obd *ec = ed->ed_ec; - struct echo_object *eco = cl2echo_obj(lu2cl(obj)); - const struct cl_object_conf *cconf; - struct echo_object_conf *econf; - - if (ed->ed_next) { - struct lu_object *below; - struct lu_device *under; - - under = ed->ed_next; - below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, - under); - if (!below) - return -ENOMEM; - lu_object_add(obj, below); - } - - cconf = lu2cl_conf(conf); - econf = cl2echo_conf(cconf); - - LASSERT(econf->eoc_oinfo); - /* - * Transfer the oinfo pointer to eco that it won't be - * freed. - */ - eco->eo_oinfo = *econf->eoc_oinfo; - *econf->eoc_oinfo = NULL; - - eco->eo_dev = ed; - atomic_set(&eco->eo_npages, 0); - cl_object_page_init(lu2cl(obj), sizeof(struct echo_page)); - - spin_lock(&ec->ec_lock); - list_add_tail(&eco->eo_obj_chain, &ec->ec_objects); - spin_unlock(&ec->ec_lock); - - return 0; -} - -static void echo_object_free(const struct lu_env *env, struct lu_object *obj) -{ - struct echo_object *eco = cl2echo_obj(lu2cl(obj)); - struct echo_client_obd *ec = eco->eo_dev->ed_ec; - - LASSERT(atomic_read(&eco->eo_npages) == 0); - - spin_lock(&ec->ec_lock); - list_del_init(&eco->eo_obj_chain); - spin_unlock(&ec->ec_lock); - - lu_object_fini(obj); - lu_object_header_fini(obj->lo_header); - - kfree(eco->eo_oinfo); - kmem_cache_free(echo_object_kmem, eco); -} - -static int echo_object_print(const struct lu_env *env, void *cookie, - lu_printer_t p, const struct lu_object *o) -{ - struct echo_object *obj = cl2echo_obj(lu2cl(o)); - - return (*p)(env, cookie, "echoclient-object@%p", obj); -} - -static const struct lu_object_operations echo_lu_obj_ops = { - .loo_object_init = echo_object_init, - .loo_object_delete = NULL, - .loo_object_release = NULL, - .loo_object_free = echo_object_free, - .loo_object_print = echo_object_print, - .loo_object_invariant = NULL -}; - -/** @} echo_lu_ops */ - -/** \defgroup echo_lu_dev_ops lu_device operations - * - * Operations for echo lu device. - * - * @{ - */ -static struct lu_object *echo_object_alloc(const struct lu_env *env, - const struct lu_object_header *hdr, - struct lu_device *dev) -{ - struct echo_object *eco; - struct lu_object *obj = NULL; - - /* we're the top dev. */ - LASSERT(!hdr); - eco = kmem_cache_zalloc(echo_object_kmem, GFP_NOFS); - if (eco) { - struct cl_object_header *hdr = &eco->eo_hdr; - - obj = &echo_obj2cl(eco)->co_lu; - cl_object_header_init(hdr); - hdr->coh_page_bufsize = cfs_size_round(sizeof(struct cl_page)); - - lu_object_init(obj, &hdr->coh_lu, dev); - lu_object_add_top(&hdr->coh_lu, obj); - - eco->eo_cl.co_ops = &echo_cl_obj_ops; - obj->lo_ops = &echo_lu_obj_ops; - } - return obj; -} - -static const struct lu_device_operations echo_device_lu_ops = { - .ldo_object_alloc = echo_object_alloc, -}; - -/** @} echo_lu_dev_ops */ - -/** \defgroup echo_init Setup and teardown - * - * Init and fini functions for echo client. - * - * @{ - */ -static int echo_site_init(const struct lu_env *env, struct echo_device *ed) -{ - struct cl_site *site = &ed->ed_site_myself; - int rc; - - /* initialize site */ - rc = cl_site_init(site, &ed->ed_cl); - if (rc) { - CERROR("Cannot initialize site for echo client(%d)\n", rc); - return rc; - } - - rc = lu_site_init_finish(&site->cs_lu); - if (rc) { - cl_site_fini(site); - return rc; - } - - ed->ed_site = &site->cs_lu; - return 0; -} - -static void echo_site_fini(const struct lu_env *env, struct echo_device *ed) -{ - if (ed->ed_site) { - lu_site_fini(ed->ed_site); - ed->ed_site = NULL; - } -} - -static void *echo_thread_key_init(const struct lu_context *ctx, - struct lu_context_key *key) -{ - struct echo_thread_info *info; - - info = kmem_cache_zalloc(echo_thread_kmem, GFP_NOFS); - if (!info) - info = ERR_PTR(-ENOMEM); - return info; -} - -static void echo_thread_key_fini(const struct lu_context *ctx, - struct lu_context_key *key, void *data) -{ - struct echo_thread_info *info = data; - - kmem_cache_free(echo_thread_kmem, info); -} - -static struct lu_context_key echo_thread_key = { - .lct_tags = LCT_CL_THREAD, - .lct_init = echo_thread_key_init, - .lct_fini = echo_thread_key_fini, -}; - -static void *echo_session_key_init(const struct lu_context *ctx, - struct lu_context_key *key) -{ - struct echo_session_info *session; - - session = kmem_cache_zalloc(echo_session_kmem, GFP_NOFS); - if (!session) - session = ERR_PTR(-ENOMEM); - return session; -} - -static void echo_session_key_fini(const struct lu_context *ctx, - struct lu_context_key *key, void *data) -{ - struct echo_session_info *session = data; - - kmem_cache_free(echo_session_kmem, session); -} - -static struct lu_context_key echo_session_key = { - .lct_tags = LCT_SESSION, - .lct_init = echo_session_key_init, - .lct_fini = echo_session_key_fini, -}; - -LU_TYPE_INIT_FINI(echo, &echo_thread_key, &echo_session_key); - -static struct lu_device *echo_device_alloc(const struct lu_env *env, - struct lu_device_type *t, - struct lustre_cfg *cfg) -{ - struct lu_device *next; - struct echo_device *ed; - struct cl_device *cd; - struct obd_device *obd = NULL; /* to keep compiler happy */ - struct obd_device *tgt; - const char *tgt_type_name; - int rc, err; - - ed = kzalloc(sizeof(*ed), GFP_NOFS); - if (!ed) { - rc = -ENOMEM; - goto out; - } - - cd = &ed->ed_cl; - rc = cl_device_init(cd, t); - if (rc) - goto out_free; - - cd->cd_lu_dev.ld_ops = &echo_device_lu_ops; - - obd = class_name2obd(lustre_cfg_string(cfg, 0)); - LASSERT(obd); - LASSERT(env); - - tgt = class_name2obd(lustre_cfg_string(cfg, 1)); - if (!tgt) { - CERROR("Can not find tgt device %s\n", - lustre_cfg_string(cfg, 1)); - rc = -ENODEV; - goto out_device_fini; - } - - next = tgt->obd_lu_dev; - if (!strcmp(tgt->obd_type->typ_name, LUSTRE_MDT_NAME)) { - CERROR("echo MDT client must be run on server\n"); - rc = -EOPNOTSUPP; - goto out_device_fini; - } - - rc = echo_site_init(env, ed); - if (rc) - goto out_device_fini; - - rc = echo_client_setup(env, obd, cfg); - if (rc) - goto out_site_fini; - - ed->ed_ec = &obd->u.echo_client; - - /* if echo client is to be stacked upon ost device, the next is - * NULL since ost is not a clio device so far - */ - if (next && !lu_device_is_cl(next)) - next = NULL; - - tgt_type_name = tgt->obd_type->typ_name; - if (next) { - if (next->ld_site) { - rc = -EBUSY; - goto out_cleanup; - } - - next->ld_site = ed->ed_site; - rc = next->ld_type->ldt_ops->ldto_device_init(env, next, - next->ld_type->ldt_name, - NULL); - if (rc) - goto out_cleanup; - - } else { - LASSERT(strcmp(tgt_type_name, LUSTRE_OST_NAME) == 0); - } - - ed->ed_next = next; - return &cd->cd_lu_dev; - -out_cleanup: - err = echo_client_cleanup(obd); - if (err) - CERROR("Cleanup obd device %s error(%d)\n", - obd->obd_name, err); -out_site_fini: - echo_site_fini(env, ed); -out_device_fini: - cl_device_fini(&ed->ed_cl); -out_free: - kfree(ed); -out: - return ERR_PTR(rc); -} - -static int echo_device_init(const struct lu_env *env, struct lu_device *d, - const char *name, struct lu_device *next) -{ - LBUG(); - return 0; -} - -static struct lu_device *echo_device_fini(const struct lu_env *env, - struct lu_device *d) -{ - struct echo_device *ed = cl2echo_dev(lu2cl_dev(d)); - struct lu_device *next = ed->ed_next; - - while (next) - next = next->ld_type->ldt_ops->ldto_device_fini(env, next); - return NULL; -} - -static void echo_lock_release(const struct lu_env *env, - struct echo_lock *ecl, - int still_used) -{ - struct cl_lock *clk = echo_lock2cl(ecl); - - cl_lock_release(env, clk); -} - -static struct lu_device *echo_device_free(const struct lu_env *env, - struct lu_device *d) -{ - struct echo_device *ed = cl2echo_dev(lu2cl_dev(d)); - struct echo_client_obd *ec = ed->ed_ec; - struct echo_object *eco; - struct lu_device *next = ed->ed_next; - - CDEBUG(D_INFO, "echo device:%p is going to be freed, next = %p\n", - ed, next); - - lu_site_purge(env, ed->ed_site, -1); - - /* check if there are objects still alive. - * It shouldn't have any object because lu_site_purge would cleanup - * all of cached objects. Anyway, probably the echo device is being - * parallelly accessed. - */ - spin_lock(&ec->ec_lock); - list_for_each_entry(eco, &ec->ec_objects, eo_obj_chain) - eco->eo_deleted = 1; - spin_unlock(&ec->ec_lock); - - /* purge again */ - lu_site_purge(env, ed->ed_site, -1); - - CDEBUG(D_INFO, - "Waiting for the reference of echo object to be dropped\n"); - - /* Wait for the last reference to be dropped. */ - spin_lock(&ec->ec_lock); - while (!list_empty(&ec->ec_objects)) { - spin_unlock(&ec->ec_lock); - CERROR("echo_client still has objects at cleanup time, wait for 1 second\n"); - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(HZ); - lu_site_purge(env, ed->ed_site, -1); - spin_lock(&ec->ec_lock); - } - spin_unlock(&ec->ec_lock); - - LASSERT(list_empty(&ec->ec_locks)); - - CDEBUG(D_INFO, "No object exists, exiting...\n"); - - echo_client_cleanup(d->ld_obd); - - while (next) - next = next->ld_type->ldt_ops->ldto_device_free(env, next); - - LASSERT(ed->ed_site == d->ld_site); - echo_site_fini(env, ed); - cl_device_fini(&ed->ed_cl); - kfree(ed); - - cl_env_cache_purge(~0); - - return NULL; -} - -static const struct lu_device_type_operations echo_device_type_ops = { - .ldto_init = echo_type_init, - .ldto_fini = echo_type_fini, - - .ldto_start = echo_type_start, - .ldto_stop = echo_type_stop, - - .ldto_device_alloc = echo_device_alloc, - .ldto_device_free = echo_device_free, - .ldto_device_init = echo_device_init, - .ldto_device_fini = echo_device_fini -}; - -static struct lu_device_type echo_device_type = { - .ldt_tags = LU_DEVICE_CL, - .ldt_name = LUSTRE_ECHO_CLIENT_NAME, - .ldt_ops = &echo_device_type_ops, - .ldt_ctx_tags = LCT_CL_THREAD, -}; - -/** @} echo_init */ - -/** \defgroup echo_exports Exported operations - * - * exporting functions to echo client - * - * @{ - */ - -/* Interfaces to echo client obd device */ -static struct echo_object * -cl_echo_object_find(struct echo_device *d, const struct ost_id *oi) -{ - struct lu_env *env; - struct echo_thread_info *info; - struct echo_object_conf *conf; - struct lov_oinfo *oinfo = NULL; - struct echo_object *eco; - struct cl_object *obj; - struct lu_fid *fid; - u16 refcheck; - int rc; - - LASSERTF(ostid_id(oi), DOSTID "\n", POSTID(oi)); - LASSERTF(ostid_seq(oi) == FID_SEQ_ECHO, DOSTID "\n", POSTID(oi)); - - /* Never return an object if the obd is to be freed. */ - if (echo_dev2cl(d)->cd_lu_dev.ld_obd->obd_stopping) - return ERR_PTR(-ENODEV); - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return (void *)env; - - info = echo_env_info(env); - conf = &info->eti_conf; - if (d->ed_next) { - oinfo = kzalloc(sizeof(*oinfo), GFP_NOFS); - if (!oinfo) { - eco = ERR_PTR(-ENOMEM); - goto out; - } - - oinfo->loi_oi = *oi; - conf->eoc_cl.u.coc_oinfo = oinfo; - } - - /* - * If echo_object_init() is successful then ownership of oinfo - * is transferred to the object. - */ - conf->eoc_oinfo = &oinfo; - - fid = &info->eti_fid; - rc = ostid_to_fid(fid, (struct ost_id *)oi, 0); - if (rc != 0) { - eco = ERR_PTR(rc); - goto out; - } - - /* In the function below, .hs_keycmp resolves to - * lu_obj_hop_keycmp() - */ - /* coverity[overrun-buffer-val] */ - obj = cl_object_find(env, echo_dev2cl(d), fid, &conf->eoc_cl); - if (IS_ERR(obj)) { - eco = (void *)obj; - goto out; - } - - eco = cl2echo_obj(obj); - if (eco->eo_deleted) { - cl_object_put(env, obj); - eco = ERR_PTR(-EAGAIN); - } - -out: - kfree(oinfo); - cl_env_put(env, &refcheck); - return eco; -} - -static int cl_echo_object_put(struct echo_object *eco) -{ - struct lu_env *env; - struct cl_object *obj = echo_obj2cl(eco); - u16 refcheck; - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - /* an external function to kill an object? */ - if (eco->eo_deleted) { - struct lu_object_header *loh = obj->co_lu.lo_header; - - LASSERT(&eco->eo_hdr == luh2coh(loh)); - set_bit(LU_OBJECT_HEARD_BANSHEE, &loh->loh_flags); - } - - cl_object_put(env, obj); - cl_env_put(env, &refcheck); - return 0; -} - -static int cl_echo_enqueue0(struct lu_env *env, struct echo_object *eco, - u64 start, u64 end, int mode, - __u64 *cookie, __u32 enqflags) -{ - struct cl_io *io; - struct cl_lock *lck; - struct cl_object *obj; - struct cl_lock_descr *descr; - struct echo_thread_info *info; - int rc = -ENOMEM; - - info = echo_env_info(env); - io = &info->eti_io; - lck = &info->eti_lock; - obj = echo_obj2cl(eco); - - memset(lck, 0, sizeof(*lck)); - descr = &lck->cll_descr; - descr->cld_obj = obj; - descr->cld_start = cl_index(obj, start); - descr->cld_end = cl_index(obj, end); - descr->cld_mode = mode == LCK_PW ? CLM_WRITE : CLM_READ; - descr->cld_enq_flags = enqflags; - io->ci_obj = obj; - - rc = cl_lock_request(env, io, lck); - if (rc == 0) { - struct echo_client_obd *ec = eco->eo_dev->ed_ec; - struct echo_lock *el; - - el = cl2echo_lock(cl_lock_at(lck, &echo_device_type)); - spin_lock(&ec->ec_lock); - if (list_empty(&el->el_chain)) { - list_add(&el->el_chain, &ec->ec_locks); - el->el_cookie = ++ec->ec_unique; - } - atomic_inc(&el->el_refcount); - *cookie = el->el_cookie; - spin_unlock(&ec->ec_lock); - } - return rc; -} - -static int cl_echo_cancel0(struct lu_env *env, struct echo_device *ed, - __u64 cookie) -{ - struct echo_client_obd *ec = ed->ed_ec; - struct echo_lock *ecl = NULL; - struct list_head *el; - int found = 0, still_used = 0; - - spin_lock(&ec->ec_lock); - list_for_each(el, &ec->ec_locks) { - ecl = list_entry(el, struct echo_lock, el_chain); - CDEBUG(D_INFO, "ecl: %p, cookie: %#llx\n", ecl, ecl->el_cookie); - found = (ecl->el_cookie == cookie); - if (found) { - if (atomic_dec_and_test(&ecl->el_refcount)) - list_del_init(&ecl->el_chain); - else - still_used = 1; - break; - } - } - spin_unlock(&ec->ec_lock); - - if (!found) - return -ENOENT; - - echo_lock_release(env, ecl, still_used); - return 0; -} - -static void echo_commit_callback(const struct lu_env *env, struct cl_io *io, - struct cl_page *page) -{ - struct echo_thread_info *info; - struct cl_2queue *queue; - - info = echo_env_info(env); - LASSERT(io == &info->eti_io); - - queue = &info->eti_queue; - cl_page_list_add(&queue->c2_qout, page); -} - -static int cl_echo_object_brw(struct echo_object *eco, int rw, u64 offset, - struct page **pages, int npages, int async) -{ - struct lu_env *env; - struct echo_thread_info *info; - struct cl_object *obj = echo_obj2cl(eco); - struct echo_device *ed = eco->eo_dev; - struct cl_2queue *queue; - struct cl_io *io; - struct cl_page *clp; - struct lustre_handle lh = { 0 }; - size_t page_size = cl_page_size(obj); - u16 refcheck; - int rc; - int i; - - LASSERT((offset & ~PAGE_MASK) == 0); - LASSERT(ed->ed_next); - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - info = echo_env_info(env); - io = &info->eti_io; - queue = &info->eti_queue; - - cl_2queue_init(queue); - - io->ci_ignore_layout = 1; - rc = cl_io_init(env, io, CIT_MISC, obj); - if (rc < 0) - goto out; - LASSERT(rc == 0); - - rc = cl_echo_enqueue0(env, eco, offset, - offset + npages * PAGE_SIZE - 1, - rw == READ ? LCK_PR : LCK_PW, &lh.cookie, - CEF_NEVER); - if (rc < 0) - goto error_lock; - - for (i = 0; i < npages; i++) { - LASSERT(pages[i]); - clp = cl_page_find(env, obj, cl_index(obj, offset), - pages[i], CPT_TRANSIENT); - if (IS_ERR(clp)) { - rc = PTR_ERR(clp); - break; - } - LASSERT(clp->cp_type == CPT_TRANSIENT); - - rc = cl_page_own(env, io, clp); - if (rc) { - LASSERT(clp->cp_state == CPS_FREEING); - cl_page_put(env, clp); - break; - } - /* - * Add a page to the incoming page list of 2-queue. - */ - cl_page_list_add(&queue->c2_qin, clp); - - /* drop the reference count for cl_page_find, so that the page - * will be freed in cl_2queue_fini. - */ - cl_page_put(env, clp); - cl_page_clip(env, clp, 0, page_size); - - offset += page_size; - } - - if (rc == 0) { - enum cl_req_type typ = rw == READ ? CRT_READ : CRT_WRITE; - - async = async && (typ == CRT_WRITE); - if (async) - rc = cl_io_commit_async(env, io, &queue->c2_qin, - 0, PAGE_SIZE, - echo_commit_callback); - else - rc = cl_io_submit_sync(env, io, typ, queue, 0); - CDEBUG(D_INFO, "echo_client %s write returns %d\n", - async ? "async" : "sync", rc); - } - - cl_echo_cancel0(env, ed, lh.cookie); -error_lock: - cl_2queue_discard(env, io, queue); - cl_2queue_disown(env, io, queue); - cl_2queue_fini(env, queue); - cl_io_fini(env, io); -out: - cl_env_put(env, &refcheck); - return rc; -} - -/** @} echo_exports */ - -static u64 last_object_id; - -static int echo_create_object(const struct lu_env *env, struct echo_device *ed, - struct obdo *oa) -{ - struct echo_object *eco; - struct echo_client_obd *ec = ed->ed_ec; - int rc; - int created = 0; - - if (!(oa->o_valid & OBD_MD_FLID) || - !(oa->o_valid & OBD_MD_FLGROUP) || - !fid_seq_is_echo(ostid_seq(&oa->o_oi))) { - CERROR("invalid oid " DOSTID "\n", POSTID(&oa->o_oi)); - return -EINVAL; - } - - if (!ostid_id(&oa->o_oi)) { - rc = ostid_set_id(&oa->o_oi, ++last_object_id); - if (rc) - goto failed; - } - - rc = obd_create(env, ec->ec_exp, oa); - if (rc != 0) { - CERROR("Cannot create objects: rc = %d\n", rc); - goto failed; - } - created = 1; - - oa->o_valid |= OBD_MD_FLID; - - eco = cl_echo_object_find(ed, &oa->o_oi); - if (IS_ERR(eco)) { - rc = PTR_ERR(eco); - goto failed; - } - cl_echo_object_put(eco); - - CDEBUG(D_INFO, "oa oid " DOSTID "\n", POSTID(&oa->o_oi)); - - failed: - if (created && rc) - obd_destroy(env, ec->ec_exp, oa); - if (rc) - CERROR("create object failed with: rc = %d\n", rc); - return rc; -} - -static int echo_get_object(struct echo_object **ecop, struct echo_device *ed, - struct obdo *oa) -{ - struct echo_object *eco; - int rc; - - if (!(oa->o_valid & OBD_MD_FLID) || !(oa->o_valid & OBD_MD_FLGROUP) || - !ostid_id(&oa->o_oi)) { - CERROR("invalid oid " DOSTID "\n", POSTID(&oa->o_oi)); - return -EINVAL; - } - - rc = 0; - eco = cl_echo_object_find(ed, &oa->o_oi); - if (!IS_ERR(eco)) - *ecop = eco; - else - rc = PTR_ERR(eco); - return rc; -} - -static void echo_put_object(struct echo_object *eco) -{ - int rc; - - rc = cl_echo_object_put(eco); - if (rc) - CERROR("%s: echo client drop an object failed: rc = %d\n", - eco->eo_dev->ed_ec->ec_exp->exp_obd->obd_name, rc); -} - -static void -echo_client_page_debug_setup(struct page *page, int rw, u64 id, - u64 offset, u64 count) -{ - char *addr; - u64 stripe_off; - u64 stripe_id; - int delta; - - /* no partial pages on the client */ - LASSERT(count == PAGE_SIZE); - - addr = kmap(page); - - for (delta = 0; delta < PAGE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) { - if (rw == OBD_BRW_WRITE) { - stripe_off = offset + delta; - stripe_id = id; - } else { - stripe_off = 0xdeadbeef00c0ffeeULL; - stripe_id = 0xdeadbeef00c0ffeeULL; - } - block_debug_setup(addr + delta, OBD_ECHO_BLOCK_SIZE, - stripe_off, stripe_id); - } - - kunmap(page); -} - -static int echo_client_page_debug_check(struct page *page, u64 id, - u64 offset, u64 count) -{ - u64 stripe_off; - u64 stripe_id; - char *addr; - int delta; - int rc; - int rc2; - - /* no partial pages on the client */ - LASSERT(count == PAGE_SIZE); - - addr = kmap(page); - - for (rc = delta = 0; delta < PAGE_SIZE; delta += OBD_ECHO_BLOCK_SIZE) { - stripe_off = offset + delta; - stripe_id = id; - - rc2 = block_debug_check("test_brw", - addr + delta, OBD_ECHO_BLOCK_SIZE, - stripe_off, stripe_id); - if (rc2 != 0) { - CERROR("Error in echo object %#llx\n", id); - rc = rc2; - } - } - - kunmap(page); - return rc; -} - -static int echo_client_kbrw(struct echo_device *ed, int rw, struct obdo *oa, - struct echo_object *eco, u64 offset, - u64 count, int async) -{ - u32 npages; - struct brw_page *pga; - struct brw_page *pgp; - struct page **pages; - u64 off; - int i; - int rc; - int verify; - gfp_t gfp_mask; - int brw_flags = 0; - - verify = (ostid_id(&oa->o_oi) != ECHO_PERSISTENT_OBJID && - (oa->o_valid & OBD_MD_FLFLAGS) != 0 && - (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0); - - gfp_mask = ((ostid_id(&oa->o_oi) & 2) == 0) ? GFP_KERNEL : GFP_HIGHUSER; - - LASSERT(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ); - - if (count <= 0 || - (count & (~PAGE_MASK)) != 0) - return -EINVAL; - - /* XXX think again with misaligned I/O */ - npages = count >> PAGE_SHIFT; - - if (rw == OBD_BRW_WRITE) - brw_flags = OBD_BRW_ASYNC; - - pga = kcalloc(npages, sizeof(*pga), GFP_NOFS); - if (!pga) - return -ENOMEM; - - pages = kcalloc(npages, sizeof(*pages), GFP_NOFS); - if (!pages) { - kfree(pga); - return -ENOMEM; - } - - for (i = 0, pgp = pga, off = offset; - i < npages; - i++, pgp++, off += PAGE_SIZE) { - LASSERT(!pgp->pg); /* for cleanup */ - - rc = -ENOMEM; - pgp->pg = alloc_page(gfp_mask); - if (!pgp->pg) - goto out; - - pages[i] = pgp->pg; - pgp->count = PAGE_SIZE; - pgp->off = off; - pgp->flag = brw_flags; - - if (verify) - echo_client_page_debug_setup(pgp->pg, rw, - ostid_id(&oa->o_oi), off, - pgp->count); - } - - /* brw mode can only be used at client */ - LASSERT(ed->ed_next); - rc = cl_echo_object_brw(eco, rw, offset, pages, npages, async); - - out: - if (rc != 0 || rw != OBD_BRW_READ) - verify = 0; - - for (i = 0, pgp = pga; i < npages; i++, pgp++) { - if (!pgp->pg) - continue; - - if (verify) { - int vrc; - - vrc = echo_client_page_debug_check(pgp->pg, - ostid_id(&oa->o_oi), - pgp->off, pgp->count); - if (vrc != 0 && rc == 0) - rc = vrc; - } - __free_page(pgp->pg); - } - kfree(pga); - kfree(pages); - return rc; -} - -static int echo_client_prep_commit(const struct lu_env *env, - struct obd_export *exp, int rw, - struct obdo *oa, struct echo_object *eco, - u64 offset, u64 count, - u64 batch, int async) -{ - struct obd_ioobj ioo; - struct niobuf_local *lnb; - struct niobuf_remote rnb; - u64 off; - u64 npages, tot_pages; - int i, ret = 0, brw_flags = 0; - - if (count <= 0 || (count & (~PAGE_MASK)) != 0) - return -EINVAL; - - npages = batch >> PAGE_SHIFT; - tot_pages = count >> PAGE_SHIFT; - - lnb = kcalloc(npages, sizeof(struct niobuf_local), GFP_NOFS); - if (!lnb) { - ret = -ENOMEM; - goto out; - } - - if (rw == OBD_BRW_WRITE && async) - brw_flags |= OBD_BRW_ASYNC; - - obdo_to_ioobj(oa, &ioo); - - off = offset; - - for (; tot_pages > 0; tot_pages -= npages) { - int lpages; - - if (tot_pages < npages) - npages = tot_pages; - - rnb.rnb_offset = off; - rnb.rnb_len = npages * PAGE_SIZE; - rnb.rnb_flags = brw_flags; - ioo.ioo_bufcnt = 1; - off += npages * PAGE_SIZE; - - lpages = npages; - ret = obd_preprw(env, rw, exp, oa, 1, &ioo, &rnb, &lpages, lnb); - if (ret != 0) - goto out; - - for (i = 0; i < lpages; i++) { - struct page *page = lnb[i].lnb_page; - - /* read past eof? */ - if (!page && lnb[i].lnb_rc == 0) - continue; - - if (async) - lnb[i].lnb_flags |= OBD_BRW_ASYNC; - - if (ostid_id(&oa->o_oi) == ECHO_PERSISTENT_OBJID || - (oa->o_valid & OBD_MD_FLFLAGS) == 0 || - (oa->o_flags & OBD_FL_DEBUG_CHECK) == 0) - continue; - - if (rw == OBD_BRW_WRITE) - echo_client_page_debug_setup(page, rw, - ostid_id(&oa->o_oi), - lnb[i].lnb_file_offset, - lnb[i].lnb_len); - else - echo_client_page_debug_check(page, - ostid_id(&oa->o_oi), - lnb[i].lnb_file_offset, - lnb[i].lnb_len); - } - - ret = obd_commitrw(env, rw, exp, oa, 1, &ioo, &rnb, npages, lnb, - ret); - if (ret != 0) - goto out; - - /* Reuse env context. */ - lu_context_exit((struct lu_context *)&env->le_ctx); - lu_context_enter((struct lu_context *)&env->le_ctx); - } - -out: - kfree(lnb); - return ret; -} - -static int echo_client_brw_ioctl(const struct lu_env *env, int rw, - struct obd_export *exp, - struct obd_ioctl_data *data) -{ - struct obd_device *obd = class_exp2obd(exp); - struct echo_device *ed = obd2echo_dev(obd); - struct echo_client_obd *ec = ed->ed_ec; - struct obdo *oa = &data->ioc_obdo1; - struct echo_object *eco; - int rc; - int async = 1; - long test_mode; - - LASSERT(oa->o_valid & OBD_MD_FLGROUP); - - rc = echo_get_object(&eco, ed, oa); - if (rc) - return rc; - - oa->o_valid &= ~OBD_MD_FLHANDLE; - - /* OFD/obdfilter works only via prep/commit */ - test_mode = (long)data->ioc_pbuf1; - if (test_mode == 1) - async = 0; - - if (!ed->ed_next && test_mode != 3) { - test_mode = 3; - data->ioc_plen1 = data->ioc_count; - } - - /* Truncate batch size to maximum */ - if (data->ioc_plen1 > PTLRPC_MAX_BRW_SIZE) - data->ioc_plen1 = PTLRPC_MAX_BRW_SIZE; - - switch (test_mode) { - case 1: - /* fall through */ - case 2: - rc = echo_client_kbrw(ed, rw, oa, eco, data->ioc_offset, - data->ioc_count, async); - break; - case 3: - rc = echo_client_prep_commit(env, ec->ec_exp, rw, oa, eco, - data->ioc_offset, data->ioc_count, - data->ioc_plen1, async); - break; - default: - rc = -EINVAL; - } - echo_put_object(eco); - return rc; -} - -static int -echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, int len, - void *karg, void __user *uarg) -{ - struct obd_device *obd = exp->exp_obd; - struct echo_device *ed = obd2echo_dev(obd); - struct echo_client_obd *ec = ed->ed_ec; - struct echo_object *eco; - struct obd_ioctl_data *data = karg; - struct lu_env *env; - struct obdo *oa; - struct lu_fid fid; - int rw = OBD_BRW_READ; - int rc = 0; - - oa = &data->ioc_obdo1; - if (!(oa->o_valid & OBD_MD_FLGROUP)) { - oa->o_valid |= OBD_MD_FLGROUP; - ostid_set_seq_echo(&oa->o_oi); - } - - /* This FID is unpacked just for validation at this point */ - rc = ostid_to_fid(&fid, &oa->o_oi, 0); - if (rc < 0) - return rc; - - env = kzalloc(sizeof(*env), GFP_NOFS); - if (!env) - return -ENOMEM; - - rc = lu_env_init(env, LCT_DT_THREAD); - if (rc) { - rc = -ENOMEM; - goto out; - } - - switch (cmd) { - case OBD_IOC_CREATE: /* may create echo object */ - if (!capable(CAP_SYS_ADMIN)) { - rc = -EPERM; - goto out; - } - - rc = echo_create_object(env, ed, oa); - goto out; - - case OBD_IOC_DESTROY: - if (!capable(CAP_SYS_ADMIN)) { - rc = -EPERM; - goto out; - } - - rc = echo_get_object(&eco, ed, oa); - if (rc == 0) { - rc = obd_destroy(env, ec->ec_exp, oa); - if (rc == 0) - eco->eo_deleted = 1; - echo_put_object(eco); - } - goto out; - - case OBD_IOC_GETATTR: - rc = echo_get_object(&eco, ed, oa); - if (rc == 0) { - rc = obd_getattr(env, ec->ec_exp, oa); - echo_put_object(eco); - } - goto out; - - case OBD_IOC_SETATTR: - if (!capable(CAP_SYS_ADMIN)) { - rc = -EPERM; - goto out; - } - - rc = echo_get_object(&eco, ed, oa); - if (rc == 0) { - rc = obd_setattr(env, ec->ec_exp, oa); - echo_put_object(eco); - } - goto out; - - case OBD_IOC_BRW_WRITE: - if (!capable(CAP_SYS_ADMIN)) { - rc = -EPERM; - goto out; - } - - rw = OBD_BRW_WRITE; - /* fall through */ - case OBD_IOC_BRW_READ: - rc = echo_client_brw_ioctl(env, rw, exp, data); - goto out; - - default: - CERROR("echo_ioctl(): unrecognised ioctl %#x\n", cmd); - rc = -ENOTTY; - goto out; - } - -out: - lu_env_fini(env); - kfree(env); - - return rc; -} - -static int echo_client_setup(const struct lu_env *env, - struct obd_device *obddev, struct lustre_cfg *lcfg) -{ - struct echo_client_obd *ec = &obddev->u.echo_client; - struct obd_device *tgt; - struct obd_uuid echo_uuid = { "ECHO_UUID" }; - struct obd_connect_data *ocd = NULL; - int rc; - - if (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) < 1) { - CERROR("requires a TARGET OBD name\n"); - return -EINVAL; - } - - tgt = class_name2obd(lustre_cfg_string(lcfg, 1)); - if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) { - CERROR("device not attached or not set up (%s)\n", - lustre_cfg_string(lcfg, 1)); - return -EINVAL; - } - - spin_lock_init(&ec->ec_lock); - INIT_LIST_HEAD(&ec->ec_objects); - INIT_LIST_HEAD(&ec->ec_locks); - ec->ec_unique = 0; - - ocd = kzalloc(sizeof(*ocd), GFP_NOFS); - if (!ocd) - return -ENOMEM; - - ocd->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_REQPORTAL | - OBD_CONNECT_BRW_SIZE | - OBD_CONNECT_GRANT | OBD_CONNECT_FULL20 | - OBD_CONNECT_64BITHASH | OBD_CONNECT_LVB_TYPE | - OBD_CONNECT_FID; - ocd->ocd_brw_size = DT_MAX_BRW_SIZE; - ocd->ocd_version = LUSTRE_VERSION_CODE; - ocd->ocd_group = FID_SEQ_ECHO; - - rc = obd_connect(env, &ec->ec_exp, tgt, &echo_uuid, ocd, NULL); - - kfree(ocd); - - if (rc != 0) { - CERROR("fail to connect to device %s\n", - lustre_cfg_string(lcfg, 1)); - return rc; - } - - return rc; -} - -static int echo_client_cleanup(struct obd_device *obddev) -{ - struct echo_client_obd *ec = &obddev->u.echo_client; - int rc; - - if (!list_empty(&obddev->obd_exports)) { - CERROR("still has clients!\n"); - return -EBUSY; - } - - LASSERT(atomic_read(&ec->ec_exp->exp_refcount) > 0); - rc = obd_disconnect(ec->ec_exp); - if (rc != 0) - CERROR("fail to disconnect device: %d\n", rc); - - return rc; -} - -static int echo_client_connect(const struct lu_env *env, - struct obd_export **exp, - struct obd_device *src, struct obd_uuid *cluuid, - struct obd_connect_data *data, void *localdata) -{ - int rc; - struct lustre_handle conn = { 0 }; - - rc = class_connect(&conn, src, cluuid); - if (rc == 0) - *exp = class_conn2export(&conn); - - return rc; -} - -static int echo_client_disconnect(struct obd_export *exp) -{ - int rc; - - if (!exp) { - rc = -EINVAL; - goto out; - } - - rc = class_disconnect(exp); - goto out; - out: - return rc; -} - -static struct obd_ops echo_client_obd_ops = { - .owner = THIS_MODULE, - .iocontrol = echo_client_iocontrol, - .connect = echo_client_connect, - .disconnect = echo_client_disconnect -}; - -static int echo_client_init(void) -{ - int rc; - - rc = lu_kmem_init(echo_caches); - if (rc == 0) { - rc = class_register_type(&echo_client_obd_ops, NULL, - LUSTRE_ECHO_CLIENT_NAME, - &echo_device_type); - if (rc) - lu_kmem_fini(echo_caches); - } - return rc; -} - -static void echo_client_exit(void) -{ - class_unregister_type(LUSTRE_ECHO_CLIENT_NAME); - lu_kmem_fini(echo_caches); -} - -static int __init obdecho_init(void) -{ - int rc; - - LCONSOLE_INFO("Echo OBD driver; http://www.lustre.org/\n"); - - LASSERT(PAGE_SIZE % OBD_ECHO_BLOCK_SIZE == 0); - - rc = libcfs_setup(); - if (rc) - return rc; - - return echo_client_init(); -} - -static void /*__exit*/ obdecho_exit(void) -{ - echo_client_exit(); -} - -MODULE_AUTHOR("OpenSFS, Inc. "); -MODULE_DESCRIPTION("Lustre Echo Client test driver"); -MODULE_VERSION(LUSTRE_VERSION_STRING); -MODULE_LICENSE("GPL"); - -module_init(obdecho_init); -module_exit(obdecho_exit); - -/** @} echo_client */ diff --git a/drivers/staging/lustre/lustre/obdecho/echo_internal.h b/drivers/staging/lustre/lustre/obdecho/echo_internal.h deleted file mode 100644 index 42faa164fabbc9edf9aaad5857cfe67872cee03d..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/obdecho/echo_internal.h +++ /dev/null @@ -1,42 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Whamcloud, Inc. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/obdecho/echo_internal.h - */ - -#ifndef _ECHO_INTERNAL_H -#define _ECHO_INTERNAL_H - -/* The persistent object (i.e. actually stores stuff!) */ -#define ECHO_PERSISTENT_OBJID 1ULL -#define ECHO_PERSISTENT_SIZE ((__u64)(1 << 20)) - -/* block size to use for data verification */ -#define OBD_ECHO_BLOCK_SIZE (4 << 10) - -#endif diff --git a/drivers/staging/lustre/lustre/osc/Makefile b/drivers/staging/lustre/lustre/osc/Makefile deleted file mode 100644 index 30dec90e64e8db70b2f3d667dff47ac263611a15..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/osc/Makefile +++ /dev/null @@ -1,6 +0,0 @@ -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/include -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/lustre/include - -obj-$(CONFIG_LUSTRE_FS) += osc.o -osc-y := osc_request.o osc_dev.o osc_object.o \ - osc_page.o osc_lock.o osc_io.o osc_quota.o osc_cache.o lproc_osc.o diff --git a/drivers/staging/lustre/lustre/osc/lproc_osc.c b/drivers/staging/lustre/lustre/osc/lproc_osc.c deleted file mode 100644 index 6a705bc5420cd3c4d87b75bd8b67d304d4e98c45..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/osc/lproc_osc.c +++ /dev/null @@ -1,838 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ -#define DEBUG_SUBSYSTEM S_CLASS - -#include -#include -#include -#include -#include -#include "osc_internal.h" - -static ssize_t active_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - - return sprintf(buf, "%d\n", !dev->u.cli.cl_import->imp_deactive); -} - -static ssize_t active_store(struct kobject *kobj, struct attribute *attr, - const char *buffer, - size_t count) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - int rc; - unsigned long val; - - rc = kstrtoul(buffer, 10, &val); - if (rc) - return rc; - if (val > 1) - return -ERANGE; - - /* opposite senses */ - if (dev->u.cli.cl_import->imp_deactive == val) - rc = ptlrpc_set_import_active(dev->u.cli.cl_import, val); - else - CDEBUG(D_CONFIG, "activate %ld: ignoring repeat request\n", - val); - - return count; -} -LUSTRE_RW_ATTR(active); - -static ssize_t max_rpcs_in_flight_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - struct client_obd *cli = &dev->u.cli; - - return sprintf(buf, "%u\n", cli->cl_max_rpcs_in_flight); -} - -static ssize_t max_rpcs_in_flight_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - struct client_obd *cli = &dev->u.cli; - int rc; - unsigned long val; - int adding, added, req_count; - - rc = kstrtoul(buffer, 10, &val); - if (rc) - return rc; - - if (val < 1 || val > OSC_MAX_RIF_MAX) - return -ERANGE; - - adding = val - cli->cl_max_rpcs_in_flight; - req_count = atomic_read(&osc_pool_req_count); - if (adding > 0 && req_count < osc_reqpool_maxreqcount) { - /* - * There might be some race which will cause over-limit - * allocation, but it is fine. - */ - if (req_count + adding > osc_reqpool_maxreqcount) - adding = osc_reqpool_maxreqcount - req_count; - - added = osc_rq_pool->prp_populate(osc_rq_pool, adding); - atomic_add(added, &osc_pool_req_count); - } - - spin_lock(&cli->cl_loi_list_lock); - cli->cl_max_rpcs_in_flight = val; - client_adjust_max_dirty(cli); - spin_unlock(&cli->cl_loi_list_lock); - - return count; -} -LUSTRE_RW_ATTR(max_rpcs_in_flight); - -static ssize_t max_dirty_mb_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - struct client_obd *cli = &dev->u.cli; - long val; - int mult; - - spin_lock(&cli->cl_loi_list_lock); - val = cli->cl_dirty_max_pages; - spin_unlock(&cli->cl_loi_list_lock); - - mult = 1 << (20 - PAGE_SHIFT); - return lprocfs_read_frac_helper(buf, PAGE_SIZE, val, mult); -} - -static ssize_t max_dirty_mb_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - struct client_obd *cli = &dev->u.cli; - int rc; - unsigned long pages_number; - - rc = kstrtoul(buffer, 10, &pages_number); - if (rc) - return rc; - - pages_number *= 1 << (20 - PAGE_SHIFT); /* MB -> pages */ - - if (pages_number <= 0 || - pages_number >= OSC_MAX_DIRTY_MB_MAX << (20 - PAGE_SHIFT) || - pages_number > totalram_pages / 4) /* 1/4 of RAM */ - return -ERANGE; - - spin_lock(&cli->cl_loi_list_lock); - cli->cl_dirty_max_pages = pages_number; - osc_wake_cache_waiters(cli); - spin_unlock(&cli->cl_loi_list_lock); - - return count; -} -LUSTRE_RW_ATTR(max_dirty_mb); - -static int osc_cached_mb_seq_show(struct seq_file *m, void *v) -{ - struct obd_device *dev = m->private; - struct client_obd *cli = &dev->u.cli; - int shift = 20 - PAGE_SHIFT; - - seq_printf(m, - "used_mb: %ld\n" - "busy_cnt: %ld\n" - "reclaim: %llu\n", - (atomic_long_read(&cli->cl_lru_in_list) + - atomic_long_read(&cli->cl_lru_busy)) >> shift, - atomic_long_read(&cli->cl_lru_busy), - cli->cl_lru_reclaim); - - return 0; -} - -/* shrink the number of caching pages to a specific number */ -static ssize_t osc_cached_mb_seq_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *off) -{ - struct obd_device *dev = ((struct seq_file *)file->private_data)->private; - struct client_obd *cli = &dev->u.cli; - long pages_number, rc; - char kernbuf[128]; - int mult; - u64 val; - - if (count >= sizeof(kernbuf)) - return -EINVAL; - - if (copy_from_user(kernbuf, buffer, count)) - return -EFAULT; - kernbuf[count] = 0; - - mult = 1 << (20 - PAGE_SHIFT); - buffer += lprocfs_find_named_value(kernbuf, "used_mb:", &count) - - kernbuf; - rc = lprocfs_write_frac_u64_helper(buffer, count, &val, mult); - if (rc) - return rc; - - if (val > LONG_MAX) - return -ERANGE; - pages_number = (long)val; - - if (pages_number < 0) - return -ERANGE; - - rc = atomic_long_read(&cli->cl_lru_in_list) - pages_number; - if (rc > 0) { - struct lu_env *env; - u16 refcheck; - - env = cl_env_get(&refcheck); - if (!IS_ERR(env)) { - (void)osc_lru_shrink(env, cli, rc, true); - cl_env_put(env, &refcheck); - } - } - - return count; -} - -LPROC_SEQ_FOPS(osc_cached_mb); - -static ssize_t cur_dirty_bytes_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - struct client_obd *cli = &dev->u.cli; - int len; - - spin_lock(&cli->cl_loi_list_lock); - len = sprintf(buf, "%lu\n", cli->cl_dirty_pages << PAGE_SHIFT); - spin_unlock(&cli->cl_loi_list_lock); - - return len; -} -LUSTRE_RO_ATTR(cur_dirty_bytes); - -static ssize_t cur_grant_bytes_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - struct client_obd *cli = &dev->u.cli; - int len; - - spin_lock(&cli->cl_loi_list_lock); - len = sprintf(buf, "%lu\n", cli->cl_avail_grant); - spin_unlock(&cli->cl_loi_list_lock); - - return len; -} - -static ssize_t cur_grant_bytes_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - struct client_obd *cli = &obd->u.cli; - int rc; - unsigned long long val; - - rc = kstrtoull(buffer, 10, &val); - if (rc) - return rc; - - /* this is only for shrinking grant */ - spin_lock(&cli->cl_loi_list_lock); - if (val >= cli->cl_avail_grant) { - spin_unlock(&cli->cl_loi_list_lock); - return -EINVAL; - } - spin_unlock(&cli->cl_loi_list_lock); - - if (cli->cl_import->imp_state == LUSTRE_IMP_FULL) - rc = osc_shrink_grant_to_target(cli, val); - if (rc) - return rc; - return count; -} -LUSTRE_RW_ATTR(cur_grant_bytes); - -static ssize_t cur_lost_grant_bytes_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - struct client_obd *cli = &dev->u.cli; - int len; - - spin_lock(&cli->cl_loi_list_lock); - len = sprintf(buf, "%lu\n", cli->cl_lost_grant); - spin_unlock(&cli->cl_loi_list_lock); - - return len; -} -LUSTRE_RO_ATTR(cur_lost_grant_bytes); - -static ssize_t grant_shrink_interval_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - - return sprintf(buf, "%d\n", obd->u.cli.cl_grant_shrink_interval); -} - -static ssize_t grant_shrink_interval_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - int rc; - unsigned long val; - - rc = kstrtoul(buffer, 10, &val); - if (rc) - return rc; - - if (val <= 0) - return -ERANGE; - - obd->u.cli.cl_grant_shrink_interval = val; - - return count; -} -LUSTRE_RW_ATTR(grant_shrink_interval); - -static ssize_t checksums_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - - return sprintf(buf, "%d\n", obd->u.cli.cl_checksum ? 1 : 0); -} - -static ssize_t checksums_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - int rc; - unsigned long val; - - rc = kstrtoul(buffer, 10, &val); - if (rc) - return rc; - - obd->u.cli.cl_checksum = (val ? 1 : 0); - - return count; -} -LUSTRE_RW_ATTR(checksums); - -static int osc_checksum_type_seq_show(struct seq_file *m, void *v) -{ - struct obd_device *obd = m->private; - int i; - - DECLARE_CKSUM_NAME; - - if (!obd) - return 0; - - for (i = 0; i < ARRAY_SIZE(cksum_name); i++) { - if (((1 << i) & obd->u.cli.cl_supp_cksum_types) == 0) - continue; - if (obd->u.cli.cl_cksum_type == (1 << i)) - seq_printf(m, "[%s] ", cksum_name[i]); - else - seq_printf(m, "%s ", cksum_name[i]); - } - seq_putc(m, '\n'); - return 0; -} - -static ssize_t osc_checksum_type_seq_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *off) -{ - struct obd_device *obd = ((struct seq_file *)file->private_data)->private; - int i; - - DECLARE_CKSUM_NAME; - char kernbuf[10]; - - if (!obd) - return 0; - - if (count > sizeof(kernbuf) - 1) - return -EINVAL; - if (copy_from_user(kernbuf, buffer, count)) - return -EFAULT; - if (count > 0 && kernbuf[count - 1] == '\n') - kernbuf[count - 1] = '\0'; - else - kernbuf[count] = '\0'; - - for (i = 0; i < ARRAY_SIZE(cksum_name); i++) { - if (((1 << i) & obd->u.cli.cl_supp_cksum_types) == 0) - continue; - if (!strcmp(kernbuf, cksum_name[i])) { - obd->u.cli.cl_cksum_type = 1 << i; - return count; - } - } - return -EINVAL; -} - -LPROC_SEQ_FOPS(osc_checksum_type); - -static ssize_t resend_count_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - - return sprintf(buf, "%u\n", atomic_read(&obd->u.cli.cl_resends)); -} - -static ssize_t resend_count_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - int rc; - unsigned long val; - - rc = kstrtoul(buffer, 10, &val); - if (rc) - return rc; - - atomic_set(&obd->u.cli.cl_resends, val); - - return count; -} -LUSTRE_RW_ATTR(resend_count); - -static ssize_t contention_seconds_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - struct osc_device *od = obd2osc_dev(obd); - - return sprintf(buf, "%u\n", od->od_contention_time); -} - -static ssize_t contention_seconds_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - struct osc_device *od = obd2osc_dev(obd); - int rc; - int val; - - rc = kstrtoint(buffer, 10, &val); - if (rc) - return rc; - - if (val < 0) - return -EINVAL; - - od->od_contention_time = val; - - return count; -} -LUSTRE_RW_ATTR(contention_seconds); - -static ssize_t lockless_truncate_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - struct osc_device *od = obd2osc_dev(obd); - - return sprintf(buf, "%u\n", od->od_lockless_truncate); -} - -static ssize_t lockless_truncate_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - struct osc_device *od = obd2osc_dev(obd); - int rc; - unsigned int val; - - rc = kstrtouint(buffer, 10, &val); - if (rc) - return rc; - - od->od_lockless_truncate = val; - - return count; -} -LUSTRE_RW_ATTR(lockless_truncate); - -static ssize_t destroys_in_flight_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct obd_device *obd = container_of(kobj, struct obd_device, - obd_kobj); - - return sprintf(buf, "%u\n", - atomic_read(&obd->u.cli.cl_destroy_in_flight)); -} -LUSTRE_RO_ATTR(destroys_in_flight); - -static ssize_t max_pages_per_rpc_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - struct client_obd *cli = &dev->u.cli; - - return sprintf(buf, "%d\n", cli->cl_max_pages_per_rpc); -} - -static ssize_t max_pages_per_rpc_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - struct client_obd *cli = &dev->u.cli; - struct obd_connect_data *ocd = &cli->cl_import->imp_connect_data; - int chunk_mask, rc; - unsigned long long val; - - rc = kstrtoull(buffer, 10, &val); - if (rc) - return rc; - - /* if the max_pages is specified in bytes, convert to pages */ - if (val >= ONE_MB_BRW_SIZE) - val >>= PAGE_SHIFT; - - chunk_mask = ~((1 << (cli->cl_chunkbits - PAGE_SHIFT)) - 1); - /* max_pages_per_rpc must be chunk aligned */ - val = (val + ~chunk_mask) & chunk_mask; - if (!val || (ocd->ocd_brw_size && - val > ocd->ocd_brw_size >> PAGE_SHIFT)) { - return -ERANGE; - } - spin_lock(&cli->cl_loi_list_lock); - cli->cl_max_pages_per_rpc = val; - client_adjust_max_dirty(cli); - spin_unlock(&cli->cl_loi_list_lock); - - return count; -} -LUSTRE_RW_ATTR(max_pages_per_rpc); - -static ssize_t unstable_stats_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct obd_device *dev = container_of(kobj, struct obd_device, - obd_kobj); - struct client_obd *cli = &dev->u.cli; - long pages; - int mb; - - pages = atomic_long_read(&cli->cl_unstable_count); - mb = (pages * PAGE_SIZE) >> 20; - - return sprintf(buf, "unstable_pages: %20ld\n" - "unstable_mb: %10d\n", pages, mb); -} -LUSTRE_RO_ATTR(unstable_stats); - -LPROC_SEQ_FOPS_RO_TYPE(osc, connect_flags); -LPROC_SEQ_FOPS_RO_TYPE(osc, server_uuid); -LPROC_SEQ_FOPS_RO_TYPE(osc, conn_uuid); -LPROC_SEQ_FOPS_RO_TYPE(osc, timeouts); -LPROC_SEQ_FOPS_RO_TYPE(osc, state); - -LPROC_SEQ_FOPS_WR_ONLY(osc, ping); - -LPROC_SEQ_FOPS_RW_TYPE(osc, import); -LPROC_SEQ_FOPS_RW_TYPE(osc, pinger_recov); - -static struct lprocfs_vars lprocfs_osc_obd_vars[] = { - { "ping", &osc_ping_fops, NULL, 0222 }, - { "connect_flags", &osc_connect_flags_fops, NULL, 0 }, - /*{ "filegroups", lprocfs_rd_filegroups, NULL, 0 },*/ - { "ost_server_uuid", &osc_server_uuid_fops, NULL, 0 }, - { "ost_conn_uuid", &osc_conn_uuid_fops, NULL, 0 }, - { "osc_cached_mb", &osc_cached_mb_fops, NULL }, - { "checksum_type", &osc_checksum_type_fops, NULL }, - { "timeouts", &osc_timeouts_fops, NULL, 0 }, - { "import", &osc_import_fops, NULL }, - { "state", &osc_state_fops, NULL, 0 }, - { "pinger_recov", &osc_pinger_recov_fops, NULL }, - { NULL } -}; - -#define pct(a, b) (b ? a * 100 / b : 0) - -static int osc_rpc_stats_seq_show(struct seq_file *seq, void *v) -{ - struct timespec64 now; - struct obd_device *dev = seq->private; - struct client_obd *cli = &dev->u.cli; - unsigned long read_tot = 0, write_tot = 0, read_cum, write_cum; - int i; - - ktime_get_real_ts64(&now); - - spin_lock(&cli->cl_loi_list_lock); - - seq_printf(seq, "snapshot_time: %llu.%9lu (secs.usecs)\n", - (s64)now.tv_sec, (unsigned long)now.tv_nsec); - seq_printf(seq, "read RPCs in flight: %d\n", - cli->cl_r_in_flight); - seq_printf(seq, "write RPCs in flight: %d\n", - cli->cl_w_in_flight); - seq_printf(seq, "pending write pages: %d\n", - atomic_read(&cli->cl_pending_w_pages)); - seq_printf(seq, "pending read pages: %d\n", - atomic_read(&cli->cl_pending_r_pages)); - - seq_puts(seq, "\n\t\t\tread\t\t\twrite\n"); - seq_puts(seq, "pages per rpc rpcs % cum % |"); - seq_puts(seq, " rpcs % cum %\n"); - - read_tot = lprocfs_oh_sum(&cli->cl_read_page_hist); - write_tot = lprocfs_oh_sum(&cli->cl_write_page_hist); - - read_cum = 0; - write_cum = 0; - for (i = 0; i < OBD_HIST_MAX; i++) { - unsigned long r = cli->cl_read_page_hist.oh_buckets[i]; - unsigned long w = cli->cl_write_page_hist.oh_buckets[i]; - - read_cum += r; - write_cum += w; - seq_printf(seq, "%d:\t\t%10lu %3lu %3lu | %10lu %3lu %3lu\n", - 1 << i, r, pct(r, read_tot), - pct(read_cum, read_tot), w, - pct(w, write_tot), - pct(write_cum, write_tot)); - if (read_cum == read_tot && write_cum == write_tot) - break; - } - - seq_puts(seq, "\n\t\t\tread\t\t\twrite\n"); - seq_puts(seq, "rpcs in flight rpcs % cum % |"); - seq_puts(seq, " rpcs % cum %\n"); - - read_tot = lprocfs_oh_sum(&cli->cl_read_rpc_hist); - write_tot = lprocfs_oh_sum(&cli->cl_write_rpc_hist); - - read_cum = 0; - write_cum = 0; - for (i = 0; i < OBD_HIST_MAX; i++) { - unsigned long r = cli->cl_read_rpc_hist.oh_buckets[i]; - unsigned long w = cli->cl_write_rpc_hist.oh_buckets[i]; - - read_cum += r; - write_cum += w; - seq_printf(seq, "%d:\t\t%10lu %3lu %3lu | %10lu %3lu %3lu\n", - i, r, pct(r, read_tot), - pct(read_cum, read_tot), w, - pct(w, write_tot), - pct(write_cum, write_tot)); - if (read_cum == read_tot && write_cum == write_tot) - break; - } - - seq_puts(seq, "\n\t\t\tread\t\t\twrite\n"); - seq_puts(seq, "offset rpcs % cum % |"); - seq_puts(seq, " rpcs % cum %\n"); - - read_tot = lprocfs_oh_sum(&cli->cl_read_offset_hist); - write_tot = lprocfs_oh_sum(&cli->cl_write_offset_hist); - - read_cum = 0; - write_cum = 0; - for (i = 0; i < OBD_HIST_MAX; i++) { - unsigned long r = cli->cl_read_offset_hist.oh_buckets[i]; - unsigned long w = cli->cl_write_offset_hist.oh_buckets[i]; - - read_cum += r; - write_cum += w; - seq_printf(seq, "%d:\t\t%10lu %3lu %3lu | %10lu %3lu %3lu\n", - (i == 0) ? 0 : 1 << (i - 1), - r, pct(r, read_tot), pct(read_cum, read_tot), - w, pct(w, write_tot), pct(write_cum, write_tot)); - if (read_cum == read_tot && write_cum == write_tot) - break; - } - - spin_unlock(&cli->cl_loi_list_lock); - - return 0; -} - -#undef pct - -static ssize_t osc_rpc_stats_seq_write(struct file *file, - const char __user *buf, - size_t len, loff_t *off) -{ - struct seq_file *seq = file->private_data; - struct obd_device *dev = seq->private; - struct client_obd *cli = &dev->u.cli; - - lprocfs_oh_clear(&cli->cl_read_rpc_hist); - lprocfs_oh_clear(&cli->cl_write_rpc_hist); - lprocfs_oh_clear(&cli->cl_read_page_hist); - lprocfs_oh_clear(&cli->cl_write_page_hist); - lprocfs_oh_clear(&cli->cl_read_offset_hist); - lprocfs_oh_clear(&cli->cl_write_offset_hist); - - return len; -} - -LPROC_SEQ_FOPS(osc_rpc_stats); - -static int osc_stats_seq_show(struct seq_file *seq, void *v) -{ - struct timespec64 now; - struct obd_device *dev = seq->private; - struct osc_stats *stats = &obd2osc_dev(dev)->od_stats; - - ktime_get_real_ts64(&now); - - seq_printf(seq, "snapshot_time: %llu.%9lu (secs.usecs)\n", - (s64)now.tv_sec, (unsigned long)now.tv_nsec); - seq_printf(seq, "lockless_write_bytes\t\t%llu\n", - stats->os_lockless_writes); - seq_printf(seq, "lockless_read_bytes\t\t%llu\n", - stats->os_lockless_reads); - seq_printf(seq, "lockless_truncate\t\t%llu\n", - stats->os_lockless_truncates); - return 0; -} - -static ssize_t osc_stats_seq_write(struct file *file, - const char __user *buf, - size_t len, loff_t *off) -{ - struct seq_file *seq = file->private_data; - struct obd_device *dev = seq->private; - struct osc_stats *stats = &obd2osc_dev(dev)->od_stats; - - memset(stats, 0, sizeof(*stats)); - return len; -} - -LPROC_SEQ_FOPS(osc_stats); - -void lproc_osc_attach_seqstat(struct obd_device *dev) -{ - debugfs_create_file("osc_stats", 0644, dev->obd_debugfs_entry, dev, - &osc_stats_fops); - debugfs_create_file("rpc_stats", 0644, dev->obd_debugfs_entry, dev, - &osc_rpc_stats_fops); -} - -static struct attribute *osc_attrs[] = { - &lustre_attr_active.attr, - &lustre_attr_checksums.attr, - &lustre_attr_contention_seconds.attr, - &lustre_attr_cur_dirty_bytes.attr, - &lustre_attr_cur_grant_bytes.attr, - &lustre_attr_cur_lost_grant_bytes.attr, - &lustre_attr_destroys_in_flight.attr, - &lustre_attr_grant_shrink_interval.attr, - &lustre_attr_lockless_truncate.attr, - &lustre_attr_max_dirty_mb.attr, - &lustre_attr_max_pages_per_rpc.attr, - &lustre_attr_max_rpcs_in_flight.attr, - &lustre_attr_resend_count.attr, - &lustre_attr_unstable_stats.attr, - NULL, -}; - -static const struct attribute_group osc_attr_group = { - .attrs = osc_attrs, -}; - -void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars) -{ - lvars->sysfs_vars = &osc_attr_group; - lvars->obd_vars = lprocfs_osc_obd_vars; -} diff --git a/drivers/staging/lustre/lustre/osc/osc_cache.c b/drivers/staging/lustre/lustre/osc/osc_cache.c deleted file mode 100644 index f269830048439c9a4007f7bbfeabcda7e018cefe..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/osc/osc_cache.c +++ /dev/null @@ -1,3306 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015, Intel Corporation. - * - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * osc cache management. - * - * Author: Jinshan Xiong - */ - -#define DEBUG_SUBSYSTEM S_OSC - -#include "osc_cl_internal.h" -#include "osc_internal.h" - -static int extent_debug; /* set it to be true for more debug */ - -static void osc_update_pending(struct osc_object *obj, int cmd, int delta); -static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext, - enum osc_extent_state state); -static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli, - struct osc_async_page *oap, int sent, int rc); -static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap, - int cmd); -static int osc_refresh_count(const struct lu_env *env, - struct osc_async_page *oap, int cmd); -static int osc_io_unplug_async(const struct lu_env *env, - struct client_obd *cli, struct osc_object *osc); -static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages, - unsigned int lost_grant); - -static void osc_extent_tree_dump0(int level, struct osc_object *obj, - const char *func, int line); -#define osc_extent_tree_dump(lvl, obj) \ - osc_extent_tree_dump0(lvl, obj, __func__, __LINE__) - -/** \addtogroup osc - * @{ - */ - -/* ------------------ osc extent ------------------ */ -static inline char *ext_flags(struct osc_extent *ext, char *flags) -{ - char *buf = flags; - *buf++ = ext->oe_rw ? 'r' : 'w'; - if (ext->oe_intree) - *buf++ = 'i'; - if (ext->oe_sync) - *buf++ = 'S'; - if (ext->oe_srvlock) - *buf++ = 's'; - if (ext->oe_hp) - *buf++ = 'h'; - if (ext->oe_urgent) - *buf++ = 'u'; - if (ext->oe_memalloc) - *buf++ = 'm'; - if (ext->oe_trunc_pending) - *buf++ = 't'; - if (ext->oe_fsync_wait) - *buf++ = 'Y'; - *buf = 0; - return flags; -} - -static inline char list_empty_marker(struct list_head *list) -{ - return list_empty(list) ? '-' : '+'; -} - -#define EXTSTR "[%lu -> %lu/%lu]" -#define EXTPARA(ext) (ext)->oe_start, (ext)->oe_end, (ext)->oe_max_end -static const char *oes_strings[] = { - "inv", "active", "cache", "locking", "lockdone", "rpc", "trunc", NULL }; - -#define OSC_EXTENT_DUMP(lvl, extent, fmt, ...) do { \ - struct osc_extent *__ext = (extent); \ - char __buf[16]; \ - \ - CDEBUG(lvl, \ - "extent %p@{" EXTSTR ", " \ - "[%d|%d|%c|%s|%s|%p], [%d|%d|%c|%c|%p|%u|%p]} " fmt, \ - /* ----- extent part 0 ----- */ \ - __ext, EXTPARA(__ext), \ - /* ----- part 1 ----- */ \ - atomic_read(&__ext->oe_refc), \ - atomic_read(&__ext->oe_users), \ - list_empty_marker(&__ext->oe_link), \ - oes_strings[__ext->oe_state], ext_flags(__ext, __buf), \ - __ext->oe_obj, \ - /* ----- part 2 ----- */ \ - __ext->oe_grants, __ext->oe_nr_pages, \ - list_empty_marker(&__ext->oe_pages), \ - waitqueue_active(&__ext->oe_waitq) ? '+' : '-', \ - __ext->oe_dlmlock, __ext->oe_mppr, __ext->oe_owner, \ - /* ----- part 4 ----- */ \ - ## __VA_ARGS__); \ - if (lvl == D_ERROR && __ext->oe_dlmlock) \ - LDLM_ERROR(__ext->oe_dlmlock, "extent: %p", __ext); \ - else \ - LDLM_DEBUG(__ext->oe_dlmlock, "extent: %p", __ext); \ -} while (0) - -#undef EASSERTF -#define EASSERTF(expr, ext, fmt, args...) do { \ - if (!(expr)) { \ - OSC_EXTENT_DUMP(D_ERROR, (ext), fmt, ##args); \ - osc_extent_tree_dump(D_ERROR, (ext)->oe_obj); \ - LASSERT(expr); \ - } \ -} while (0) - -#undef EASSERT -#define EASSERT(expr, ext) EASSERTF(expr, ext, "\n") - -static inline struct osc_extent *rb_extent(struct rb_node *n) -{ - return rb_entry_safe(n, struct osc_extent, oe_node); -} - -static inline struct osc_extent *next_extent(struct osc_extent *ext) -{ - if (!ext) - return NULL; - - LASSERT(ext->oe_intree); - return rb_extent(rb_next(&ext->oe_node)); -} - -static inline struct osc_extent *prev_extent(struct osc_extent *ext) -{ - if (!ext) - return NULL; - - LASSERT(ext->oe_intree); - return rb_extent(rb_prev(&ext->oe_node)); -} - -static inline struct osc_extent *first_extent(struct osc_object *obj) -{ - return rb_extent(rb_first(&obj->oo_root)); -} - -/* object must be locked by caller. */ -static int osc_extent_sanity_check0(struct osc_extent *ext, - const char *func, const int line) -{ - struct osc_object *obj = ext->oe_obj; - struct osc_async_page *oap; - size_t page_count; - int rc = 0; - - if (!osc_object_is_locked(obj)) { - rc = 9; - goto out; - } - - if (ext->oe_state >= OES_STATE_MAX) { - rc = 10; - goto out; - } - - if (atomic_read(&ext->oe_refc) <= 0) { - rc = 20; - goto out; - } - - if (atomic_read(&ext->oe_refc) < atomic_read(&ext->oe_users)) { - rc = 30; - goto out; - } - - switch (ext->oe_state) { - case OES_INV: - if (ext->oe_nr_pages > 0 || !list_empty(&ext->oe_pages)) - rc = 35; - else - rc = 0; - goto out; - case OES_ACTIVE: - if (atomic_read(&ext->oe_users) == 0) { - rc = 40; - goto out; - } - if (ext->oe_hp) { - rc = 50; - goto out; - } - if (ext->oe_fsync_wait && !ext->oe_urgent) { - rc = 55; - goto out; - } - break; - case OES_CACHE: - if (ext->oe_grants == 0) { - rc = 60; - goto out; - } - if (ext->oe_fsync_wait && !ext->oe_urgent && !ext->oe_hp) { - rc = 65; - goto out; - } - /* fall through */ - default: - if (atomic_read(&ext->oe_users) > 0) { - rc = 70; - goto out; - } - } - - if (ext->oe_max_end < ext->oe_end || ext->oe_end < ext->oe_start) { - rc = 80; - goto out; - } - - if (ext->oe_sync && ext->oe_grants > 0) { - rc = 90; - goto out; - } - - if (ext->oe_dlmlock && !ldlm_is_failed(ext->oe_dlmlock)) { - struct ldlm_extent *extent; - - extent = &ext->oe_dlmlock->l_policy_data.l_extent; - if (!(extent->start <= cl_offset(osc2cl(obj), ext->oe_start) && - extent->end >= cl_offset(osc2cl(obj), ext->oe_max_end))) { - rc = 100; - goto out; - } - - if (!(ext->oe_dlmlock->l_granted_mode & (LCK_PW | LCK_GROUP))) { - rc = 102; - goto out; - } - } - - if (ext->oe_nr_pages > ext->oe_mppr) { - rc = 105; - goto out; - } - - /* Do not verify page list if extent is in RPC. This is because an - * in-RPC extent is supposed to be exclusively accessible w/o lock. - */ - if (ext->oe_state > OES_CACHE) { - rc = 0; - goto out; - } - - if (!extent_debug) { - rc = 0; - goto out; - } - - page_count = 0; - list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { - pgoff_t index = osc_index(oap2osc(oap)); - ++page_count; - if (index > ext->oe_end || index < ext->oe_start) { - rc = 110; - goto out; - } - } - if (page_count != ext->oe_nr_pages) { - rc = 120; - goto out; - } - -out: - if (rc != 0) - OSC_EXTENT_DUMP(D_ERROR, ext, - "%s:%d sanity check %p failed with rc = %d\n", - func, line, ext, rc); - return rc; -} - -#define sanity_check_nolock(ext) \ - osc_extent_sanity_check0(ext, __func__, __LINE__) - -#define sanity_check(ext) ({ \ - int __res; \ - osc_object_lock((ext)->oe_obj); \ - __res = sanity_check_nolock(ext); \ - osc_object_unlock((ext)->oe_obj); \ - __res; \ -}) - -/** - * sanity check - to make sure there is no overlapped extent in the tree. - */ -static int osc_extent_is_overlapped(struct osc_object *obj, - struct osc_extent *ext) -{ - struct osc_extent *tmp; - - LASSERT(osc_object_is_locked(obj)); - - if (!extent_debug) - return 0; - - for (tmp = first_extent(obj); tmp; tmp = next_extent(tmp)) { - if (tmp == ext) - continue; - if (tmp->oe_end >= ext->oe_start && - tmp->oe_start <= ext->oe_end) - return 1; - } - return 0; -} - -static void osc_extent_state_set(struct osc_extent *ext, int state) -{ - LASSERT(osc_object_is_locked(ext->oe_obj)); - LASSERT(state >= OES_INV && state < OES_STATE_MAX); - - /* Never try to sanity check a state changing extent :-) */ - /* LASSERT(sanity_check_nolock(ext) == 0); */ - - /* TODO: validate the state machine */ - ext->oe_state = state; - wake_up_all(&ext->oe_waitq); -} - -static struct osc_extent *osc_extent_alloc(struct osc_object *obj) -{ - struct osc_extent *ext; - - ext = kmem_cache_zalloc(osc_extent_kmem, GFP_NOFS); - if (!ext) - return NULL; - - RB_CLEAR_NODE(&ext->oe_node); - ext->oe_obj = obj; - cl_object_get(osc2cl(obj)); - atomic_set(&ext->oe_refc, 1); - atomic_set(&ext->oe_users, 0); - INIT_LIST_HEAD(&ext->oe_link); - ext->oe_state = OES_INV; - INIT_LIST_HEAD(&ext->oe_pages); - init_waitqueue_head(&ext->oe_waitq); - ext->oe_dlmlock = NULL; - - return ext; -} - -static void osc_extent_free(struct osc_extent *ext) -{ - kmem_cache_free(osc_extent_kmem, ext); -} - -static struct osc_extent *osc_extent_get(struct osc_extent *ext) -{ - LASSERT(atomic_read(&ext->oe_refc) >= 0); - atomic_inc(&ext->oe_refc); - return ext; -} - -static void osc_extent_put(const struct lu_env *env, struct osc_extent *ext) -{ - LASSERT(atomic_read(&ext->oe_refc) > 0); - if (atomic_dec_and_test(&ext->oe_refc)) { - LASSERT(list_empty(&ext->oe_link)); - LASSERT(atomic_read(&ext->oe_users) == 0); - LASSERT(ext->oe_state == OES_INV); - LASSERT(!ext->oe_intree); - - if (ext->oe_dlmlock) { - lu_ref_add(&ext->oe_dlmlock->l_reference, - "osc_extent", ext); - LDLM_LOCK_PUT(ext->oe_dlmlock); - ext->oe_dlmlock = NULL; - } - cl_object_put(env, osc2cl(ext->oe_obj)); - osc_extent_free(ext); - } -} - -/** - * osc_extent_put_trust() is a special version of osc_extent_put() when - * it's known that the caller is not the last user. This is to address the - * problem of lacking of lu_env ;-). - */ -static void osc_extent_put_trust(struct osc_extent *ext) -{ - LASSERT(atomic_read(&ext->oe_refc) > 1); - LASSERT(osc_object_is_locked(ext->oe_obj)); - atomic_dec(&ext->oe_refc); -} - -/** - * Return the extent which includes pgoff @index, or return the greatest - * previous extent in the tree. - */ -static struct osc_extent *osc_extent_search(struct osc_object *obj, - pgoff_t index) -{ - struct rb_node *n = obj->oo_root.rb_node; - struct osc_extent *tmp, *p = NULL; - - LASSERT(osc_object_is_locked(obj)); - while (n) { - tmp = rb_extent(n); - if (index < tmp->oe_start) { - n = n->rb_left; - } else if (index > tmp->oe_end) { - p = rb_extent(n); - n = n->rb_right; - } else { - return tmp; - } - } - return p; -} - -/* - * Return the extent covering @index, otherwise return NULL. - * caller must have held object lock. - */ -static struct osc_extent *osc_extent_lookup(struct osc_object *obj, - pgoff_t index) -{ - struct osc_extent *ext; - - ext = osc_extent_search(obj, index); - if (ext && ext->oe_start <= index && index <= ext->oe_end) - return osc_extent_get(ext); - return NULL; -} - -/* caller must have held object lock. */ -static void osc_extent_insert(struct osc_object *obj, struct osc_extent *ext) -{ - struct rb_node **n = &obj->oo_root.rb_node; - struct rb_node *parent = NULL; - struct osc_extent *tmp; - - LASSERT(ext->oe_intree == 0); - LASSERT(ext->oe_obj == obj); - LASSERT(osc_object_is_locked(obj)); - while (*n) { - tmp = rb_extent(*n); - parent = *n; - - if (ext->oe_end < tmp->oe_start) - n = &(*n)->rb_left; - else if (ext->oe_start > tmp->oe_end) - n = &(*n)->rb_right; - else - EASSERTF(0, tmp, EXTSTR "\n", EXTPARA(ext)); - } - rb_link_node(&ext->oe_node, parent, n); - rb_insert_color(&ext->oe_node, &obj->oo_root); - osc_extent_get(ext); - ext->oe_intree = 1; -} - -/* caller must have held object lock. */ -static void osc_extent_erase(struct osc_extent *ext) -{ - struct osc_object *obj = ext->oe_obj; - - LASSERT(osc_object_is_locked(obj)); - if (ext->oe_intree) { - rb_erase(&ext->oe_node, &obj->oo_root); - ext->oe_intree = 0; - /* rbtree held a refcount */ - osc_extent_put_trust(ext); - } -} - -static struct osc_extent *osc_extent_hold(struct osc_extent *ext) -{ - struct osc_object *obj = ext->oe_obj; - - LASSERT(osc_object_is_locked(obj)); - LASSERT(ext->oe_state == OES_ACTIVE || ext->oe_state == OES_CACHE); - if (ext->oe_state == OES_CACHE) { - osc_extent_state_set(ext, OES_ACTIVE); - osc_update_pending(obj, OBD_BRW_WRITE, -ext->oe_nr_pages); - } - atomic_inc(&ext->oe_users); - list_del_init(&ext->oe_link); - return osc_extent_get(ext); -} - -static void __osc_extent_remove(struct osc_extent *ext) -{ - LASSERT(osc_object_is_locked(ext->oe_obj)); - LASSERT(list_empty(&ext->oe_pages)); - osc_extent_erase(ext); - list_del_init(&ext->oe_link); - osc_extent_state_set(ext, OES_INV); - OSC_EXTENT_DUMP(D_CACHE, ext, "destroyed.\n"); -} - -static void osc_extent_remove(struct osc_extent *ext) -{ - struct osc_object *obj = ext->oe_obj; - - osc_object_lock(obj); - __osc_extent_remove(ext); - osc_object_unlock(obj); -} - -/** - * This function is used to merge extents to get better performance. It checks - * if @cur and @victim are contiguous at chunk level. - */ -static int osc_extent_merge(const struct lu_env *env, struct osc_extent *cur, - struct osc_extent *victim) -{ - struct osc_object *obj = cur->oe_obj; - pgoff_t chunk_start; - pgoff_t chunk_end; - int ppc_bits; - - LASSERT(cur->oe_state == OES_CACHE); - LASSERT(osc_object_is_locked(obj)); - if (!victim) - return -EINVAL; - - if (victim->oe_state != OES_CACHE || victim->oe_fsync_wait) - return -EBUSY; - - if (cur->oe_max_end != victim->oe_max_end) - return -ERANGE; - - LASSERT(cur->oe_dlmlock == victim->oe_dlmlock); - ppc_bits = osc_cli(obj)->cl_chunkbits - PAGE_SHIFT; - chunk_start = cur->oe_start >> ppc_bits; - chunk_end = cur->oe_end >> ppc_bits; - if (chunk_start != (victim->oe_end >> ppc_bits) + 1 && - chunk_end + 1 != victim->oe_start >> ppc_bits) - return -ERANGE; - - OSC_EXTENT_DUMP(D_CACHE, victim, "will be merged by %p.\n", cur); - - cur->oe_start = min(cur->oe_start, victim->oe_start); - cur->oe_end = max(cur->oe_end, victim->oe_end); - cur->oe_grants += victim->oe_grants; - cur->oe_nr_pages += victim->oe_nr_pages; - /* only the following bits are needed to merge */ - cur->oe_urgent |= victim->oe_urgent; - cur->oe_memalloc |= victim->oe_memalloc; - list_splice_init(&victim->oe_pages, &cur->oe_pages); - list_del_init(&victim->oe_link); - victim->oe_nr_pages = 0; - - osc_extent_get(victim); - __osc_extent_remove(victim); - osc_extent_put(env, victim); - - OSC_EXTENT_DUMP(D_CACHE, cur, "after merging %p.\n", victim); - return 0; -} - -/** - * Drop user count of osc_extent, and unplug IO asynchronously. - */ -void osc_extent_release(const struct lu_env *env, struct osc_extent *ext) -{ - struct osc_object *obj = ext->oe_obj; - - LASSERT(atomic_read(&ext->oe_users) > 0); - LASSERT(sanity_check(ext) == 0); - LASSERT(ext->oe_grants > 0); - - if (atomic_dec_and_lock(&ext->oe_users, &obj->oo_lock)) { - LASSERT(ext->oe_state == OES_ACTIVE); - if (ext->oe_trunc_pending) { - /* a truncate process is waiting for this extent. - * This may happen due to a race, check - * osc_cache_truncate_start(). - */ - osc_extent_state_set(ext, OES_TRUNC); - ext->oe_trunc_pending = 0; - } else { - osc_extent_state_set(ext, OES_CACHE); - osc_update_pending(obj, OBD_BRW_WRITE, - ext->oe_nr_pages); - - /* try to merge the previous and next extent. */ - osc_extent_merge(env, ext, prev_extent(ext)); - osc_extent_merge(env, ext, next_extent(ext)); - - if (ext->oe_urgent) - list_move_tail(&ext->oe_link, - &obj->oo_urgent_exts); - } - osc_object_unlock(obj); - - osc_io_unplug_async(env, osc_cli(obj), obj); - } - osc_extent_put(env, ext); -} - -static inline int overlapped(struct osc_extent *ex1, struct osc_extent *ex2) -{ - return !(ex1->oe_end < ex2->oe_start || ex2->oe_end < ex1->oe_start); -} - -/** - * Find or create an extent which includes @index, core function to manage - * extent tree. - */ -static struct osc_extent *osc_extent_find(const struct lu_env *env, - struct osc_object *obj, pgoff_t index, - unsigned int *grants) -{ - struct client_obd *cli = osc_cli(obj); - struct osc_lock *olck; - struct cl_lock_descr *descr; - struct osc_extent *cur; - struct osc_extent *ext; - struct osc_extent *conflict = NULL; - struct osc_extent *found = NULL; - pgoff_t chunk; - pgoff_t max_end; - unsigned int max_pages; /* max_pages_per_rpc */ - unsigned int chunksize; - int ppc_bits; /* pages per chunk bits */ - pgoff_t chunk_mask; - int rc; - - cur = osc_extent_alloc(obj); - if (!cur) - return ERR_PTR(-ENOMEM); - - olck = osc_env_io(env)->oi_write_osclock; - LASSERTF(olck, "page %lu is not covered by lock\n", index); - LASSERT(olck->ols_state == OLS_GRANTED); - - descr = &olck->ols_cl.cls_lock->cll_descr; - LASSERT(descr->cld_mode >= CLM_WRITE); - - LASSERT(cli->cl_chunkbits >= PAGE_SHIFT); - ppc_bits = cli->cl_chunkbits - PAGE_SHIFT; - chunk_mask = ~((1 << ppc_bits) - 1); - chunksize = 1 << cli->cl_chunkbits; - chunk = index >> ppc_bits; - - /* align end to rpc edge, rpc size may not be a power 2 integer. */ - max_pages = cli->cl_max_pages_per_rpc; - LASSERT((max_pages & ~chunk_mask) == 0); - max_end = index - (index % max_pages) + max_pages - 1; - max_end = min_t(pgoff_t, max_end, descr->cld_end); - - /* initialize new extent by parameters so far */ - cur->oe_max_end = max_end; - cur->oe_start = index & chunk_mask; - cur->oe_end = ((index + ~chunk_mask + 1) & chunk_mask) - 1; - if (cur->oe_start < descr->cld_start) - cur->oe_start = descr->cld_start; - if (cur->oe_end > max_end) - cur->oe_end = max_end; - cur->oe_grants = 0; - cur->oe_mppr = max_pages; - if (olck->ols_dlmlock) { - LASSERT(olck->ols_hold); - cur->oe_dlmlock = LDLM_LOCK_GET(olck->ols_dlmlock); - lu_ref_add(&olck->ols_dlmlock->l_reference, "osc_extent", cur); - } - - /* grants has been allocated by caller */ - LASSERTF(*grants >= chunksize + cli->cl_extent_tax, - "%u/%u/%u.\n", *grants, chunksize, cli->cl_extent_tax); - LASSERTF((max_end - cur->oe_start) < max_pages, EXTSTR "\n", - EXTPARA(cur)); - -restart: - osc_object_lock(obj); - ext = osc_extent_search(obj, cur->oe_start); - if (!ext) - ext = first_extent(obj); - while (ext) { - pgoff_t ext_chk_start = ext->oe_start >> ppc_bits; - pgoff_t ext_chk_end = ext->oe_end >> ppc_bits; - - LASSERT(sanity_check_nolock(ext) == 0); - if (chunk > ext_chk_end + 1) - break; - - /* if covering by different locks, no chance to match */ - if (olck->ols_dlmlock != ext->oe_dlmlock) { - EASSERTF(!overlapped(ext, cur), ext, - EXTSTR "\n", EXTPARA(cur)); - - ext = next_extent(ext); - continue; - } - - /* discontiguous chunks? */ - if (chunk + 1 < ext_chk_start) { - ext = next_extent(ext); - continue; - } - - /* ok, from now on, ext and cur have these attrs: - * 1. covered by the same lock - * 2. contiguous at chunk level or overlapping. - */ - - if (overlapped(ext, cur)) { - /* cur is the minimum unit, so overlapping means - * full contain. - */ - EASSERTF((ext->oe_start <= cur->oe_start && - ext->oe_end >= cur->oe_end), - ext, EXTSTR "\n", EXTPARA(cur)); - - if (ext->oe_state > OES_CACHE || ext->oe_fsync_wait) { - /* for simplicity, we wait for this extent to - * finish before going forward. - */ - conflict = osc_extent_get(ext); - break; - } - - found = osc_extent_hold(ext); - break; - } - - /* non-overlapped extent */ - if (ext->oe_state != OES_CACHE || ext->oe_fsync_wait) { - /* we can't do anything for a non OES_CACHE extent, or - * if there is someone waiting for this extent to be - * flushed, try next one. - */ - ext = next_extent(ext); - continue; - } - - /* check if they belong to the same rpc slot before trying to - * merge. the extents are not overlapped and contiguous at - * chunk level to get here. - */ - if (ext->oe_max_end != max_end) { - /* if they don't belong to the same RPC slot or - * max_pages_per_rpc has ever changed, do not merge. - */ - ext = next_extent(ext); - continue; - } - - /* it's required that an extent must be contiguous at chunk - * level so that we know the whole extent is covered by grant - * (the pages in the extent are NOT required to be contiguous). - * Otherwise, it will be too much difficult to know which - * chunks have grants allocated. - */ - - /* try to do front merge - extend ext's start */ - if (chunk + 1 == ext_chk_start) { - /* ext must be chunk size aligned */ - EASSERT((ext->oe_start & ~chunk_mask) == 0, ext); - - /* pull ext's start back to cover cur */ - ext->oe_start = cur->oe_start; - ext->oe_grants += chunksize; - LASSERT(*grants >= chunksize); - *grants -= chunksize; - - found = osc_extent_hold(ext); - } else if (chunk == ext_chk_end + 1) { - /* rear merge */ - ext->oe_end = cur->oe_end; - ext->oe_grants += chunksize; - LASSERT(*grants >= chunksize); - *grants -= chunksize; - - /* try to merge with the next one because we just fill - * in a gap - */ - if (osc_extent_merge(env, ext, next_extent(ext)) == 0) - /* we can save extent tax from next extent */ - *grants += cli->cl_extent_tax; - - found = osc_extent_hold(ext); - } - if (found) - break; - - ext = next_extent(ext); - } - - osc_extent_tree_dump(D_CACHE, obj); - if (found) { - LASSERT(!conflict); - if (!IS_ERR(found)) { - LASSERT(found->oe_dlmlock == cur->oe_dlmlock); - OSC_EXTENT_DUMP(D_CACHE, found, - "found caching ext for %lu.\n", index); - } - } else if (!conflict) { - /* create a new extent */ - EASSERT(osc_extent_is_overlapped(obj, cur) == 0, cur); - cur->oe_grants = chunksize + cli->cl_extent_tax; - LASSERT(*grants >= cur->oe_grants); - *grants -= cur->oe_grants; - - cur->oe_state = OES_CACHE; - found = osc_extent_hold(cur); - osc_extent_insert(obj, cur); - OSC_EXTENT_DUMP(D_CACHE, cur, "add into tree %lu/%lu.\n", - index, descr->cld_end); - } - osc_object_unlock(obj); - - if (conflict) { - LASSERT(!found); - - /* waiting for IO to finish. Please notice that it's impossible - * to be an OES_TRUNC extent. - */ - rc = osc_extent_wait(env, conflict, OES_INV); - osc_extent_put(env, conflict); - conflict = NULL; - if (rc < 0) { - found = ERR_PTR(rc); - goto out; - } - - goto restart; - } - -out: - osc_extent_put(env, cur); - return found; -} - -/** - * Called when IO is finished to an extent. - */ -int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext, - int sent, int rc) -{ - struct client_obd *cli = osc_cli(ext->oe_obj); - struct osc_async_page *oap; - struct osc_async_page *tmp; - int nr_pages = ext->oe_nr_pages; - int lost_grant = 0; - int blocksize = cli->cl_import->imp_obd->obd_osfs.os_bsize ? : 4096; - __u64 last_off = 0; - int last_count = -1; - - OSC_EXTENT_DUMP(D_CACHE, ext, "extent finished.\n"); - - ext->oe_rc = rc ?: ext->oe_nr_pages; - EASSERT(ergo(rc == 0, ext->oe_state == OES_RPC), ext); - - osc_lru_add_batch(cli, &ext->oe_pages); - list_for_each_entry_safe(oap, tmp, &ext->oe_pages, oap_pending_item) { - list_del_init(&oap->oap_rpc_item); - list_del_init(&oap->oap_pending_item); - if (last_off <= oap->oap_obj_off) { - last_off = oap->oap_obj_off; - last_count = oap->oap_count; - } - - --ext->oe_nr_pages; - osc_ap_completion(env, cli, oap, sent, rc); - } - EASSERT(ext->oe_nr_pages == 0, ext); - - if (!sent) { - lost_grant = ext->oe_grants; - } else if (blocksize < PAGE_SIZE && - last_count != PAGE_SIZE) { - /* For short writes we shouldn't count parts of pages that - * span a whole chunk on the OST side, or our accounting goes - * wrong. Should match the code in filter_grant_check. - */ - int offset = last_off & ~PAGE_MASK; - int count = last_count + (offset & (blocksize - 1)); - int end = (offset + last_count) & (blocksize - 1); - - if (end) - count += blocksize - end; - - lost_grant = PAGE_SIZE - count; - } - if (ext->oe_grants > 0) - osc_free_grant(cli, nr_pages, lost_grant); - - osc_extent_remove(ext); - /* put the refcount for RPC */ - osc_extent_put(env, ext); - return 0; -} - -static int extent_wait_cb(struct osc_extent *ext, enum osc_extent_state state) -{ - int ret; - - osc_object_lock(ext->oe_obj); - ret = ext->oe_state == state; - osc_object_unlock(ext->oe_obj); - - return ret; -} - -/** - * Wait for the extent's state to become @state. - */ -static int osc_extent_wait(const struct lu_env *env, struct osc_extent *ext, - enum osc_extent_state state) -{ - struct osc_object *obj = ext->oe_obj; - int rc = 0; - - osc_object_lock(obj); - LASSERT(sanity_check_nolock(ext) == 0); - /* `Kick' this extent only if the caller is waiting for it to be - * written out. - */ - if (state == OES_INV && !ext->oe_urgent && !ext->oe_hp && - !ext->oe_trunc_pending) { - if (ext->oe_state == OES_ACTIVE) { - ext->oe_urgent = 1; - } else if (ext->oe_state == OES_CACHE) { - ext->oe_urgent = 1; - osc_extent_hold(ext); - rc = 1; - } - } - osc_object_unlock(obj); - if (rc == 1) - osc_extent_release(env, ext); - - /* wait for the extent until its state becomes @state */ - rc = wait_event_idle_timeout(ext->oe_waitq, - extent_wait_cb(ext, state), 600 * HZ); - if (rc == 0) { - OSC_EXTENT_DUMP(D_ERROR, ext, - "%s: wait ext to %u timedout, recovery in progress?\n", - cli_name(osc_cli(obj)), state); - - wait_event_idle(ext->oe_waitq, extent_wait_cb(ext, state)); - } - if (ext->oe_rc < 0) - rc = ext->oe_rc; - else - rc = 0; - return rc; -} - -/** - * Discard pages with index greater than @size. If @ext is overlapped with - * @size, then partial truncate happens. - */ -static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index, - bool partial) -{ - struct lu_env *env; - struct cl_io *io; - struct osc_object *obj = ext->oe_obj; - struct client_obd *cli = osc_cli(obj); - struct osc_async_page *oap; - struct osc_async_page *tmp; - int pages_in_chunk = 0; - int ppc_bits = cli->cl_chunkbits - PAGE_SHIFT; - __u64 trunc_chunk = trunc_index >> ppc_bits; - int grants = 0; - int nr_pages = 0; - int rc = 0; - u16 refcheck; - - LASSERT(sanity_check(ext) == 0); - EASSERT(ext->oe_state == OES_TRUNC, ext); - EASSERT(!ext->oe_urgent, ext); - - /* Request new lu_env. - * We can't use that env from osc_cache_truncate_start() because - * it's from lov_io_sub and not fully initialized. - */ - env = cl_env_get(&refcheck); - io = &osc_env_info(env)->oti_io; - io->ci_obj = cl_object_top(osc2cl(obj)); - io->ci_ignore_layout = 1; - rc = cl_io_init(env, io, CIT_MISC, io->ci_obj); - if (rc < 0) - goto out; - - /* discard all pages with index greater then trunc_index */ - list_for_each_entry_safe(oap, tmp, &ext->oe_pages, oap_pending_item) { - pgoff_t index = osc_index(oap2osc(oap)); - struct cl_page *page = oap2cl_page(oap); - - LASSERT(list_empty(&oap->oap_rpc_item)); - - /* only discard the pages with their index greater than - * trunc_index, and ... - */ - if (index < trunc_index || - (index == trunc_index && partial)) { - /* accounting how many pages remaining in the chunk - * so that we can calculate grants correctly. */ - if (index >> ppc_bits == trunc_chunk) - ++pages_in_chunk; - continue; - } - - list_del_init(&oap->oap_pending_item); - - cl_page_get(page); - lu_ref_add(&page->cp_reference, "truncate", current); - - if (cl_page_own(env, io, page) == 0) { - cl_page_discard(env, io, page); - cl_page_disown(env, io, page); - } else { - LASSERT(page->cp_state == CPS_FREEING); - LASSERT(0); - } - - lu_ref_del(&page->cp_reference, "truncate", current); - cl_page_put(env, page); - - --ext->oe_nr_pages; - ++nr_pages; - } - EASSERTF(ergo(ext->oe_start >= trunc_index + !!partial, - ext->oe_nr_pages == 0), - ext, "trunc_index %lu, partial %d\n", trunc_index, partial); - - osc_object_lock(obj); - if (ext->oe_nr_pages == 0) { - LASSERT(pages_in_chunk == 0); - grants = ext->oe_grants; - ext->oe_grants = 0; - } else { /* calculate how many grants we can free */ - int chunks = (ext->oe_end >> ppc_bits) - trunc_chunk; - pgoff_t last_index; - - /* if there is no pages in this chunk, we can also free grants - * for the last chunk - */ - if (pages_in_chunk == 0) { - /* if this is the 1st chunk and no pages in this chunk, - * ext->oe_nr_pages must be zero, so we should be in - * the other if-clause. - */ - LASSERT(trunc_chunk > 0); - --trunc_chunk; - ++chunks; - } - - /* this is what we can free from this extent */ - grants = chunks << cli->cl_chunkbits; - ext->oe_grants -= grants; - last_index = ((trunc_chunk + 1) << ppc_bits) - 1; - ext->oe_end = min(last_index, ext->oe_max_end); - LASSERT(ext->oe_end >= ext->oe_start); - LASSERT(ext->oe_grants > 0); - } - osc_object_unlock(obj); - - if (grants > 0 || nr_pages > 0) - osc_free_grant(cli, nr_pages, grants); - -out: - cl_io_fini(env, io); - cl_env_put(env, &refcheck); - return rc; -} - -/** - * This function is used to make the extent prepared for transfer. - * A race with flushing page - ll_writepage() has to be handled cautiously. - */ -static int osc_extent_make_ready(const struct lu_env *env, - struct osc_extent *ext) -{ - struct osc_async_page *oap; - struct osc_async_page *last = NULL; - struct osc_object *obj = ext->oe_obj; - unsigned int page_count = 0; - int rc; - - /* we're going to grab page lock, so object lock must not be taken. */ - LASSERT(sanity_check(ext) == 0); - /* in locking state, any process should not touch this extent. */ - EASSERT(ext->oe_state == OES_LOCKING, ext); - EASSERT(ext->oe_owner, ext); - - OSC_EXTENT_DUMP(D_CACHE, ext, "make ready\n"); - - list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { - ++page_count; - if (!last || last->oap_obj_off < oap->oap_obj_off) - last = oap; - - /* checking ASYNC_READY is race safe */ - if ((oap->oap_async_flags & ASYNC_READY) != 0) - continue; - - rc = osc_make_ready(env, oap, OBD_BRW_WRITE); - switch (rc) { - case 0: - spin_lock(&oap->oap_lock); - oap->oap_async_flags |= ASYNC_READY; - spin_unlock(&oap->oap_lock); - break; - case -EALREADY: - LASSERT((oap->oap_async_flags & ASYNC_READY) != 0); - break; - default: - LASSERTF(0, "unknown return code: %d\n", rc); - } - } - - LASSERT(page_count == ext->oe_nr_pages); - LASSERT(last); - /* the last page is the only one we need to refresh its count by - * the size of file. - */ - if (!(last->oap_async_flags & ASYNC_COUNT_STABLE)) { - int last_oap_count = osc_refresh_count(env, last, OBD_BRW_WRITE); - - LASSERT(last_oap_count > 0); - LASSERT(last->oap_page_off + last_oap_count <= PAGE_SIZE); - last->oap_count = last_oap_count; - spin_lock(&last->oap_lock); - last->oap_async_flags |= ASYNC_COUNT_STABLE; - spin_unlock(&last->oap_lock); - } - - /* for the rest of pages, we don't need to call osf_refresh_count() - * because it's known they are not the last page - */ - list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { - if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) { - oap->oap_count = PAGE_SIZE - oap->oap_page_off; - spin_lock(&last->oap_lock); - oap->oap_async_flags |= ASYNC_COUNT_STABLE; - spin_unlock(&last->oap_lock); - } - } - - osc_object_lock(obj); - osc_extent_state_set(ext, OES_RPC); - osc_object_unlock(obj); - /* get a refcount for RPC. */ - osc_extent_get(ext); - - return 0; -} - -/** - * Quick and simple version of osc_extent_find(). This function is frequently - * called to expand the extent for the same IO. To expand the extent, the - * page index must be in the same or next chunk of ext->oe_end. - */ -static int osc_extent_expand(struct osc_extent *ext, pgoff_t index, - unsigned int *grants) -{ - struct osc_object *obj = ext->oe_obj; - struct client_obd *cli = osc_cli(obj); - struct osc_extent *next; - int ppc_bits = cli->cl_chunkbits - PAGE_SHIFT; - pgoff_t chunk = index >> ppc_bits; - pgoff_t end_chunk; - pgoff_t end_index; - unsigned int chunksize = 1 << cli->cl_chunkbits; - int rc = 0; - - LASSERT(ext->oe_max_end >= index && ext->oe_start <= index); - osc_object_lock(obj); - LASSERT(sanity_check_nolock(ext) == 0); - end_chunk = ext->oe_end >> ppc_bits; - if (chunk > end_chunk + 1) { - rc = -ERANGE; - goto out; - } - - if (end_chunk >= chunk) { - rc = 0; - goto out; - } - - LASSERT(end_chunk + 1 == chunk); - /* try to expand this extent to cover @index */ - end_index = min(ext->oe_max_end, ((chunk + 1) << ppc_bits) - 1); - - next = next_extent(ext); - if (next && next->oe_start <= end_index) { - /* complex mode - overlapped with the next extent, - * this case will be handled by osc_extent_find() - */ - rc = -EAGAIN; - goto out; - } - - ext->oe_end = end_index; - ext->oe_grants += chunksize; - LASSERT(*grants >= chunksize); - *grants -= chunksize; - EASSERTF(osc_extent_is_overlapped(obj, ext) == 0, ext, - "overlapped after expanding for %lu.\n", index); - -out: - osc_object_unlock(obj); - return rc; -} - -static void osc_extent_tree_dump0(int level, struct osc_object *obj, - const char *func, int line) -{ - struct osc_extent *ext; - int cnt; - - CDEBUG(level, "Dump object %p extents at %s:%d, mppr: %u.\n", - obj, func, line, osc_cli(obj)->cl_max_pages_per_rpc); - - /* osc_object_lock(obj); */ - cnt = 1; - for (ext = first_extent(obj); ext; ext = next_extent(ext)) - OSC_EXTENT_DUMP(level, ext, "in tree %d.\n", cnt++); - - cnt = 1; - list_for_each_entry(ext, &obj->oo_hp_exts, oe_link) - OSC_EXTENT_DUMP(level, ext, "hp %d.\n", cnt++); - - cnt = 1; - list_for_each_entry(ext, &obj->oo_urgent_exts, oe_link) - OSC_EXTENT_DUMP(level, ext, "urgent %d.\n", cnt++); - - cnt = 1; - list_for_each_entry(ext, &obj->oo_reading_exts, oe_link) - OSC_EXTENT_DUMP(level, ext, "reading %d.\n", cnt++); - /* osc_object_unlock(obj); */ -} - -/* ------------------ osc extent end ------------------ */ - -static inline int osc_is_ready(struct osc_object *osc) -{ - return !list_empty(&osc->oo_ready_item) || - !list_empty(&osc->oo_hp_ready_item); -} - -#define OSC_IO_DEBUG(OSC, STR, args...) \ - CDEBUG(D_CACHE, "obj %p ready %d|%c|%c wr %d|%c|%c rd %d|%c " STR, \ - (OSC), osc_is_ready(OSC), \ - list_empty_marker(&(OSC)->oo_hp_ready_item), \ - list_empty_marker(&(OSC)->oo_ready_item), \ - atomic_read(&(OSC)->oo_nr_writes), \ - list_empty_marker(&(OSC)->oo_hp_exts), \ - list_empty_marker(&(OSC)->oo_urgent_exts), \ - atomic_read(&(OSC)->oo_nr_reads), \ - list_empty_marker(&(OSC)->oo_reading_exts), \ - ##args) - -static int osc_make_ready(const struct lu_env *env, struct osc_async_page *oap, - int cmd) -{ - struct osc_page *opg = oap2osc_page(oap); - struct cl_page *page = oap2cl_page(oap); - int result; - - LASSERT(cmd == OBD_BRW_WRITE); /* no cached reads */ - - result = cl_page_make_ready(env, page, CRT_WRITE); - if (result == 0) - opg->ops_submit_time = jiffies; - return result; -} - -static int osc_refresh_count(const struct lu_env *env, - struct osc_async_page *oap, int cmd) -{ - struct osc_page *opg = oap2osc_page(oap); - pgoff_t index = osc_index(oap2osc(oap)); - struct cl_object *obj; - struct cl_attr *attr = &osc_env_info(env)->oti_attr; - - int result; - loff_t kms; - - /* readpage queues with _COUNT_STABLE, shouldn't get here. */ - LASSERT(!(cmd & OBD_BRW_READ)); - obj = opg->ops_cl.cpl_obj; - - cl_object_attr_lock(obj); - result = cl_object_attr_get(env, obj, attr); - cl_object_attr_unlock(obj); - if (result < 0) - return result; - kms = attr->cat_kms; - if (cl_offset(obj, index) >= kms) - /* catch race with truncate */ - return 0; - else if (cl_offset(obj, index + 1) > kms) - /* catch sub-page write at end of file */ - return kms % PAGE_SIZE; - else - return PAGE_SIZE; -} - -static int osc_completion(const struct lu_env *env, struct osc_async_page *oap, - int cmd, int rc) -{ - struct osc_page *opg = oap2osc_page(oap); - struct cl_page *page = oap2cl_page(oap); - enum cl_req_type crt; - int srvlock; - - cmd &= ~OBD_BRW_NOQUOTA; - LASSERTF(equi(page->cp_state == CPS_PAGEIN, cmd == OBD_BRW_READ), - "cp_state:%u, cmd:%d\n", page->cp_state, cmd); - LASSERTF(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE), - "cp_state:%u, cmd:%d\n", page->cp_state, cmd); - LASSERT(opg->ops_transfer_pinned); - - crt = cmd == OBD_BRW_READ ? CRT_READ : CRT_WRITE; - /* Clear opg->ops_transfer_pinned before VM lock is released. */ - opg->ops_transfer_pinned = 0; - - opg->ops_submit_time = 0; - srvlock = oap->oap_brw_flags & OBD_BRW_SRVLOCK; - - /* statistic */ - if (rc == 0 && srvlock) { - struct lu_device *ld = opg->ops_cl.cpl_obj->co_lu.lo_dev; - struct osc_stats *stats = &lu2osc_dev(ld)->od_stats; - size_t bytes = oap->oap_count; - - if (crt == CRT_READ) - stats->os_lockless_reads += bytes; - else - stats->os_lockless_writes += bytes; - } - - /* - * This has to be the last operation with the page, as locks are - * released in cl_page_completion() and nothing except for the - * reference counter protects page from concurrent reclaim. - */ - lu_ref_del(&page->cp_reference, "transfer", page); - - cl_page_completion(env, page, crt, rc); - cl_page_put(env, page); - - return 0; -} - -#define OSC_DUMP_GRANT(lvl, cli, fmt, args...) do { \ - struct client_obd *__tmp = (cli); \ - CDEBUG(lvl, "%s: grant { dirty: %lu/%lu dirty_pages: %ld/%lu " \ - "dropped: %ld avail: %ld, reserved: %ld, flight: %d }" \ - "lru {in list: %ld, left: %ld, waiters: %d }" fmt "\n", \ - cli_name(__tmp), \ - __tmp->cl_dirty_pages, __tmp->cl_dirty_max_pages, \ - atomic_long_read(&obd_dirty_pages), obd_max_dirty_pages, \ - __tmp->cl_lost_grant, __tmp->cl_avail_grant, \ - __tmp->cl_reserved_grant, __tmp->cl_w_in_flight, \ - atomic_long_read(&__tmp->cl_lru_in_list), \ - atomic_long_read(&__tmp->cl_lru_busy), \ - atomic_read(&__tmp->cl_lru_shrinkers), ##args); \ -} while (0) - -/* caller must hold loi_list_lock */ -static void osc_consume_write_grant(struct client_obd *cli, - struct brw_page *pga) -{ - assert_spin_locked(&cli->cl_loi_list_lock); - LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT)); - atomic_long_inc(&obd_dirty_pages); - cli->cl_dirty_pages++; - pga->flag |= OBD_BRW_FROM_GRANT; - CDEBUG(D_CACHE, "using %lu grant credits for brw %p page %p\n", - PAGE_SIZE, pga, pga->pg); - osc_update_next_shrink(cli); -} - -/* the companion to osc_consume_write_grant, called when a brw has completed. - * must be called with the loi lock held. - */ -static void osc_release_write_grant(struct client_obd *cli, - struct brw_page *pga) -{ - assert_spin_locked(&cli->cl_loi_list_lock); - if (!(pga->flag & OBD_BRW_FROM_GRANT)) - return; - - pga->flag &= ~OBD_BRW_FROM_GRANT; - atomic_long_dec(&obd_dirty_pages); - cli->cl_dirty_pages--; - if (pga->flag & OBD_BRW_NOCACHE) { - pga->flag &= ~OBD_BRW_NOCACHE; - atomic_long_dec(&obd_dirty_transit_pages); - cli->cl_dirty_transit--; - } -} - -/** - * To avoid sleeping with object lock held, it's good for us allocate enough - * grants before entering into critical section. - * - * spin_lock held by caller - */ -static int osc_reserve_grant(struct client_obd *cli, unsigned int bytes) -{ - int rc = -EDQUOT; - - if (cli->cl_avail_grant >= bytes) { - cli->cl_avail_grant -= bytes; - cli->cl_reserved_grant += bytes; - rc = 0; - } - return rc; -} - -static void __osc_unreserve_grant(struct client_obd *cli, - unsigned int reserved, unsigned int unused) -{ - /* it's quite normal for us to get more grant than reserved. - * Thinking about a case that two extents merged by adding a new - * chunk, we can save one extent tax. If extent tax is greater than - * one chunk, we can save more grant by adding a new chunk - */ - cli->cl_reserved_grant -= reserved; - if (unused > reserved) { - cli->cl_avail_grant += reserved; - cli->cl_lost_grant += unused - reserved; - } else { - cli->cl_avail_grant += unused; - } -} - -static void osc_unreserve_grant(struct client_obd *cli, - unsigned int reserved, unsigned int unused) -{ - spin_lock(&cli->cl_loi_list_lock); - __osc_unreserve_grant(cli, reserved, unused); - if (unused > 0) - osc_wake_cache_waiters(cli); - spin_unlock(&cli->cl_loi_list_lock); -} - -/** - * Free grant after IO is finished or canceled. - * - * @lost_grant is used to remember how many grants we have allocated but not - * used, we should return these grants to OST. There're two cases where grants - * can be lost: - * 1. truncate; - * 2. blocksize at OST is less than PAGE_SIZE and a partial page was - * written. In this case OST may use less chunks to serve this partial - * write. OSTs don't actually know the page size on the client side. so - * clients have to calculate lost grant by the blocksize on the OST. - * See filter_grant_check() for details. - */ -static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages, - unsigned int lost_grant) -{ - unsigned long grant = (1 << cli->cl_chunkbits) + cli->cl_extent_tax; - - spin_lock(&cli->cl_loi_list_lock); - atomic_long_sub(nr_pages, &obd_dirty_pages); - cli->cl_dirty_pages -= nr_pages; - cli->cl_lost_grant += lost_grant; - if (cli->cl_avail_grant < grant && cli->cl_lost_grant >= grant) { - /* borrow some grant from truncate to avoid the case that - * truncate uses up all avail grant - */ - cli->cl_lost_grant -= grant; - cli->cl_avail_grant += grant; - } - osc_wake_cache_waiters(cli); - spin_unlock(&cli->cl_loi_list_lock); - CDEBUG(D_CACHE, "lost %u grant: %lu avail: %lu dirty: %lu\n", - lost_grant, cli->cl_lost_grant, - cli->cl_avail_grant, cli->cl_dirty_pages << PAGE_SHIFT); -} - -/** - * The companion to osc_enter_cache(), called when @oap is no longer part of - * the dirty accounting due to error. - */ -static void osc_exit_cache(struct client_obd *cli, struct osc_async_page *oap) -{ - spin_lock(&cli->cl_loi_list_lock); - osc_release_write_grant(cli, &oap->oap_brw_page); - spin_unlock(&cli->cl_loi_list_lock); -} - -/** - * Non-blocking version of osc_enter_cache() that consumes grant only when it - * is available. - */ -static int osc_enter_cache_try(struct client_obd *cli, - struct osc_async_page *oap, - int bytes, int transient) -{ - int rc; - - OSC_DUMP_GRANT(D_CACHE, cli, "need:%d\n", bytes); - - rc = osc_reserve_grant(cli, bytes); - if (rc < 0) - return 0; - - if (cli->cl_dirty_pages < cli->cl_dirty_max_pages && - atomic_long_read(&obd_dirty_pages) + 1 <= obd_max_dirty_pages) { - osc_consume_write_grant(cli, &oap->oap_brw_page); - if (transient) { - cli->cl_dirty_transit++; - atomic_long_inc(&obd_dirty_transit_pages); - oap->oap_brw_flags |= OBD_BRW_NOCACHE; - } - rc = 1; - } else { - __osc_unreserve_grant(cli, bytes, bytes); - rc = 0; - } - return rc; -} - -static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw) -{ - int rc; - - spin_lock(&cli->cl_loi_list_lock); - rc = list_empty(&ocw->ocw_entry); - spin_unlock(&cli->cl_loi_list_lock); - return rc; -} - -/** - * The main entry to reserve dirty page accounting. Usually the grant reserved - * in this function will be freed in bulk in osc_free_grant() unless it fails - * to add osc cache, in that case, it will be freed in osc_exit_cache(). - * - * The process will be put into sleep if it's already run out of grant. - */ -static int osc_enter_cache(const struct lu_env *env, struct client_obd *cli, - struct osc_async_page *oap, int bytes) -{ - struct osc_object *osc = oap->oap_obj; - struct lov_oinfo *loi = osc->oo_oinfo; - struct osc_cache_waiter ocw; - unsigned long timeout = (AT_OFF ? obd_timeout : at_max) * HZ; - int rc = -EDQUOT; - - OSC_DUMP_GRANT(D_CACHE, cli, "need:%d\n", bytes); - - spin_lock(&cli->cl_loi_list_lock); - - /* force the caller to try sync io. this can jump the list - * of queued writes and create a discontiguous rpc stream - */ - if (OBD_FAIL_CHECK(OBD_FAIL_OSC_NO_GRANT) || - !cli->cl_dirty_max_pages || cli->cl_ar.ar_force_sync || - loi->loi_ar.ar_force_sync) { - OSC_DUMP_GRANT(D_CACHE, cli, "forced sync i/o\n"); - rc = -EDQUOT; - goto out; - } - - /* Hopefully normal case - cache space and write credits available */ - if (osc_enter_cache_try(cli, oap, bytes, 0)) { - OSC_DUMP_GRANT(D_CACHE, cli, "granted from cache\n"); - rc = 0; - goto out; - } - - /* We can get here for two reasons: too many dirty pages in cache, or - * run out of grants. In both cases we should write dirty pages out. - * Adding a cache waiter will trigger urgent write-out no matter what - * RPC size will be. - * The exiting condition is no avail grants and no dirty pages caching, - * that really means there is no space on the OST. - */ - init_waitqueue_head(&ocw.ocw_waitq); - ocw.ocw_oap = oap; - ocw.ocw_grant = bytes; - while (cli->cl_dirty_pages > 0 || cli->cl_w_in_flight > 0) { - list_add_tail(&ocw.ocw_entry, &cli->cl_cache_waiters); - ocw.ocw_rc = 0; - spin_unlock(&cli->cl_loi_list_lock); - - osc_io_unplug_async(env, cli, NULL); - - CDEBUG(D_CACHE, "%s: sleeping for cache space @ %p for %p\n", - cli_name(cli), &ocw, oap); - - rc = wait_event_idle_timeout(ocw.ocw_waitq, - ocw_granted(cli, &ocw), timeout); - - spin_lock(&cli->cl_loi_list_lock); - - if (rc == 0) { - /* wait_event is interrupted by signal, or timed out */ - list_del_init(&ocw.ocw_entry); - rc = -ETIMEDOUT; - break; - } - LASSERT(list_empty(&ocw.ocw_entry)); - rc = ocw.ocw_rc; - - if (rc != -EDQUOT) - break; - if (osc_enter_cache_try(cli, oap, bytes, 0)) { - rc = 0; - break; - } - } - - switch (rc) { - case 0: - OSC_DUMP_GRANT(D_CACHE, cli, "finally got grant space\n"); - break; - case -ETIMEDOUT: - OSC_DUMP_GRANT(D_CACHE, cli, - "timeout, fall back to sync i/o\n"); - osc_extent_tree_dump(D_CACHE, osc); - /* fall back to synchronous I/O */ - rc = -EDQUOT; - break; - case -EINTR: - /* Ensures restartability - LU-3581 */ - OSC_DUMP_GRANT(D_CACHE, cli, "interrupted\n"); - rc = -ERESTARTSYS; - break; - case -EDQUOT: - OSC_DUMP_GRANT(D_CACHE, cli, - "no grant space, fall back to sync i/o\n"); - break; - default: - CDEBUG(D_CACHE, "%s: event for cache space @ %p never arrived due to %d, fall back to sync i/o\n", - cli_name(cli), &ocw, rc); - break; - } -out: - spin_unlock(&cli->cl_loi_list_lock); - return rc; -} - -/* caller must hold loi_list_lock */ -void osc_wake_cache_waiters(struct client_obd *cli) -{ - struct list_head *l, *tmp; - struct osc_cache_waiter *ocw; - - list_for_each_safe(l, tmp, &cli->cl_cache_waiters) { - ocw = list_entry(l, struct osc_cache_waiter, ocw_entry); - list_del_init(&ocw->ocw_entry); - - ocw->ocw_rc = -EDQUOT; - /* we can't dirty more */ - if ((cli->cl_dirty_pages > cli->cl_dirty_max_pages) || - (atomic_long_read(&obd_dirty_pages) + 1 > - obd_max_dirty_pages)) { - CDEBUG(D_CACHE, "no dirty room: dirty: %ld osc max %ld, sys max %ld\n", - cli->cl_dirty_pages, cli->cl_dirty_max_pages, - obd_max_dirty_pages); - goto wakeup; - } - - if (osc_enter_cache_try(cli, ocw->ocw_oap, ocw->ocw_grant, 0)) - ocw->ocw_rc = 0; -wakeup: - CDEBUG(D_CACHE, "wake up %p for oap %p, avail grant %ld, %d\n", - ocw, ocw->ocw_oap, cli->cl_avail_grant, ocw->ocw_rc); - - wake_up(&ocw->ocw_waitq); - } -} - -static int osc_max_rpc_in_flight(struct client_obd *cli, struct osc_object *osc) -{ - int hprpc = !!list_empty(&osc->oo_hp_exts); - - return rpcs_in_flight(cli) >= cli->cl_max_rpcs_in_flight + hprpc; -} - -/* This maintains the lists of pending pages to read/write for a given object - * (lop). This is used by osc_check_rpcs->osc_next_obj() and osc_list_maint() - * to quickly find objects that are ready to send an RPC. - */ -static int osc_makes_rpc(struct client_obd *cli, struct osc_object *osc, - int cmd) -{ - int invalid_import = 0; - - /* if we have an invalid import we want to drain the queued pages - * by forcing them through rpcs that immediately fail and complete - * the pages. recovery relies on this to empty the queued pages - * before canceling the locks and evicting down the llite pages - */ - if (!cli->cl_import || cli->cl_import->imp_invalid) - invalid_import = 1; - - if (cmd & OBD_BRW_WRITE) { - if (atomic_read(&osc->oo_nr_writes) == 0) - return 0; - if (invalid_import) { - CDEBUG(D_CACHE, "invalid import forcing RPC\n"); - return 1; - } - if (!list_empty(&osc->oo_hp_exts)) { - CDEBUG(D_CACHE, "high prio request forcing RPC\n"); - return 1; - } - if (!list_empty(&osc->oo_urgent_exts)) { - CDEBUG(D_CACHE, "urgent request forcing RPC\n"); - return 1; - } - /* trigger a write rpc stream as long as there are dirtiers - * waiting for space. as they're waiting, they're not going to - * create more pages to coalesce with what's waiting.. - */ - if (!list_empty(&cli->cl_cache_waiters)) { - CDEBUG(D_CACHE, "cache waiters forcing RPC\n"); - return 1; - } - if (atomic_read(&osc->oo_nr_writes) >= - cli->cl_max_pages_per_rpc) - return 1; - } else { - if (atomic_read(&osc->oo_nr_reads) == 0) - return 0; - if (invalid_import) { - CDEBUG(D_CACHE, "invalid import forcing RPC\n"); - return 1; - } - /* all read are urgent. */ - if (!list_empty(&osc->oo_reading_exts)) - return 1; - } - - return 0; -} - -static void osc_update_pending(struct osc_object *obj, int cmd, int delta) -{ - struct client_obd *cli = osc_cli(obj); - - if (cmd & OBD_BRW_WRITE) { - atomic_add(delta, &obj->oo_nr_writes); - atomic_add(delta, &cli->cl_pending_w_pages); - LASSERT(atomic_read(&obj->oo_nr_writes) >= 0); - } else { - atomic_add(delta, &obj->oo_nr_reads); - atomic_add(delta, &cli->cl_pending_r_pages); - LASSERT(atomic_read(&obj->oo_nr_reads) >= 0); - } - OSC_IO_DEBUG(obj, "update pending cmd %d delta %d.\n", cmd, delta); -} - -static int osc_makes_hprpc(struct osc_object *obj) -{ - return !list_empty(&obj->oo_hp_exts); -} - -static void on_list(struct list_head *item, struct list_head *list, int should_be_on) -{ - if (list_empty(item) && should_be_on) - list_add_tail(item, list); - else if (!list_empty(item) && !should_be_on) - list_del_init(item); -} - -/* maintain the osc's cli list membership invariants so that osc_send_oap_rpc - * can find pages to build into rpcs quickly - */ -static int __osc_list_maint(struct client_obd *cli, struct osc_object *osc) -{ - if (osc_makes_hprpc(osc)) { - /* HP rpc */ - on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list, 0); - on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 1); - } else { - on_list(&osc->oo_hp_ready_item, &cli->cl_loi_hp_ready_list, 0); - on_list(&osc->oo_ready_item, &cli->cl_loi_ready_list, - osc_makes_rpc(cli, osc, OBD_BRW_WRITE) || - osc_makes_rpc(cli, osc, OBD_BRW_READ)); - } - - on_list(&osc->oo_write_item, &cli->cl_loi_write_list, - atomic_read(&osc->oo_nr_writes) > 0); - - on_list(&osc->oo_read_item, &cli->cl_loi_read_list, - atomic_read(&osc->oo_nr_reads) > 0); - - return osc_is_ready(osc); -} - -static int osc_list_maint(struct client_obd *cli, struct osc_object *osc) -{ - int is_ready; - - spin_lock(&cli->cl_loi_list_lock); - is_ready = __osc_list_maint(cli, osc); - spin_unlock(&cli->cl_loi_list_lock); - - return is_ready; -} - -/* this is trying to propagate async writeback errors back up to the - * application. As an async write fails we record the error code for later if - * the app does an fsync. As long as errors persist we force future rpcs to be - * sync so that the app can get a sync error and break the cycle of queueing - * pages for which writeback will fail. - */ -static void osc_process_ar(struct osc_async_rc *ar, __u64 xid, - int rc) -{ - if (rc) { - if (!ar->ar_rc) - ar->ar_rc = rc; - - ar->ar_force_sync = 1; - ar->ar_min_xid = ptlrpc_sample_next_xid(); - return; - } - - if (ar->ar_force_sync && (xid >= ar->ar_min_xid)) - ar->ar_force_sync = 0; -} - -/* this must be called holding the loi list lock to give coverage to exit_cache, - * async_flag maintenance, and oap_request - */ -static void osc_ap_completion(const struct lu_env *env, struct client_obd *cli, - struct osc_async_page *oap, int sent, int rc) -{ - struct osc_object *osc = oap->oap_obj; - struct lov_oinfo *loi = osc->oo_oinfo; - __u64 xid = 0; - - if (oap->oap_request) { - xid = ptlrpc_req_xid(oap->oap_request); - ptlrpc_req_finished(oap->oap_request); - oap->oap_request = NULL; - } - - /* As the transfer for this page is being done, clear the flags */ - spin_lock(&oap->oap_lock); - oap->oap_async_flags = 0; - spin_unlock(&oap->oap_lock); - oap->oap_interrupted = 0; - - if (oap->oap_cmd & OBD_BRW_WRITE && xid > 0) { - spin_lock(&cli->cl_loi_list_lock); - osc_process_ar(&cli->cl_ar, xid, rc); - osc_process_ar(&loi->loi_ar, xid, rc); - spin_unlock(&cli->cl_loi_list_lock); - } - - rc = osc_completion(env, oap, oap->oap_cmd, rc); - if (rc) - CERROR("completion on oap %p obj %p returns %d.\n", - oap, osc, rc); -} - -struct extent_rpc_data { - struct list_head *erd_rpc_list; - unsigned int erd_page_count; - unsigned int erd_max_pages; - unsigned int erd_max_chunks; - unsigned int erd_max_extents; -}; - -static inline unsigned int osc_extent_chunks(const struct osc_extent *ext) -{ - struct client_obd *cli = osc_cli(ext->oe_obj); - unsigned int ppc_bits = cli->cl_chunkbits - PAGE_SHIFT; - - return (ext->oe_end >> ppc_bits) - (ext->oe_start >> ppc_bits) + 1; -} - -/** - * Try to add extent to one RPC. We need to think about the following things: - * - # of pages must not be over max_pages_per_rpc - * - extent must be compatible with previous ones - */ -static int try_to_add_extent_for_io(struct client_obd *cli, - struct osc_extent *ext, - struct extent_rpc_data *data) -{ - struct osc_extent *tmp; - unsigned int chunk_count; - struct osc_async_page *oap = list_first_entry(&ext->oe_pages, - struct osc_async_page, - oap_pending_item); - - EASSERT((ext->oe_state == OES_CACHE || ext->oe_state == OES_LOCK_DONE), - ext); - - if (!data->erd_max_extents) - return 0; - - chunk_count = osc_extent_chunks(ext); - EASSERTF(data->erd_page_count != 0 || - chunk_count <= data->erd_max_chunks, ext, - "The first extent to be fit in a RPC contains %u chunks, which is over the limit %u.\n", - chunk_count, data->erd_max_chunks); - - if (chunk_count > data->erd_max_chunks) - return 0; - - data->erd_max_pages = max(ext->oe_mppr, data->erd_max_pages); - EASSERTF(data->erd_page_count != 0 || - ext->oe_nr_pages <= data->erd_max_pages, ext, - "The first extent to be fit in a RPC contains %u pages, which is over the limit %u.\n", - ext->oe_nr_pages, data->erd_max_pages); - if (data->erd_page_count + ext->oe_nr_pages > data->erd_max_pages) - return 0; - - list_for_each_entry(tmp, data->erd_rpc_list, oe_link) { - struct osc_async_page *oap2; - - oap2 = list_first_entry(&tmp->oe_pages, struct osc_async_page, - oap_pending_item); - EASSERT(tmp->oe_owner == current, tmp); - if (oap2cl_page(oap)->cp_type != oap2cl_page(oap2)->cp_type) { - CDEBUG(D_CACHE, "Do not permit different type of IO in one RPC\n"); - return 0; - } - - if (tmp->oe_srvlock != ext->oe_srvlock || - !tmp->oe_grants != !ext->oe_grants || - tmp->oe_no_merge || ext->oe_no_merge) - return 0; - - /* remove break for strict check */ - break; - } - - data->erd_max_extents--; - data->erd_max_chunks -= chunk_count; - data->erd_page_count += ext->oe_nr_pages; - list_move_tail(&ext->oe_link, data->erd_rpc_list); - ext->oe_owner = current; - return 1; -} - -static inline unsigned int osc_max_write_chunks(const struct client_obd *cli) -{ - /* - * LU-8135: - * - * The maximum size of a single transaction is about 64MB in ZFS. - * #define DMU_MAX_ACCESS (64 * 1024 * 1024) - * - * Since ZFS is a copy-on-write file system, a single dirty page in - * a chunk will result in the rewrite of the whole chunk, therefore - * an RPC shouldn't be allowed to contain too many chunks otherwise - * it will make transaction size much bigger than 64MB, especially - * with big block size for ZFS. - * - * This piece of code is to make sure that OSC won't send write RPCs - * with too many chunks. The maximum chunk size that an RPC can cover - * is set to PTLRPC_MAX_BRW_SIZE, which is defined to 16MB. Ideally - * OST should tell the client what the biggest transaction size is, - * but it's good enough for now. - * - * This limitation doesn't apply to ldiskfs, which allows as many - * chunks in one RPC as we want. However, it won't have any benefits - * to have too many discontiguous pages in one RPC. - * - * An osc_extent won't cover over a RPC size, so the chunks in an - * osc_extent won't bigger than PTLRPC_MAX_BRW_SIZE >> chunkbits. - */ - return PTLRPC_MAX_BRW_SIZE >> cli->cl_chunkbits; -} - -/** - * In order to prevent multiple ptlrpcd from breaking contiguous extents, - * get_write_extent() takes all appropriate extents in atomic. - * - * The following policy is used to collect extents for IO: - * 1. Add as many HP extents as possible; - * 2. Add the first urgent extent in urgent extent list and take it out of - * urgent list; - * 3. Add subsequent extents of this urgent extent; - * 4. If urgent list is not empty, goto 2; - * 5. Traverse the extent tree from the 1st extent; - * 6. Above steps exit if there is no space in this RPC. - */ -static unsigned int get_write_extents(struct osc_object *obj, - struct list_head *rpclist) -{ - struct client_obd *cli = osc_cli(obj); - struct osc_extent *ext; - struct osc_extent *temp; - struct extent_rpc_data data = { - .erd_rpc_list = rpclist, - .erd_page_count = 0, - .erd_max_pages = cli->cl_max_pages_per_rpc, - .erd_max_chunks = osc_max_write_chunks(cli), - .erd_max_extents = 256, - }; - - LASSERT(osc_object_is_locked(obj)); - list_for_each_entry_safe(ext, temp, &obj->oo_hp_exts, oe_link) { - LASSERT(ext->oe_state == OES_CACHE); - if (!try_to_add_extent_for_io(cli, ext, &data)) - return data.erd_page_count; - EASSERT(ext->oe_nr_pages <= data.erd_max_pages, ext); - } - if (data.erd_page_count == data.erd_max_pages) - return data.erd_page_count; - - while (!list_empty(&obj->oo_urgent_exts)) { - ext = list_entry(obj->oo_urgent_exts.next, - struct osc_extent, oe_link); - if (!try_to_add_extent_for_io(cli, ext, &data)) - return data.erd_page_count; - - if (!ext->oe_intree) - continue; - - while ((ext = next_extent(ext)) != NULL) { - if ((ext->oe_state != OES_CACHE) || - (!list_empty(&ext->oe_link) && - ext->oe_owner)) - continue; - - if (!try_to_add_extent_for_io(cli, ext, &data)) - return data.erd_page_count; - } - } - if (data.erd_page_count == data.erd_max_pages) - return data.erd_page_count; - - ext = first_extent(obj); - while (ext) { - if ((ext->oe_state != OES_CACHE) || - /* this extent may be already in current rpclist */ - (!list_empty(&ext->oe_link) && ext->oe_owner)) { - ext = next_extent(ext); - continue; - } - - if (!try_to_add_extent_for_io(cli, ext, &data)) - return data.erd_page_count; - - ext = next_extent(ext); - } - return data.erd_page_count; -} - -static int -osc_send_write_rpc(const struct lu_env *env, struct client_obd *cli, - struct osc_object *osc) - __must_hold(osc) -{ - LIST_HEAD(rpclist); - struct osc_extent *ext; - struct osc_extent *tmp; - struct osc_extent *first = NULL; - u32 page_count = 0; - int srvlock = 0; - int rc = 0; - - LASSERT(osc_object_is_locked(osc)); - - page_count = get_write_extents(osc, &rpclist); - LASSERT(equi(page_count == 0, list_empty(&rpclist))); - - if (list_empty(&rpclist)) - return 0; - - osc_update_pending(osc, OBD_BRW_WRITE, -page_count); - - list_for_each_entry(ext, &rpclist, oe_link) { - LASSERT(ext->oe_state == OES_CACHE || - ext->oe_state == OES_LOCK_DONE); - if (ext->oe_state == OES_CACHE) - osc_extent_state_set(ext, OES_LOCKING); - else - osc_extent_state_set(ext, OES_RPC); - } - - /* we're going to grab page lock, so release object lock because - * lock order is page lock -> object lock. - */ - osc_object_unlock(osc); - - list_for_each_entry_safe(ext, tmp, &rpclist, oe_link) { - if (ext->oe_state == OES_LOCKING) { - rc = osc_extent_make_ready(env, ext); - if (unlikely(rc < 0)) { - list_del_init(&ext->oe_link); - osc_extent_finish(env, ext, 0, rc); - continue; - } - } - if (!first) { - first = ext; - srvlock = ext->oe_srvlock; - } else { - LASSERT(srvlock == ext->oe_srvlock); - } - } - - if (!list_empty(&rpclist)) { - LASSERT(page_count > 0); - rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_WRITE); - LASSERT(list_empty(&rpclist)); - } - - osc_object_lock(osc); - return rc; -} - -/** - * prepare pages for ASYNC io and put pages in send queue. - * - * \param cmd OBD_BRW_* macroses - * \param lop pending pages - * - * \return zero if no page added to send queue. - * \return 1 if pages successfully added to send queue. - * \return negative on errors. - */ -static int -osc_send_read_rpc(const struct lu_env *env, struct client_obd *cli, - struct osc_object *osc) - __must_hold(osc) -{ - struct osc_extent *ext; - struct osc_extent *next; - LIST_HEAD(rpclist); - struct extent_rpc_data data = { - .erd_rpc_list = &rpclist, - .erd_page_count = 0, - .erd_max_pages = cli->cl_max_pages_per_rpc, - .erd_max_chunks = UINT_MAX, - .erd_max_extents = UINT_MAX, - }; - int rc = 0; - - LASSERT(osc_object_is_locked(osc)); - list_for_each_entry_safe(ext, next, &osc->oo_reading_exts, oe_link) { - EASSERT(ext->oe_state == OES_LOCK_DONE, ext); - if (!try_to_add_extent_for_io(cli, ext, &data)) - break; - osc_extent_state_set(ext, OES_RPC); - EASSERT(ext->oe_nr_pages <= data.erd_max_pages, ext); - } - LASSERT(data.erd_page_count <= data.erd_max_pages); - - osc_update_pending(osc, OBD_BRW_READ, -data.erd_page_count); - - if (!list_empty(&rpclist)) { - osc_object_unlock(osc); - - rc = osc_build_rpc(env, cli, &rpclist, OBD_BRW_READ); - LASSERT(list_empty(&rpclist)); - - osc_object_lock(osc); - } - return rc; -} - -#define list_to_obj(list, item) ({ \ - struct list_head *__tmp = (list)->next; \ - list_del_init(__tmp); \ - list_entry(__tmp, struct osc_object, oo_##item); \ -}) - -/* This is called by osc_check_rpcs() to find which objects have pages that - * we could be sending. These lists are maintained by osc_makes_rpc(). - */ -static struct osc_object *osc_next_obj(struct client_obd *cli) -{ - /* First return objects that have blocked locks so that they - * will be flushed quickly and other clients can get the lock, - * then objects which have pages ready to be stuffed into RPCs - */ - if (!list_empty(&cli->cl_loi_hp_ready_list)) - return list_to_obj(&cli->cl_loi_hp_ready_list, hp_ready_item); - if (!list_empty(&cli->cl_loi_ready_list)) - return list_to_obj(&cli->cl_loi_ready_list, ready_item); - - /* then if we have cache waiters, return all objects with queued - * writes. This is especially important when many small files - * have filled up the cache and not been fired into rpcs because - * they don't pass the nr_pending/object threshold - */ - if (!list_empty(&cli->cl_cache_waiters) && - !list_empty(&cli->cl_loi_write_list)) - return list_to_obj(&cli->cl_loi_write_list, write_item); - - /* then return all queued objects when we have an invalid import - * so that they get flushed - */ - if (!cli->cl_import || cli->cl_import->imp_invalid) { - if (!list_empty(&cli->cl_loi_write_list)) - return list_to_obj(&cli->cl_loi_write_list, write_item); - if (!list_empty(&cli->cl_loi_read_list)) - return list_to_obj(&cli->cl_loi_read_list, read_item); - } - return NULL; -} - -/* called with the loi list lock held */ -static void osc_check_rpcs(const struct lu_env *env, struct client_obd *cli) - __must_hold(&cli->cl_loi_list_lock) -{ - struct osc_object *osc; - int rc = 0; - - while ((osc = osc_next_obj(cli)) != NULL) { - struct cl_object *obj = osc2cl(osc); - struct lu_ref_link link; - - OSC_IO_DEBUG(osc, "%lu in flight\n", rpcs_in_flight(cli)); - - if (osc_max_rpc_in_flight(cli, osc)) { - __osc_list_maint(cli, osc); - break; - } - - cl_object_get(obj); - spin_unlock(&cli->cl_loi_list_lock); - lu_object_ref_add_at(&obj->co_lu, &link, "check", current); - - /* attempt some read/write balancing by alternating between - * reads and writes in an object. The makes_rpc checks here - * would be redundant if we were getting read/write work items - * instead of objects. we don't want send_oap_rpc to drain a - * partial read pending queue when we're given this object to - * do io on writes while there are cache waiters - */ - osc_object_lock(osc); - if (osc_makes_rpc(cli, osc, OBD_BRW_WRITE)) { - rc = osc_send_write_rpc(env, cli, osc); - if (rc < 0) { - CERROR("Write request failed with %d\n", rc); - - /* osc_send_write_rpc failed, mostly because of - * memory pressure. - * - * It can't break here, because if: - * - a page was submitted by osc_io_submit, so - * page locked; - * - no request in flight - * - no subsequent request - * The system will be in live-lock state, - * because there is no chance to call - * osc_io_unplug() and osc_check_rpcs() any - * more. pdflush can't help in this case, - * because it might be blocked at grabbing - * the page lock as we mentioned. - * - * Anyway, continue to drain pages. - */ - /* break; */ - } - } - if (osc_makes_rpc(cli, osc, OBD_BRW_READ)) { - rc = osc_send_read_rpc(env, cli, osc); - if (rc < 0) - CERROR("Read request failed with %d\n", rc); - } - osc_object_unlock(osc); - - osc_list_maint(cli, osc); - lu_object_ref_del_at(&obj->co_lu, &link, "check", current); - cl_object_put(env, obj); - - spin_lock(&cli->cl_loi_list_lock); - } -} - -static int osc_io_unplug0(const struct lu_env *env, struct client_obd *cli, - struct osc_object *osc, int async) -{ - int rc = 0; - - if (osc && osc_list_maint(cli, osc) == 0) - return 0; - - if (!async) { - spin_lock(&cli->cl_loi_list_lock); - osc_check_rpcs(env, cli); - spin_unlock(&cli->cl_loi_list_lock); - } else { - CDEBUG(D_CACHE, "Queue writeback work for client %p.\n", cli); - LASSERT(cli->cl_writeback_work); - rc = ptlrpcd_queue_work(cli->cl_writeback_work); - } - return rc; -} - -static int osc_io_unplug_async(const struct lu_env *env, - struct client_obd *cli, struct osc_object *osc) -{ - return osc_io_unplug0(env, cli, osc, 1); -} - -void osc_io_unplug(const struct lu_env *env, struct client_obd *cli, - struct osc_object *osc) -{ - (void)osc_io_unplug0(env, cli, osc, 0); -} - -int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops, - struct page *page, loff_t offset) -{ - struct obd_export *exp = osc_export(osc); - struct osc_async_page *oap = &ops->ops_oap; - - if (!page) - return cfs_size_round(sizeof(*oap)); - - oap->oap_magic = OAP_MAGIC; - oap->oap_cli = &exp->exp_obd->u.cli; - oap->oap_obj = osc; - - oap->oap_page = page; - oap->oap_obj_off = offset; - LASSERT(!(offset & ~PAGE_MASK)); - - if (capable(CAP_SYS_RESOURCE)) - oap->oap_brw_flags = OBD_BRW_NOQUOTA; - - INIT_LIST_HEAD(&oap->oap_pending_item); - INIT_LIST_HEAD(&oap->oap_rpc_item); - - spin_lock_init(&oap->oap_lock); - CDEBUG(D_INFO, "oap %p page %p obj off %llu\n", - oap, page, oap->oap_obj_off); - return 0; -} - -int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, - struct osc_page *ops) -{ - struct osc_io *oio = osc_env_io(env); - struct osc_extent *ext = NULL; - struct osc_async_page *oap = &ops->ops_oap; - struct client_obd *cli = oap->oap_cli; - struct osc_object *osc = oap->oap_obj; - pgoff_t index; - unsigned int grants = 0, tmp; - int brw_flags = OBD_BRW_ASYNC; - int cmd = OBD_BRW_WRITE; - int need_release = 0; - int rc = 0; - - if (oap->oap_magic != OAP_MAGIC) - return -EINVAL; - - if (!cli->cl_import || cli->cl_import->imp_invalid) - return -EIO; - - if (!list_empty(&oap->oap_pending_item) || - !list_empty(&oap->oap_rpc_item)) - return -EBUSY; - - /* Set the OBD_BRW_SRVLOCK before the page is queued. */ - brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0; - if (capable(CAP_SYS_RESOURCE)) { - brw_flags |= OBD_BRW_NOQUOTA; - cmd |= OBD_BRW_NOQUOTA; - } - - /* check if the file's owner/group is over quota */ - if (!(cmd & OBD_BRW_NOQUOTA)) { - struct cl_object *obj; - struct cl_attr *attr; - unsigned int qid[MAXQUOTAS]; - - obj = cl_object_top(&osc->oo_cl); - attr = &osc_env_info(env)->oti_attr; - - cl_object_attr_lock(obj); - rc = cl_object_attr_get(env, obj, attr); - cl_object_attr_unlock(obj); - - qid[USRQUOTA] = attr->cat_uid; - qid[GRPQUOTA] = attr->cat_gid; - if (rc == 0 && osc_quota_chkdq(cli, qid) == NO_QUOTA) - rc = -EDQUOT; - if (rc) - return rc; - } - - oap->oap_cmd = cmd; - oap->oap_page_off = ops->ops_from; - oap->oap_count = ops->ops_to - ops->ops_from; - /* - * No need to hold a lock here, - * since this page is not in any list yet. - */ - oap->oap_async_flags = 0; - oap->oap_brw_flags = brw_flags; - - OSC_IO_DEBUG(osc, "oap %p page %p added for cmd %d\n", - oap, oap->oap_page, oap->oap_cmd & OBD_BRW_RWMASK); - - index = osc_index(oap2osc(oap)); - - /* Add this page into extent by the following steps: - * 1. if there exists an active extent for this IO, mostly this page - * can be added to the active extent and sometimes we need to - * expand extent to accommodate this page; - * 2. otherwise, a new extent will be allocated. - */ - - ext = oio->oi_active; - if (ext && ext->oe_start <= index && ext->oe_max_end >= index) { - /* one chunk plus extent overhead must be enough to write this - * page - */ - grants = (1 << cli->cl_chunkbits) + cli->cl_extent_tax; - if (ext->oe_end >= index) - grants = 0; - - /* it doesn't need any grant to dirty this page */ - spin_lock(&cli->cl_loi_list_lock); - rc = osc_enter_cache_try(cli, oap, grants, 0); - spin_unlock(&cli->cl_loi_list_lock); - if (rc == 0) { /* try failed */ - grants = 0; - need_release = 1; - } else if (ext->oe_end < index) { - tmp = grants; - /* try to expand this extent */ - rc = osc_extent_expand(ext, index, &tmp); - if (rc < 0) { - need_release = 1; - /* don't free reserved grant */ - } else { - OSC_EXTENT_DUMP(D_CACHE, ext, - "expanded for %lu.\n", index); - osc_unreserve_grant(cli, grants, tmp); - grants = 0; - } - } - rc = 0; - } else if (ext) { - /* index is located outside of active extent */ - need_release = 1; - } - if (need_release) { - osc_extent_release(env, ext); - oio->oi_active = NULL; - ext = NULL; - } - - if (!ext) { - tmp = (1 << cli->cl_chunkbits) + cli->cl_extent_tax; - - /* try to find new extent to cover this page */ - LASSERT(!oio->oi_active); - /* we may have allocated grant for this page if we failed - * to expand the previous active extent. - */ - LASSERT(ergo(grants > 0, grants >= tmp)); - - rc = 0; - if (grants == 0) { - /* we haven't allocated grant for this page. */ - rc = osc_enter_cache(env, cli, oap, tmp); - if (rc == 0) - grants = tmp; - } - - tmp = grants; - if (rc == 0) { - ext = osc_extent_find(env, osc, index, &tmp); - if (IS_ERR(ext)) { - LASSERT(tmp == grants); - osc_exit_cache(cli, oap); - rc = PTR_ERR(ext); - ext = NULL; - } else { - oio->oi_active = ext; - } - } - if (grants > 0) - osc_unreserve_grant(cli, grants, tmp); - } - - LASSERT(ergo(rc == 0, ext)); - if (ext) { - EASSERTF(ext->oe_end >= index && ext->oe_start <= index, - ext, "index = %lu.\n", index); - LASSERT((oap->oap_brw_flags & OBD_BRW_FROM_GRANT) != 0); - - osc_object_lock(osc); - if (ext->oe_nr_pages == 0) - ext->oe_srvlock = ops->ops_srvlock; - else - LASSERT(ext->oe_srvlock == ops->ops_srvlock); - ++ext->oe_nr_pages; - list_add_tail(&oap->oap_pending_item, &ext->oe_pages); - osc_object_unlock(osc); - } - return rc; -} - -int osc_teardown_async_page(const struct lu_env *env, - struct osc_object *obj, struct osc_page *ops) -{ - struct osc_async_page *oap = &ops->ops_oap; - int rc = 0; - - LASSERT(oap->oap_magic == OAP_MAGIC); - - CDEBUG(D_INFO, "teardown oap %p page %p at index %lu.\n", - oap, ops, osc_index(oap2osc(oap))); - - if (!list_empty(&oap->oap_rpc_item)) { - CDEBUG(D_CACHE, "oap %p is not in cache.\n", oap); - rc = -EBUSY; - } else if (!list_empty(&oap->oap_pending_item)) { - struct osc_extent *ext = NULL; - - osc_object_lock(obj); - ext = osc_extent_lookup(obj, osc_index(oap2osc(oap))); - osc_object_unlock(obj); - /* only truncated pages are allowed to be taken out. - * See osc_extent_truncate() and osc_cache_truncate_start() - * for details. - */ - if (ext && ext->oe_state != OES_TRUNC) { - OSC_EXTENT_DUMP(D_ERROR, ext, "trunc at %lu.\n", - osc_index(oap2osc(oap))); - rc = -EBUSY; - } - if (ext) - osc_extent_put(env, ext); - } - return rc; -} - -/** - * This is called when a page is picked up by kernel to write out. - * - * We should find out the corresponding extent and add the whole extent - * into urgent list. The extent may be being truncated or used, handle it - * carefully. - */ -int osc_flush_async_page(const struct lu_env *env, struct cl_io *io, - struct osc_page *ops) -{ - struct osc_extent *ext = NULL; - struct osc_object *obj = cl2osc(ops->ops_cl.cpl_obj); - struct cl_page *cp = ops->ops_cl.cpl_page; - pgoff_t index = osc_index(ops); - struct osc_async_page *oap = &ops->ops_oap; - bool unplug = false; - int rc = 0; - - osc_object_lock(obj); - ext = osc_extent_lookup(obj, index); - if (!ext) { - osc_extent_tree_dump(D_ERROR, obj); - LASSERTF(0, "page index %lu is NOT covered.\n", index); - } - - switch (ext->oe_state) { - case OES_RPC: - case OES_LOCK_DONE: - CL_PAGE_DEBUG(D_ERROR, env, cp, "flush an in-rpc page?\n"); - LASSERT(0); - break; - case OES_LOCKING: - /* If we know this extent is being written out, we should abort - * so that the writer can make this page ready. Otherwise, there - * exists a deadlock problem because other process can wait for - * page writeback bit holding page lock; and meanwhile in - * vvp_page_make_ready(), we need to grab page lock before - * really sending the RPC. - */ - case OES_TRUNC: - /* race with truncate, page will be redirtied */ - case OES_ACTIVE: - /* The extent is active so we need to abort and let the caller - * re-dirty the page. If we continued on here, and we were the - * one making the extent active, we could deadlock waiting for - * the page writeback to clear but it won't because the extent - * is active and won't be written out. - */ - rc = -EAGAIN; - goto out; - default: - break; - } - - rc = cl_page_prep(env, io, cp, CRT_WRITE); - if (rc) - goto out; - - spin_lock(&oap->oap_lock); - oap->oap_async_flags |= ASYNC_READY | ASYNC_URGENT; - spin_unlock(&oap->oap_lock); - - if (current->flags & PF_MEMALLOC) - ext->oe_memalloc = 1; - - ext->oe_urgent = 1; - if (ext->oe_state == OES_CACHE) { - OSC_EXTENT_DUMP(D_CACHE, ext, - "flush page %p make it urgent.\n", oap); - if (list_empty(&ext->oe_link)) - list_add_tail(&ext->oe_link, &obj->oo_urgent_exts); - unplug = true; - } - rc = 0; - -out: - osc_object_unlock(obj); - osc_extent_put(env, ext); - if (unplug) - osc_io_unplug_async(env, osc_cli(obj), obj); - return rc; -} - -/** - * this is called when a sync waiter receives an interruption. Its job is to - * get the caller woken as soon as possible. If its page hasn't been put in an - * rpc yet it can dequeue immediately. Otherwise it has to mark the rpc as - * desiring interruption which will forcefully complete the rpc once the rpc - * has timed out. - */ -int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops) -{ - struct osc_async_page *oap = &ops->ops_oap; - struct osc_object *obj = oap->oap_obj; - struct client_obd *cli = osc_cli(obj); - struct osc_extent *ext; - struct osc_extent *found = NULL; - struct list_head *plist; - pgoff_t index = osc_index(ops); - int rc = -EBUSY; - int cmd; - - LASSERT(!oap->oap_interrupted); - oap->oap_interrupted = 1; - - /* Find out the caching extent */ - osc_object_lock(obj); - if (oap->oap_cmd & OBD_BRW_WRITE) { - plist = &obj->oo_urgent_exts; - cmd = OBD_BRW_WRITE; - } else { - plist = &obj->oo_reading_exts; - cmd = OBD_BRW_READ; - } - list_for_each_entry(ext, plist, oe_link) { - if (ext->oe_start <= index && ext->oe_end >= index) { - LASSERT(ext->oe_state == OES_LOCK_DONE); - /* For OES_LOCK_DONE state extent, it has already held - * a refcount for RPC. - */ - found = osc_extent_get(ext); - break; - } - } - if (found) { - list_del_init(&found->oe_link); - osc_update_pending(obj, cmd, -found->oe_nr_pages); - osc_object_unlock(obj); - - osc_extent_finish(env, found, 0, -EINTR); - osc_extent_put(env, found); - rc = 0; - } else { - osc_object_unlock(obj); - /* ok, it's been put in an rpc. only one oap gets a request - * reference - */ - if (oap->oap_request) { - ptlrpc_mark_interrupted(oap->oap_request); - ptlrpcd_wake(oap->oap_request); - ptlrpc_req_finished(oap->oap_request); - oap->oap_request = NULL; - } - } - - osc_list_maint(cli, obj); - return rc; -} - -int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj, - struct list_head *list, int cmd, int brw_flags) -{ - struct client_obd *cli = osc_cli(obj); - struct osc_extent *ext; - struct osc_async_page *oap, *tmp; - int page_count = 0; - int mppr = cli->cl_max_pages_per_rpc; - bool can_merge = true; - pgoff_t start = CL_PAGE_EOF; - pgoff_t end = 0; - - list_for_each_entry(oap, list, oap_pending_item) { - struct osc_page *opg = oap2osc_page(oap); - pgoff_t index = osc_index(opg); - - if (index > end) - end = index; - if (index < start) - start = index; - ++page_count; - mppr <<= (page_count > mppr); - - if (unlikely(opg->ops_from > 0 || opg->ops_to < PAGE_SIZE)) - can_merge = false; - } - - ext = osc_extent_alloc(obj); - if (!ext) { - list_for_each_entry_safe(oap, tmp, list, oap_pending_item) { - list_del_init(&oap->oap_pending_item); - osc_ap_completion(env, cli, oap, 0, -ENOMEM); - } - return -ENOMEM; - } - - ext->oe_rw = !!(cmd & OBD_BRW_READ); - ext->oe_sync = 1; - ext->oe_no_merge = !can_merge; - ext->oe_urgent = 1; - ext->oe_start = start; - ext->oe_end = end; - ext->oe_max_end = end; - ext->oe_obj = obj; - ext->oe_srvlock = !!(brw_flags & OBD_BRW_SRVLOCK); - ext->oe_nr_pages = page_count; - ext->oe_mppr = mppr; - list_splice_init(list, &ext->oe_pages); - - osc_object_lock(obj); - /* Reuse the initial refcount for RPC, don't drop it */ - osc_extent_state_set(ext, OES_LOCK_DONE); - if (cmd & OBD_BRW_WRITE) { - list_add_tail(&ext->oe_link, &obj->oo_urgent_exts); - osc_update_pending(obj, OBD_BRW_WRITE, page_count); - } else { - list_add_tail(&ext->oe_link, &obj->oo_reading_exts); - osc_update_pending(obj, OBD_BRW_READ, page_count); - } - osc_object_unlock(obj); - - osc_io_unplug_async(env, cli, obj); - return 0; -} - -/** - * Called by osc_io_setattr_start() to freeze and destroy covering extents. - */ -int osc_cache_truncate_start(const struct lu_env *env, struct osc_object *obj, - u64 size, struct osc_extent **extp) -{ - struct client_obd *cli = osc_cli(obj); - struct osc_extent *ext; - struct osc_extent *temp; - struct osc_extent *waiting = NULL; - pgoff_t index; - LIST_HEAD(list); - int result = 0; - bool partial; - - /* pages with index greater or equal to index will be truncated. */ - index = cl_index(osc2cl(obj), size); - partial = size > cl_offset(osc2cl(obj), index); - -again: - osc_object_lock(obj); - ext = osc_extent_search(obj, index); - if (!ext) - ext = first_extent(obj); - else if (ext->oe_end < index) - ext = next_extent(ext); - while (ext) { - EASSERT(ext->oe_state != OES_TRUNC, ext); - - if (ext->oe_state > OES_CACHE || ext->oe_urgent) { - /* if ext is in urgent state, it means there must exist - * a page already having been flushed by write_page(). - * We have to wait for this extent because we can't - * truncate that page. - */ - OSC_EXTENT_DUMP(D_CACHE, ext, - "waiting for busy extent\n"); - waiting = osc_extent_get(ext); - break; - } - - OSC_EXTENT_DUMP(D_CACHE, ext, "try to trunc:%llu.\n", size); - - osc_extent_get(ext); - if (ext->oe_state == OES_ACTIVE) { - /* though we grab inode mutex for write path, but we - * release it before releasing extent(in osc_io_end()), - * so there is a race window that an extent is still - * in OES_ACTIVE when truncate starts. - */ - LASSERT(!ext->oe_trunc_pending); - ext->oe_trunc_pending = 1; - } else { - EASSERT(ext->oe_state == OES_CACHE, ext); - osc_extent_state_set(ext, OES_TRUNC); - osc_update_pending(obj, OBD_BRW_WRITE, - -ext->oe_nr_pages); - } - EASSERT(list_empty(&ext->oe_link), ext); - list_add_tail(&ext->oe_link, &list); - - ext = next_extent(ext); - } - osc_object_unlock(obj); - - osc_list_maint(cli, obj); - - list_for_each_entry_safe(ext, temp, &list, oe_link) { - int rc; - - list_del_init(&ext->oe_link); - - /* extent may be in OES_ACTIVE state because inode mutex - * is released before osc_io_end() in file write case - */ - if (ext->oe_state != OES_TRUNC) - osc_extent_wait(env, ext, OES_TRUNC); - - rc = osc_extent_truncate(ext, index, partial); - if (rc < 0) { - if (result == 0) - result = rc; - - OSC_EXTENT_DUMP(D_ERROR, ext, - "truncate error %d\n", rc); - } else if (ext->oe_nr_pages == 0) { - osc_extent_remove(ext); - } else { - /* this must be an overlapped extent which means only - * part of pages in this extent have been truncated. - */ - EASSERTF(ext->oe_start <= index, ext, - "trunc index = %lu/%d.\n", index, partial); - /* fix index to skip this partially truncated extent */ - index = ext->oe_end + 1; - partial = false; - - /* we need to hold this extent in OES_TRUNC state so - * that no writeback will happen. This is to avoid - * BUG 17397. - * Only partial truncate can reach here, if @size is - * not zero, the caller should provide a valid @extp. - */ - LASSERT(!*extp); - *extp = osc_extent_get(ext); - OSC_EXTENT_DUMP(D_CACHE, ext, - "trunc at %llu\n", size); - } - osc_extent_put(env, ext); - } - if (waiting) { - int rc; - - /* ignore the result of osc_extent_wait the write initiator - * should take care of it. - */ - rc = osc_extent_wait(env, waiting, OES_INV); - if (rc < 0) - OSC_EXTENT_DUMP(D_CACHE, waiting, "error: %d.\n", rc); - - osc_extent_put(env, waiting); - waiting = NULL; - goto again; - } - return result; -} - -/** - * Called after osc_io_setattr_end to add oio->oi_trunc back to cache. - */ -void osc_cache_truncate_end(const struct lu_env *env, struct osc_extent *ext) -{ - if (ext) { - struct osc_object *obj = ext->oe_obj; - bool unplug = false; - - EASSERT(ext->oe_nr_pages > 0, ext); - EASSERT(ext->oe_state == OES_TRUNC, ext); - EASSERT(!ext->oe_urgent, ext); - - OSC_EXTENT_DUMP(D_CACHE, ext, "trunc -> cache.\n"); - osc_object_lock(obj); - osc_extent_state_set(ext, OES_CACHE); - if (ext->oe_fsync_wait && !ext->oe_urgent) { - ext->oe_urgent = 1; - list_move_tail(&ext->oe_link, &obj->oo_urgent_exts); - unplug = true; - } - osc_update_pending(obj, OBD_BRW_WRITE, ext->oe_nr_pages); - osc_object_unlock(obj); - osc_extent_put(env, ext); - - if (unplug) - osc_io_unplug_async(env, osc_cli(obj), obj); - } -} - -/** - * Wait for extents in a specific range to be written out. - * The caller must have called osc_cache_writeback_range() to issue IO - * otherwise it will take a long time for this function to finish. - * - * Caller must hold inode_mutex , or cancel exclusive dlm lock so that - * nobody else can dirty this range of file while we're waiting for - * extents to be written. - */ -int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj, - pgoff_t start, pgoff_t end) -{ - struct osc_extent *ext; - pgoff_t index = start; - int result = 0; - -again: - osc_object_lock(obj); - ext = osc_extent_search(obj, index); - if (!ext) - ext = first_extent(obj); - else if (ext->oe_end < index) - ext = next_extent(ext); - while (ext) { - int rc; - - if (ext->oe_start > end) - break; - - if (!ext->oe_fsync_wait) { - ext = next_extent(ext); - continue; - } - - EASSERT(ergo(ext->oe_state == OES_CACHE, - ext->oe_hp || ext->oe_urgent), ext); - EASSERT(ergo(ext->oe_state == OES_ACTIVE, - !ext->oe_hp && ext->oe_urgent), ext); - - index = ext->oe_end + 1; - osc_extent_get(ext); - osc_object_unlock(obj); - - rc = osc_extent_wait(env, ext, OES_INV); - if (result == 0) - result = rc; - osc_extent_put(env, ext); - goto again; - } - osc_object_unlock(obj); - - OSC_IO_DEBUG(obj, "sync file range.\n"); - return result; -} - -/** - * Called to write out a range of osc object. - * - * @hp : should be set this is caused by lock cancel; - * @discard: is set if dirty pages should be dropped - file will be deleted or - * truncated, this implies there is no partially discarding extents. - * - * Return how many pages will be issued, or error code if error occurred. - */ -int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj, - pgoff_t start, pgoff_t end, int hp, int discard) -{ - struct osc_extent *ext; - LIST_HEAD(discard_list); - bool unplug = false; - int result = 0; - - osc_object_lock(obj); - ext = osc_extent_search(obj, start); - if (!ext) - ext = first_extent(obj); - else if (ext->oe_end < start) - ext = next_extent(ext); - while (ext) { - if (ext->oe_start > end) - break; - - ext->oe_fsync_wait = 1; - switch (ext->oe_state) { - case OES_CACHE: - result += ext->oe_nr_pages; - if (!discard) { - struct list_head *list = NULL; - - if (hp) { - EASSERT(!ext->oe_hp, ext); - ext->oe_hp = 1; - list = &obj->oo_hp_exts; - } else if (!ext->oe_urgent) { - ext->oe_urgent = 1; - list = &obj->oo_urgent_exts; - } - if (list) - list_move_tail(&ext->oe_link, list); - unplug = true; - } else { - /* the only discarder is lock cancelling, so - * [start, end] must contain this extent - */ - EASSERT(ext->oe_start >= start && - ext->oe_max_end <= end, ext); - osc_extent_state_set(ext, OES_LOCKING); - ext->oe_owner = current; - list_move_tail(&ext->oe_link, &discard_list); - osc_update_pending(obj, OBD_BRW_WRITE, - -ext->oe_nr_pages); - } - break; - case OES_ACTIVE: - /* It's pretty bad to wait for ACTIVE extents, because - * we don't know how long we will wait for it to be - * flushed since it may be blocked at awaiting more - * grants. We do this for the correctness of fsync. - */ - LASSERT(hp == 0 && discard == 0); - ext->oe_urgent = 1; - break; - case OES_TRUNC: - /* this extent is being truncated, can't do anything - * for it now. it will be set to urgent after truncate - * is finished in osc_cache_truncate_end(). - */ - default: - break; - } - ext = next_extent(ext); - } - osc_object_unlock(obj); - - LASSERT(ergo(!discard, list_empty(&discard_list))); - if (!list_empty(&discard_list)) { - struct osc_extent *tmp; - int rc; - - osc_list_maint(osc_cli(obj), obj); - list_for_each_entry_safe(ext, tmp, &discard_list, oe_link) { - list_del_init(&ext->oe_link); - EASSERT(ext->oe_state == OES_LOCKING, ext); - - /* Discard caching pages. We don't actually write this - * extent out but we complete it as if we did. - */ - rc = osc_extent_make_ready(env, ext); - if (unlikely(rc < 0)) { - OSC_EXTENT_DUMP(D_ERROR, ext, - "make_ready returned %d\n", rc); - if (result >= 0) - result = rc; - } - - /* finish the extent as if the pages were sent */ - osc_extent_finish(env, ext, 0, 0); - } - } - - if (unplug) - osc_io_unplug(env, osc_cli(obj), obj); - - if (hp || discard) { - int rc; - - rc = osc_cache_wait_range(env, obj, start, end); - if (result >= 0 && rc < 0) - result = rc; - } - - OSC_IO_DEBUG(obj, "pageout [%lu, %lu], %d.\n", start, end, result); - return result; -} - -/** - * Returns a list of pages by a given [start, end] of \a obj. - * - * \param resched If not NULL, then we give up before hogging CPU for too - * long and set *resched = 1, in that case caller should implement a retry - * logic. - * - * Gang tree lookup (radix_tree_gang_lookup()) optimization is absolutely - * crucial in the face of [offset, EOF] locks. - * - * Return at least one page in @queue unless there is no covered page. - */ -int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io, - struct osc_object *osc, pgoff_t start, pgoff_t end, - osc_page_gang_cbt cb, void *cbdata) -{ - struct osc_page *ops; - void **pvec; - pgoff_t idx; - unsigned int nr; - unsigned int i; - unsigned int j; - int res = CLP_GANG_OKAY; - bool tree_lock = true; - - idx = start; - pvec = osc_env_info(env)->oti_pvec; - spin_lock(&osc->oo_tree_lock); - while ((nr = radix_tree_gang_lookup(&osc->oo_tree, pvec, - idx, OTI_PVEC_SIZE)) > 0) { - struct cl_page *page; - bool end_of_region = false; - - for (i = 0, j = 0; i < nr; ++i) { - ops = pvec[i]; - pvec[i] = NULL; - - idx = osc_index(ops); - if (idx > end) { - end_of_region = true; - break; - } - - page = ops->ops_cl.cpl_page; - LASSERT(page->cp_type == CPT_CACHEABLE); - if (page->cp_state == CPS_FREEING) - continue; - - cl_page_get(page); - lu_ref_add_atomic(&page->cp_reference, - "gang_lookup", current); - pvec[j++] = ops; - } - ++idx; - - /* - * Here a delicate locking dance is performed. Current thread - * holds a reference to a page, but has to own it before it - * can be placed into queue. Owning implies waiting, so - * radix-tree lock is to be released. After a wait one has to - * check that pages weren't truncated (cl_page_own() returns - * error in the latter case). - */ - spin_unlock(&osc->oo_tree_lock); - tree_lock = false; - - for (i = 0; i < j; ++i) { - ops = pvec[i]; - if (res == CLP_GANG_OKAY) - res = (*cb)(env, io, ops, cbdata); - - page = ops->ops_cl.cpl_page; - lu_ref_del(&page->cp_reference, "gang_lookup", current); - cl_page_put(env, page); - } - if (nr < OTI_PVEC_SIZE || end_of_region) - break; - - if (res == CLP_GANG_OKAY && need_resched()) - res = CLP_GANG_RESCHED; - if (res != CLP_GANG_OKAY) - break; - - spin_lock(&osc->oo_tree_lock); - tree_lock = true; - } - if (tree_lock) - spin_unlock(&osc->oo_tree_lock); - return res; -} - -/** - * Check if page @page is covered by an extra lock or discard it. - */ -static int check_and_discard_cb(const struct lu_env *env, struct cl_io *io, - struct osc_page *ops, void *cbdata) -{ - struct osc_thread_info *info = osc_env_info(env); - struct osc_object *osc = cbdata; - pgoff_t index; - - index = osc_index(ops); - if (index >= info->oti_fn_index) { - struct ldlm_lock *tmp; - struct cl_page *page = ops->ops_cl.cpl_page; - - /* refresh non-overlapped index */ - tmp = osc_dlmlock_at_pgoff(env, osc, index, - OSC_DAP_FL_TEST_LOCK); - if (tmp) { - __u64 end = tmp->l_policy_data.l_extent.end; - /* Cache the first-non-overlapped index so as to skip - * all pages within [index, oti_fn_index). This is safe - * because if tmp lock is canceled, it will discard - * these pages. - */ - info->oti_fn_index = cl_index(osc2cl(osc), end + 1); - if (end == OBD_OBJECT_EOF) - info->oti_fn_index = CL_PAGE_EOF; - LDLM_LOCK_PUT(tmp); - } else if (cl_page_own(env, io, page) == 0) { - /* discard the page */ - cl_page_discard(env, io, page); - cl_page_disown(env, io, page); - } else { - LASSERT(page->cp_state == CPS_FREEING); - } - } - - info->oti_next_index = index + 1; - return CLP_GANG_OKAY; -} - -static int discard_cb(const struct lu_env *env, struct cl_io *io, - struct osc_page *ops, void *cbdata) -{ - struct osc_thread_info *info = osc_env_info(env); - struct cl_page *page = ops->ops_cl.cpl_page; - - /* page is top page. */ - info->oti_next_index = osc_index(ops) + 1; - if (cl_page_own(env, io, page) == 0) { - if (page->cp_type == CPT_CACHEABLE && - PageDirty(cl_page_vmpage(page))) - CL_PAGE_DEBUG(D_ERROR, env, page, - "discard dirty page?\n"); - - /* discard the page */ - cl_page_discard(env, io, page); - cl_page_disown(env, io, page); - } else { - LASSERT(page->cp_state == CPS_FREEING); - } - - return CLP_GANG_OKAY; -} - -/** - * Discard pages protected by the given lock. This function traverses radix - * tree to find all covering pages and discard them. If a page is being covered - * by other locks, it should remain in cache. - * - * If error happens on any step, the process continues anyway (the reasoning - * behind this being that lock cancellation cannot be delayed indefinitely). - */ -int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc, - pgoff_t start, pgoff_t end, enum cl_lock_mode mode) -{ - struct osc_thread_info *info = osc_env_info(env); - struct cl_io *io = &info->oti_io; - osc_page_gang_cbt cb; - int res; - int result; - - io->ci_obj = cl_object_top(osc2cl(osc)); - io->ci_ignore_layout = 1; - result = cl_io_init(env, io, CIT_MISC, io->ci_obj); - if (result != 0) - goto out; - - cb = mode == CLM_READ ? check_and_discard_cb : discard_cb; - info->oti_fn_index = start; - info->oti_next_index = start; - do { - res = osc_page_gang_lookup(env, io, osc, - info->oti_next_index, end, cb, osc); - if (info->oti_next_index > end) - break; - - if (res == CLP_GANG_RESCHED) - cond_resched(); - } while (res != CLP_GANG_OKAY); -out: - cl_io_fini(env, io); - return result; -} - -/** @} osc */ diff --git a/drivers/staging/lustre/lustre/osc/osc_cl_internal.h b/drivers/staging/lustre/lustre/osc/osc_cl_internal.h deleted file mode 100644 index 2d3cba16ef345df1f250723bd22107683913b149..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/osc/osc_cl_internal.h +++ /dev/null @@ -1,681 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Internal interfaces of OSC layer. - * - * Author: Nikita Danilov - * Author: Jinshan Xiong - */ - -#ifndef OSC_CL_INTERNAL_H -#define OSC_CL_INTERNAL_H - -#include -/* osc_build_res_name() */ -#include -#include "osc_internal.h" - -/** \defgroup osc osc - * @{ - */ - -struct osc_extent; - -/** - * State maintained by osc layer for each IO context. - */ -struct osc_io { - /** super class */ - struct cl_io_slice oi_cl; - /** true if this io is lockless. */ - unsigned int oi_lockless:1, - /** true if this io is counted as active IO */ - oi_is_active:1; - /** how many LRU pages are reserved for this IO */ - unsigned long oi_lru_reserved; - - /** active extents, we know how many bytes is going to be written, - * so having an active extent will prevent it from being fragmented - */ - struct osc_extent *oi_active; - /** partially truncated extent, we need to hold this extent to prevent - * page writeback from happening. - */ - struct osc_extent *oi_trunc; - - /** write osc_lock for this IO, used by osc_extent_find(). */ - struct osc_lock *oi_write_osclock; - struct obdo oi_oa; - struct osc_async_cbargs { - bool opc_rpc_sent; - int opc_rc; - struct completion opc_sync; - } oi_cbarg; -}; - -/** - * State maintained by osc layer for the duration of a system call. - */ -struct osc_session { - struct osc_io os_io; -}; - -#define OTI_PVEC_SIZE 256 -struct osc_thread_info { - struct ldlm_res_id oti_resname; - union ldlm_policy_data oti_policy; - struct cl_lock_descr oti_descr; - struct cl_attr oti_attr; - struct lustre_handle oti_handle; - struct cl_page_list oti_plist; - struct cl_io oti_io; - void *oti_pvec[OTI_PVEC_SIZE]; - /** - * Fields used by cl_lock_discard_pages(). - */ - pgoff_t oti_next_index; - pgoff_t oti_fn_index; /* first non-overlapped index */ - struct cl_sync_io oti_anchor; - struct cl_req_attr oti_req_attr; -}; - -struct osc_object { - struct cl_object oo_cl; - struct lov_oinfo *oo_oinfo; - /** - * True if locking against this stripe got -EUSERS. - */ - int oo_contended; - unsigned long oo_contention_time; - /** - * used by the osc to keep track of what objects to build into rpcs. - * Protected by client_obd->cli_loi_list_lock. - */ - struct list_head oo_ready_item; - struct list_head oo_hp_ready_item; - struct list_head oo_write_item; - struct list_head oo_read_item; - - /** - * extent is a red black tree to manage (async) dirty pages. - */ - struct rb_root oo_root; - /** - * Manage write(dirty) extents. - */ - struct list_head oo_hp_exts; /* list of hp extents */ - struct list_head oo_urgent_exts; /* list of writeback extents */ - struct list_head oo_rpc_exts; - - struct list_head oo_reading_exts; - - atomic_t oo_nr_reads; - atomic_t oo_nr_writes; - - /** Protect extent tree. Will be used to protect - * oo_{read|write}_pages soon. - */ - spinlock_t oo_lock; - - /** - * Radix tree for caching pages - */ - struct radix_tree_root oo_tree; - spinlock_t oo_tree_lock; - unsigned long oo_npages; - - /* Protect osc_lock this osc_object has */ - spinlock_t oo_ol_spin; - struct list_head oo_ol_list; - - /** number of active IOs of this object */ - atomic_t oo_nr_ios; - wait_queue_head_t oo_io_waitq; -}; - -static inline void osc_object_lock(struct osc_object *obj) -{ - spin_lock(&obj->oo_lock); -} - -static inline int osc_object_trylock(struct osc_object *obj) -{ - return spin_trylock(&obj->oo_lock); -} - -static inline void osc_object_unlock(struct osc_object *obj) -{ - spin_unlock(&obj->oo_lock); -} - -static inline int osc_object_is_locked(struct osc_object *obj) -{ -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) - return spin_is_locked(&obj->oo_lock); -#else - /* - * It is not perfect to return true all the time. - * But since this function is only used for assertion - * and checking, it seems OK. - */ - return 1; -#endif -} - -/* - * Lock "micro-states" for osc layer. - */ -enum osc_lock_state { - OLS_NEW, - OLS_ENQUEUED, - OLS_UPCALL_RECEIVED, - OLS_GRANTED, - OLS_CANCELLED -}; - -/** - * osc-private state of cl_lock. - * - * Interaction with DLM. - * - * Once receive upcall is invoked, osc_lock remembers a handle of DLM lock in - * osc_lock::ols_handle and a pointer to that lock in osc_lock::ols_dlmlock. - * - * This pointer is protected through a reference, acquired by - * osc_lock_upcall0(). Also, an additional reference is acquired by - * ldlm_lock_addref() call protecting the lock from cancellation, until - * osc_lock_unuse() releases it. - * - * Below is a description of how lock references are acquired and released - * inside of DLM. - * - * - When new lock is created and enqueued to the server (ldlm_cli_enqueue()) - * - ldlm_lock_create() - * - ldlm_lock_new(): initializes a lock with 2 references. One for - * the caller (released when reply from the server is received, or on - * error), and another for the hash table. - * - ldlm_lock_addref_internal(): protects the lock from cancellation. - * - * - When reply is received from the server (osc_enqueue_interpret()) - * - ldlm_cli_enqueue_fini() - * - LDLM_LOCK_PUT(): releases caller reference acquired by - * ldlm_lock_new(). - * - if (rc != 0) - * ldlm_lock_decref(): error case: matches ldlm_cli_enqueue(). - * - ldlm_lock_decref(): for async locks, matches ldlm_cli_enqueue(). - * - * - When lock is being cancelled (ldlm_lock_cancel()) - * - ldlm_lock_destroy() - * - LDLM_LOCK_PUT(): releases hash-table reference acquired by - * ldlm_lock_new(). - * - * osc_lock is detached from ldlm_lock by osc_lock_detach() that is called - * either when lock is cancelled (osc_lock_blocking()), or when locks is - * deleted without cancellation (e.g., from cl_locks_prune()). In the latter - * case ldlm lock remains in memory, and can be re-attached to osc_lock in the - * future. - */ -struct osc_lock { - struct cl_lock_slice ols_cl; - /** Internal lock to protect states, etc. */ - spinlock_t ols_lock; - /** Owner sleeps on this channel for state change */ - struct cl_sync_io *ols_owner; - /** waiting list for this lock to be cancelled */ - struct list_head ols_waiting_list; - /** wait entry of ols_waiting_list */ - struct list_head ols_wait_entry; - /** list entry for osc_object::oo_ol_list */ - struct list_head ols_nextlock_oscobj; - - /** underlying DLM lock */ - struct ldlm_lock *ols_dlmlock; - /** DLM flags with which osc_lock::ols_lock was enqueued */ - __u64 ols_flags; - /** osc_lock::ols_lock handle */ - struct lustre_handle ols_handle; - struct ldlm_enqueue_info ols_einfo; - enum osc_lock_state ols_state; - /** lock value block */ - struct ost_lvb ols_lvb; - - /** - * true, if ldlm_lock_addref() was called against - * osc_lock::ols_lock. This is used for sanity checking. - * - * \see osc_lock::ols_has_ref - */ - unsigned ols_hold :1, - /** - * this is much like osc_lock::ols_hold, except that this bit is - * cleared _after_ reference in released in osc_lock_unuse(). This - * fine distinction is needed because: - * - * - if ldlm lock still has a reference, osc_ast_data_get() needs - * to return associated cl_lock (so that a flag is needed that is - * cleared after ldlm_lock_decref() returned), and - * - * - ldlm_lock_decref() can invoke blocking ast (for a - * LDLM_FL_CBPENDING lock), and osc_lock functions like - * osc_lock_cancel() called from there need to know whether to - * release lock reference (so that a flag is needed that is - * cleared before ldlm_lock_decref() is called). - */ - ols_has_ref:1, - /** - * inherit the lockless attribute from top level cl_io. - * If true, osc_lock_enqueue is able to tolerate the -EUSERS error. - */ - ols_locklessable:1, - /** - * if set, the osc_lock is a glimpse lock. For glimpse locks, we treat - * the EVAVAIL error as tolerable, this will make upper logic happy - * to wait all glimpse locks to each OSTs to be completed. - * Glimpse lock converts to normal lock if the server lock is - * granted. - * Glimpse lock should be destroyed immediately after use. - */ - ols_glimpse:1, - /** - * For async glimpse lock. - */ - ols_agl:1; -}; - -/** - * Page state private for osc layer. - */ -struct osc_page { - struct cl_page_slice ops_cl; - /** - * Page queues used by osc to detect when RPC can be formed. - */ - struct osc_async_page ops_oap; - /** - * An offset within page from which next transfer starts. This is used - * by cl_page_clip() to submit partial page transfers. - */ - int ops_from; - /** - * An offset within page at which next transfer ends. - * - * \see osc_page::ops_from. - */ - int ops_to; - /** - * Boolean, true iff page is under transfer. Used for sanity checking. - */ - unsigned ops_transfer_pinned:1, - /** - * in LRU? - */ - ops_in_lru:1, - /** - * Set if the page must be transferred with OBD_BRW_SRVLOCK. - */ - ops_srvlock:1; - /** - * lru page list. See osc_lru_{del|use}() in osc_page.c for usage. - */ - struct list_head ops_lru; - /** - * Submit time - the time when the page is starting RPC. For debugging. - */ - unsigned long ops_submit_time; -}; - -extern struct kmem_cache *osc_lock_kmem; -extern struct kmem_cache *osc_object_kmem; -extern struct kmem_cache *osc_thread_kmem; -extern struct kmem_cache *osc_session_kmem; -extern struct kmem_cache *osc_extent_kmem; - -extern struct lu_device_type osc_device_type; -extern struct lu_context_key osc_key; -extern struct lu_context_key osc_session_key; - -#define OSC_FLAGS (ASYNC_URGENT | ASYNC_READY) - -int osc_lock_init(const struct lu_env *env, - struct cl_object *obj, struct cl_lock *lock, - const struct cl_io *io); -int osc_io_init(const struct lu_env *env, - struct cl_object *obj, struct cl_io *io); -struct lu_object *osc_object_alloc(const struct lu_env *env, - const struct lu_object_header *hdr, - struct lu_device *dev); -int osc_page_init(const struct lu_env *env, struct cl_object *obj, - struct cl_page *page, pgoff_t ind); - -void osc_index2policy(union ldlm_policy_data *policy, - const struct cl_object *obj, - pgoff_t start, pgoff_t end); -int osc_lvb_print(const struct lu_env *env, void *cookie, - lu_printer_t p, const struct ost_lvb *lvb); - -void osc_lru_add_batch(struct client_obd *cli, struct list_head *list); -void osc_page_submit(const struct lu_env *env, struct osc_page *opg, - enum cl_req_type crt, int brw_flags); -int osc_cancel_async_page(const struct lu_env *env, struct osc_page *ops); -int osc_set_async_flags(struct osc_object *obj, struct osc_page *opg, - u32 async_flags); -int osc_prep_async_page(struct osc_object *osc, struct osc_page *ops, - struct page *page, loff_t offset); -int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, - struct osc_page *ops); -int osc_page_cache_add(const struct lu_env *env, - const struct cl_page_slice *slice, struct cl_io *io); -int osc_teardown_async_page(const struct lu_env *env, struct osc_object *obj, - struct osc_page *ops); -int osc_flush_async_page(const struct lu_env *env, struct cl_io *io, - struct osc_page *ops); -int osc_queue_sync_pages(const struct lu_env *env, struct osc_object *obj, - struct list_head *list, int cmd, int brw_flags); -int osc_cache_truncate_start(const struct lu_env *env, struct osc_object *obj, - u64 size, struct osc_extent **extp); -void osc_cache_truncate_end(const struct lu_env *env, struct osc_extent *ext); -int osc_cache_writeback_range(const struct lu_env *env, struct osc_object *obj, - pgoff_t start, pgoff_t end, int hp, int discard); -int osc_cache_wait_range(const struct lu_env *env, struct osc_object *obj, - pgoff_t start, pgoff_t end); -void osc_io_unplug(const struct lu_env *env, struct client_obd *cli, - struct osc_object *osc); -int lru_queue_work(const struct lu_env *env, void *data); - -void osc_object_set_contended(struct osc_object *obj); -void osc_object_clear_contended(struct osc_object *obj); -int osc_object_is_contended(struct osc_object *obj); - -int osc_lock_is_lockless(const struct osc_lock *olck); - -/***************************************************************************** - * - * Accessors. - * - */ - -static inline struct osc_thread_info *osc_env_info(const struct lu_env *env) -{ - struct osc_thread_info *info; - - info = lu_context_key_get(&env->le_ctx, &osc_key); - LASSERT(info); - return info; -} - -static inline struct osc_session *osc_env_session(const struct lu_env *env) -{ - struct osc_session *ses; - - ses = lu_context_key_get(env->le_ses, &osc_session_key); - LASSERT(ses); - return ses; -} - -static inline struct osc_io *osc_env_io(const struct lu_env *env) -{ - return &osc_env_session(env)->os_io; -} - -static inline int osc_is_object(const struct lu_object *obj) -{ - return obj->lo_dev->ld_type == &osc_device_type; -} - -static inline struct osc_device *lu2osc_dev(const struct lu_device *d) -{ - LINVRNT(d->ld_type == &osc_device_type); - return container_of(d, struct osc_device, od_cl.cd_lu_dev); -} - -static inline struct obd_export *osc_export(const struct osc_object *obj) -{ - return lu2osc_dev(obj->oo_cl.co_lu.lo_dev)->od_exp; -} - -static inline struct client_obd *osc_cli(const struct osc_object *obj) -{ - return &osc_export(obj)->exp_obd->u.cli; -} - -static inline struct osc_object *cl2osc(const struct cl_object *obj) -{ - LINVRNT(osc_is_object(&obj->co_lu)); - return container_of(obj, struct osc_object, oo_cl); -} - -static inline struct cl_object *osc2cl(const struct osc_object *obj) -{ - return (struct cl_object *)&obj->oo_cl; -} - -static inline enum ldlm_mode osc_cl_lock2ldlm(enum cl_lock_mode mode) -{ - LASSERT(mode == CLM_READ || mode == CLM_WRITE || mode == CLM_GROUP); - if (mode == CLM_READ) - return LCK_PR; - else if (mode == CLM_WRITE) - return LCK_PW; - else - return LCK_GROUP; -} - -static inline enum cl_lock_mode osc_ldlm2cl_lock(enum ldlm_mode mode) -{ - LASSERT(mode == LCK_PR || mode == LCK_PW || mode == LCK_GROUP); - if (mode == LCK_PR) - return CLM_READ; - else if (mode == LCK_PW) - return CLM_WRITE; - else - return CLM_GROUP; -} - -static inline struct osc_page *cl2osc_page(const struct cl_page_slice *slice) -{ - LINVRNT(osc_is_object(&slice->cpl_obj->co_lu)); - return container_of(slice, struct osc_page, ops_cl); -} - -static inline struct osc_page *oap2osc(struct osc_async_page *oap) -{ - return container_of_safe(oap, struct osc_page, ops_oap); -} - -static inline pgoff_t osc_index(struct osc_page *opg) -{ - return opg->ops_cl.cpl_index; -} - -static inline struct cl_page *oap2cl_page(struct osc_async_page *oap) -{ - return oap2osc(oap)->ops_cl.cpl_page; -} - -static inline struct osc_page *oap2osc_page(struct osc_async_page *oap) -{ - return (struct osc_page *)container_of(oap, struct osc_page, ops_oap); -} - -static inline struct osc_page * -osc_cl_page_osc(struct cl_page *page, struct osc_object *osc) -{ - const struct cl_page_slice *slice; - - LASSERT(osc); - slice = cl_object_page_slice(&osc->oo_cl, page); - return cl2osc_page(slice); -} - -static inline struct osc_lock *cl2osc_lock(const struct cl_lock_slice *slice) -{ - LINVRNT(osc_is_object(&slice->cls_obj->co_lu)); - return container_of(slice, struct osc_lock, ols_cl); -} - -static inline struct osc_lock *osc_lock_at(const struct cl_lock *lock) -{ - return cl2osc_lock(cl_lock_at(lock, &osc_device_type)); -} - -static inline int osc_io_srvlock(struct osc_io *oio) -{ - return (oio->oi_lockless && !oio->oi_cl.cis_io->ci_no_srvlock); -} - -enum osc_extent_state { - OES_INV = 0, /** extent is just initialized or destroyed */ - OES_ACTIVE = 1, /** process is using this extent */ - OES_CACHE = 2, /** extent is ready for IO */ - OES_LOCKING = 3, /** locking page to prepare IO */ - OES_LOCK_DONE = 4, /** locking finished, ready to send */ - OES_RPC = 5, /** in RPC */ - OES_TRUNC = 6, /** being truncated */ - OES_STATE_MAX -}; - -/** - * osc_extent data to manage dirty pages. - * osc_extent has the following attributes: - * 1. all pages in the same must be in one RPC in write back; - * 2. # of pages must be less than max_pages_per_rpc - implied by 1; - * 3. must be covered by only 1 osc_lock; - * 4. exclusive. It's impossible to have overlapped osc_extent. - * - * The lifetime of an extent is from when the 1st page is dirtied to when - * all pages inside it are written out. - * - * LOCKING ORDER - * ============= - * page lock -> cl_loi_list_lock -> object lock(osc_object::oo_lock) - */ -struct osc_extent { - /** red-black tree node */ - struct rb_node oe_node; - /** osc_object of this extent */ - struct osc_object *oe_obj; - /** refcount, removed from red-black tree if reaches zero. */ - atomic_t oe_refc; - /** busy if non-zero */ - atomic_t oe_users; - /** link list of osc_object's oo_{hp|urgent|locking}_exts. */ - struct list_head oe_link; - /** state of this extent */ - enum osc_extent_state oe_state; - /** flags for this extent. */ - unsigned int oe_intree:1, - /** 0 is write, 1 is read */ - oe_rw:1, - /** sync extent, queued by osc_queue_sync_pages() */ - oe_sync:1, - /** set if this extent has partial, sync pages. - * Extents with partial page(s) can't merge with others in RPC - */ - oe_no_merge:1, - oe_srvlock:1, - oe_memalloc:1, - /** an ACTIVE extent is going to be truncated, so when this extent - * is released, it will turn into TRUNC state instead of CACHE. - */ - oe_trunc_pending:1, - /** this extent should be written asap and someone may wait for the - * write to finish. This bit is usually set along with urgent if - * the extent was CACHE state. - * fsync_wait extent can't be merged because new extent region may - * exceed fsync range. - */ - oe_fsync_wait:1, - /** covering lock is being canceled */ - oe_hp:1, - /** this extent should be written back asap. set if one of pages is - * called by page WB daemon, or sync write or reading requests. - */ - oe_urgent:1; - /** how many grants allocated for this extent. - * Grant allocated for this extent. There is no grant allocated - * for reading extents and sync write extents. - */ - unsigned int oe_grants; - /** # of dirty pages in this extent */ - unsigned int oe_nr_pages; - /** list of pending oap pages. Pages in this list are NOT sorted. */ - struct list_head oe_pages; - /** Since an extent has to be written out in atomic, this is used to - * remember the next page need to be locked to write this extent out. - * Not used right now. - */ - struct osc_page *oe_next_page; - /** start and end index of this extent, include start and end - * themselves. Page offset here is the page index of osc_pages. - * oe_start is used as keyword for red-black tree. - */ - pgoff_t oe_start; - pgoff_t oe_end; - /** maximum ending index of this extent, this is limited by - * max_pages_per_rpc, lock extent and chunk size. - */ - pgoff_t oe_max_end; - /** waitqueue - for those who want to be notified if this extent's - * state has changed. - */ - wait_queue_head_t oe_waitq; - /** lock covering this extent */ - struct ldlm_lock *oe_dlmlock; - /** terminator of this extent. Must be true if this extent is in IO. */ - struct task_struct *oe_owner; - /** return value of writeback. If somebody is waiting for this extent, - * this value can be known by outside world. - */ - int oe_rc; - /** max pages per rpc when this extent was created */ - unsigned int oe_mppr; -}; - -int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext, - int sent, int rc); -void osc_extent_release(const struct lu_env *env, struct osc_extent *ext); - -int osc_lock_discard_pages(const struct lu_env *env, struct osc_object *osc, - pgoff_t start, pgoff_t end, enum cl_lock_mode mode); - -typedef int (*osc_page_gang_cbt)(const struct lu_env *, struct cl_io *, - struct osc_page *, void *); -int osc_page_gang_lookup(const struct lu_env *env, struct cl_io *io, - struct osc_object *osc, pgoff_t start, pgoff_t end, - osc_page_gang_cbt cb, void *cbdata); -/** @} osc */ - -#endif /* OSC_CL_INTERNAL_H */ diff --git a/drivers/staging/lustre/lustre/osc/osc_dev.c b/drivers/staging/lustre/lustre/osc/osc_dev.c deleted file mode 100644 index 2b5f324743e2533da2dc740eba8d3db931693ac7..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/osc/osc_dev.c +++ /dev/null @@ -1,246 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Implementation of cl_device, for OSC layer. - * - * Author: Nikita Danilov - */ - -#define DEBUG_SUBSYSTEM S_OSC - -/* class_name2obd() */ -#include - -#include "osc_cl_internal.h" - -/** \addtogroup osc - * @{ - */ - -struct kmem_cache *osc_lock_kmem; -struct kmem_cache *osc_object_kmem; -struct kmem_cache *osc_thread_kmem; -struct kmem_cache *osc_session_kmem; -struct kmem_cache *osc_extent_kmem; -struct kmem_cache *osc_quota_kmem; - -struct lu_kmem_descr osc_caches[] = { - { - .ckd_cache = &osc_lock_kmem, - .ckd_name = "osc_lock_kmem", - .ckd_size = sizeof(struct osc_lock) - }, - { - .ckd_cache = &osc_object_kmem, - .ckd_name = "osc_object_kmem", - .ckd_size = sizeof(struct osc_object) - }, - { - .ckd_cache = &osc_thread_kmem, - .ckd_name = "osc_thread_kmem", - .ckd_size = sizeof(struct osc_thread_info) - }, - { - .ckd_cache = &osc_session_kmem, - .ckd_name = "osc_session_kmem", - .ckd_size = sizeof(struct osc_session) - }, - { - .ckd_cache = &osc_extent_kmem, - .ckd_name = "osc_extent_kmem", - .ckd_size = sizeof(struct osc_extent) - }, - { - .ckd_cache = &osc_quota_kmem, - .ckd_name = "osc_quota_kmem", - .ckd_size = sizeof(struct osc_quota_info) - }, - { - .ckd_cache = NULL - } -}; - -/***************************************************************************** - * - * Type conversions. - * - */ - -static struct lu_device *osc2lu_dev(struct osc_device *osc) -{ - return &osc->od_cl.cd_lu_dev; -} - -/***************************************************************************** - * - * Osc device and device type functions. - * - */ - -static void *osc_key_init(const struct lu_context *ctx, - struct lu_context_key *key) -{ - struct osc_thread_info *info; - - info = kmem_cache_zalloc(osc_thread_kmem, GFP_NOFS); - if (!info) - info = ERR_PTR(-ENOMEM); - return info; -} - -static void osc_key_fini(const struct lu_context *ctx, - struct lu_context_key *key, void *data) -{ - struct osc_thread_info *info = data; - - kmem_cache_free(osc_thread_kmem, info); -} - -struct lu_context_key osc_key = { - .lct_tags = LCT_CL_THREAD, - .lct_init = osc_key_init, - .lct_fini = osc_key_fini -}; - -static void *osc_session_init(const struct lu_context *ctx, - struct lu_context_key *key) -{ - struct osc_session *info; - - info = kmem_cache_zalloc(osc_session_kmem, GFP_NOFS); - if (!info) - info = ERR_PTR(-ENOMEM); - return info; -} - -static void osc_session_fini(const struct lu_context *ctx, - struct lu_context_key *key, void *data) -{ - struct osc_session *info = data; - - kmem_cache_free(osc_session_kmem, info); -} - -struct lu_context_key osc_session_key = { - .lct_tags = LCT_SESSION, - .lct_init = osc_session_init, - .lct_fini = osc_session_fini -}; - -/* type constructor/destructor: osc_type_{init,fini,start,stop}(). */ -LU_TYPE_INIT_FINI(osc, &osc_key, &osc_session_key); - -static int osc_cl_process_config(const struct lu_env *env, - struct lu_device *d, struct lustre_cfg *cfg) -{ - return osc_process_config_base(d->ld_obd, cfg); -} - -static const struct lu_device_operations osc_lu_ops = { - .ldo_object_alloc = osc_object_alloc, - .ldo_process_config = osc_cl_process_config, - .ldo_recovery_complete = NULL -}; - -static int osc_device_init(const struct lu_env *env, struct lu_device *d, - const char *name, struct lu_device *next) -{ - return 0; -} - -static struct lu_device *osc_device_fini(const struct lu_env *env, - struct lu_device *d) -{ - return NULL; -} - -static struct lu_device *osc_device_free(const struct lu_env *env, - struct lu_device *d) -{ - struct osc_device *od = lu2osc_dev(d); - - cl_device_fini(lu2cl_dev(d)); - kfree(od); - return NULL; -} - -static struct lu_device *osc_device_alloc(const struct lu_env *env, - struct lu_device_type *t, - struct lustre_cfg *cfg) -{ - struct lu_device *d; - struct osc_device *od; - struct obd_device *obd; - int rc; - - od = kzalloc(sizeof(*od), GFP_NOFS); - if (!od) - return ERR_PTR(-ENOMEM); - - cl_device_init(&od->od_cl, t); - d = osc2lu_dev(od); - d->ld_ops = &osc_lu_ops; - - /* Setup OSC OBD */ - obd = class_name2obd(lustre_cfg_string(cfg, 0)); - LASSERT(obd); - rc = osc_setup(obd, cfg); - if (rc) { - osc_device_free(env, d); - return ERR_PTR(rc); - } - od->od_exp = obd->obd_self_export; - return d; -} - -static const struct lu_device_type_operations osc_device_type_ops = { - .ldto_init = osc_type_init, - .ldto_fini = osc_type_fini, - - .ldto_start = osc_type_start, - .ldto_stop = osc_type_stop, - - .ldto_device_alloc = osc_device_alloc, - .ldto_device_free = osc_device_free, - - .ldto_device_init = osc_device_init, - .ldto_device_fini = osc_device_fini -}; - -struct lu_device_type osc_device_type = { - .ldt_tags = LU_DEVICE_CL, - .ldt_name = LUSTRE_OSC_NAME, - .ldt_ops = &osc_device_type_ops, - .ldt_ctx_tags = LCT_CL_THREAD -}; - -/** @} osc */ diff --git a/drivers/staging/lustre/lustre/osc/osc_internal.h b/drivers/staging/lustre/lustre/osc/osc_internal.h deleted file mode 100644 index 4ddba1354befecf078d940fcfdd709c486953b74..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/osc/osc_internal.h +++ /dev/null @@ -1,237 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#ifndef OSC_INTERNAL_H -#define OSC_INTERNAL_H - -#define OAP_MAGIC 8675309 - -extern atomic_t osc_pool_req_count; -extern unsigned int osc_reqpool_maxreqcount; -extern struct ptlrpc_request_pool *osc_rq_pool; - -struct lu_env; - -enum async_flags { - ASYNC_READY = 0x1, /* ap_make_ready will not be called before this - * page is added to an rpc - */ - ASYNC_URGENT = 0x2, /* page must be put into an RPC before return */ - ASYNC_COUNT_STABLE = 0x4, /* ap_refresh_count will not be called - * to give the caller a chance to update - * or cancel the size of the io - */ - ASYNC_HP = 0x10, -}; - -struct osc_async_page { - int oap_magic; - unsigned short oap_cmd; - unsigned short oap_interrupted:1; - - struct list_head oap_pending_item; - struct list_head oap_rpc_item; - - u64 oap_obj_off; - unsigned int oap_page_off; - enum async_flags oap_async_flags; - - struct brw_page oap_brw_page; - - struct ptlrpc_request *oap_request; - struct client_obd *oap_cli; - struct osc_object *oap_obj; - - spinlock_t oap_lock; -}; - -#define oap_page oap_brw_page.pg -#define oap_count oap_brw_page.count -#define oap_brw_flags oap_brw_page.flag - -static inline struct osc_async_page *brw_page2oap(struct brw_page *pga) -{ - return (struct osc_async_page *)container_of(pga, struct osc_async_page, - oap_brw_page); -} - -struct osc_cache_waiter { - struct list_head ocw_entry; - wait_queue_head_t ocw_waitq; - struct osc_async_page *ocw_oap; - int ocw_grant; - int ocw_rc; -}; - -void osc_wake_cache_waiters(struct client_obd *cli); -int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes); -void osc_update_next_shrink(struct client_obd *cli); - -/* - * cl integration. - */ -#include - -extern struct ptlrpc_request_set *PTLRPCD_SET; - -typedef int (*osc_enqueue_upcall_f)(void *cookie, struct lustre_handle *lockh, - int rc); - -int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id, - __u64 *flags, union ldlm_policy_data *policy, - struct ost_lvb *lvb, int kms_valid, - osc_enqueue_upcall_f upcall, - void *cookie, struct ldlm_enqueue_info *einfo, - struct ptlrpc_request_set *rqset, int async, int agl); - -int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id, - enum ldlm_type type, union ldlm_policy_data *policy, - enum ldlm_mode mode, __u64 *flags, void *data, - struct lustre_handle *lockh, int unref); - -int osc_setattr_async(struct obd_export *exp, struct obdo *oa, - obd_enqueue_update_f upcall, void *cookie, - struct ptlrpc_request_set *rqset); -int osc_punch_base(struct obd_export *exp, struct obdo *oa, - obd_enqueue_update_f upcall, void *cookie, - struct ptlrpc_request_set *rqset); -int osc_sync_base(struct osc_object *exp, struct obdo *oa, - obd_enqueue_update_f upcall, void *cookie, - struct ptlrpc_request_set *rqset); - -int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *cfg); -int osc_build_rpc(const struct lu_env *env, struct client_obd *cli, - struct list_head *ext_list, int cmd); -long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli, - long target, bool force); -unsigned long osc_lru_reserve(struct client_obd *cli, unsigned long npages); -void osc_lru_unreserve(struct client_obd *cli, unsigned long npages); - -unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock); - -int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg); - -void lproc_osc_attach_seqstat(struct obd_device *dev); -void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars); - -extern struct lu_device_type osc_device_type; - -static inline int osc_recoverable_error(int rc) -{ - return (rc == -EIO || rc == -EROFS || rc == -ENOMEM || - rc == -EAGAIN || rc == -EINPROGRESS); -} - -static inline unsigned long rpcs_in_flight(struct client_obd *cli) -{ - return cli->cl_r_in_flight + cli->cl_w_in_flight; -} - -static inline char *cli_name(struct client_obd *cli) -{ - return cli->cl_import->imp_obd->obd_name; -} - -struct osc_device { - struct cl_device od_cl; - struct obd_export *od_exp; - - /* Write stats is actually protected by client_obd's lock. */ - struct osc_stats { - u64 os_lockless_writes; /* by bytes */ - u64 os_lockless_reads; /* by bytes */ - u64 os_lockless_truncates; /* by times */ - } od_stats; - - /* configuration item(s) */ - int od_contention_time; - int od_lockless_truncate; -}; - -static inline struct osc_device *obd2osc_dev(const struct obd_device *d) -{ - return container_of_safe(d->obd_lu_dev, struct osc_device, od_cl.cd_lu_dev); -} - -extern struct lu_kmem_descr osc_caches[]; - -extern struct kmem_cache *osc_quota_kmem; -struct osc_quota_info { - /** linkage for quota hash table */ - struct rhash_head oqi_hash; - u32 oqi_id; - struct rcu_head rcu; -}; - -int osc_quota_setup(struct obd_device *obd); -int osc_quota_cleanup(struct obd_device *obd); -int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[], - u32 valid, u32 flags); -int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[]); -int osc_quotactl(struct obd_device *unused, struct obd_export *exp, - struct obd_quotactl *oqctl); -void osc_inc_unstable_pages(struct ptlrpc_request *req); -void osc_dec_unstable_pages(struct ptlrpc_request *req); -bool osc_over_unstable_soft_limit(struct client_obd *cli); - -/** - * Bit flags for osc_dlm_lock_at_pageoff(). - */ -enum osc_dap_flags { - /** - * Just check if the desired lock exists, it won't hold reference - * count on lock. - */ - OSC_DAP_FL_TEST_LOCK = BIT(0), - /** - * Return the lock even if it is being canceled. - */ - OSC_DAP_FL_CANCELING = BIT(1), -}; - -struct ldlm_lock *osc_dlmlock_at_pgoff(const struct lu_env *env, - struct osc_object *obj, pgoff_t index, - enum osc_dap_flags flags); - -int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc); - -/** osc shrink list to link all osc client obd */ -extern struct list_head osc_shrink_list; -/** spin lock to protect osc_shrink_list */ -extern spinlock_t osc_shrink_lock; -unsigned long osc_cache_shrink_count(struct shrinker *sk, - struct shrink_control *sc); -unsigned long osc_cache_shrink_scan(struct shrinker *sk, - struct shrink_control *sc); - -#endif /* OSC_INTERNAL_H */ diff --git a/drivers/staging/lustre/lustre/osc/osc_io.c b/drivers/staging/lustre/lustre/osc/osc_io.c deleted file mode 100644 index 67734a8ed331daa70e438df7f6efc5701ca96343..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/osc/osc_io.c +++ /dev/null @@ -1,918 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Implementation of cl_io for OSC layer. - * - * Author: Nikita Danilov - * Author: Jinshan Xiong - */ - -#define DEBUG_SUBSYSTEM S_OSC - -#include - -#include "osc_cl_internal.h" - -/** \addtogroup osc - * @{ - */ - -/***************************************************************************** - * - * Type conversions. - * - */ - -static struct osc_io *cl2osc_io(const struct lu_env *env, - const struct cl_io_slice *slice) -{ - struct osc_io *oio = container_of_safe(slice, struct osc_io, oi_cl); - - LINVRNT(oio == osc_env_io(env)); - return oio; -} - -/***************************************************************************** - * - * io operations. - * - */ - -static void osc_io_fini(const struct lu_env *env, const struct cl_io_slice *io) -{ -} - -static void osc_read_ahead_release(const struct lu_env *env, void *cbdata) -{ - struct ldlm_lock *dlmlock = cbdata; - struct lustre_handle lockh; - - ldlm_lock2handle(dlmlock, &lockh); - ldlm_lock_decref(&lockh, LCK_PR); - LDLM_LOCK_PUT(dlmlock); -} - -static int osc_io_read_ahead(const struct lu_env *env, - const struct cl_io_slice *ios, - pgoff_t start, struct cl_read_ahead *ra) -{ - struct osc_object *osc = cl2osc(ios->cis_obj); - struct ldlm_lock *dlmlock; - int result = -ENODATA; - - dlmlock = osc_dlmlock_at_pgoff(env, osc, start, 0); - if (dlmlock) { - LASSERT(dlmlock->l_ast_data == osc); - if (dlmlock->l_req_mode != LCK_PR) { - struct lustre_handle lockh; - - ldlm_lock2handle(dlmlock, &lockh); - ldlm_lock_addref(&lockh, LCK_PR); - ldlm_lock_decref(&lockh, dlmlock->l_req_mode); - } - - ra->cra_rpc_size = osc_cli(osc)->cl_max_pages_per_rpc; - ra->cra_end = cl_index(osc2cl(osc), - dlmlock->l_policy_data.l_extent.end); - ra->cra_release = osc_read_ahead_release; - ra->cra_cbdata = dlmlock; - result = 0; - } - - return result; -} - -/** - * An implementation of cl_io_operations::cio_io_submit() method for osc - * layer. Iterates over pages in the in-queue, prepares each for io by calling - * cl_page_prep() and then either submits them through osc_io_submit_page() - * or, if page is already submitted, changes osc flags through - * osc_set_async_flags(). - */ -static int osc_io_submit(const struct lu_env *env, - const struct cl_io_slice *ios, - enum cl_req_type crt, struct cl_2queue *queue) -{ - struct cl_page *page; - struct cl_page *tmp; - struct client_obd *cli = NULL; - struct osc_object *osc = NULL; /* to keep gcc happy */ - struct osc_page *opg; - struct cl_io *io; - LIST_HEAD(list); - - struct cl_page_list *qin = &queue->c2_qin; - struct cl_page_list *qout = &queue->c2_qout; - unsigned int queued = 0; - int result = 0; - int cmd; - int brw_flags; - unsigned int max_pages; - - LASSERT(qin->pl_nr > 0); - - CDEBUG(D_CACHE | D_READA, "%d %d\n", qin->pl_nr, crt); - - osc = cl2osc(ios->cis_obj); - cli = osc_cli(osc); - max_pages = cli->cl_max_pages_per_rpc; - - cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ; - brw_flags = osc_io_srvlock(cl2osc_io(env, ios)) ? OBD_BRW_SRVLOCK : 0; - - /* - * NOTE: here @page is a top-level page. This is done to avoid - * creation of sub-page-list. - */ - cl_page_list_for_each_safe(page, tmp, qin) { - struct osc_async_page *oap; - - /* Top level IO. */ - io = page->cp_owner; - LASSERT(io); - - opg = osc_cl_page_osc(page, osc); - oap = &opg->ops_oap; - LASSERT(osc == oap->oap_obj); - - if (!list_empty(&oap->oap_pending_item) || - !list_empty(&oap->oap_rpc_item)) { - CDEBUG(D_CACHE, "Busy oap %p page %p for submit.\n", - oap, opg); - result = -EBUSY; - break; - } - - result = cl_page_prep(env, io, page, crt); - if (result != 0) { - LASSERT(result < 0); - if (result != -EALREADY) - break; - /* - * Handle -EALREADY error: for read case, the page is - * already in UPTODATE state; for write, the page - * is not dirty. - */ - result = 0; - continue; - } - - spin_lock(&oap->oap_lock); - oap->oap_async_flags = ASYNC_URGENT | ASYNC_READY; - oap->oap_async_flags |= ASYNC_COUNT_STABLE; - spin_unlock(&oap->oap_lock); - - osc_page_submit(env, opg, crt, brw_flags); - list_add_tail(&oap->oap_pending_item, &list); - - if (page->cp_sync_io) - cl_page_list_move(qout, qin, page); - else /* async IO */ - cl_page_list_del(env, qin, page); - - if (++queued == max_pages) { - queued = 0; - result = osc_queue_sync_pages(env, osc, &list, cmd, - brw_flags); - if (result < 0) - break; - } - } - - if (queued > 0) - result = osc_queue_sync_pages(env, osc, &list, cmd, brw_flags); - - /* Update c/mtime for sync write. LU-7310 */ - if (qout->pl_nr > 0 && !result) { - struct cl_attr *attr = &osc_env_info(env)->oti_attr; - struct cl_object *obj = ios->cis_obj; - - cl_object_attr_lock(obj); - attr->cat_mtime = ktime_get_real_seconds(); - attr->cat_ctime = attr->cat_mtime; - cl_object_attr_update(env, obj, attr, CAT_MTIME | CAT_CTIME); - cl_object_attr_unlock(obj); - } - - CDEBUG(D_INFO, "%d/%d %d\n", qin->pl_nr, qout->pl_nr, result); - return qout->pl_nr > 0 ? 0 : result; -} - -/** - * This is called when a page is accessed within file in a way that creates - * new page, if one were missing (i.e., if there were a hole at that place in - * the file, or accessed page is beyond the current file size). - * - * Expand stripe KMS if necessary. - */ -static void osc_page_touch_at(const struct lu_env *env, - struct cl_object *obj, pgoff_t idx, size_t to) -{ - struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo; - struct cl_attr *attr = &osc_env_info(env)->oti_attr; - int valid; - __u64 kms; - - /* offset within stripe */ - kms = cl_offset(obj, idx) + to; - - cl_object_attr_lock(obj); - /* - * XXX old code used - * - * ll_inode_size_lock(inode, 0); lov_stripe_lock(lsm); - * - * here - */ - CDEBUG(D_INODE, "stripe KMS %sincreasing %llu->%llu %llu\n", - kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms, - loi->loi_lvb.lvb_size); - - attr->cat_ctime = ktime_get_real_seconds(); - attr->cat_mtime = attr->cat_ctime; - valid = CAT_MTIME | CAT_CTIME; - if (kms > loi->loi_kms) { - attr->cat_kms = kms; - valid |= CAT_KMS; - } - if (kms > loi->loi_lvb.lvb_size) { - attr->cat_size = kms; - valid |= CAT_SIZE; - } - cl_object_attr_update(env, obj, attr, valid); - cl_object_attr_unlock(obj); -} - -static int osc_io_commit_async(const struct lu_env *env, - const struct cl_io_slice *ios, - struct cl_page_list *qin, int from, int to, - cl_commit_cbt cb) -{ - struct cl_io *io = ios->cis_io; - struct osc_io *oio = cl2osc_io(env, ios); - struct osc_object *osc = cl2osc(ios->cis_obj); - struct cl_page *page; - struct cl_page *last_page; - struct osc_page *opg; - int result = 0; - - LASSERT(qin->pl_nr > 0); - - /* Handle partial page cases */ - last_page = cl_page_list_last(qin); - if (oio->oi_lockless) { - page = cl_page_list_first(qin); - if (page == last_page) { - cl_page_clip(env, page, from, to); - } else { - if (from != 0) - cl_page_clip(env, page, from, PAGE_SIZE); - if (to != PAGE_SIZE) - cl_page_clip(env, last_page, 0, to); - } - } - - while (qin->pl_nr > 0) { - struct osc_async_page *oap; - - page = cl_page_list_first(qin); - opg = osc_cl_page_osc(page, osc); - oap = &opg->ops_oap; - - if (!list_empty(&oap->oap_rpc_item)) { - CDEBUG(D_CACHE, "Busy oap %p page %p for submit.\n", - oap, opg); - result = -EBUSY; - break; - } - - /* The page may be already in dirty cache. */ - if (list_empty(&oap->oap_pending_item)) { - result = osc_page_cache_add(env, &opg->ops_cl, io); - if (result != 0) - break; - } - - osc_page_touch_at(env, osc2cl(osc), osc_index(opg), - page == last_page ? to : PAGE_SIZE); - - cl_page_list_del(env, qin, page); - - (*cb)(env, io, page); - /* Can't access page any more. Page can be in transfer and - * complete at any time. - */ - } - - /* for sync write, kernel will wait for this page to be flushed before - * osc_io_end() is called, so release it earlier. - * for mkwrite(), it's known there is no further pages. - */ - if (cl_io_is_sync_write(io) && oio->oi_active) { - osc_extent_release(env, oio->oi_active); - oio->oi_active = NULL; - } - - CDEBUG(D_INFO, "%d %d\n", qin->pl_nr, result); - return result; -} - -static int osc_io_iter_init(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct osc_object *osc = cl2osc(ios->cis_obj); - struct obd_import *imp = osc_cli(osc)->cl_import; - int rc = -EIO; - - spin_lock(&imp->imp_lock); - if (likely(!imp->imp_invalid)) { - struct osc_io *oio = osc_env_io(env); - - atomic_inc(&osc->oo_nr_ios); - oio->oi_is_active = 1; - rc = 0; - } - spin_unlock(&imp->imp_lock); - - return rc; -} - -static int osc_io_write_iter_init(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct cl_io *io = ios->cis_io; - struct osc_io *oio = osc_env_io(env); - struct osc_object *osc = cl2osc(ios->cis_obj); - unsigned long npages; - - if (cl_io_is_append(io)) - return osc_io_iter_init(env, ios); - - npages = io->u.ci_rw.crw_count >> PAGE_SHIFT; - if (io->u.ci_rw.crw_pos & ~PAGE_MASK) - ++npages; - - oio->oi_lru_reserved = osc_lru_reserve(osc_cli(osc), npages); - - return osc_io_iter_init(env, ios); -} - -static void osc_io_iter_fini(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct osc_io *oio = osc_env_io(env); - - if (oio->oi_is_active) { - struct osc_object *osc = cl2osc(ios->cis_obj); - - oio->oi_is_active = 0; - LASSERT(atomic_read(&osc->oo_nr_ios) > 0); - if (atomic_dec_and_test(&osc->oo_nr_ios)) - wake_up_all(&osc->oo_io_waitq); - } -} - -static void osc_io_write_iter_fini(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct osc_io *oio = osc_env_io(env); - struct osc_object *osc = cl2osc(ios->cis_obj); - - if (oio->oi_lru_reserved > 0) { - osc_lru_unreserve(osc_cli(osc), oio->oi_lru_reserved); - oio->oi_lru_reserved = 0; - } - oio->oi_write_osclock = NULL; - - osc_io_iter_fini(env, ios); -} - -static int osc_io_fault_start(const struct lu_env *env, - const struct cl_io_slice *ios) -{ - struct cl_io *io; - struct cl_fault_io *fio; - - io = ios->cis_io; - fio = &io->u.ci_fault; - CDEBUG(D_INFO, "%lu %d %zu\n", - fio->ft_index, fio->ft_writable, fio->ft_nob); - /* - * If mapping is writeable, adjust kms to cover this page, - * but do not extend kms beyond actual file size. - * See bug 10919. - */ - if (fio->ft_writable) - osc_page_touch_at(env, ios->cis_obj, - fio->ft_index, fio->ft_nob); - return 0; -} - -static int osc_async_upcall(void *a, int rc) -{ - struct osc_async_cbargs *args = a; - - args->opc_rc = rc; - complete(&args->opc_sync); - return 0; -} - -/** - * Checks that there are no pages being written in the extent being truncated. - */ -static int trunc_check_cb(const struct lu_env *env, struct cl_io *io, - struct osc_page *ops, void *cbdata) -{ - struct cl_page *page = ops->ops_cl.cpl_page; - struct osc_async_page *oap; - __u64 start = *(__u64 *)cbdata; - - oap = &ops->ops_oap; - if (oap->oap_cmd & OBD_BRW_WRITE && - !list_empty(&oap->oap_pending_item)) - CL_PAGE_DEBUG(D_ERROR, env, page, "exists %llu/%s.\n", - start, current->comm); - - if (PageLocked(page->cp_vmpage)) - CDEBUG(D_CACHE, "page %p index %lu locked for %d.\n", - ops, osc_index(ops), oap->oap_cmd & OBD_BRW_RWMASK); - - return CLP_GANG_OKAY; -} - -static void osc_trunc_check(const struct lu_env *env, struct cl_io *io, - struct osc_io *oio, __u64 size) -{ - struct cl_object *clob; - int partial; - pgoff_t start; - - clob = oio->oi_cl.cis_obj; - start = cl_index(clob, size); - partial = cl_offset(clob, start) < size; - - /* - * Complain if there are pages in the truncated region. - */ - osc_page_gang_lookup(env, io, cl2osc(clob), - start + partial, CL_PAGE_EOF, - trunc_check_cb, (void *)&size); -} - -static int osc_io_setattr_start(const struct lu_env *env, - const struct cl_io_slice *slice) -{ - struct cl_io *io = slice->cis_io; - struct osc_io *oio = cl2osc_io(env, slice); - struct cl_object *obj = slice->cis_obj; - struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo; - struct cl_attr *attr = &osc_env_info(env)->oti_attr; - struct obdo *oa = &oio->oi_oa; - struct osc_async_cbargs *cbargs = &oio->oi_cbarg; - __u64 size = io->u.ci_setattr.sa_attr.lvb_size; - unsigned int ia_valid = io->u.ci_setattr.sa_valid; - int result = 0; - - /* truncate cache dirty pages first */ - if (cl_io_is_trunc(io)) - result = osc_cache_truncate_start(env, cl2osc(obj), size, - &oio->oi_trunc); - - if (result == 0 && oio->oi_lockless == 0) { - cl_object_attr_lock(obj); - result = cl_object_attr_get(env, obj, attr); - if (result == 0) { - struct ost_lvb *lvb = &io->u.ci_setattr.sa_attr; - unsigned int cl_valid = 0; - - if (ia_valid & ATTR_SIZE) { - attr->cat_size = size; - attr->cat_kms = size; - cl_valid = CAT_SIZE | CAT_KMS; - } - if (ia_valid & ATTR_MTIME_SET) { - attr->cat_mtime = lvb->lvb_mtime; - cl_valid |= CAT_MTIME; - } - if (ia_valid & ATTR_ATIME_SET) { - attr->cat_atime = lvb->lvb_atime; - cl_valid |= CAT_ATIME; - } - if (ia_valid & ATTR_CTIME_SET) { - attr->cat_ctime = lvb->lvb_ctime; - cl_valid |= CAT_CTIME; - } - result = cl_object_attr_update(env, obj, attr, - cl_valid); - } - cl_object_attr_unlock(obj); - } - memset(oa, 0, sizeof(*oa)); - if (result == 0) { - oa->o_oi = loi->loi_oi; - obdo_set_parent_fid(oa, io->u.ci_setattr.sa_parent_fid); - oa->o_stripe_idx = io->u.ci_setattr.sa_stripe_index; - oa->o_valid |= OBD_MD_FLID | OBD_MD_FLGROUP; - if (ia_valid & ATTR_CTIME) { - oa->o_valid |= OBD_MD_FLCTIME; - oa->o_ctime = attr->cat_ctime; - } - if (ia_valid & ATTR_ATIME) { - oa->o_valid |= OBD_MD_FLATIME; - oa->o_atime = attr->cat_atime; - } - if (ia_valid & ATTR_MTIME) { - oa->o_valid |= OBD_MD_FLMTIME; - oa->o_mtime = attr->cat_mtime; - } - if (ia_valid & ATTR_SIZE) { - oa->o_size = size; - oa->o_blocks = OBD_OBJECT_EOF; - oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS; - - if (oio->oi_lockless) { - oa->o_flags = OBD_FL_SRVLOCK; - oa->o_valid |= OBD_MD_FLFLAGS; - } - } else { - LASSERT(oio->oi_lockless == 0); - } - if (ia_valid & ATTR_ATTR_FLAG) { - oa->o_flags = io->u.ci_setattr.sa_attr_flags; - oa->o_valid |= OBD_MD_FLFLAGS; - } - - init_completion(&cbargs->opc_sync); - - if (ia_valid & ATTR_SIZE) - result = osc_punch_base(osc_export(cl2osc(obj)), - oa, osc_async_upcall, - cbargs, PTLRPCD_SET); - else - result = osc_setattr_async(osc_export(cl2osc(obj)), - oa, osc_async_upcall, - cbargs, PTLRPCD_SET); - cbargs->opc_rpc_sent = result == 0; - } - return result; -} - -static void osc_io_setattr_end(const struct lu_env *env, - const struct cl_io_slice *slice) -{ - struct cl_io *io = slice->cis_io; - struct osc_io *oio = cl2osc_io(env, slice); - struct cl_object *obj = slice->cis_obj; - struct osc_async_cbargs *cbargs = &oio->oi_cbarg; - int result = 0; - - if (cbargs->opc_rpc_sent) { - wait_for_completion(&cbargs->opc_sync); - result = cbargs->opc_rc; - io->ci_result = cbargs->opc_rc; - } - if (result == 0) { - if (oio->oi_lockless) { - /* lockless truncate */ - struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev); - - LASSERT(cl_io_is_trunc(io)); - /* XXX: Need a lock. */ - osd->od_stats.os_lockless_truncates++; - } - } - - if (cl_io_is_trunc(io)) { - __u64 size = io->u.ci_setattr.sa_attr.lvb_size; - - osc_trunc_check(env, io, oio, size); - osc_cache_truncate_end(env, oio->oi_trunc); - oio->oi_trunc = NULL; - } -} - -struct osc_data_version_args { - struct osc_io *dva_oio; -}; - -static int -osc_data_version_interpret(const struct lu_env *env, struct ptlrpc_request *req, - void *arg, int rc) -{ - struct osc_data_version_args *dva = arg; - struct osc_io *oio = dva->dva_oio; - const struct ost_body *body; - - if (rc < 0) - goto out; - - body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); - if (!body) { - rc = -EPROTO; - goto out; - } - - lustre_get_wire_obdo(&req->rq_import->imp_connect_data, &oio->oi_oa, - &body->oa); -out: - oio->oi_cbarg.opc_rc = rc; - complete(&oio->oi_cbarg.opc_sync); - - return 0; -} - -static int osc_io_data_version_start(const struct lu_env *env, - const struct cl_io_slice *slice) -{ - struct cl_data_version_io *dv = &slice->cis_io->u.ci_data_version; - struct osc_io *oio = cl2osc_io(env, slice); - struct osc_async_cbargs *cbargs = &oio->oi_cbarg; - struct osc_object *obj = cl2osc(slice->cis_obj); - struct obd_export *exp = osc_export(obj); - struct lov_oinfo *loi = obj->oo_oinfo; - struct osc_data_version_args *dva; - struct obdo *oa = &oio->oi_oa; - struct ptlrpc_request *req; - struct ost_body *body; - int rc; - - memset(oa, 0, sizeof(*oa)); - oa->o_oi = loi->loi_oi; - oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; - - if (dv->dv_flags & (LL_DV_RD_FLUSH | LL_DV_WR_FLUSH)) { - oa->o_valid |= OBD_MD_FLFLAGS; - oa->o_flags |= OBD_FL_SRVLOCK; - if (dv->dv_flags & LL_DV_WR_FLUSH) - oa->o_flags |= OBD_FL_FLUSH; - } - - init_completion(&cbargs->opc_sync); - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR); - if (!req) - return -ENOMEM; - - rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR); - if (rc < 0) { - ptlrpc_request_free(req); - return rc; - } - - body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); - lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); - - ptlrpc_request_set_replen(req); - req->rq_interpret_reply = osc_data_version_interpret; - BUILD_BUG_ON(sizeof(*dva) > sizeof(req->rq_async_args)); - dva = ptlrpc_req_async_args(req); - dva->dva_oio = oio; - - ptlrpcd_add_req(req); - - return 0; -} - -static void osc_io_data_version_end(const struct lu_env *env, - const struct cl_io_slice *slice) -{ - struct cl_data_version_io *dv = &slice->cis_io->u.ci_data_version; - struct osc_io *oio = cl2osc_io(env, slice); - struct osc_async_cbargs *cbargs = &oio->oi_cbarg; - - wait_for_completion(&cbargs->opc_sync); - - if (cbargs->opc_rc) { - slice->cis_io->ci_result = cbargs->opc_rc; - } else if (!(oio->oi_oa.o_valid & OBD_MD_FLDATAVERSION)) { - slice->cis_io->ci_result = -EOPNOTSUPP; - } else { - dv->dv_data_version = oio->oi_oa.o_data_version; - slice->cis_io->ci_result = 0; - } -} - -static int osc_io_read_start(const struct lu_env *env, - const struct cl_io_slice *slice) -{ - struct cl_object *obj = slice->cis_obj; - struct cl_attr *attr = &osc_env_info(env)->oti_attr; - int rc = 0; - - if (!slice->cis_io->ci_noatime) { - cl_object_attr_lock(obj); - attr->cat_atime = ktime_get_real_seconds(); - rc = cl_object_attr_update(env, obj, attr, CAT_ATIME); - cl_object_attr_unlock(obj); - } - return rc; -} - -static int osc_io_write_start(const struct lu_env *env, - const struct cl_io_slice *slice) -{ - struct cl_object *obj = slice->cis_obj; - struct cl_attr *attr = &osc_env_info(env)->oti_attr; - int rc = 0; - - OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_SETTIME, 1); - cl_object_attr_lock(obj); - attr->cat_ctime = ktime_get_real_seconds(); - attr->cat_mtime = attr->cat_ctime; - rc = cl_object_attr_update(env, obj, attr, CAT_MTIME | CAT_CTIME); - cl_object_attr_unlock(obj); - - return rc; -} - -static int osc_fsync_ost(const struct lu_env *env, struct osc_object *obj, - struct cl_fsync_io *fio) -{ - struct osc_io *oio = osc_env_io(env); - struct obdo *oa = &oio->oi_oa; - struct lov_oinfo *loi = obj->oo_oinfo; - struct osc_async_cbargs *cbargs = &oio->oi_cbarg; - int rc = 0; - - memset(oa, 0, sizeof(*oa)); - oa->o_oi = loi->loi_oi; - oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; - - /* reload size abd blocks for start and end of sync range */ - oa->o_size = fio->fi_start; - oa->o_blocks = fio->fi_end; - oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS; - - obdo_set_parent_fid(oa, fio->fi_fid); - - init_completion(&cbargs->opc_sync); - - rc = osc_sync_base(obj, oa, osc_async_upcall, cbargs, PTLRPCD_SET); - return rc; -} - -static int osc_io_fsync_start(const struct lu_env *env, - const struct cl_io_slice *slice) -{ - struct cl_io *io = slice->cis_io; - struct cl_fsync_io *fio = &io->u.ci_fsync; - struct cl_object *obj = slice->cis_obj; - struct osc_object *osc = cl2osc(obj); - pgoff_t start = cl_index(obj, fio->fi_start); - pgoff_t end = cl_index(obj, fio->fi_end); - int result = 0; - - if (fio->fi_end == OBD_OBJECT_EOF) - end = CL_PAGE_EOF; - - result = osc_cache_writeback_range(env, osc, start, end, 0, - fio->fi_mode == CL_FSYNC_DISCARD); - if (result > 0) { - fio->fi_nr_written += result; - result = 0; - } - if (fio->fi_mode == CL_FSYNC_ALL) { - int rc; - - /* we have to wait for writeback to finish before we can - * send OST_SYNC RPC. This is bad because it causes extents - * to be written osc by osc. However, we usually start - * writeback before CL_FSYNC_ALL so this won't have any real - * problem. - */ - rc = osc_cache_wait_range(env, osc, start, end); - if (result == 0) - result = rc; - rc = osc_fsync_ost(env, osc, fio); - if (result == 0) - result = rc; - } - - return result; -} - -static void osc_io_fsync_end(const struct lu_env *env, - const struct cl_io_slice *slice) -{ - struct cl_fsync_io *fio = &slice->cis_io->u.ci_fsync; - struct cl_object *obj = slice->cis_obj; - pgoff_t start = cl_index(obj, fio->fi_start); - pgoff_t end = cl_index(obj, fio->fi_end); - int result = 0; - - if (fio->fi_mode == CL_FSYNC_LOCAL) { - result = osc_cache_wait_range(env, cl2osc(obj), start, end); - } else if (fio->fi_mode == CL_FSYNC_ALL) { - struct osc_io *oio = cl2osc_io(env, slice); - struct osc_async_cbargs *cbargs = &oio->oi_cbarg; - - wait_for_completion(&cbargs->opc_sync); - if (result == 0) - result = cbargs->opc_rc; - } - slice->cis_io->ci_result = result; -} - -static void osc_io_end(const struct lu_env *env, - const struct cl_io_slice *slice) -{ - struct osc_io *oio = cl2osc_io(env, slice); - - if (oio->oi_active) { - osc_extent_release(env, oio->oi_active); - oio->oi_active = NULL; - } -} - -static const struct cl_io_operations osc_io_ops = { - .op = { - [CIT_READ] = { - .cio_iter_init = osc_io_iter_init, - .cio_iter_fini = osc_io_iter_fini, - .cio_start = osc_io_read_start, - .cio_fini = osc_io_fini - }, - [CIT_WRITE] = { - .cio_iter_init = osc_io_write_iter_init, - .cio_iter_fini = osc_io_write_iter_fini, - .cio_start = osc_io_write_start, - .cio_end = osc_io_end, - .cio_fini = osc_io_fini - }, - [CIT_SETATTR] = { - .cio_iter_init = osc_io_iter_init, - .cio_iter_fini = osc_io_iter_fini, - .cio_start = osc_io_setattr_start, - .cio_end = osc_io_setattr_end - }, - [CIT_DATA_VERSION] = { - .cio_start = osc_io_data_version_start, - .cio_end = osc_io_data_version_end, - }, - [CIT_FAULT] = { - .cio_iter_init = osc_io_iter_init, - .cio_iter_fini = osc_io_iter_fini, - .cio_start = osc_io_fault_start, - .cio_end = osc_io_end, - .cio_fini = osc_io_fini - }, - [CIT_FSYNC] = { - .cio_start = osc_io_fsync_start, - .cio_end = osc_io_fsync_end, - .cio_fini = osc_io_fini - }, - [CIT_MISC] = { - .cio_fini = osc_io_fini - } - }, - .cio_read_ahead = osc_io_read_ahead, - .cio_submit = osc_io_submit, - .cio_commit_async = osc_io_commit_async -}; - -/***************************************************************************** - * - * Transfer operations. - * - */ - -int osc_io_init(const struct lu_env *env, - struct cl_object *obj, struct cl_io *io) -{ - struct osc_io *oio = osc_env_io(env); - - CL_IO_SLICE_CLEAN(oio, oi_cl); - cl_io_slice_add(io, &oio->oi_cl, obj, &osc_io_ops); - return 0; -} - -/** @} osc */ diff --git a/drivers/staging/lustre/lustre/osc/osc_lock.c b/drivers/staging/lustre/lustre/osc/osc_lock.c deleted file mode 100644 index d93d33dc8dc4d6234de829323de759f9cc464660..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/osc/osc_lock.c +++ /dev/null @@ -1,1230 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Implementation of cl_lock for OSC layer. - * - * Author: Nikita Danilov - * Author: Jinshan Xiong - */ - -#define DEBUG_SUBSYSTEM S_OSC - -/* fid_build_reg_res_name() */ -#include - -#include "osc_cl_internal.h" - -/** \addtogroup osc - * @{ - */ - -/***************************************************************************** - * - * Type conversions. - * - */ - -static const struct cl_lock_operations osc_lock_ops; -static const struct cl_lock_operations osc_lock_lockless_ops; -static void osc_lock_to_lockless(const struct lu_env *env, - struct osc_lock *ols, int force); - -int osc_lock_is_lockless(const struct osc_lock *olck) -{ - return (olck->ols_cl.cls_ops == &osc_lock_lockless_ops); -} - -/** - * Returns a weak pointer to the ldlm lock identified by a handle. Returned - * pointer cannot be dereferenced, as lock is not protected from concurrent - * reclaim. This function is a helper for osc_lock_invariant(). - */ -static struct ldlm_lock *osc_handle_ptr(struct lustre_handle *handle) -{ - struct ldlm_lock *lock; - - lock = ldlm_handle2lock(handle); - if (lock) - LDLM_LOCK_PUT(lock); - return lock; -} - -/** - * Invariant that has to be true all of the time. - */ -static int osc_lock_invariant(struct osc_lock *ols) -{ - struct ldlm_lock *lock = osc_handle_ptr(&ols->ols_handle); - struct ldlm_lock *olock = ols->ols_dlmlock; - int handle_used = lustre_handle_is_used(&ols->ols_handle); - - if (ergo(osc_lock_is_lockless(ols), - ols->ols_locklessable && !ols->ols_dlmlock)) - return 1; - - /* - * If all the following "ergo"s are true, return 1, otherwise 0 - */ - if (!ergo(olock, handle_used)) - return 0; - - if (!ergo(olock, olock->l_handle.h_cookie == ols->ols_handle.cookie)) - return 0; - - if (!ergo(handle_used, - ergo(lock && olock, lock == olock) && - ergo(!lock, !olock))) - return 0; - /* - * Check that ->ols_handle and ->ols_dlmlock are consistent, but - * take into account that they are set at the different time. - */ - if (!ergo(ols->ols_state == OLS_CANCELLED, - !olock && !handle_used)) - return 0; - /* - * DLM lock is destroyed only after we have seen cancellation - * ast. - */ - if (!ergo(olock && ols->ols_state < OLS_CANCELLED, - !ldlm_is_destroyed(olock))) - return 0; - - if (!ergo(ols->ols_state == OLS_GRANTED, - olock && olock->l_req_mode == olock->l_granted_mode && - ols->ols_hold)) - return 0; - return 1; -} - -/***************************************************************************** - * - * Lock operations. - * - */ - -static void osc_lock_fini(const struct lu_env *env, - struct cl_lock_slice *slice) -{ - struct osc_lock *ols = cl2osc_lock(slice); - - LINVRNT(osc_lock_invariant(ols)); - LASSERT(!ols->ols_dlmlock); - - kmem_cache_free(osc_lock_kmem, ols); -} - -static void osc_lock_build_policy(const struct lu_env *env, - const struct cl_lock *lock, - union ldlm_policy_data *policy) -{ - const struct cl_lock_descr *d = &lock->cll_descr; - - osc_index2policy(policy, d->cld_obj, d->cld_start, d->cld_end); - policy->l_extent.gid = d->cld_gid; -} - -static __u64 osc_enq2ldlm_flags(__u32 enqflags) -{ - __u64 result = 0; - - LASSERT((enqflags & ~CEF_MASK) == 0); - - if (enqflags & CEF_NONBLOCK) - result |= LDLM_FL_BLOCK_NOWAIT; - if (enqflags & CEF_ASYNC) - result |= LDLM_FL_HAS_INTENT; - if (enqflags & CEF_DISCARD_DATA) - result |= LDLM_FL_AST_DISCARD_DATA; - if (enqflags & CEF_PEEK) - result |= LDLM_FL_TEST_LOCK; - if (enqflags & CEF_LOCK_MATCH) - result |= LDLM_FL_MATCH_LOCK; - return result; -} - -/** - * Updates object attributes from a lock value block (lvb) received together - * with the DLM lock reply from the server. Copy of osc_update_enqueue() - * logic. - * - * This can be optimized to not update attributes when lock is a result of a - * local match. - * - * Called under lock and resource spin-locks. - */ -static void osc_lock_lvb_update(const struct lu_env *env, - struct osc_object *osc, - struct ldlm_lock *dlmlock, - struct ost_lvb *lvb) -{ - struct cl_object *obj = osc2cl(osc); - struct lov_oinfo *oinfo = osc->oo_oinfo; - struct cl_attr *attr = &osc_env_info(env)->oti_attr; - unsigned int valid; - - valid = CAT_BLOCKS | CAT_ATIME | CAT_CTIME | CAT_MTIME | CAT_SIZE; - if (!lvb) - lvb = dlmlock->l_lvb_data; - - cl_lvb2attr(attr, lvb); - - cl_object_attr_lock(obj); - if (dlmlock) { - __u64 size; - - check_res_locked(dlmlock->l_resource); - LASSERT(lvb == dlmlock->l_lvb_data); - size = lvb->lvb_size; - - /* Extend KMS up to the end of this lock and no further - * A lock on [x,y] means a KMS of up to y + 1 bytes! - */ - if (size > dlmlock->l_policy_data.l_extent.end) - size = dlmlock->l_policy_data.l_extent.end + 1; - if (size >= oinfo->loi_kms) { - LDLM_DEBUG(dlmlock, "lock acquired, setting rss=%llu, kms=%llu", - lvb->lvb_size, size); - valid |= CAT_KMS; - attr->cat_kms = size; - } else { - LDLM_DEBUG(dlmlock, "lock acquired, setting rss=%llu; leaving kms=%llu, end=%llu", - lvb->lvb_size, oinfo->loi_kms, - dlmlock->l_policy_data.l_extent.end); - } - ldlm_lock_allow_match_locked(dlmlock); - } - - cl_object_attr_update(env, obj, attr, valid); - cl_object_attr_unlock(obj); -} - -static void osc_lock_granted(const struct lu_env *env, struct osc_lock *oscl, - struct lustre_handle *lockh, bool lvb_update) -{ - struct ldlm_lock *dlmlock; - - dlmlock = ldlm_handle2lock_long(lockh, 0); - LASSERT(dlmlock); - - /* lock reference taken by ldlm_handle2lock_long() is - * owned by osc_lock and released in osc_lock_detach() - */ - lu_ref_add(&dlmlock->l_reference, "osc_lock", oscl); - oscl->ols_has_ref = 1; - - LASSERT(!oscl->ols_dlmlock); - oscl->ols_dlmlock = dlmlock; - - /* This may be a matched lock for glimpse request, do not hold - * lock reference in that case. - */ - if (!oscl->ols_glimpse) { - /* hold a refc for non glimpse lock which will - * be released in osc_lock_cancel() - */ - lustre_handle_copy(&oscl->ols_handle, lockh); - ldlm_lock_addref(lockh, oscl->ols_einfo.ei_mode); - oscl->ols_hold = 1; - } - - /* Lock must have been granted. */ - lock_res_and_lock(dlmlock); - if (dlmlock->l_granted_mode == dlmlock->l_req_mode) { - struct ldlm_extent *ext = &dlmlock->l_policy_data.l_extent; - struct cl_lock_descr *descr = &oscl->ols_cl.cls_lock->cll_descr; - - /* extend the lock extent, otherwise it will have problem when - * we decide whether to grant a lockless lock. - */ - descr->cld_mode = osc_ldlm2cl_lock(dlmlock->l_granted_mode); - descr->cld_start = cl_index(descr->cld_obj, ext->start); - descr->cld_end = cl_index(descr->cld_obj, ext->end); - descr->cld_gid = ext->gid; - - /* no lvb update for matched lock */ - if (lvb_update) { - LASSERT(oscl->ols_flags & LDLM_FL_LVB_READY); - osc_lock_lvb_update(env, cl2osc(oscl->ols_cl.cls_obj), - dlmlock, NULL); - } - LINVRNT(osc_lock_invariant(oscl)); - } - unlock_res_and_lock(dlmlock); - - LASSERT(oscl->ols_state != OLS_GRANTED); - oscl->ols_state = OLS_GRANTED; -} - -/** - * Lock upcall function that is executed either when a reply to ENQUEUE rpc is - * received from a server, or after osc_enqueue_base() matched a local DLM - * lock. - */ -static int osc_lock_upcall(void *cookie, struct lustre_handle *lockh, - int errcode) -{ - struct osc_lock *oscl = cookie; - struct cl_lock_slice *slice = &oscl->ols_cl; - struct lu_env *env; - int rc; - u16 refcheck; - - env = cl_env_get(&refcheck); - /* should never happen, similar to osc_ldlm_blocking_ast(). */ - LASSERT(!IS_ERR(env)); - - rc = ldlm_error2errno(errcode); - if (oscl->ols_state == OLS_ENQUEUED) { - oscl->ols_state = OLS_UPCALL_RECEIVED; - } else if (oscl->ols_state == OLS_CANCELLED) { - rc = -EIO; - } else { - CERROR("Impossible state: %d\n", oscl->ols_state); - LBUG(); - } - - if (rc == 0) - osc_lock_granted(env, oscl, lockh, errcode == ELDLM_OK); - - /* Error handling, some errors are tolerable. */ - if (oscl->ols_locklessable && rc == -EUSERS) { - /* This is a tolerable error, turn this lock into - * lockless lock. - */ - osc_object_set_contended(cl2osc(slice->cls_obj)); - LASSERT(slice->cls_ops == &osc_lock_ops); - - /* Change this lock to ldlmlock-less lock. */ - osc_lock_to_lockless(env, oscl, 1); - oscl->ols_state = OLS_GRANTED; - rc = 0; - } else if (oscl->ols_glimpse && rc == -ENAVAIL) { - LASSERT(oscl->ols_flags & LDLM_FL_LVB_READY); - osc_lock_lvb_update(env, cl2osc(slice->cls_obj), - NULL, &oscl->ols_lvb); - /* Hide the error. */ - rc = 0; - } - - if (oscl->ols_owner) - cl_sync_io_note(env, oscl->ols_owner, rc); - cl_env_put(env, &refcheck); - - return rc; -} - -static int osc_lock_upcall_agl(void *cookie, struct lustre_handle *lockh, - int errcode) -{ - struct osc_object *osc = cookie; - struct ldlm_lock *dlmlock; - struct lu_env *env; - u16 refcheck; - - env = cl_env_get(&refcheck); - LASSERT(!IS_ERR(env)); - - if (errcode == ELDLM_LOCK_MATCHED) { - errcode = ELDLM_OK; - goto out; - } - - if (errcode != ELDLM_OK) - goto out; - - dlmlock = ldlm_handle2lock(lockh); - LASSERT(dlmlock); - - lock_res_and_lock(dlmlock); - LASSERT(dlmlock->l_granted_mode == dlmlock->l_req_mode); - - /* there is no osc_lock associated with AGL lock */ - osc_lock_lvb_update(env, osc, dlmlock, NULL); - - unlock_res_and_lock(dlmlock); - LDLM_LOCK_PUT(dlmlock); - -out: - cl_object_put(env, osc2cl(osc)); - cl_env_put(env, &refcheck); - return ldlm_error2errno(errcode); -} - -static int osc_lock_flush(struct osc_object *obj, pgoff_t start, pgoff_t end, - enum cl_lock_mode mode, int discard) -{ - struct lu_env *env; - u16 refcheck; - int rc = 0; - int rc2 = 0; - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return PTR_ERR(env); - - if (mode == CLM_WRITE) { - rc = osc_cache_writeback_range(env, obj, start, end, 1, - discard); - CDEBUG(D_CACHE, "object %p: [%lu -> %lu] %d pages were %s.\n", - obj, start, end, rc, - discard ? "discarded" : "written back"); - if (rc > 0) - rc = 0; - } - - rc2 = osc_lock_discard_pages(env, obj, start, end, mode); - if (rc == 0 && rc2 < 0) - rc = rc2; - - cl_env_put(env, &refcheck); - return rc; -} - -/** - * Helper for osc_dlm_blocking_ast() handling discrepancies between cl_lock - * and ldlm_lock caches. - */ -static int osc_dlm_blocking_ast0(const struct lu_env *env, - struct ldlm_lock *dlmlock, - void *data, int flag) -{ - struct cl_object *obj = NULL; - int result = 0; - int discard; - enum cl_lock_mode mode = CLM_READ; - - LASSERT(flag == LDLM_CB_CANCELING); - - lock_res_and_lock(dlmlock); - if (dlmlock->l_granted_mode != dlmlock->l_req_mode) { - dlmlock->l_ast_data = NULL; - unlock_res_and_lock(dlmlock); - return 0; - } - - discard = ldlm_is_discard_data(dlmlock); - if (dlmlock->l_granted_mode & (LCK_PW | LCK_GROUP)) - mode = CLM_WRITE; - - if (dlmlock->l_ast_data) { - obj = osc2cl(dlmlock->l_ast_data); - dlmlock->l_ast_data = NULL; - - cl_object_get(obj); - } - - unlock_res_and_lock(dlmlock); - - /* if l_ast_data is NULL, the dlmlock was enqueued by AGL or - * the object has been destroyed. - */ - if (obj) { - struct ldlm_extent *extent = &dlmlock->l_policy_data.l_extent; - struct cl_attr *attr = &osc_env_info(env)->oti_attr; - __u64 old_kms; - - /* Destroy pages covered by the extent of the DLM lock */ - result = osc_lock_flush(cl2osc(obj), - cl_index(obj, extent->start), - cl_index(obj, extent->end), - mode, discard); - - /* losing a lock, update kms */ - lock_res_and_lock(dlmlock); - cl_object_attr_lock(obj); - /* Must get the value under the lock to avoid race. */ - old_kms = cl2osc(obj)->oo_oinfo->loi_kms; - /* Update the kms. Need to loop all granted locks. - * Not a problem for the client - */ - attr->cat_kms = ldlm_extent_shift_kms(dlmlock, old_kms); - - cl_object_attr_update(env, obj, attr, CAT_KMS); - cl_object_attr_unlock(obj); - unlock_res_and_lock(dlmlock); - - cl_object_put(env, obj); - } - return result; -} - -/** - * Blocking ast invoked by ldlm when dlm lock is either blocking progress of - * some other lock, or is canceled. This function is installed as a - * ldlm_lock::l_blocking_ast() for client extent locks. - * - * Control flow is tricky, because ldlm uses the same call-back - * (ldlm_lock::l_blocking_ast()) for both blocking and cancellation ast's. - * - * \param dlmlock lock for which ast occurred. - * - * \param new description of a conflicting lock in case of blocking ast. - * - * \param data value of dlmlock->l_ast_data - * - * \param flag LDLM_CB_BLOCKING or LDLM_CB_CANCELING. Used to distinguish - * cancellation and blocking ast's. - * - * Possible use cases: - * - * - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) to cancel - * lock due to lock lru pressure, or explicit user request to purge - * locks. - * - * - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_BLOCKING) to notify - * us that dlmlock conflicts with another lock that some client is - * enqueing. Lock is canceled. - * - * - cl_lock_cancel() is called. osc_lock_cancel() calls - * ldlm_cli_cancel() that calls - * - * dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) - * - * recursively entering osc_ldlm_blocking_ast(). - * - * - client cancels lock voluntary (e.g., as a part of early cancellation): - * - * cl_lock_cancel()-> - * osc_lock_cancel()-> - * ldlm_cli_cancel()-> - * dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) - * - */ -static int osc_ldlm_blocking_ast(struct ldlm_lock *dlmlock, - struct ldlm_lock_desc *new, void *data, - int flag) -{ - int result = 0; - - switch (flag) { - case LDLM_CB_BLOCKING: { - struct lustre_handle lockh; - - ldlm_lock2handle(dlmlock, &lockh); - result = ldlm_cli_cancel(&lockh, LCF_ASYNC); - if (result == -ENODATA) - result = 0; - break; - } - case LDLM_CB_CANCELING: { - struct lu_env *env; - u16 refcheck; - - /* - * This can be called in the context of outer IO, e.g., - * - * osc_enqueue_base()->... - * ->ldlm_prep_elc_req()->... - * ->ldlm_cancel_callback()->... - * ->osc_ldlm_blocking_ast() - * - * new environment has to be created to not corrupt outer - * context. - */ - env = cl_env_get(&refcheck); - if (IS_ERR(env)) { - result = PTR_ERR(env); - break; - } - - result = osc_dlm_blocking_ast0(env, dlmlock, data, flag); - cl_env_put(env, &refcheck); - break; - } - default: - LBUG(); - } - return result; -} - -static int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data) -{ - struct ptlrpc_request *req = data; - struct lu_env *env; - struct ost_lvb *lvb; - struct req_capsule *cap; - struct cl_object *obj = NULL; - int result; - u16 refcheck; - - LASSERT(lustre_msg_get_opc(req->rq_reqmsg) == LDLM_GL_CALLBACK); - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) { - result = PTR_ERR(env); - goto out; - } - - lock_res_and_lock(dlmlock); - if (dlmlock->l_ast_data) { - obj = osc2cl(dlmlock->l_ast_data); - cl_object_get(obj); - } - unlock_res_and_lock(dlmlock); - - if (obj) { - /* Do not grab the mutex of cl_lock for glimpse. - * See LU-1274 for details. - * BTW, it's okay for cl_lock to be cancelled during - * this period because server can handle this race. - * See ldlm_server_glimpse_ast() for details. - * cl_lock_mutex_get(env, lock); - */ - cap = &req->rq_pill; - req_capsule_extend(cap, &RQF_LDLM_GL_CALLBACK); - req_capsule_set_size(cap, &RMF_DLM_LVB, RCL_SERVER, - sizeof(*lvb)); - result = req_capsule_server_pack(cap); - if (result == 0) { - lvb = req_capsule_server_get(cap, &RMF_DLM_LVB); - result = cl_object_glimpse(env, obj, lvb); - } - if (!exp_connect_lvb_type(req->rq_export)) { - req_capsule_shrink(&req->rq_pill, &RMF_DLM_LVB, - sizeof(struct ost_lvb_v1), - RCL_SERVER); - } - cl_object_put(env, obj); - } else { - /* - * These errors are normal races, so we don't want to - * fill the console with messages by calling - * ptlrpc_error() - */ - lustre_pack_reply(req, 1, NULL, NULL); - result = -ELDLM_NO_LOCK_DATA; - } - cl_env_put(env, &refcheck); - -out: - req->rq_status = result; - return result; -} - -static int weigh_cb(const struct lu_env *env, struct cl_io *io, - struct osc_page *ops, void *cbdata) -{ - struct cl_page *page = ops->ops_cl.cpl_page; - - if (cl_page_is_vmlocked(env, page) || - PageDirty(page->cp_vmpage) || PageWriteback(page->cp_vmpage) - ) - return CLP_GANG_ABORT; - - *(pgoff_t *)cbdata = osc_index(ops) + 1; - return CLP_GANG_OKAY; -} - -static unsigned long osc_lock_weight(const struct lu_env *env, - struct osc_object *oscobj, - struct ldlm_extent *extent) -{ - struct cl_io *io = &osc_env_info(env)->oti_io; - struct cl_object *obj = cl_object_top(&oscobj->oo_cl); - pgoff_t page_index; - int result; - - io->ci_obj = obj; - io->ci_ignore_layout = 1; - result = cl_io_init(env, io, CIT_MISC, io->ci_obj); - if (result != 0) - return result; - - page_index = cl_index(obj, extent->start); - do { - result = osc_page_gang_lookup(env, io, oscobj, - page_index, - cl_index(obj, extent->end), - weigh_cb, (void *)&page_index); - if (result == CLP_GANG_ABORT) - break; - if (result == CLP_GANG_RESCHED) - cond_resched(); - } while (result != CLP_GANG_OKAY); - cl_io_fini(env, io); - - return result == CLP_GANG_ABORT ? 1 : 0; -} - -/** - * Get the weight of dlm lock for early cancellation. - */ -unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock) -{ - struct lu_env *env; - struct osc_object *obj; - struct osc_lock *oscl; - unsigned long weight; - bool found = false; - u16 refcheck; - - might_sleep(); - /* - * osc_ldlm_weigh_ast has a complex context since it might be called - * because of lock canceling, or from user's input. We have to make - * a new environment for it. Probably it is implementation safe to use - * the upper context because cl_lock_put don't modify environment - * variables. But just in case .. - */ - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - /* Mostly because lack of memory, do not eliminate this lock */ - return 1; - - LASSERT(dlmlock->l_resource->lr_type == LDLM_EXTENT); - obj = dlmlock->l_ast_data; - if (!obj) { - weight = 1; - goto out; - } - - spin_lock(&obj->oo_ol_spin); - list_for_each_entry(oscl, &obj->oo_ol_list, ols_nextlock_oscobj) { - if (oscl->ols_dlmlock && oscl->ols_dlmlock != dlmlock) - continue; - found = true; - } - spin_unlock(&obj->oo_ol_spin); - if (found) { - /* - * If the lock is being used by an IO, definitely not cancel it. - */ - weight = 1; - goto out; - } - - weight = osc_lock_weight(env, obj, &dlmlock->l_policy_data.l_extent); - -out: - cl_env_put(env, &refcheck); - return weight; -} - -static void osc_lock_build_einfo(const struct lu_env *env, - const struct cl_lock *lock, - struct osc_object *osc, - struct ldlm_enqueue_info *einfo) -{ - einfo->ei_type = LDLM_EXTENT; - einfo->ei_mode = osc_cl_lock2ldlm(lock->cll_descr.cld_mode); - einfo->ei_cb_bl = osc_ldlm_blocking_ast; - einfo->ei_cb_cp = ldlm_completion_ast; - einfo->ei_cb_gl = osc_ldlm_glimpse_ast; - einfo->ei_cbdata = osc; /* value to be put into ->l_ast_data */ -} - -/** - * Determine if the lock should be converted into a lockless lock. - * - * Steps to check: - * - if the lock has an explicit requirement for a non-lockless lock; - * - if the io lock request type ci_lockreq; - * - send the enqueue rpc to ost to make the further decision; - * - special treat to truncate lockless lock - * - * Additional policy can be implemented here, e.g., never do lockless-io - * for large extents. - */ -static void osc_lock_to_lockless(const struct lu_env *env, - struct osc_lock *ols, int force) -{ - struct cl_lock_slice *slice = &ols->ols_cl; - - LASSERT(ols->ols_state == OLS_NEW || - ols->ols_state == OLS_UPCALL_RECEIVED); - - if (force) { - ols->ols_locklessable = 1; - slice->cls_ops = &osc_lock_lockless_ops; - } else { - struct osc_io *oio = osc_env_io(env); - struct cl_io *io = oio->oi_cl.cis_io; - struct cl_object *obj = slice->cls_obj; - struct osc_object *oob = cl2osc(obj); - const struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev); - struct obd_connect_data *ocd; - - LASSERT(io->ci_lockreq == CILR_MANDATORY || - io->ci_lockreq == CILR_MAYBE || - io->ci_lockreq == CILR_NEVER); - - ocd = &class_exp2cliimp(osc_export(oob))->imp_connect_data; - ols->ols_locklessable = (io->ci_type != CIT_SETATTR) && - (io->ci_lockreq == CILR_MAYBE) && - (ocd->ocd_connect_flags & OBD_CONNECT_SRVLOCK); - if (io->ci_lockreq == CILR_NEVER || - /* lockless IO */ - (ols->ols_locklessable && osc_object_is_contended(oob)) || - /* lockless truncate */ - (cl_io_is_trunc(io) && - (ocd->ocd_connect_flags & OBD_CONNECT_TRUNCLOCK) && - osd->od_lockless_truncate)) { - ols->ols_locklessable = 1; - slice->cls_ops = &osc_lock_lockless_ops; - } - } - LASSERT(ergo(ols->ols_glimpse, !osc_lock_is_lockless(ols))); -} - -static bool osc_lock_compatible(const struct osc_lock *qing, - const struct osc_lock *qed) -{ - struct cl_lock_descr *qed_descr = &qed->ols_cl.cls_lock->cll_descr; - struct cl_lock_descr *qing_descr = &qing->ols_cl.cls_lock->cll_descr; - - if (qed->ols_glimpse) - return true; - - if (qing_descr->cld_mode == CLM_READ && qed_descr->cld_mode == CLM_READ) - return true; - - if (qed->ols_state < OLS_GRANTED) - return true; - - if (qed_descr->cld_mode >= qing_descr->cld_mode && - qed_descr->cld_start <= qing_descr->cld_start && - qed_descr->cld_end >= qing_descr->cld_end) - return true; - - return false; -} - -static void osc_lock_wake_waiters(const struct lu_env *env, - struct osc_object *osc, - struct osc_lock *oscl) -{ - spin_lock(&osc->oo_ol_spin); - list_del_init(&oscl->ols_nextlock_oscobj); - spin_unlock(&osc->oo_ol_spin); - - spin_lock(&oscl->ols_lock); - while (!list_empty(&oscl->ols_waiting_list)) { - struct osc_lock *scan; - - scan = list_entry(oscl->ols_waiting_list.next, struct osc_lock, - ols_wait_entry); - list_del_init(&scan->ols_wait_entry); - - cl_sync_io_note(env, scan->ols_owner, 0); - } - spin_unlock(&oscl->ols_lock); -} - -static int osc_lock_enqueue_wait(const struct lu_env *env, - struct osc_object *obj, - struct osc_lock *oscl) -{ - struct osc_lock *tmp_oscl; - struct cl_lock_descr *need = &oscl->ols_cl.cls_lock->cll_descr; - struct cl_sync_io *waiter = &osc_env_info(env)->oti_anchor; - int rc = 0; - - spin_lock(&obj->oo_ol_spin); - list_add_tail(&oscl->ols_nextlock_oscobj, &obj->oo_ol_list); - -restart: - list_for_each_entry(tmp_oscl, &obj->oo_ol_list, - ols_nextlock_oscobj) { - struct cl_lock_descr *descr; - - if (tmp_oscl == oscl) - break; - - descr = &tmp_oscl->ols_cl.cls_lock->cll_descr; - if (descr->cld_start > need->cld_end || - descr->cld_end < need->cld_start) - continue; - - /* We're not supposed to give up group lock */ - if (descr->cld_mode == CLM_GROUP) - break; - - if (!osc_lock_is_lockless(oscl) && - osc_lock_compatible(oscl, tmp_oscl)) - continue; - - /* wait for conflicting lock to be canceled */ - cl_sync_io_init(waiter, 1, cl_sync_io_end); - oscl->ols_owner = waiter; - - spin_lock(&tmp_oscl->ols_lock); - /* add oscl into tmp's ols_waiting list */ - list_add_tail(&oscl->ols_wait_entry, - &tmp_oscl->ols_waiting_list); - spin_unlock(&tmp_oscl->ols_lock); - - spin_unlock(&obj->oo_ol_spin); - rc = cl_sync_io_wait(env, waiter, 0); - spin_lock(&obj->oo_ol_spin); - if (rc < 0) - break; - - oscl->ols_owner = NULL; - goto restart; - } - spin_unlock(&obj->oo_ol_spin); - - return rc; -} - -/** - * Implementation of cl_lock_operations::clo_enqueue() method for osc - * layer. This initiates ldlm enqueue: - * - * - cancels conflicting locks early (osc_lock_enqueue_wait()); - * - * - calls osc_enqueue_base() to do actual enqueue. - * - * osc_enqueue_base() is supplied with an upcall function that is executed - * when lock is received either after a local cached ldlm lock is matched, or - * when a reply from the server is received. - * - * This function does not wait for the network communication to complete. - */ -static int osc_lock_enqueue(const struct lu_env *env, - const struct cl_lock_slice *slice, - struct cl_io *unused, struct cl_sync_io *anchor) -{ - struct osc_thread_info *info = osc_env_info(env); - struct osc_io *oio = osc_env_io(env); - struct osc_object *osc = cl2osc(slice->cls_obj); - struct osc_lock *oscl = cl2osc_lock(slice); - struct cl_lock *lock = slice->cls_lock; - struct ldlm_res_id *resname = &info->oti_resname; - union ldlm_policy_data *policy = &info->oti_policy; - osc_enqueue_upcall_f upcall = osc_lock_upcall; - void *cookie = oscl; - bool async = false; - int result; - - LASSERTF(ergo(oscl->ols_glimpse, lock->cll_descr.cld_mode <= CLM_READ), - "lock = %p, ols = %p\n", lock, oscl); - - if (oscl->ols_state == OLS_GRANTED) - return 0; - - if (oscl->ols_flags & LDLM_FL_TEST_LOCK) - goto enqueue_base; - - if (oscl->ols_glimpse) { - LASSERT(equi(oscl->ols_agl, !anchor)); - async = true; - goto enqueue_base; - } - - result = osc_lock_enqueue_wait(env, osc, oscl); - if (result < 0) - goto out; - - /* we can grant lockless lock right after all conflicting locks - * are canceled. - */ - if (osc_lock_is_lockless(oscl)) { - oscl->ols_state = OLS_GRANTED; - oio->oi_lockless = 1; - return 0; - } - -enqueue_base: - oscl->ols_state = OLS_ENQUEUED; - if (anchor) { - atomic_inc(&anchor->csi_sync_nr); - oscl->ols_owner = anchor; - } - - /** - * DLM lock's ast data must be osc_object; - * if glimpse or AGL lock, async of osc_enqueue_base() must be true, - * DLM's enqueue callback set to osc_lock_upcall() with cookie as - * osc_lock. - */ - ostid_build_res_name(&osc->oo_oinfo->loi_oi, resname); - osc_lock_build_policy(env, lock, policy); - if (oscl->ols_agl) { - oscl->ols_einfo.ei_cbdata = NULL; - /* hold a reference for callback */ - cl_object_get(osc2cl(osc)); - upcall = osc_lock_upcall_agl; - cookie = osc; - } - result = osc_enqueue_base(osc_export(osc), resname, &oscl->ols_flags, - policy, &oscl->ols_lvb, - osc->oo_oinfo->loi_kms_valid, - upcall, cookie, - &oscl->ols_einfo, PTLRPCD_SET, async, - oscl->ols_agl); - if (!result) { - if (osc_lock_is_lockless(oscl)) { - oio->oi_lockless = 1; - } else if (!async) { - LASSERT(oscl->ols_state == OLS_GRANTED); - LASSERT(oscl->ols_hold); - LASSERT(oscl->ols_dlmlock); - } - } else if (oscl->ols_agl) { - cl_object_put(env, osc2cl(osc)); - result = 0; - } - -out: - if (result < 0) { - oscl->ols_state = OLS_CANCELLED; - osc_lock_wake_waiters(env, osc, oscl); - - if (anchor) - cl_sync_io_note(env, anchor, result); - } - return result; -} - -/** - * Breaks a link between osc_lock and dlm_lock. - */ -static void osc_lock_detach(const struct lu_env *env, struct osc_lock *olck) -{ - struct ldlm_lock *dlmlock; - - dlmlock = olck->ols_dlmlock; - if (!dlmlock) - return; - - if (olck->ols_hold) { - olck->ols_hold = 0; - ldlm_lock_decref(&olck->ols_handle, olck->ols_einfo.ei_mode); - olck->ols_handle.cookie = 0ULL; - } - - olck->ols_dlmlock = NULL; - - /* release a reference taken in osc_lock_upcall(). */ - LASSERT(olck->ols_has_ref); - lu_ref_del(&dlmlock->l_reference, "osc_lock", olck); - LDLM_LOCK_RELEASE(dlmlock); - olck->ols_has_ref = 0; -} - -/** - * Implements cl_lock_operations::clo_cancel() method for osc layer. This is - * called (as part of cl_lock_cancel()) when lock is canceled either voluntary - * (LRU pressure, early cancellation, umount, etc.) or due to the conflict - * with some other lock some where in the cluster. This function does the - * following: - * - * - invalidates all pages protected by this lock (after sending dirty - * ones to the server, as necessary); - * - * - decref's underlying ldlm lock; - * - * - cancels ldlm lock (ldlm_cli_cancel()). - */ -static void osc_lock_cancel(const struct lu_env *env, - const struct cl_lock_slice *slice) -{ - struct osc_object *obj = cl2osc(slice->cls_obj); - struct osc_lock *oscl = cl2osc_lock(slice); - - LINVRNT(osc_lock_invariant(oscl)); - - osc_lock_detach(env, oscl); - oscl->ols_state = OLS_CANCELLED; - oscl->ols_flags &= ~LDLM_FL_LVB_READY; - - osc_lock_wake_waiters(env, obj, oscl); -} - -static int osc_lock_print(const struct lu_env *env, void *cookie, - lu_printer_t p, const struct cl_lock_slice *slice) -{ - struct osc_lock *lock = cl2osc_lock(slice); - - (*p)(env, cookie, "%p %#16llx %#llx %d %p ", - lock->ols_dlmlock, lock->ols_flags, lock->ols_handle.cookie, - lock->ols_state, lock->ols_owner); - osc_lvb_print(env, cookie, p, &lock->ols_lvb); - return 0; -} - -static const struct cl_lock_operations osc_lock_ops = { - .clo_fini = osc_lock_fini, - .clo_enqueue = osc_lock_enqueue, - .clo_cancel = osc_lock_cancel, - .clo_print = osc_lock_print, -}; - -static void osc_lock_lockless_cancel(const struct lu_env *env, - const struct cl_lock_slice *slice) -{ - struct osc_lock *ols = cl2osc_lock(slice); - struct osc_object *osc = cl2osc(slice->cls_obj); - struct cl_lock_descr *descr = &slice->cls_lock->cll_descr; - int result; - - LASSERT(!ols->ols_dlmlock); - result = osc_lock_flush(osc, descr->cld_start, descr->cld_end, - descr->cld_mode, 0); - if (result) - CERROR("Pages for lockless lock %p were not purged(%d)\n", - ols, result); - - osc_lock_wake_waiters(env, osc, ols); -} - -static const struct cl_lock_operations osc_lock_lockless_ops = { - .clo_fini = osc_lock_fini, - .clo_enqueue = osc_lock_enqueue, - .clo_cancel = osc_lock_lockless_cancel, - .clo_print = osc_lock_print -}; - -static void osc_lock_set_writer(const struct lu_env *env, - const struct cl_io *io, - struct cl_object *obj, struct osc_lock *oscl) -{ - struct cl_lock_descr *descr = &oscl->ols_cl.cls_lock->cll_descr; - pgoff_t io_start; - pgoff_t io_end; - - if (!cl_object_same(io->ci_obj, obj)) - return; - - if (likely(io->ci_type == CIT_WRITE)) { - io_start = cl_index(obj, io->u.ci_rw.crw_pos); - io_end = cl_index(obj, io->u.ci_rw.crw_pos + - io->u.ci_rw.crw_count - 1); - if (cl_io_is_append(io)) { - io_start = 0; - io_end = CL_PAGE_EOF; - } - } else { - LASSERT(cl_io_is_mkwrite(io)); - io_start = io->u.ci_fault.ft_index; - io_end = io->u.ci_fault.ft_index; - } - - if (descr->cld_mode >= CLM_WRITE && - descr->cld_start <= io_start && descr->cld_end >= io_end) { - struct osc_io *oio = osc_env_io(env); - - /* There must be only one lock to match the write region */ - LASSERT(!oio->oi_write_osclock); - oio->oi_write_osclock = oscl; - } -} - -int osc_lock_init(const struct lu_env *env, - struct cl_object *obj, struct cl_lock *lock, - const struct cl_io *io) -{ - struct osc_lock *oscl; - __u32 enqflags = lock->cll_descr.cld_enq_flags; - - oscl = kmem_cache_zalloc(osc_lock_kmem, GFP_NOFS); - if (!oscl) - return -ENOMEM; - - oscl->ols_state = OLS_NEW; - spin_lock_init(&oscl->ols_lock); - INIT_LIST_HEAD(&oscl->ols_waiting_list); - INIT_LIST_HEAD(&oscl->ols_wait_entry); - INIT_LIST_HEAD(&oscl->ols_nextlock_oscobj); - - oscl->ols_flags = osc_enq2ldlm_flags(enqflags); - oscl->ols_agl = !!(enqflags & CEF_AGL); - if (oscl->ols_agl) - oscl->ols_flags |= LDLM_FL_BLOCK_NOWAIT; - if (oscl->ols_flags & LDLM_FL_HAS_INTENT) { - oscl->ols_flags |= LDLM_FL_BLOCK_GRANTED; - oscl->ols_glimpse = 1; - } - osc_lock_build_einfo(env, lock, cl2osc(obj), &oscl->ols_einfo); - - cl_lock_slice_add(lock, &oscl->ols_cl, obj, &osc_lock_ops); - - if (!(enqflags & CEF_MUST)) - /* try to convert this lock to a lockless lock */ - osc_lock_to_lockless(env, oscl, (enqflags & CEF_NEVER)); - if (oscl->ols_locklessable && !(enqflags & CEF_DISCARD_DATA)) - oscl->ols_flags |= LDLM_FL_DENY_ON_CONTENTION; - - if (io->ci_type == CIT_WRITE || cl_io_is_mkwrite(io)) - osc_lock_set_writer(env, io, obj, oscl); - - - LDLM_DEBUG_NOLOCK("lock %p, osc lock %p, flags %llx", - lock, oscl, oscl->ols_flags); - - return 0; -} - -/** - * Finds an existing lock covering given index and optionally different from a - * given \a except lock. - */ -struct ldlm_lock *osc_dlmlock_at_pgoff(const struct lu_env *env, - struct osc_object *obj, pgoff_t index, - enum osc_dap_flags dap_flags) -{ - struct osc_thread_info *info = osc_env_info(env); - struct ldlm_res_id *resname = &info->oti_resname; - union ldlm_policy_data *policy = &info->oti_policy; - struct lustre_handle lockh; - struct ldlm_lock *lock = NULL; - enum ldlm_mode mode; - __u64 flags; - - ostid_build_res_name(&obj->oo_oinfo->loi_oi, resname); - osc_index2policy(policy, osc2cl(obj), index, index); - policy->l_extent.gid = LDLM_GID_ANY; - - flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING; - if (dap_flags & OSC_DAP_FL_TEST_LOCK) - flags |= LDLM_FL_TEST_LOCK; - - /* - * It is fine to match any group lock since there could be only one - * with a uniq gid and it conflicts with all other lock modes too - */ -again: - mode = osc_match_base(osc_export(obj), resname, LDLM_EXTENT, policy, - LCK_PR | LCK_PW | LCK_GROUP, &flags, obj, &lockh, - dap_flags & OSC_DAP_FL_CANCELING); - if (mode != 0) { - lock = ldlm_handle2lock(&lockh); - /* RACE: the lock is cancelled so let's try again */ - if (unlikely(!lock)) - goto again; - } - return lock; -} - -/** @} osc */ diff --git a/drivers/staging/lustre/lustre/osc/osc_object.c b/drivers/staging/lustre/lustre/osc/osc_object.c deleted file mode 100644 index 84240181c7ea3035ae25d8cd87eab5c2d78662f2..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/osc/osc_object.c +++ /dev/null @@ -1,473 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Implementation of cl_object for OSC layer. - * - * Author: Nikita Danilov - * Author: Jinshan Xiong - */ - -#define DEBUG_SUBSYSTEM S_OSC - -#include "osc_cl_internal.h" - -/** \addtogroup osc - * @{ - */ - -/***************************************************************************** - * - * Type conversions. - * - */ - -static struct lu_object *osc2lu(struct osc_object *osc) -{ - return &osc->oo_cl.co_lu; -} - -static struct osc_object *lu2osc(const struct lu_object *obj) -{ - LINVRNT(osc_is_object(obj)); - return container_of(obj, struct osc_object, oo_cl.co_lu); -} - -/***************************************************************************** - * - * Object operations. - * - */ - -static int osc_object_init(const struct lu_env *env, struct lu_object *obj, - const struct lu_object_conf *conf) -{ - struct osc_object *osc = lu2osc(obj); - const struct cl_object_conf *cconf = lu2cl_conf(conf); - - osc->oo_oinfo = cconf->u.coc_oinfo; - INIT_LIST_HEAD(&osc->oo_ready_item); - INIT_LIST_HEAD(&osc->oo_hp_ready_item); - INIT_LIST_HEAD(&osc->oo_write_item); - INIT_LIST_HEAD(&osc->oo_read_item); - - atomic_set(&osc->oo_nr_ios, 0); - init_waitqueue_head(&osc->oo_io_waitq); - - osc->oo_root.rb_node = NULL; - INIT_LIST_HEAD(&osc->oo_hp_exts); - INIT_LIST_HEAD(&osc->oo_urgent_exts); - INIT_LIST_HEAD(&osc->oo_rpc_exts); - INIT_LIST_HEAD(&osc->oo_reading_exts); - atomic_set(&osc->oo_nr_reads, 0); - atomic_set(&osc->oo_nr_writes, 0); - spin_lock_init(&osc->oo_lock); - spin_lock_init(&osc->oo_tree_lock); - spin_lock_init(&osc->oo_ol_spin); - INIT_LIST_HEAD(&osc->oo_ol_list); - - cl_object_page_init(lu2cl(obj), sizeof(struct osc_page)); - - return 0; -} - -static void osc_object_free(const struct lu_env *env, struct lu_object *obj) -{ - struct osc_object *osc = lu2osc(obj); - - LASSERT(list_empty(&osc->oo_ready_item)); - LASSERT(list_empty(&osc->oo_hp_ready_item)); - LASSERT(list_empty(&osc->oo_write_item)); - LASSERT(list_empty(&osc->oo_read_item)); - - LASSERT(!osc->oo_root.rb_node); - LASSERT(list_empty(&osc->oo_hp_exts)); - LASSERT(list_empty(&osc->oo_urgent_exts)); - LASSERT(list_empty(&osc->oo_rpc_exts)); - LASSERT(list_empty(&osc->oo_reading_exts)); - LASSERT(atomic_read(&osc->oo_nr_reads) == 0); - LASSERT(atomic_read(&osc->oo_nr_writes) == 0); - LASSERT(list_empty(&osc->oo_ol_list)); - LASSERT(!atomic_read(&osc->oo_nr_ios)); - - lu_object_fini(obj); - kmem_cache_free(osc_object_kmem, osc); -} - -int osc_lvb_print(const struct lu_env *env, void *cookie, - lu_printer_t p, const struct ost_lvb *lvb) -{ - return (*p)(env, cookie, "size: %llu mtime: %llu atime: %llu ctime: %llu blocks: %llu", - lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime, - lvb->lvb_ctime, lvb->lvb_blocks); -} - -static int osc_object_print(const struct lu_env *env, void *cookie, - lu_printer_t p, const struct lu_object *obj) -{ - struct osc_object *osc = lu2osc(obj); - struct lov_oinfo *oinfo = osc->oo_oinfo; - struct osc_async_rc *ar = &oinfo->loi_ar; - - (*p)(env, cookie, "id: " DOSTID " idx: %d gen: %d kms_valid: %u kms %llu rc: %d force_sync: %d min_xid: %llu ", - POSTID(&oinfo->loi_oi), oinfo->loi_ost_idx, - oinfo->loi_ost_gen, oinfo->loi_kms_valid, oinfo->loi_kms, - ar->ar_rc, ar->ar_force_sync, ar->ar_min_xid); - osc_lvb_print(env, cookie, p, &oinfo->loi_lvb); - return 0; -} - -static int osc_attr_get(const struct lu_env *env, struct cl_object *obj, - struct cl_attr *attr) -{ - struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo; - - cl_lvb2attr(attr, &oinfo->loi_lvb); - attr->cat_kms = oinfo->loi_kms_valid ? oinfo->loi_kms : 0; - return 0; -} - -static int osc_attr_update(const struct lu_env *env, struct cl_object *obj, - const struct cl_attr *attr, unsigned int valid) -{ - struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo; - struct ost_lvb *lvb = &oinfo->loi_lvb; - - if (valid & CAT_SIZE) - lvb->lvb_size = attr->cat_size; - if (valid & CAT_MTIME) - lvb->lvb_mtime = attr->cat_mtime; - if (valid & CAT_ATIME) - lvb->lvb_atime = attr->cat_atime; - if (valid & CAT_CTIME) - lvb->lvb_ctime = attr->cat_ctime; - if (valid & CAT_BLOCKS) - lvb->lvb_blocks = attr->cat_blocks; - if (valid & CAT_KMS) { - CDEBUG(D_CACHE, "set kms from %llu to %llu\n", - oinfo->loi_kms, (__u64)attr->cat_kms); - loi_kms_set(oinfo, attr->cat_kms); - } - return 0; -} - -static int osc_object_glimpse(const struct lu_env *env, - const struct cl_object *obj, struct ost_lvb *lvb) -{ - struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo; - - lvb->lvb_size = oinfo->loi_kms; - lvb->lvb_blocks = oinfo->loi_lvb.lvb_blocks; - return 0; -} - -static int osc_object_ast_clear(struct ldlm_lock *lock, void *data) -{ - if (lock->l_ast_data == data) - lock->l_ast_data = NULL; - return LDLM_ITER_CONTINUE; -} - -static int osc_object_prune(const struct lu_env *env, struct cl_object *obj) -{ - struct osc_object *osc = cl2osc(obj); - struct ldlm_res_id *resname = &osc_env_info(env)->oti_resname; - - /* DLM locks don't hold a reference of osc_object so we have to - * clear it before the object is being destroyed. - */ - ostid_build_res_name(&osc->oo_oinfo->loi_oi, resname); - ldlm_resource_iterate(osc_export(osc)->exp_obd->obd_namespace, resname, - osc_object_ast_clear, osc); - return 0; -} - -static int osc_object_fiemap(const struct lu_env *env, struct cl_object *obj, - struct ll_fiemap_info_key *fmkey, - struct fiemap *fiemap, size_t *buflen) -{ - struct obd_export *exp = osc_export(cl2osc(obj)); - union ldlm_policy_data policy; - struct ptlrpc_request *req; - struct lustre_handle lockh; - struct ldlm_res_id resid; - enum ldlm_mode mode = 0; - struct fiemap *reply; - char *tmp; - int rc; - - fmkey->lfik_oa.o_oi = cl2osc(obj)->oo_oinfo->loi_oi; - if (!(fmkey->lfik_fiemap.fm_flags & FIEMAP_FLAG_SYNC)) - goto skip_locking; - - policy.l_extent.start = fmkey->lfik_fiemap.fm_start & PAGE_MASK; - - if (OBD_OBJECT_EOF - fmkey->lfik_fiemap.fm_length <= - fmkey->lfik_fiemap.fm_start + PAGE_SIZE - 1) - policy.l_extent.end = OBD_OBJECT_EOF; - else - policy.l_extent.end = (fmkey->lfik_fiemap.fm_start + - fmkey->lfik_fiemap.fm_length + - PAGE_SIZE - 1) & PAGE_MASK; - - ostid_build_res_name(&fmkey->lfik_oa.o_oi, &resid); - mode = ldlm_lock_match(exp->exp_obd->obd_namespace, - LDLM_FL_BLOCK_GRANTED | LDLM_FL_LVB_READY, - &resid, LDLM_EXTENT, &policy, - LCK_PR | LCK_PW, &lockh, 0); - if (mode) { /* lock is cached on client */ - if (mode != LCK_PR) { - ldlm_lock_addref(&lockh, LCK_PR); - ldlm_lock_decref(&lockh, LCK_PW); - } - } else { /* no cached lock, needs acquire lock on server side */ - fmkey->lfik_oa.o_valid |= OBD_MD_FLFLAGS; - fmkey->lfik_oa.o_flags |= OBD_FL_SRVLOCK; - } - -skip_locking: - req = ptlrpc_request_alloc(class_exp2cliimp(exp), - &RQF_OST_GET_INFO_FIEMAP); - if (!req) { - rc = -ENOMEM; - goto drop_lock; - } - - req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_KEY, RCL_CLIENT, - sizeof(*fmkey)); - req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL, RCL_CLIENT, - *buflen); - req_capsule_set_size(&req->rq_pill, &RMF_FIEMAP_VAL, RCL_SERVER, - *buflen); - - rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GET_INFO); - if (rc) { - ptlrpc_request_free(req); - goto drop_lock; - } - tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_KEY); - memcpy(tmp, fmkey, sizeof(*fmkey)); - tmp = req_capsule_client_get(&req->rq_pill, &RMF_FIEMAP_VAL); - memcpy(tmp, fiemap, *buflen); - ptlrpc_request_set_replen(req); - - rc = ptlrpc_queue_wait(req); - if (rc) - goto fini_req; - - reply = req_capsule_server_get(&req->rq_pill, &RMF_FIEMAP_VAL); - if (!reply) { - rc = -EPROTO; - goto fini_req; - } - - memcpy(fiemap, reply, *buflen); -fini_req: - ptlrpc_req_finished(req); -drop_lock: - if (mode) - ldlm_lock_decref(&lockh, LCK_PR); - return rc; -} - -void osc_object_set_contended(struct osc_object *obj) -{ - obj->oo_contention_time = jiffies; - /* mb(); */ - obj->oo_contended = 1; -} - -void osc_object_clear_contended(struct osc_object *obj) -{ - obj->oo_contended = 0; -} - -int osc_object_is_contended(struct osc_object *obj) -{ - struct osc_device *dev = lu2osc_dev(obj->oo_cl.co_lu.lo_dev); - int osc_contention_time = dev->od_contention_time; - unsigned long cur_time = jiffies; - unsigned long retry_time; - - if (OBD_FAIL_CHECK(OBD_FAIL_OSC_OBJECT_CONTENTION)) - return 1; - - if (!obj->oo_contended) - return 0; - - /* - * I like copy-paste. the code is copied from - * ll_file_is_contended. - */ - retry_time = obj->oo_contention_time + osc_contention_time * HZ; - if (time_after(cur_time, retry_time)) { - osc_object_clear_contended(obj); - return 0; - } - return 1; -} - -/** - * Implementation of struct cl_object_operations::coo_req_attr_set() for osc - * layer. osc is responsible for struct obdo::o_id and struct obdo::o_seq - * fields. - */ -static void osc_req_attr_set(const struct lu_env *env, struct cl_object *obj, - struct cl_req_attr *attr) -{ - u64 flags = attr->cra_flags; - struct lov_oinfo *oinfo; - struct ost_lvb *lvb; - struct obdo *oa; - - oinfo = cl2osc(obj)->oo_oinfo; - lvb = &oinfo->loi_lvb; - oa = attr->cra_oa; - - if (flags & OBD_MD_FLMTIME) { - oa->o_mtime = lvb->lvb_mtime; - oa->o_valid |= OBD_MD_FLMTIME; - } - if (flags & OBD_MD_FLATIME) { - oa->o_atime = lvb->lvb_atime; - oa->o_valid |= OBD_MD_FLATIME; - } - if (flags & OBD_MD_FLCTIME) { - oa->o_ctime = lvb->lvb_ctime; - oa->o_valid |= OBD_MD_FLCTIME; - } - if (flags & OBD_MD_FLGROUP) { - ostid_set_seq(&oa->o_oi, ostid_seq(&oinfo->loi_oi)); - oa->o_valid |= OBD_MD_FLGROUP; - } - if (flags & OBD_MD_FLID) { - int rc; - - rc = ostid_set_id(&oa->o_oi, ostid_id(&oinfo->loi_oi)); - if (rc) { - CERROR("Bad %llu to set " DOSTID " : rc %d\n", - (unsigned long long)ostid_id(&oinfo->loi_oi), - POSTID(&oa->o_oi), rc); - } - oa->o_valid |= OBD_MD_FLID; - } - if (flags & OBD_MD_FLHANDLE) { - struct ldlm_lock *lock; - struct osc_page *opg; - - opg = osc_cl_page_osc(attr->cra_page, cl2osc(obj)); - lock = osc_dlmlock_at_pgoff(env, cl2osc(obj), osc_index(opg), - OSC_DAP_FL_TEST_LOCK | OSC_DAP_FL_CANCELING); - if (!lock && !opg->ops_srvlock) { - struct ldlm_resource *res; - struct ldlm_res_id *resname; - - CL_PAGE_DEBUG(D_ERROR, env, attr->cra_page, - "uncovered page!\n"); - - resname = &osc_env_info(env)->oti_resname; - ostid_build_res_name(&oinfo->loi_oi, resname); - res = ldlm_resource_get( - osc_export(cl2osc(obj))->exp_obd->obd_namespace, - NULL, resname, LDLM_EXTENT, 0); - ldlm_resource_dump(D_ERROR, res); - - LBUG(); - } - - /* check for lockless io. */ - if (lock) { - oa->o_handle = lock->l_remote_handle; - oa->o_valid |= OBD_MD_FLHANDLE; - LDLM_LOCK_PUT(lock); - } - } -} - -static const struct cl_object_operations osc_ops = { - .coo_page_init = osc_page_init, - .coo_lock_init = osc_lock_init, - .coo_io_init = osc_io_init, - .coo_attr_get = osc_attr_get, - .coo_attr_update = osc_attr_update, - .coo_glimpse = osc_object_glimpse, - .coo_prune = osc_object_prune, - .coo_fiemap = osc_object_fiemap, - .coo_req_attr_set = osc_req_attr_set -}; - -static const struct lu_object_operations osc_lu_obj_ops = { - .loo_object_init = osc_object_init, - .loo_object_release = NULL, - .loo_object_free = osc_object_free, - .loo_object_print = osc_object_print, - .loo_object_invariant = NULL -}; - -struct lu_object *osc_object_alloc(const struct lu_env *env, - const struct lu_object_header *unused, - struct lu_device *dev) -{ - struct osc_object *osc; - struct lu_object *obj; - - osc = kmem_cache_zalloc(osc_object_kmem, GFP_NOFS); - if (osc) { - obj = osc2lu(osc); - lu_object_init(obj, NULL, dev); - osc->oo_cl.co_ops = &osc_ops; - obj->lo_ops = &osc_lu_obj_ops; - } else { - obj = NULL; - } - return obj; -} - -int osc_object_invalidate(const struct lu_env *env, struct osc_object *osc) -{ - CDEBUG(D_INODE, "Invalidate osc object: %p, # of active IOs: %d\n", - osc, atomic_read(&osc->oo_nr_ios)); - - wait_event_idle(osc->oo_io_waitq, !atomic_read(&osc->oo_nr_ios)); - - /* Discard all dirty pages of this object. */ - osc_cache_truncate_start(env, osc, 0, NULL); - - /* Discard all caching pages */ - osc_lock_discard_pages(env, osc, 0, CL_PAGE_EOF, CLM_WRITE); - - /* Clear ast data of dlm lock. Do this after discarding all pages */ - osc_object_prune(env, osc2cl(osc)); - - return 0; -} - -/** @} osc */ diff --git a/drivers/staging/lustre/lustre/osc/osc_page.c b/drivers/staging/lustre/lustre/osc/osc_page.c deleted file mode 100644 index 20c553ef3a5ec7c525f8b7d1f469fdb8e2253011..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/osc/osc_page.c +++ /dev/null @@ -1,1094 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * Implementation of cl_page for OSC layer. - * - * Author: Nikita Danilov - * Author: Jinshan Xiong - */ - -#define DEBUG_SUBSYSTEM S_OSC - -#include -#include "osc_cl_internal.h" - -static void osc_lru_del(struct client_obd *cli, struct osc_page *opg); -static void osc_lru_use(struct client_obd *cli, struct osc_page *opg); -static int osc_lru_alloc(const struct lu_env *env, struct client_obd *cli, - struct osc_page *opg); - -/** \addtogroup osc - * @{ - */ - -/***************************************************************************** - * - * Page operations. - * - */ -static void osc_page_transfer_get(struct osc_page *opg, const char *label) -{ - struct cl_page *page = opg->ops_cl.cpl_page; - - LASSERT(!opg->ops_transfer_pinned); - cl_page_get(page); - lu_ref_add_atomic(&page->cp_reference, label, page); - opg->ops_transfer_pinned = 1; -} - -static void osc_page_transfer_put(const struct lu_env *env, - struct osc_page *opg) -{ - struct cl_page *page = opg->ops_cl.cpl_page; - - if (opg->ops_transfer_pinned) { - opg->ops_transfer_pinned = 0; - lu_ref_del(&page->cp_reference, "transfer", page); - cl_page_put(env, page); - } -} - -/** - * This is called once for every page when it is submitted for a transfer - * either opportunistic (osc_page_cache_add()), or immediate - * (osc_page_submit()). - */ -static void osc_page_transfer_add(const struct lu_env *env, - struct osc_page *opg, enum cl_req_type crt) -{ - struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); - - osc_lru_use(osc_cli(obj), opg); -} - -int osc_page_cache_add(const struct lu_env *env, - const struct cl_page_slice *slice, struct cl_io *io) -{ - struct osc_page *opg = cl2osc_page(slice); - int result; - - osc_page_transfer_get(opg, "transfer\0cache"); - result = osc_queue_async_io(env, io, opg); - if (result != 0) - osc_page_transfer_put(env, opg); - else - osc_page_transfer_add(env, opg, CRT_WRITE); - - return result; -} - -void osc_index2policy(union ldlm_policy_data *policy, - const struct cl_object *obj, - pgoff_t start, pgoff_t end) -{ - memset(policy, 0, sizeof(*policy)); - policy->l_extent.start = cl_offset(obj, start); - policy->l_extent.end = cl_offset(obj, end + 1) - 1; -} - -static const char *osc_list(struct list_head *head) -{ - return list_empty(head) ? "-" : "+"; -} - -static inline unsigned long osc_submit_duration(struct osc_page *opg) -{ - if (opg->ops_submit_time == 0) - return 0; - - return (jiffies - opg->ops_submit_time); -} - -static int osc_page_print(const struct lu_env *env, - const struct cl_page_slice *slice, - void *cookie, lu_printer_t printer) -{ - struct osc_page *opg = cl2osc_page(slice); - struct osc_async_page *oap = &opg->ops_oap; - struct osc_object *obj = cl2osc(slice->cpl_obj); - struct client_obd *cli = &osc_export(obj)->exp_obd->u.cli; - - return (*printer)(env, cookie, LUSTRE_OSC_NAME "-page@%p %lu: 1< %#x %d %u %s %s > 2< %llu %u %u %#x %#x | %p %p %p > 3< %d %lu %d > 4< %d %d %d %lu %s | %s %s %s %s > 5< %s %s %s %s | %d %s | %d %s %s>\n", - opg, osc_index(opg), - /* 1 */ - oap->oap_magic, oap->oap_cmd, - oap->oap_interrupted, - osc_list(&oap->oap_pending_item), - osc_list(&oap->oap_rpc_item), - /* 2 */ - oap->oap_obj_off, oap->oap_page_off, oap->oap_count, - oap->oap_async_flags, oap->oap_brw_flags, - oap->oap_request, oap->oap_cli, obj, - /* 3 */ - opg->ops_transfer_pinned, - osc_submit_duration(opg), opg->ops_srvlock, - /* 4 */ - cli->cl_r_in_flight, cli->cl_w_in_flight, - cli->cl_max_rpcs_in_flight, - cli->cl_avail_grant, - osc_list(&cli->cl_cache_waiters), - osc_list(&cli->cl_loi_ready_list), - osc_list(&cli->cl_loi_hp_ready_list), - osc_list(&cli->cl_loi_write_list), - osc_list(&cli->cl_loi_read_list), - /* 5 */ - osc_list(&obj->oo_ready_item), - osc_list(&obj->oo_hp_ready_item), - osc_list(&obj->oo_write_item), - osc_list(&obj->oo_read_item), - atomic_read(&obj->oo_nr_reads), - osc_list(&obj->oo_reading_exts), - atomic_read(&obj->oo_nr_writes), - osc_list(&obj->oo_hp_exts), - osc_list(&obj->oo_urgent_exts)); -} - -static void osc_page_delete(const struct lu_env *env, - const struct cl_page_slice *slice) -{ - struct osc_page *opg = cl2osc_page(slice); - struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); - int rc; - - CDEBUG(D_TRACE, "%p\n", opg); - osc_page_transfer_put(env, opg); - rc = osc_teardown_async_page(env, obj, opg); - if (rc) { - CL_PAGE_DEBUG(D_ERROR, env, slice->cpl_page, - "Trying to teardown failed: %d\n", rc); - LASSERT(0); - } - - osc_lru_del(osc_cli(obj), opg); - - if (slice->cpl_page->cp_type == CPT_CACHEABLE) { - void *value; - - spin_lock(&obj->oo_tree_lock); - value = radix_tree_delete(&obj->oo_tree, osc_index(opg)); - if (value) - --obj->oo_npages; - spin_unlock(&obj->oo_tree_lock); - - LASSERT(ergo(value, value == opg)); - } -} - -static void osc_page_clip(const struct lu_env *env, - const struct cl_page_slice *slice, int from, int to) -{ - struct osc_page *opg = cl2osc_page(slice); - struct osc_async_page *oap = &opg->ops_oap; - - opg->ops_from = from; - opg->ops_to = to; - spin_lock(&oap->oap_lock); - oap->oap_async_flags |= ASYNC_COUNT_STABLE; - spin_unlock(&oap->oap_lock); -} - -static int osc_page_cancel(const struct lu_env *env, - const struct cl_page_slice *slice) -{ - struct osc_page *opg = cl2osc_page(slice); - int rc = 0; - - /* Check if the transferring against this page - * is completed, or not even queued. - */ - if (opg->ops_transfer_pinned) - /* FIXME: may not be interrupted.. */ - rc = osc_cancel_async_page(env, opg); - LASSERT(ergo(rc == 0, opg->ops_transfer_pinned == 0)); - return rc; -} - -static int osc_page_flush(const struct lu_env *env, - const struct cl_page_slice *slice, - struct cl_io *io) -{ - struct osc_page *opg = cl2osc_page(slice); - int rc; - - rc = osc_flush_async_page(env, io, opg); - return rc; -} - -static const struct cl_page_operations osc_page_ops = { - .cpo_print = osc_page_print, - .cpo_delete = osc_page_delete, - .cpo_clip = osc_page_clip, - .cpo_cancel = osc_page_cancel, - .cpo_flush = osc_page_flush -}; - -int osc_page_init(const struct lu_env *env, struct cl_object *obj, - struct cl_page *page, pgoff_t index) -{ - struct osc_object *osc = cl2osc(obj); - struct osc_page *opg = cl_object_page_slice(obj, page); - int result; - - opg->ops_from = 0; - opg->ops_to = PAGE_SIZE; - - result = osc_prep_async_page(osc, opg, page->cp_vmpage, - cl_offset(obj, index)); - if (result == 0) { - struct osc_io *oio = osc_env_io(env); - - opg->ops_srvlock = osc_io_srvlock(oio); - cl_page_slice_add(page, &opg->ops_cl, obj, index, - &osc_page_ops); - } - INIT_LIST_HEAD(&opg->ops_lru); - - /* reserve an LRU space for this page */ - if (page->cp_type == CPT_CACHEABLE && result == 0) { - result = osc_lru_alloc(env, osc_cli(osc), opg); - if (result == 0) { - spin_lock(&osc->oo_tree_lock); - result = radix_tree_insert(&osc->oo_tree, index, opg); - if (result == 0) - ++osc->oo_npages; - spin_unlock(&osc->oo_tree_lock); - LASSERT(result == 0); - } - } - - return result; -} - -/** - * Helper function called by osc_io_submit() for every page in an immediate - * transfer (i.e., transferred synchronously). - */ -void osc_page_submit(const struct lu_env *env, struct osc_page *opg, - enum cl_req_type crt, int brw_flags) -{ - struct osc_async_page *oap = &opg->ops_oap; - - LASSERTF(oap->oap_magic == OAP_MAGIC, "Bad oap magic: oap %p, magic 0x%x\n", - oap, oap->oap_magic); - LASSERT(oap->oap_async_flags & ASYNC_READY); - LASSERT(oap->oap_async_flags & ASYNC_COUNT_STABLE); - - oap->oap_cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ; - oap->oap_page_off = opg->ops_from; - oap->oap_count = opg->ops_to - opg->ops_from; - oap->oap_brw_flags = brw_flags | OBD_BRW_SYNC; - - if (capable(CAP_SYS_RESOURCE)) { - oap->oap_brw_flags |= OBD_BRW_NOQUOTA; - oap->oap_cmd |= OBD_BRW_NOQUOTA; - } - - opg->ops_submit_time = jiffies; - osc_page_transfer_get(opg, "transfer\0imm"); - osc_page_transfer_add(env, opg, crt); -} - -/* --------------- LRU page management ------------------ */ - -/* OSC is a natural place to manage LRU pages as applications are specialized - * to write OSC by OSC. Ideally, if one OSC is used more frequently it should - * occupy more LRU slots. On the other hand, we should avoid using up all LRU - * slots (client_obd::cl_lru_left) otherwise process has to be put into sleep - * for free LRU slots - this will be very bad so the algorithm requires each - * OSC to free slots voluntarily to maintain a reasonable number of free slots - * at any time. - */ -static DECLARE_WAIT_QUEUE_HEAD(osc_lru_waitq); - -/** - * LRU pages are freed in batch mode. OSC should at least free this - * number of pages to avoid running out of LRU slots. - */ -static inline int lru_shrink_min(struct client_obd *cli) -{ - return cli->cl_max_pages_per_rpc * 2; -} - -/** - * free this number at most otherwise it will take too long time to finish. - */ -static inline int lru_shrink_max(struct client_obd *cli) -{ - return cli->cl_max_pages_per_rpc * cli->cl_max_rpcs_in_flight; -} - -/** - * Check if we can free LRU slots from this OSC. If there exists LRU waiters, - * we should free slots aggressively. In this way, slots are freed in a steady - * step to maintain fairness among OSCs. - * - * Return how many LRU pages should be freed. - */ -static int osc_cache_too_much(struct client_obd *cli) -{ - struct cl_client_cache *cache = cli->cl_cache; - long pages = atomic_long_read(&cli->cl_lru_in_list); - unsigned long budget; - - budget = cache->ccc_lru_max / (atomic_read(&cache->ccc_users) - 2); - - /* if it's going to run out LRU slots, we should free some, but not - * too much to maintain fairness among OSCs. - */ - if (atomic_long_read(cli->cl_lru_left) < cache->ccc_lru_max >> 2) { - if (pages >= budget) - return lru_shrink_max(cli); - else if (pages >= budget / 2) - return lru_shrink_min(cli); - } else { - time64_t duration = ktime_get_real_seconds(); - long timediff; - - /* knock out pages by duration of no IO activity */ - duration -= cli->cl_lru_last_used; - /* - * The difference shouldn't be more than 70 years - * so we can safely case to a long. Round to - * approximately 1 minute. - */ - timediff = (long)(duration >> 6); - if (timediff > 0 && pages >= budget / timediff) - return lru_shrink_min(cli); - } - return 0; -} - -int lru_queue_work(const struct lu_env *env, void *data) -{ - struct client_obd *cli = data; - int count; - - CDEBUG(D_CACHE, "%s: run LRU work for client obd\n", cli_name(cli)); - - count = osc_cache_too_much(cli); - if (count > 0) { - int rc = osc_lru_shrink(env, cli, count, false); - - CDEBUG(D_CACHE, "%s: shrank %d/%d pages from client obd\n", - cli_name(cli), rc, count); - if (rc >= count) { - CDEBUG(D_CACHE, "%s: queue again\n", cli_name(cli)); - ptlrpcd_queue_work(cli->cl_lru_work); - } - } - - return 0; -} - -void osc_lru_add_batch(struct client_obd *cli, struct list_head *plist) -{ - LIST_HEAD(lru); - struct osc_async_page *oap; - long npages = 0; - - list_for_each_entry(oap, plist, oap_pending_item) { - struct osc_page *opg = oap2osc_page(oap); - - if (!opg->ops_in_lru) - continue; - - ++npages; - LASSERT(list_empty(&opg->ops_lru)); - list_add(&opg->ops_lru, &lru); - } - - if (npages > 0) { - spin_lock(&cli->cl_lru_list_lock); - list_splice_tail(&lru, &cli->cl_lru_list); - atomic_long_sub(npages, &cli->cl_lru_busy); - atomic_long_add(npages, &cli->cl_lru_in_list); - cli->cl_lru_last_used = ktime_get_real_seconds(); - spin_unlock(&cli->cl_lru_list_lock); - - if (waitqueue_active(&osc_lru_waitq)) - (void)ptlrpcd_queue_work(cli->cl_lru_work); - } -} - -static void __osc_lru_del(struct client_obd *cli, struct osc_page *opg) -{ - LASSERT(atomic_long_read(&cli->cl_lru_in_list) > 0); - list_del_init(&opg->ops_lru); - atomic_long_dec(&cli->cl_lru_in_list); -} - -/** - * Page is being destroyed. The page may be not in LRU list, if the transfer - * has never finished(error occurred). - */ -static void osc_lru_del(struct client_obd *cli, struct osc_page *opg) -{ - if (opg->ops_in_lru) { - spin_lock(&cli->cl_lru_list_lock); - if (!list_empty(&opg->ops_lru)) { - __osc_lru_del(cli, opg); - } else { - LASSERT(atomic_long_read(&cli->cl_lru_busy) > 0); - atomic_long_dec(&cli->cl_lru_busy); - } - spin_unlock(&cli->cl_lru_list_lock); - - atomic_long_inc(cli->cl_lru_left); - /* this is a great place to release more LRU pages if - * this osc occupies too many LRU pages and kernel is - * stealing one of them. - */ - if (osc_cache_too_much(cli)) { - CDEBUG(D_CACHE, "%s: queue LRU work\n", cli_name(cli)); - (void)ptlrpcd_queue_work(cli->cl_lru_work); - } - wake_up(&osc_lru_waitq); - } else { - LASSERT(list_empty(&opg->ops_lru)); - } -} - -/** - * Delete page from LRUlist for redirty. - */ -static void osc_lru_use(struct client_obd *cli, struct osc_page *opg) -{ - /* If page is being transferred for the first time, - * ops_lru should be empty - */ - if (opg->ops_in_lru && !list_empty(&opg->ops_lru)) { - spin_lock(&cli->cl_lru_list_lock); - __osc_lru_del(cli, opg); - spin_unlock(&cli->cl_lru_list_lock); - atomic_long_inc(&cli->cl_lru_busy); - } -} - -static void discard_pagevec(const struct lu_env *env, struct cl_io *io, - struct cl_page **pvec, int max_index) -{ - int i; - - for (i = 0; i < max_index; i++) { - struct cl_page *page = pvec[i]; - - LASSERT(cl_page_is_owned(page, io)); - cl_page_delete(env, page); - cl_page_discard(env, io, page); - cl_page_disown(env, io, page); - cl_page_put(env, page); - - pvec[i] = NULL; - } -} - -/** - * Check if a cl_page can be released, i.e, it's not being used. - * - * If unstable account is turned on, bulk transfer may hold one refcount - * for recovery so we need to check vmpage refcount as well; otherwise, - * even we can destroy cl_page but the corresponding vmpage can't be reused. - */ -static inline bool lru_page_busy(struct client_obd *cli, struct cl_page *page) -{ - if (cl_page_in_use_noref(page)) - return true; - - if (cli->cl_cache->ccc_unstable_check) { - struct page *vmpage = cl_page_vmpage(page); - - /* vmpage have two known users: cl_page and VM page cache */ - if (page_count(vmpage) - page_mapcount(vmpage) > 2) - return true; - } - return false; -} - -/** - * Drop @target of pages from LRU at most. - */ -long osc_lru_shrink(const struct lu_env *env, struct client_obd *cli, - long target, bool force) -{ - struct cl_io *io; - struct cl_object *clobj = NULL; - struct cl_page **pvec; - struct osc_page *opg; - int maxscan = 0; - long count = 0; - int index = 0; - int rc = 0; - - LASSERT(atomic_long_read(&cli->cl_lru_in_list) >= 0); - if (atomic_long_read(&cli->cl_lru_in_list) == 0 || target <= 0) - return 0; - - CDEBUG(D_CACHE, "%s: shrinkers: %d, force: %d\n", - cli_name(cli), atomic_read(&cli->cl_lru_shrinkers), force); - if (!force) { - if (atomic_read(&cli->cl_lru_shrinkers) > 0) - return -EBUSY; - - if (atomic_inc_return(&cli->cl_lru_shrinkers) > 1) { - atomic_dec(&cli->cl_lru_shrinkers); - return -EBUSY; - } - } else { - atomic_inc(&cli->cl_lru_shrinkers); - } - - pvec = (struct cl_page **)osc_env_info(env)->oti_pvec; - io = &osc_env_info(env)->oti_io; - - spin_lock(&cli->cl_lru_list_lock); - if (force) - cli->cl_lru_reclaim++; - maxscan = min(target << 1, atomic_long_read(&cli->cl_lru_in_list)); - while (!list_empty(&cli->cl_lru_list)) { - struct cl_page *page; - bool will_free = false; - - if (!force && atomic_read(&cli->cl_lru_shrinkers) > 1) - break; - - if (--maxscan < 0) - break; - - opg = list_entry(cli->cl_lru_list.next, struct osc_page, - ops_lru); - page = opg->ops_cl.cpl_page; - if (lru_page_busy(cli, page)) { - list_move_tail(&opg->ops_lru, &cli->cl_lru_list); - continue; - } - - LASSERT(page->cp_obj); - if (clobj != page->cp_obj) { - struct cl_object *tmp = page->cp_obj; - - cl_object_get(tmp); - spin_unlock(&cli->cl_lru_list_lock); - - if (clobj) { - discard_pagevec(env, io, pvec, index); - index = 0; - - cl_io_fini(env, io); - cl_object_put(env, clobj); - clobj = NULL; - } - - clobj = tmp; - io->ci_obj = clobj; - io->ci_ignore_layout = 1; - rc = cl_io_init(env, io, CIT_MISC, clobj); - - spin_lock(&cli->cl_lru_list_lock); - - if (rc != 0) - break; - - ++maxscan; - continue; - } - - if (cl_page_own_try(env, io, page) == 0) { - if (!lru_page_busy(cli, page)) { - /* remove it from lru list earlier to avoid - * lock contention - */ - __osc_lru_del(cli, opg); - opg->ops_in_lru = 0; /* will be discarded */ - - cl_page_get(page); - will_free = true; - } else { - cl_page_disown(env, io, page); - } - } - - if (!will_free) { - list_move_tail(&opg->ops_lru, &cli->cl_lru_list); - continue; - } - - /* Don't discard and free the page with cl_lru_list held */ - pvec[index++] = page; - if (unlikely(index == OTI_PVEC_SIZE)) { - spin_unlock(&cli->cl_lru_list_lock); - discard_pagevec(env, io, pvec, index); - index = 0; - - spin_lock(&cli->cl_lru_list_lock); - } - - if (++count >= target) - break; - } - spin_unlock(&cli->cl_lru_list_lock); - - if (clobj) { - discard_pagevec(env, io, pvec, index); - - cl_io_fini(env, io); - cl_object_put(env, clobj); - } - - atomic_dec(&cli->cl_lru_shrinkers); - if (count > 0) { - atomic_long_add(count, cli->cl_lru_left); - wake_up_all(&osc_lru_waitq); - } - return count > 0 ? count : rc; -} - -/** - * Reclaim LRU pages by an IO thread. The caller wants to reclaim at least - * \@npages of LRU slots. For performance consideration, it's better to drop - * LRU pages in batch. Therefore, the actual number is adjusted at least - * max_pages_per_rpc. - */ -static long osc_lru_reclaim(struct client_obd *cli, unsigned long npages) -{ - struct lu_env *env; - struct cl_client_cache *cache = cli->cl_cache; - int max_scans; - u16 refcheck; - long rc = 0; - - LASSERT(cache); - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return 0; - - npages = max_t(int, npages, cli->cl_max_pages_per_rpc); - CDEBUG(D_CACHE, "%s: start to reclaim %ld pages from LRU\n", - cli_name(cli), npages); - rc = osc_lru_shrink(env, cli, npages, true); - if (rc >= npages) { - CDEBUG(D_CACHE, "%s: reclaimed %ld/%ld pages from LRU\n", - cli_name(cli), rc, npages); - if (osc_cache_too_much(cli) > 0) - ptlrpcd_queue_work(cli->cl_lru_work); - goto out; - } else if (rc > 0) { - npages -= rc; - } - - CDEBUG(D_CACHE, "%s: cli %p no free slots, pages: %ld/%ld, want: %ld\n", - cli_name(cli), cli, atomic_long_read(&cli->cl_lru_in_list), - atomic_long_read(&cli->cl_lru_busy), npages); - - /* Reclaim LRU slots from other client_obd as it can't free enough - * from its own. This should rarely happen. - */ - spin_lock(&cache->ccc_lru_lock); - LASSERT(!list_empty(&cache->ccc_lru)); - - cache->ccc_lru_shrinkers++; - list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru); - - max_scans = atomic_read(&cache->ccc_users) - 2; - while (--max_scans > 0 && !list_empty(&cache->ccc_lru)) { - cli = list_entry(cache->ccc_lru.next, struct client_obd, - cl_lru_osc); - - CDEBUG(D_CACHE, "%s: cli %p LRU pages: %ld, busy: %ld.\n", - cli_name(cli), cli, - atomic_long_read(&cli->cl_lru_in_list), - atomic_long_read(&cli->cl_lru_busy)); - - list_move_tail(&cli->cl_lru_osc, &cache->ccc_lru); - if (osc_cache_too_much(cli) > 0) { - spin_unlock(&cache->ccc_lru_lock); - - rc = osc_lru_shrink(env, cli, npages, true); - spin_lock(&cache->ccc_lru_lock); - if (rc >= npages) - break; - if (rc > 0) - npages -= rc; - } - } - spin_unlock(&cache->ccc_lru_lock); - -out: - cl_env_put(env, &refcheck); - CDEBUG(D_CACHE, "%s: cli %p freed %ld pages.\n", - cli_name(cli), cli, rc); - return rc; -} - -/** - * osc_lru_alloc() is called to reserve an LRU slot for a cl_page. - * - * Usually the LRU slots are reserved in osc_io_iter_rw_init(). - * Only in the case that the LRU slots are in extreme shortage, it should - * have reserved enough slots for an IO. - */ -static int osc_lru_alloc(const struct lu_env *env, struct client_obd *cli, - struct osc_page *opg) -{ - struct osc_io *oio = osc_env_io(env); - int rc = 0; - - if (!cli->cl_cache) /* shall not be in LRU */ - return 0; - - if (oio->oi_lru_reserved > 0) { - --oio->oi_lru_reserved; - goto out; - } - - LASSERT(atomic_long_read(cli->cl_lru_left) >= 0); - while (!atomic_long_add_unless(cli->cl_lru_left, -1, 0)) { - /* run out of LRU spaces, try to drop some by itself */ - rc = osc_lru_reclaim(cli, 1); - if (rc < 0) - break; - if (rc > 0) - continue; - - cond_resched(); - - rc = l_wait_event_abortable(osc_lru_waitq, - atomic_long_read(cli->cl_lru_left) > 0); - - if (rc < 0) - break; - } - -out: - if (rc >= 0) { - atomic_long_inc(&cli->cl_lru_busy); - opg->ops_in_lru = 1; - rc = 0; - } - - return rc; -} - -/** - * osc_lru_reserve() is called to reserve enough LRU slots for I/O. - * - * The benefit of doing this is to reduce contention against atomic counter - * cl_lru_left by changing it from per-page access to per-IO access. - */ -unsigned long osc_lru_reserve(struct client_obd *cli, unsigned long npages) -{ - unsigned long reserved = 0; - unsigned long max_pages; - unsigned long c; - - /* - * reserve a full RPC window at most to avoid that a thread accidentally - * consumes too many LRU slots - */ - max_pages = cli->cl_max_pages_per_rpc * cli->cl_max_rpcs_in_flight; - if (npages > max_pages) - npages = max_pages; - - c = atomic_long_read(cli->cl_lru_left); - if (c < npages && osc_lru_reclaim(cli, npages) > 0) - c = atomic_long_read(cli->cl_lru_left); - while (c >= npages) { - if (c == atomic_long_cmpxchg(cli->cl_lru_left, c, c - npages)) { - reserved = npages; - break; - } - c = atomic_long_read(cli->cl_lru_left); - } - if (atomic_long_read(cli->cl_lru_left) < max_pages) { - /* - * If there aren't enough pages in the per-OSC LRU then - * wake up the LRU thread to try and clear out space, so - * we don't block if pages are being dirtied quickly. - */ - CDEBUG(D_CACHE, "%s: queue LRU, left: %lu/%ld.\n", - cli_name(cli), atomic_long_read(cli->cl_lru_left), - max_pages); - (void)ptlrpcd_queue_work(cli->cl_lru_work); - } - - return reserved; -} - -/** - * osc_lru_unreserve() is called to unreserve LRU slots. - * - * LRU slots reserved by osc_lru_reserve() may have entries left due to several - * reasons such as page already existing or I/O error. Those reserved slots - * should be freed by calling this function. - */ -void osc_lru_unreserve(struct client_obd *cli, unsigned long npages) -{ - atomic_long_add(npages, cli->cl_lru_left); - wake_up_all(&osc_lru_waitq); -} - -/** - * Atomic operations are expensive. We accumulate the accounting for the - * same page pgdat to get better performance. - * In practice this can work pretty good because the pages in the same RPC - * are likely from the same page zone. - */ -static inline void unstable_page_accounting(struct ptlrpc_bulk_desc *desc, - int factor) -{ - int page_count = desc->bd_iov_count; - pg_data_t *last = NULL; - int count = 0; - int i; - - LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type)); - - for (i = 0; i < page_count; i++) { - pg_data_t *pgdat = page_pgdat(BD_GET_KIOV(desc, i).bv_page); - - if (likely(pgdat == last)) { - ++count; - continue; - } - - if (count > 0) { - mod_node_page_state(pgdat, NR_UNSTABLE_NFS, - factor * count); - count = 0; - } - last = pgdat; - ++count; - } - if (count > 0) - mod_node_page_state(last, NR_UNSTABLE_NFS, factor * count); -} - -static inline void add_unstable_page_accounting(struct ptlrpc_bulk_desc *desc) -{ - unstable_page_accounting(desc, 1); -} - -static inline void dec_unstable_page_accounting(struct ptlrpc_bulk_desc *desc) -{ - unstable_page_accounting(desc, -1); -} - -/** - * Performs "unstable" page accounting. This function balances the - * increment operations performed in osc_inc_unstable_pages. It is - * registered as the RPC request callback, and is executed when the - * bulk RPC is committed on the server. Thus at this point, the pages - * involved in the bulk transfer are no longer considered unstable. - * - * If this function is called, the request should have been committed - * or req:rq_unstable must have been set; it implies that the unstable - * statistic have been added. - */ -void osc_dec_unstable_pages(struct ptlrpc_request *req) -{ - struct client_obd *cli = &req->rq_import->imp_obd->u.cli; - struct ptlrpc_bulk_desc *desc = req->rq_bulk; - int page_count = desc->bd_iov_count; - long unstable_count; - - LASSERT(page_count >= 0); - dec_unstable_page_accounting(desc); - - unstable_count = atomic_long_sub_return(page_count, - &cli->cl_unstable_count); - LASSERT(unstable_count >= 0); - - unstable_count = atomic_long_sub_return(page_count, - &cli->cl_cache->ccc_unstable_nr); - LASSERT(unstable_count >= 0); - if (!unstable_count) - wake_up_all(&cli->cl_cache->ccc_unstable_waitq); - - if (waitqueue_active(&osc_lru_waitq)) - (void)ptlrpcd_queue_work(cli->cl_lru_work); -} - -/** - * "unstable" page accounting. See: osc_dec_unstable_pages. - */ -void osc_inc_unstable_pages(struct ptlrpc_request *req) -{ - struct client_obd *cli = &req->rq_import->imp_obd->u.cli; - struct ptlrpc_bulk_desc *desc = req->rq_bulk; - long page_count = desc->bd_iov_count; - - /* No unstable page tracking */ - if (!cli->cl_cache || !cli->cl_cache->ccc_unstable_check) - return; - - add_unstable_page_accounting(desc); - atomic_long_add(page_count, &cli->cl_unstable_count); - atomic_long_add(page_count, &cli->cl_cache->ccc_unstable_nr); - - /* - * If the request has already been committed (i.e. brw_commit - * called via rq_commit_cb), we need to undo the unstable page - * increments we just performed because rq_commit_cb wont be - * called again. - */ - spin_lock(&req->rq_lock); - if (unlikely(req->rq_committed)) { - spin_unlock(&req->rq_lock); - - osc_dec_unstable_pages(req); - } else { - req->rq_unstable = 1; - spin_unlock(&req->rq_lock); - } -} - -/** - * Check if it piggybacks SOFT_SYNC flag to OST from this OSC. - * This function will be called by every BRW RPC so it's critical - * to make this function fast. - */ -bool osc_over_unstable_soft_limit(struct client_obd *cli) -{ - long unstable_nr, osc_unstable_count; - - /* Can't check cli->cl_unstable_count, therefore, no soft limit */ - if (!cli->cl_cache || !cli->cl_cache->ccc_unstable_check) - return false; - - osc_unstable_count = atomic_long_read(&cli->cl_unstable_count); - unstable_nr = atomic_long_read(&cli->cl_cache->ccc_unstable_nr); - - CDEBUG(D_CACHE, - "%s: cli: %p unstable pages: %lu, osc unstable pages: %lu\n", - cli_name(cli), cli, unstable_nr, osc_unstable_count); - - /* - * If the LRU slots are in shortage - 25% remaining AND this OSC - * has one full RPC window of unstable pages, it's a good chance - * to piggyback a SOFT_SYNC flag. - * Please notice that the OST won't take immediate response for the - * SOFT_SYNC request so active OSCs will have more chance to carry - * the flag, this is reasonable. - */ - return unstable_nr > cli->cl_cache->ccc_lru_max >> 2 && - osc_unstable_count > cli->cl_max_pages_per_rpc * - cli->cl_max_rpcs_in_flight; -} - -/** - * Return how many LRU pages in the cache of all OSC devices - * - * Return: return # of cached LRU pages times reclaimation tendency - * SHRINK_STOP if it cannot do any scanning in this time - */ -unsigned long osc_cache_shrink_count(struct shrinker *sk, - struct shrink_control *sc) -{ - struct client_obd *cli; - unsigned long cached = 0; - - spin_lock(&osc_shrink_lock); - list_for_each_entry(cli, &osc_shrink_list, cl_shrink_list) - cached += atomic_long_read(&cli->cl_lru_in_list); - spin_unlock(&osc_shrink_lock); - - return (cached * sysctl_vfs_cache_pressure) / 100; -} - -/** - * Scan and try to reclaim sc->nr_to_scan cached LRU pages - * - * Return: number of cached LRU pages reclaimed - * SHRINK_STOP if it cannot do any scanning in this time - * - * Linux kernel will loop calling this shrinker scan routine with - * sc->nr_to_scan = SHRINK_BATCH(128 for now) until kernel got enough memory. - * - * If sc->nr_to_scan is 0, the VM is querying the cache size, we don't need - * to scan and try to reclaim LRU pages, just return 0 and - * osc_cache_shrink_count() will report the LRU page number. - */ -unsigned long osc_cache_shrink_scan(struct shrinker *sk, - struct shrink_control *sc) -{ - struct client_obd *stop_anchor = NULL; - struct client_obd *cli; - struct lu_env *env; - long shrank = 0; - u16 refcheck; - int rc; - - if (!sc->nr_to_scan) - return 0; - - if (!(sc->gfp_mask & __GFP_FS)) - return SHRINK_STOP; - - env = cl_env_get(&refcheck); - if (IS_ERR(env)) - return SHRINK_STOP; - - spin_lock(&osc_shrink_lock); - while (!list_empty(&osc_shrink_list)) { - cli = list_entry(osc_shrink_list.next, struct client_obd, - cl_shrink_list); - - if (!stop_anchor) - stop_anchor = cli; - else if (cli == stop_anchor) - break; - - list_move_tail(&cli->cl_shrink_list, &osc_shrink_list); - spin_unlock(&osc_shrink_lock); - - /* shrink no more than max_pages_per_rpc for an OSC */ - rc = osc_lru_shrink(env, cli, (sc->nr_to_scan - shrank) > - cli->cl_max_pages_per_rpc ? - cli->cl_max_pages_per_rpc : - sc->nr_to_scan - shrank, true); - if (rc > 0) - shrank += rc; - - if (shrank >= sc->nr_to_scan) - goto out; - - spin_lock(&osc_shrink_lock); - } - spin_unlock(&osc_shrink_lock); - -out: - cl_env_put(env, &refcheck); - - return shrank; -} - -/** @} osc */ diff --git a/drivers/staging/lustre/lustre/osc/osc_quota.c b/drivers/staging/lustre/lustre/osc/osc_quota.c deleted file mode 100644 index 723ec2fb18bf0b0cf5cc729dc630cec94c5bca16..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/osc/osc_quota.c +++ /dev/null @@ -1,236 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * GPL HEADER END - */ -/* - * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. - * - * Copyright (c) 2011, 2015, Intel Corporation. - * - * Code originally extracted from quota directory - */ - -#include -#include "osc_internal.h" - -static const struct rhashtable_params quota_hash_params = { - .key_len = sizeof(u32), - .key_offset = offsetof(struct osc_quota_info, oqi_id), - .head_offset = offsetof(struct osc_quota_info, oqi_hash), - .automatic_shrinking = true, -}; - -static inline struct osc_quota_info *osc_oqi_alloc(u32 id) -{ - struct osc_quota_info *oqi; - - oqi = kmem_cache_zalloc(osc_quota_kmem, GFP_NOFS); - if (oqi) - oqi->oqi_id = id; - - return oqi; -} - -int osc_quota_chkdq(struct client_obd *cli, const unsigned int qid[]) -{ - int type; - - for (type = 0; type < MAXQUOTAS; type++) { - struct osc_quota_info *oqi; - - oqi = rhashtable_lookup_fast(&cli->cl_quota_hash[type], &qid[type], - quota_hash_params); - if (oqi) { - /* Must not access oqi here, it could have been - * freed by osc_quota_setdq() - */ - - /* the slot is busy, the user is about to run out of - * quota space on this OST - */ - CDEBUG(D_QUOTA, "chkdq found noquota for %s %d\n", - type == USRQUOTA ? "user" : "grout", qid[type]); - return NO_QUOTA; - } - } - - return QUOTA_OK; -} - -static void osc_quota_free(struct rcu_head *head) -{ - struct osc_quota_info *oqi = container_of(head, struct osc_quota_info, rcu); - - kmem_cache_free(osc_quota_kmem, oqi); -} - - -#define MD_QUOTA_FLAG(type) ((type == USRQUOTA) ? OBD_MD_FLUSRQUOTA \ - : OBD_MD_FLGRPQUOTA) -#define FL_QUOTA_FLAG(type) ((type == USRQUOTA) ? OBD_FL_NO_USRQUOTA \ - : OBD_FL_NO_GRPQUOTA) - -int osc_quota_setdq(struct client_obd *cli, const unsigned int qid[], - u32 valid, u32 flags) -{ - int type; - int rc = 0; - - if ((valid & (OBD_MD_FLUSRQUOTA | OBD_MD_FLGRPQUOTA)) == 0) - return 0; - - for (type = 0; type < MAXQUOTAS; type++) { - struct osc_quota_info *oqi; - - if ((valid & MD_QUOTA_FLAG(type)) == 0) - continue; - - /* lookup the ID in the per-type hash table */ - rcu_read_lock(); - oqi = rhashtable_lookup_fast(&cli->cl_quota_hash[type], &qid[type], - quota_hash_params); - if ((flags & FL_QUOTA_FLAG(type)) != 0) { - /* This ID is getting close to its quota limit, let's - * switch to sync I/O - */ - rcu_read_unlock(); - if (oqi) - continue; - - oqi = osc_oqi_alloc(qid[type]); - if (!oqi) { - rc = -ENOMEM; - break; - } - - rc = rhashtable_lookup_insert_fast(&cli->cl_quota_hash[type], - &oqi->oqi_hash, quota_hash_params); - /* race with others? */ - if (rc) { - kmem_cache_free(osc_quota_kmem, oqi); - if (rc != -EEXIST) { - rc = -ENOMEM; - break; - } - rc = 0; - } - - CDEBUG(D_QUOTA, "%s: setdq to insert for %s %d (%d)\n", - cli_name(cli), - type == USRQUOTA ? "user" : "group", - qid[type], rc); - } else { - /* This ID is now off the hook, let's remove it from - * the hash table - */ - if (!oqi) { - rcu_read_unlock(); - continue; - } - if (rhashtable_remove_fast(&cli->cl_quota_hash[type], - &oqi->oqi_hash, quota_hash_params) == 0) - call_rcu(&oqi->rcu, osc_quota_free); - rcu_read_unlock(); - CDEBUG(D_QUOTA, "%s: setdq to remove for %s %d (%p)\n", - cli_name(cli), - type == USRQUOTA ? "user" : "group", - qid[type], oqi); - } - } - - return rc; -} - -static void -oqi_exit(void *vquota, void *data) -{ - struct osc_quota_info *oqi = vquota; - - osc_quota_free(&oqi->rcu); -} - -int osc_quota_setup(struct obd_device *obd) -{ - struct client_obd *cli = &obd->u.cli; - int i, type; - - for (type = 0; type < MAXQUOTAS; type++) { - if (rhashtable_init(&cli->cl_quota_hash[type], "a_hash_params) != 0) - break; - } - - if (type == MAXQUOTAS) - return 0; - - for (i = 0; i < type; i++) - rhashtable_destroy(&cli->cl_quota_hash[i]); - - return -ENOMEM; -} - -int osc_quota_cleanup(struct obd_device *obd) -{ - struct client_obd *cli = &obd->u.cli; - int type; - - for (type = 0; type < MAXQUOTAS; type++) - rhashtable_free_and_destroy(&cli->cl_quota_hash[type], - oqi_exit, NULL); - - return 0; -} - -int osc_quotactl(struct obd_device *unused, struct obd_export *exp, - struct obd_quotactl *oqctl) -{ - struct ptlrpc_request *req; - struct obd_quotactl *oqc; - int rc; - - req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), - &RQF_OST_QUOTACTL, LUSTRE_OST_VERSION, - OST_QUOTACTL); - if (!req) - return -ENOMEM; - - oqc = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL); - *oqc = *oqctl; - - ptlrpc_request_set_replen(req); - ptlrpc_at_set_req_timeout(req); - req->rq_no_resend = 1; - - rc = ptlrpc_queue_wait(req); - if (rc) - CERROR("ptlrpc_queue_wait failed, rc: %d\n", rc); - - if (req->rq_repmsg) { - oqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL); - if (oqc) { - *oqctl = *oqc; - } else if (!rc) { - CERROR("Can't unpack obd_quotactl\n"); - rc = -EPROTO; - } - } else if (!rc) { - CERROR("Can't unpack obd_quotactl\n"); - rc = -EPROTO; - } - ptlrpc_req_finished(req); - - return rc; -} diff --git a/drivers/staging/lustre/lustre/osc/osc_request.c b/drivers/staging/lustre/lustre/osc/osc_request.c deleted file mode 100644 index 61ef6c8d7a121d577e9e7160e97be13a3b70e118..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/osc/osc_request.c +++ /dev/null @@ -1,2907 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_OSC - -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "osc_internal.h" -#include "osc_cl_internal.h" - -atomic_t osc_pool_req_count; -unsigned int osc_reqpool_maxreqcount; -struct ptlrpc_request_pool *osc_rq_pool; - -/* max memory used for request pool, unit is MB */ -static unsigned int osc_reqpool_mem_max = 5; -module_param(osc_reqpool_mem_max, uint, 0444); - -struct osc_brw_async_args { - struct obdo *aa_oa; - int aa_requested_nob; - int aa_nio_count; - u32 aa_page_count; - int aa_resends; - struct brw_page **aa_ppga; - struct client_obd *aa_cli; - struct list_head aa_oaps; - struct list_head aa_exts; -}; - -struct osc_async_args { - struct obd_info *aa_oi; -}; - -struct osc_setattr_args { - struct obdo *sa_oa; - obd_enqueue_update_f sa_upcall; - void *sa_cookie; -}; - -struct osc_fsync_args { - struct osc_object *fa_obj; - struct obdo *fa_oa; - obd_enqueue_update_f fa_upcall; - void *fa_cookie; -}; - -struct osc_enqueue_args { - struct obd_export *oa_exp; - enum ldlm_type oa_type; - enum ldlm_mode oa_mode; - __u64 *oa_flags; - osc_enqueue_upcall_f oa_upcall; - void *oa_cookie; - struct ost_lvb *oa_lvb; - struct lustre_handle oa_lockh; - unsigned int oa_agl:1; -}; - -static void osc_release_ppga(struct brw_page **ppga, u32 count); -static int brw_interpret(const struct lu_env *env, - struct ptlrpc_request *req, void *data, int rc); - -static inline void osc_pack_req_body(struct ptlrpc_request *req, - struct obdo *oa) -{ - struct ost_body *body; - - body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); - LASSERT(body); - - lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); -} - -static int osc_getattr(const struct lu_env *env, struct obd_export *exp, - struct obdo *oa) -{ - struct ptlrpc_request *req; - struct ost_body *body; - int rc; - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_GETATTR); - if (!req) - return -ENOMEM; - - rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_GETATTR); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - osc_pack_req_body(req, oa); - - ptlrpc_request_set_replen(req); - - rc = ptlrpc_queue_wait(req); - if (rc) - goto out; - - body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); - if (!body) { - rc = -EPROTO; - goto out; - } - - CDEBUG(D_INODE, "mode: %o\n", body->oa.o_mode); - lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, - &body->oa); - - oa->o_blksize = cli_brw_size(exp->exp_obd); - oa->o_valid |= OBD_MD_FLBLKSZ; - - out: - ptlrpc_req_finished(req); - return rc; -} - -static int osc_setattr(const struct lu_env *env, struct obd_export *exp, - struct obdo *oa) -{ - struct ptlrpc_request *req; - struct ost_body *body; - int rc; - - LASSERT(oa->o_valid & OBD_MD_FLGROUP); - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR); - if (!req) - return -ENOMEM; - - rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - osc_pack_req_body(req, oa); - - ptlrpc_request_set_replen(req); - - rc = ptlrpc_queue_wait(req); - if (rc) - goto out; - - body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); - if (!body) { - rc = -EPROTO; - goto out; - } - - lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, - &body->oa); - -out: - ptlrpc_req_finished(req); - return rc; -} - -static int osc_setattr_interpret(const struct lu_env *env, - struct ptlrpc_request *req, - struct osc_setattr_args *sa, int rc) -{ - struct ost_body *body; - - if (rc != 0) - goto out; - - body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); - if (!body) { - rc = -EPROTO; - goto out; - } - - lustre_get_wire_obdo(&req->rq_import->imp_connect_data, sa->sa_oa, - &body->oa); -out: - rc = sa->sa_upcall(sa->sa_cookie, rc); - return rc; -} - -int osc_setattr_async(struct obd_export *exp, struct obdo *oa, - obd_enqueue_update_f upcall, void *cookie, - struct ptlrpc_request_set *rqset) -{ - struct ptlrpc_request *req; - struct osc_setattr_args *sa; - int rc; - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR); - if (!req) - return -ENOMEM; - - rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SETATTR); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - osc_pack_req_body(req, oa); - - ptlrpc_request_set_replen(req); - - /* do mds to ost setattr asynchronously */ - if (!rqset) { - /* Do not wait for response. */ - ptlrpcd_add_req(req); - } else { - req->rq_interpret_reply = - (ptlrpc_interpterer_t)osc_setattr_interpret; - - BUILD_BUG_ON(sizeof(*sa) > sizeof(req->rq_async_args)); - sa = ptlrpc_req_async_args(req); - sa->sa_oa = oa; - sa->sa_upcall = upcall; - sa->sa_cookie = cookie; - - if (rqset == PTLRPCD_SET) - ptlrpcd_add_req(req); - else - ptlrpc_set_add_req(rqset, req); - } - - return 0; -} - -static int osc_create(const struct lu_env *env, struct obd_export *exp, - struct obdo *oa) -{ - struct ptlrpc_request *req; - struct ost_body *body; - int rc; - - LASSERT(oa); - LASSERT(oa->o_valid & OBD_MD_FLGROUP); - LASSERT(fid_seq_is_echo(ostid_seq(&oa->o_oi))); - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_CREATE); - if (!req) { - rc = -ENOMEM; - goto out; - } - - rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_CREATE); - if (rc) { - ptlrpc_request_free(req); - goto out; - } - - body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); - LASSERT(body); - - lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); - - ptlrpc_request_set_replen(req); - - rc = ptlrpc_queue_wait(req); - if (rc) - goto out_req; - - body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); - if (!body) { - rc = -EPROTO; - goto out_req; - } - - CDEBUG(D_INFO, "oa flags %x\n", oa->o_flags); - lustre_get_wire_obdo(&req->rq_import->imp_connect_data, oa, &body->oa); - - oa->o_blksize = cli_brw_size(exp->exp_obd); - oa->o_valid |= OBD_MD_FLBLKSZ; - - CDEBUG(D_HA, "transno: %lld\n", - lustre_msg_get_transno(req->rq_repmsg)); -out_req: - ptlrpc_req_finished(req); -out: - return rc; -} - -int osc_punch_base(struct obd_export *exp, struct obdo *oa, - obd_enqueue_update_f upcall, void *cookie, - struct ptlrpc_request_set *rqset) -{ - struct ptlrpc_request *req; - struct osc_setattr_args *sa; - struct ost_body *body; - int rc; - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_PUNCH); - if (!req) - return -ENOMEM; - - rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */ - ptlrpc_at_set_req_timeout(req); - - body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); - LASSERT(body); - lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, - oa); - - ptlrpc_request_set_replen(req); - - req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_setattr_interpret; - BUILD_BUG_ON(sizeof(*sa) > sizeof(req->rq_async_args)); - sa = ptlrpc_req_async_args(req); - sa->sa_oa = oa; - sa->sa_upcall = upcall; - sa->sa_cookie = cookie; - if (rqset == PTLRPCD_SET) - ptlrpcd_add_req(req); - else - ptlrpc_set_add_req(rqset, req); - - return 0; -} - -static int osc_sync_interpret(const struct lu_env *env, - struct ptlrpc_request *req, - void *arg, int rc) -{ - struct cl_attr *attr = &osc_env_info(env)->oti_attr; - struct osc_fsync_args *fa = arg; - unsigned long valid = 0; - struct ost_body *body; - struct cl_object *obj; - - if (rc) - goto out; - - body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); - if (!body) { - CERROR("can't unpack ost_body\n"); - rc = -EPROTO; - goto out; - } - - *fa->fa_oa = body->oa; - obj = osc2cl(fa->fa_obj); - - /* Update osc object's blocks attribute */ - cl_object_attr_lock(obj); - if (body->oa.o_valid & OBD_MD_FLBLOCKS) { - attr->cat_blocks = body->oa.o_blocks; - valid |= CAT_BLOCKS; - } - - if (valid) - cl_object_attr_update(env, obj, attr, valid); - cl_object_attr_unlock(obj); - -out: - rc = fa->fa_upcall(fa->fa_cookie, rc); - return rc; -} - -int osc_sync_base(struct osc_object *obj, struct obdo *oa, - obd_enqueue_update_f upcall, void *cookie, - struct ptlrpc_request_set *rqset) -{ - struct obd_export *exp = osc_export(obj); - struct ptlrpc_request *req; - struct ost_body *body; - struct osc_fsync_args *fa; - int rc; - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SYNC); - if (!req) - return -ENOMEM; - - rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SYNC); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - /* overload the size and blocks fields in the oa with start/end */ - body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); - LASSERT(body); - lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, - oa); - - ptlrpc_request_set_replen(req); - req->rq_interpret_reply = osc_sync_interpret; - - BUILD_BUG_ON(sizeof(*fa) > sizeof(req->rq_async_args)); - fa = ptlrpc_req_async_args(req); - fa->fa_obj = obj; - fa->fa_oa = oa; - fa->fa_upcall = upcall; - fa->fa_cookie = cookie; - - if (rqset == PTLRPCD_SET) - ptlrpcd_add_req(req); - else - ptlrpc_set_add_req(rqset, req); - - return 0; -} - -/* Find and cancel locally locks matched by @mode in the resource found by - * @objid. Found locks are added into @cancel list. Returns the amount of - * locks added to @cancels list. - */ -static int osc_resource_get_unused(struct obd_export *exp, struct obdo *oa, - struct list_head *cancels, - enum ldlm_mode mode, __u64 lock_flags) -{ - struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; - struct ldlm_res_id res_id; - struct ldlm_resource *res; - int count; - - /* Return, i.e. cancel nothing, only if ELC is supported (flag in - * export) but disabled through procfs (flag in NS). - * - * This distinguishes from a case when ELC is not supported originally, - * when we still want to cancel locks in advance and just cancel them - * locally, without sending any RPC. - */ - if (exp_connect_cancelset(exp) && !ns_connect_cancelset(ns)) - return 0; - - ostid_build_res_name(&oa->o_oi, &res_id); - res = ldlm_resource_get(ns, NULL, &res_id, 0, 0); - if (IS_ERR(res)) - return 0; - - LDLM_RESOURCE_ADDREF(res); - count = ldlm_cancel_resource_local(res, cancels, NULL, mode, - lock_flags, 0, NULL); - LDLM_RESOURCE_DELREF(res); - ldlm_resource_putref(res); - return count; -} - -static int osc_destroy_interpret(const struct lu_env *env, - struct ptlrpc_request *req, void *data, - int rc) -{ - struct client_obd *cli = &req->rq_import->imp_obd->u.cli; - - atomic_dec(&cli->cl_destroy_in_flight); - wake_up(&cli->cl_destroy_waitq); - return 0; -} - -static int osc_can_send_destroy(struct client_obd *cli) -{ - if (atomic_inc_return(&cli->cl_destroy_in_flight) <= - cli->cl_max_rpcs_in_flight) { - /* The destroy request can be sent */ - return 1; - } - if (atomic_dec_return(&cli->cl_destroy_in_flight) < - cli->cl_max_rpcs_in_flight) { - /* - * The counter has been modified between the two atomic - * operations. - */ - wake_up(&cli->cl_destroy_waitq); - } - return 0; -} - -static int osc_destroy(const struct lu_env *env, struct obd_export *exp, - struct obdo *oa) -{ - struct client_obd *cli = &exp->exp_obd->u.cli; - struct ptlrpc_request *req; - struct ost_body *body; - LIST_HEAD(cancels); - int rc, count; - - if (!oa) { - CDEBUG(D_INFO, "oa NULL\n"); - return -EINVAL; - } - - count = osc_resource_get_unused(exp, oa, &cancels, LCK_PW, - LDLM_FL_DISCARD_DATA); - - req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_DESTROY); - if (!req) { - ldlm_lock_list_put(&cancels, l_bl_ast, count); - return -ENOMEM; - } - - rc = ldlm_prep_elc_req(exp, req, LUSTRE_OST_VERSION, OST_DESTROY, - 0, &cancels, count); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */ - ptlrpc_at_set_req_timeout(req); - - body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); - LASSERT(body); - lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); - - ptlrpc_request_set_replen(req); - - req->rq_interpret_reply = osc_destroy_interpret; - if (!osc_can_send_destroy(cli)) { - /* - * Wait until the number of on-going destroy RPCs drops - * under max_rpc_in_flight - */ - l_wait_event_abortable_exclusive(cli->cl_destroy_waitq, - osc_can_send_destroy(cli)); - } - - /* Do not wait for response */ - ptlrpcd_add_req(req); - return 0; -} - -static void osc_announce_cached(struct client_obd *cli, struct obdo *oa, - long writing_bytes) -{ - u32 bits = OBD_MD_FLBLOCKS | OBD_MD_FLGRANT; - - LASSERT(!(oa->o_valid & bits)); - - oa->o_valid |= bits; - spin_lock(&cli->cl_loi_list_lock); - oa->o_dirty = cli->cl_dirty_pages << PAGE_SHIFT; - if (unlikely(cli->cl_dirty_pages - cli->cl_dirty_transit > - cli->cl_dirty_max_pages)) { - CERROR("dirty %lu - %lu > dirty_max %lu\n", - cli->cl_dirty_pages, cli->cl_dirty_transit, - cli->cl_dirty_max_pages); - oa->o_undirty = 0; - } else if (unlikely(atomic_long_read(&obd_dirty_pages) - - atomic_long_read(&obd_dirty_transit_pages) > - (long)(obd_max_dirty_pages + 1))) { - /* The atomic_read() allowing the atomic_inc() are - * not covered by a lock thus they may safely race and trip - * this CERROR() unless we add in a small fudge factor (+1). - */ - CERROR("%s: dirty %ld + %ld > system dirty_max %ld\n", - cli_name(cli), atomic_long_read(&obd_dirty_pages), - atomic_long_read(&obd_dirty_transit_pages), - obd_max_dirty_pages); - oa->o_undirty = 0; - } else if (unlikely(cli->cl_dirty_max_pages - cli->cl_dirty_pages > - 0x7fffffff)) { - CERROR("dirty %lu - dirty_max %lu too big???\n", - cli->cl_dirty_pages, cli->cl_dirty_max_pages); - oa->o_undirty = 0; - } else { - unsigned long max_in_flight; - - max_in_flight = (cli->cl_max_pages_per_rpc << PAGE_SHIFT) * - (cli->cl_max_rpcs_in_flight + 1); - oa->o_undirty = max(cli->cl_dirty_max_pages << PAGE_SHIFT, - max_in_flight); - } - oa->o_grant = cli->cl_avail_grant + cli->cl_reserved_grant; - oa->o_dropped = cli->cl_lost_grant; - cli->cl_lost_grant = 0; - spin_unlock(&cli->cl_loi_list_lock); - CDEBUG(D_CACHE, "dirty: %llu undirty: %u dropped %u grant: %llu\n", - oa->o_dirty, oa->o_undirty, oa->o_dropped, oa->o_grant); -} - -void osc_update_next_shrink(struct client_obd *cli) -{ - cli->cl_next_shrink_grant = - jiffies + cli->cl_grant_shrink_interval * HZ; - CDEBUG(D_CACHE, "next time %ld to shrink grant\n", - cli->cl_next_shrink_grant); -} - -static void __osc_update_grant(struct client_obd *cli, u64 grant) -{ - spin_lock(&cli->cl_loi_list_lock); - cli->cl_avail_grant += grant; - spin_unlock(&cli->cl_loi_list_lock); -} - -static void osc_update_grant(struct client_obd *cli, struct ost_body *body) -{ - if (body->oa.o_valid & OBD_MD_FLGRANT) { - CDEBUG(D_CACHE, "got %llu extra grant\n", body->oa.o_grant); - __osc_update_grant(cli, body->oa.o_grant); - } -} - -static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp, - u32 keylen, void *key, u32 vallen, - void *val, struct ptlrpc_request_set *set); - -static int osc_shrink_grant_interpret(const struct lu_env *env, - struct ptlrpc_request *req, - void *aa, int rc) -{ - struct client_obd *cli = &req->rq_import->imp_obd->u.cli; - struct obdo *oa = ((struct osc_brw_async_args *)aa)->aa_oa; - struct ost_body *body; - - if (rc != 0) { - __osc_update_grant(cli, oa->o_grant); - goto out; - } - - body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); - LASSERT(body); - osc_update_grant(cli, body); -out: - kmem_cache_free(obdo_cachep, oa); - return rc; -} - -static void osc_shrink_grant_local(struct client_obd *cli, struct obdo *oa) -{ - spin_lock(&cli->cl_loi_list_lock); - oa->o_grant = cli->cl_avail_grant / 4; - cli->cl_avail_grant -= oa->o_grant; - spin_unlock(&cli->cl_loi_list_lock); - if (!(oa->o_valid & OBD_MD_FLFLAGS)) { - oa->o_valid |= OBD_MD_FLFLAGS; - oa->o_flags = 0; - } - oa->o_flags |= OBD_FL_SHRINK_GRANT; - osc_update_next_shrink(cli); -} - -/* Shrink the current grant, either from some large amount to enough for a - * full set of in-flight RPCs, or if we have already shrunk to that limit - * then to enough for a single RPC. This avoids keeping more grant than - * needed, and avoids shrinking the grant piecemeal. - */ -static int osc_shrink_grant(struct client_obd *cli) -{ - __u64 target_bytes = (cli->cl_max_rpcs_in_flight + 1) * - (cli->cl_max_pages_per_rpc << PAGE_SHIFT); - - spin_lock(&cli->cl_loi_list_lock); - if (cli->cl_avail_grant <= target_bytes) - target_bytes = cli->cl_max_pages_per_rpc << PAGE_SHIFT; - spin_unlock(&cli->cl_loi_list_lock); - - return osc_shrink_grant_to_target(cli, target_bytes); -} - -int osc_shrink_grant_to_target(struct client_obd *cli, __u64 target_bytes) -{ - int rc = 0; - struct ost_body *body; - - spin_lock(&cli->cl_loi_list_lock); - /* Don't shrink if we are already above or below the desired limit - * We don't want to shrink below a single RPC, as that will negatively - * impact block allocation and long-term performance. - */ - if (target_bytes < cli->cl_max_pages_per_rpc << PAGE_SHIFT) - target_bytes = cli->cl_max_pages_per_rpc << PAGE_SHIFT; - - if (target_bytes >= cli->cl_avail_grant) { - spin_unlock(&cli->cl_loi_list_lock); - return 0; - } - spin_unlock(&cli->cl_loi_list_lock); - - body = kzalloc(sizeof(*body), GFP_NOFS); - if (!body) - return -ENOMEM; - - osc_announce_cached(cli, &body->oa, 0); - - spin_lock(&cli->cl_loi_list_lock); - body->oa.o_grant = cli->cl_avail_grant - target_bytes; - cli->cl_avail_grant = target_bytes; - spin_unlock(&cli->cl_loi_list_lock); - if (!(body->oa.o_valid & OBD_MD_FLFLAGS)) { - body->oa.o_valid |= OBD_MD_FLFLAGS; - body->oa.o_flags = 0; - } - body->oa.o_flags |= OBD_FL_SHRINK_GRANT; - osc_update_next_shrink(cli); - - rc = osc_set_info_async(NULL, cli->cl_import->imp_obd->obd_self_export, - sizeof(KEY_GRANT_SHRINK), KEY_GRANT_SHRINK, - sizeof(*body), body, NULL); - if (rc != 0) - __osc_update_grant(cli, body->oa.o_grant); - kfree(body); - return rc; -} - -static int osc_should_shrink_grant(struct client_obd *client) -{ - unsigned long time = jiffies; - unsigned long next_shrink = client->cl_next_shrink_grant; - - if ((client->cl_import->imp_connect_data.ocd_connect_flags & - OBD_CONNECT_GRANT_SHRINK) == 0) - return 0; - - if (time_after_eq(time, next_shrink - 5)) { - /* Get the current RPC size directly, instead of going via: - * cli_brw_size(obd->u.cli.cl_import->imp_obd->obd_self_export) - * Keep comment here so that it can be found by searching. - */ - int brw_size = client->cl_max_pages_per_rpc << PAGE_SHIFT; - - if (client->cl_import->imp_state == LUSTRE_IMP_FULL && - client->cl_avail_grant > brw_size) - return 1; - - osc_update_next_shrink(client); - } - return 0; -} - -static int osc_grant_shrink_grant_cb(struct timeout_item *item, void *data) -{ - struct client_obd *client; - - list_for_each_entry(client, &item->ti_obd_list, cl_grant_shrink_list) { - if (osc_should_shrink_grant(client)) - osc_shrink_grant(client); - } - return 0; -} - -static int osc_add_shrink_grant(struct client_obd *client) -{ - int rc; - - rc = ptlrpc_add_timeout_client(client->cl_grant_shrink_interval, - TIMEOUT_GRANT, - osc_grant_shrink_grant_cb, NULL, - &client->cl_grant_shrink_list); - if (rc) { - CERROR("add grant client %s error %d\n", cli_name(client), rc); - return rc; - } - CDEBUG(D_CACHE, "add grant client %s\n", cli_name(client)); - osc_update_next_shrink(client); - return 0; -} - -static int osc_del_shrink_grant(struct client_obd *client) -{ - return ptlrpc_del_timeout_client(&client->cl_grant_shrink_list, - TIMEOUT_GRANT); -} - -static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd) -{ - /* - * ocd_grant is the total grant amount we're expect to hold: if we've - * been evicted, it's the new avail_grant amount, cl_dirty_pages will - * drop to 0 as inflight RPCs fail out; otherwise, it's avail_grant + - * dirty. - * - * race is tolerable here: if we're evicted, but imp_state already - * left EVICTED state, then cl_dirty_pages must be 0 already. - */ - spin_lock(&cli->cl_loi_list_lock); - if (cli->cl_import->imp_state == LUSTRE_IMP_EVICTED) - cli->cl_avail_grant = ocd->ocd_grant; - else - cli->cl_avail_grant = ocd->ocd_grant - - (cli->cl_dirty_pages << PAGE_SHIFT); - - /* determine the appropriate chunk size used by osc_extent. */ - cli->cl_chunkbits = max_t(int, PAGE_SHIFT, ocd->ocd_blocksize); - spin_unlock(&cli->cl_loi_list_lock); - - CDEBUG(D_CACHE, "%s, setting cl_avail_grant: %ld cl_lost_grant: %ld chunk bits: %d\n", - cli_name(cli), cli->cl_avail_grant, cli->cl_lost_grant, - cli->cl_chunkbits); - - if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT_SHRINK && - list_empty(&cli->cl_grant_shrink_list)) - osc_add_shrink_grant(cli); -} - -/* We assume that the reason this OSC got a short read is because it read - * beyond the end of a stripe file; i.e. lustre is reading a sparse file - * via the LOV, and it _knows_ it's reading inside the file, it's just that - * this stripe never got written at or beyond this stripe offset yet. - */ -static void handle_short_read(int nob_read, u32 page_count, - struct brw_page **pga) -{ - char *ptr; - int i = 0; - - /* skip bytes read OK */ - while (nob_read > 0) { - LASSERT(page_count > 0); - - if (pga[i]->count > nob_read) { - /* EOF inside this page */ - ptr = kmap(pga[i]->pg) + - (pga[i]->off & ~PAGE_MASK); - memset(ptr + nob_read, 0, pga[i]->count - nob_read); - kunmap(pga[i]->pg); - page_count--; - i++; - break; - } - - nob_read -= pga[i]->count; - page_count--; - i++; - } - - /* zero remaining pages */ - while (page_count-- > 0) { - ptr = kmap(pga[i]->pg) + (pga[i]->off & ~PAGE_MASK); - memset(ptr, 0, pga[i]->count); - kunmap(pga[i]->pg); - i++; - } -} - -static int check_write_rcs(struct ptlrpc_request *req, - int requested_nob, int niocount, - u32 page_count, struct brw_page **pga) -{ - int i; - __u32 *remote_rcs; - - remote_rcs = req_capsule_server_sized_get(&req->rq_pill, &RMF_RCS, - sizeof(*remote_rcs) * - niocount); - if (!remote_rcs) { - CDEBUG(D_INFO, "Missing/short RC vector on BRW_WRITE reply\n"); - return -EPROTO; - } - - /* return error if any niobuf was in error */ - for (i = 0; i < niocount; i++) { - if ((int)remote_rcs[i] < 0) - return remote_rcs[i]; - - if (remote_rcs[i] != 0) { - CDEBUG(D_INFO, "rc[%d] invalid (%d) req %p\n", - i, remote_rcs[i], req); - return -EPROTO; - } - } - - if (req->rq_bulk->bd_nob_transferred != requested_nob) { - CERROR("Unexpected # bytes transferred: %d (requested %d)\n", - req->rq_bulk->bd_nob_transferred, requested_nob); - return -EPROTO; - } - - return 0; -} - -static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2) -{ - if (p1->flag != p2->flag) { - unsigned int mask = ~(OBD_BRW_FROM_GRANT | OBD_BRW_NOCACHE | - OBD_BRW_SYNC | OBD_BRW_ASYNC | - OBD_BRW_NOQUOTA | OBD_BRW_SOFT_SYNC); - - /* warn if we try to combine flags that we don't know to be - * safe to combine - */ - if (unlikely((p1->flag & mask) != (p2->flag & mask))) { - CWARN("Saw flags 0x%x and 0x%x in the same brw, please report this at http://bugs.whamcloud.com/\n", - p1->flag, p2->flag); - } - return 0; - } - - return (p1->off + p1->count == p2->off); -} - -static u32 osc_checksum_bulk(int nob, u32 pg_count, - struct brw_page **pga, int opc, - enum cksum_type cksum_type) -{ - __u32 cksum; - int i = 0; - struct ahash_request *hdesc; - unsigned int bufsize; - unsigned char cfs_alg = cksum_obd2cfs(cksum_type); - - LASSERT(pg_count > 0); - - hdesc = cfs_crypto_hash_init(cfs_alg, NULL, 0); - if (IS_ERR(hdesc)) { - CERROR("Unable to initialize checksum hash %s\n", - cfs_crypto_hash_name(cfs_alg)); - return PTR_ERR(hdesc); - } - - while (nob > 0 && pg_count > 0) { - unsigned int count = pga[i]->count > nob ? nob : pga[i]->count; - - /* corrupt the data before we compute the checksum, to - * simulate an OST->client data error - */ - if (i == 0 && opc == OST_READ && - OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) { - unsigned char *ptr = kmap(pga[i]->pg); - int off = pga[i]->off & ~PAGE_MASK; - - memcpy(ptr + off, "bad1", min_t(typeof(nob), 4, nob)); - kunmap(pga[i]->pg); - } - cfs_crypto_hash_update_page(hdesc, pga[i]->pg, - pga[i]->off & ~PAGE_MASK, - count); - CDEBUG(D_PAGE, - "page %p map %p index %lu flags %lx count %u priv %0lx: off %d\n", - pga[i]->pg, pga[i]->pg->mapping, pga[i]->pg->index, - (long)pga[i]->pg->flags, page_count(pga[i]->pg), - page_private(pga[i]->pg), - (int)(pga[i]->off & ~PAGE_MASK)); - - nob -= pga[i]->count; - pg_count--; - i++; - } - - bufsize = sizeof(cksum); - cfs_crypto_hash_final(hdesc, (unsigned char *)&cksum, &bufsize); - - /* For sending we only compute the wrong checksum instead - * of corrupting the data so it is still correct on a redo - */ - if (opc == OST_WRITE && OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND)) - cksum++; - - return cksum; -} - -static int osc_brw_prep_request(int cmd, struct client_obd *cli, - struct obdo *oa, u32 page_count, - struct brw_page **pga, - struct ptlrpc_request **reqp, - int reserve, - int resend) -{ - struct ptlrpc_request *req; - struct ptlrpc_bulk_desc *desc; - struct ost_body *body; - struct obd_ioobj *ioobj; - struct niobuf_remote *niobuf; - int niocount, i, requested_nob, opc, rc; - struct osc_brw_async_args *aa; - struct req_capsule *pill; - struct brw_page *pg_prev; - - if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ)) - return -ENOMEM; /* Recoverable */ - if (OBD_FAIL_CHECK(OBD_FAIL_OSC_BRW_PREP_REQ2)) - return -EINVAL; /* Fatal */ - - if ((cmd & OBD_BRW_WRITE) != 0) { - opc = OST_WRITE; - req = ptlrpc_request_alloc_pool(cli->cl_import, - osc_rq_pool, - &RQF_OST_BRW_WRITE); - } else { - opc = OST_READ; - req = ptlrpc_request_alloc(cli->cl_import, &RQF_OST_BRW_READ); - } - if (!req) - return -ENOMEM; - - for (niocount = i = 1; i < page_count; i++) { - if (!can_merge_pages(pga[i - 1], pga[i])) - niocount++; - } - - pill = &req->rq_pill; - req_capsule_set_size(pill, &RMF_OBD_IOOBJ, RCL_CLIENT, - sizeof(*ioobj)); - req_capsule_set_size(pill, &RMF_NIOBUF_REMOTE, RCL_CLIENT, - niocount * sizeof(*niobuf)); - - rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, opc); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */ - ptlrpc_at_set_req_timeout(req); - /* ask ptlrpc not to resend on EINPROGRESS since BRWs have their own - * retry logic - */ - req->rq_no_retry_einprogress = 1; - - desc = ptlrpc_prep_bulk_imp(req, page_count, - cli->cl_import->imp_connect_data.ocd_brw_size >> LNET_MTU_BITS, - (opc == OST_WRITE ? PTLRPC_BULK_GET_SOURCE : - PTLRPC_BULK_PUT_SINK) | PTLRPC_BULK_BUF_KIOV, OST_BULK_PORTAL, - &ptlrpc_bulk_kiov_pin_ops); - - if (!desc) { - rc = -ENOMEM; - goto out; - } - /* NB request now owns desc and will free it when it gets freed */ - - body = req_capsule_client_get(pill, &RMF_OST_BODY); - ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ); - niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE); - LASSERT(body && ioobj && niobuf); - - lustre_set_wire_obdo(&req->rq_import->imp_connect_data, &body->oa, oa); - - obdo_to_ioobj(oa, ioobj); - ioobj->ioo_bufcnt = niocount; - /* The high bits of ioo_max_brw tells server _maximum_ number of bulks - * that might be send for this request. The actual number is decided - * when the RPC is finally sent in ptlrpc_register_bulk(). It sends - * "max - 1" for old client compatibility sending "0", and also so the - * the actual maximum is a power-of-two number, not one less. LU-1431 - */ - ioobj_max_brw_set(ioobj, desc->bd_md_max_brw); - LASSERT(page_count > 0); - pg_prev = pga[0]; - for (requested_nob = i = 0; i < page_count; i++, niobuf++) { - struct brw_page *pg = pga[i]; - int poff = pg->off & ~PAGE_MASK; - - LASSERT(pg->count > 0); - /* make sure there is no gap in the middle of page array */ - LASSERTF(page_count == 1 || - (ergo(i == 0, poff + pg->count == PAGE_SIZE) && - ergo(i > 0 && i < page_count - 1, - poff == 0 && pg->count == PAGE_SIZE) && - ergo(i == page_count - 1, poff == 0)), - "i: %d/%d pg: %p off: %llu, count: %u\n", - i, page_count, pg, pg->off, pg->count); - LASSERTF(i == 0 || pg->off > pg_prev->off, - "i %d p_c %u pg %p [pri %lu ind %lu] off %llu prev_pg %p [pri %lu ind %lu] off %llu\n", - i, page_count, - pg->pg, page_private(pg->pg), pg->pg->index, pg->off, - pg_prev->pg, page_private(pg_prev->pg), - pg_prev->pg->index, pg_prev->off); - LASSERT((pga[0]->flag & OBD_BRW_SRVLOCK) == - (pg->flag & OBD_BRW_SRVLOCK)); - - desc->bd_frag_ops->add_kiov_frag(desc, pg->pg, poff, pg->count); - requested_nob += pg->count; - - if (i > 0 && can_merge_pages(pg_prev, pg)) { - niobuf--; - niobuf->rnb_len += pg->count; - } else { - niobuf->rnb_offset = pg->off; - niobuf->rnb_len = pg->count; - niobuf->rnb_flags = pg->flag; - } - pg_prev = pg; - } - - LASSERTF((void *)(niobuf - niocount) == - req_capsule_client_get(&req->rq_pill, &RMF_NIOBUF_REMOTE), - "want %p - real %p\n", req_capsule_client_get(&req->rq_pill, - &RMF_NIOBUF_REMOTE), (void *)(niobuf - niocount)); - - osc_announce_cached(cli, &body->oa, opc == OST_WRITE ? requested_nob:0); - if (resend) { - if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) { - body->oa.o_valid |= OBD_MD_FLFLAGS; - body->oa.o_flags = 0; - } - body->oa.o_flags |= OBD_FL_RECOV_RESEND; - } - - if (osc_should_shrink_grant(cli)) - osc_shrink_grant_local(cli, &body->oa); - - /* size[REQ_REC_OFF] still sizeof (*body) */ - if (opc == OST_WRITE) { - if (cli->cl_checksum && - !sptlrpc_flavor_has_bulk(&req->rq_flvr)) { - /* store cl_cksum_type in a local variable since - * it can be changed via lprocfs - */ - enum cksum_type cksum_type = cli->cl_cksum_type; - - if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) { - oa->o_flags &= OBD_FL_LOCAL_MASK; - body->oa.o_flags = 0; - } - body->oa.o_flags |= cksum_type_pack(cksum_type); - body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS; - body->oa.o_cksum = osc_checksum_bulk(requested_nob, - page_count, pga, - OST_WRITE, - cksum_type); - CDEBUG(D_PAGE, "checksum at write origin: %x\n", - body->oa.o_cksum); - /* save this in 'oa', too, for later checking */ - oa->o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS; - oa->o_flags |= cksum_type_pack(cksum_type); - } else { - /* clear out the checksum flag, in case this is a - * resend but cl_checksum is no longer set. b=11238 - */ - oa->o_valid &= ~OBD_MD_FLCKSUM; - } - oa->o_cksum = body->oa.o_cksum; - /* 1 RC per niobuf */ - req_capsule_set_size(pill, &RMF_RCS, RCL_SERVER, - sizeof(__u32) * niocount); - } else { - if (cli->cl_checksum && - !sptlrpc_flavor_has_bulk(&req->rq_flvr)) { - if ((body->oa.o_valid & OBD_MD_FLFLAGS) == 0) - body->oa.o_flags = 0; - body->oa.o_flags |= cksum_type_pack(cli->cl_cksum_type); - body->oa.o_valid |= OBD_MD_FLCKSUM | OBD_MD_FLFLAGS; - } - } - ptlrpc_request_set_replen(req); - - BUILD_BUG_ON(sizeof(*aa) > sizeof(req->rq_async_args)); - aa = ptlrpc_req_async_args(req); - aa->aa_oa = oa; - aa->aa_requested_nob = requested_nob; - aa->aa_nio_count = niocount; - aa->aa_page_count = page_count; - aa->aa_resends = 0; - aa->aa_ppga = pga; - aa->aa_cli = cli; - INIT_LIST_HEAD(&aa->aa_oaps); - - *reqp = req; - niobuf = req_capsule_client_get(pill, &RMF_NIOBUF_REMOTE); - CDEBUG(D_RPCTRACE, "brw rpc %p - object " DOSTID " offset %lld<>%lld\n", - req, POSTID(&oa->o_oi), niobuf[0].rnb_offset, - niobuf[niocount - 1].rnb_offset + niobuf[niocount - 1].rnb_len); - - return 0; - - out: - ptlrpc_req_finished(req); - return rc; -} - -static int check_write_checksum(struct obdo *oa, - const struct lnet_process_id *peer, - __u32 client_cksum, __u32 server_cksum, int nob, - u32 page_count, struct brw_page **pga, - enum cksum_type client_cksum_type) -{ - __u32 new_cksum; - char *msg; - enum cksum_type cksum_type; - - if (server_cksum == client_cksum) { - CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum); - return 0; - } - - cksum_type = cksum_type_unpack(oa->o_valid & OBD_MD_FLFLAGS ? - oa->o_flags : 0); - new_cksum = osc_checksum_bulk(nob, page_count, pga, OST_WRITE, - cksum_type); - - if (cksum_type != client_cksum_type) - msg = "the server did not use the checksum type specified in the original request - likely a protocol problem" - ; - else if (new_cksum == server_cksum) - msg = "changed on the client after we checksummed it - likely false positive due to mmap IO (bug 11742)" - ; - else if (new_cksum == client_cksum) - msg = "changed in transit before arrival at OST"; - else - msg = "changed in transit AND doesn't match the original - likely false positive due to mmap IO (bug 11742)" - ; - - LCONSOLE_ERROR_MSG(0x132, "BAD WRITE CHECKSUM: %s: from %s inode " DFID " object " DOSTID " extent [%llu-%llu]\n", - msg, libcfs_nid2str(peer->nid), - oa->o_valid & OBD_MD_FLFID ? oa->o_parent_seq : (__u64)0, - oa->o_valid & OBD_MD_FLFID ? oa->o_parent_oid : 0, - oa->o_valid & OBD_MD_FLFID ? oa->o_parent_ver : 0, - POSTID(&oa->o_oi), pga[0]->off, - pga[page_count - 1]->off + - pga[page_count - 1]->count - 1); - CERROR("original client csum %x (type %x), server csum %x (type %x), client csum now %x\n", - client_cksum, client_cksum_type, - server_cksum, cksum_type, new_cksum); - return 1; -} - -/* Note rc enters this function as number of bytes transferred */ -static int osc_brw_fini_request(struct ptlrpc_request *req, int rc) -{ - struct osc_brw_async_args *aa = (void *)&req->rq_async_args; - const struct lnet_process_id *peer = - &req->rq_import->imp_connection->c_peer; - struct client_obd *cli = aa->aa_cli; - struct ost_body *body; - __u32 client_cksum = 0; - - if (rc < 0 && rc != -EDQUOT) { - DEBUG_REQ(D_INFO, req, "Failed request with rc = %d\n", rc); - return rc; - } - - LASSERTF(req->rq_repmsg, "rc = %d\n", rc); - body = req_capsule_server_get(&req->rq_pill, &RMF_OST_BODY); - if (!body) { - DEBUG_REQ(D_INFO, req, "Can't unpack body\n"); - return -EPROTO; - } - - /* set/clear over quota flag for a uid/gid */ - if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE && - body->oa.o_valid & (OBD_MD_FLUSRQUOTA | OBD_MD_FLGRPQUOTA)) { - unsigned int qid[MAXQUOTAS] = { body->oa.o_uid, body->oa.o_gid }; - - CDEBUG(D_QUOTA, "setdq for [%u %u] with valid %#llx, flags %x\n", - body->oa.o_uid, body->oa.o_gid, body->oa.o_valid, - body->oa.o_flags); - osc_quota_setdq(cli, qid, body->oa.o_valid, body->oa.o_flags); - } - - osc_update_grant(cli, body); - - if (rc < 0) - return rc; - - if (aa->aa_oa->o_valid & OBD_MD_FLCKSUM) - client_cksum = aa->aa_oa->o_cksum; /* save for later */ - - if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) { - if (rc > 0) { - CERROR("Unexpected +ve rc %d\n", rc); - return -EPROTO; - } - LASSERT(req->rq_bulk->bd_nob == aa->aa_requested_nob); - - if (sptlrpc_cli_unwrap_bulk_write(req, req->rq_bulk)) - return -EAGAIN; - - if ((aa->aa_oa->o_valid & OBD_MD_FLCKSUM) && client_cksum && - check_write_checksum(&body->oa, peer, client_cksum, - body->oa.o_cksum, aa->aa_requested_nob, - aa->aa_page_count, aa->aa_ppga, - cksum_type_unpack(aa->aa_oa->o_flags))) - return -EAGAIN; - - rc = check_write_rcs(req, aa->aa_requested_nob, - aa->aa_nio_count, - aa->aa_page_count, aa->aa_ppga); - goto out; - } - - /* The rest of this function executes only for OST_READs */ - - /* if unwrap_bulk failed, return -EAGAIN to retry */ - rc = sptlrpc_cli_unwrap_bulk_read(req, req->rq_bulk, rc); - if (rc < 0) { - rc = -EAGAIN; - goto out; - } - - if (rc > aa->aa_requested_nob) { - CERROR("Unexpected rc %d (%d requested)\n", rc, - aa->aa_requested_nob); - return -EPROTO; - } - - if (rc != req->rq_bulk->bd_nob_transferred) { - CERROR("Unexpected rc %d (%d transferred)\n", - rc, req->rq_bulk->bd_nob_transferred); - return -EPROTO; - } - - if (rc < aa->aa_requested_nob) - handle_short_read(rc, aa->aa_page_count, aa->aa_ppga); - - if (body->oa.o_valid & OBD_MD_FLCKSUM) { - static int cksum_counter; - __u32 server_cksum = body->oa.o_cksum; - char *via = ""; - char *router = ""; - enum cksum_type cksum_type; - - cksum_type = cksum_type_unpack(body->oa.o_valid & - OBD_MD_FLFLAGS ? - body->oa.o_flags : 0); - client_cksum = osc_checksum_bulk(rc, aa->aa_page_count, - aa->aa_ppga, OST_READ, - cksum_type); - - if (peer->nid != req->rq_bulk->bd_sender) { - via = " via "; - router = libcfs_nid2str(req->rq_bulk->bd_sender); - } - - if (server_cksum != client_cksum) { - LCONSOLE_ERROR_MSG(0x133, "%s: BAD READ CHECKSUM: from %s%s%s inode " DFID " object " DOSTID " extent [%llu-%llu]\n", - req->rq_import->imp_obd->obd_name, - libcfs_nid2str(peer->nid), - via, router, - body->oa.o_valid & OBD_MD_FLFID ? - body->oa.o_parent_seq : (__u64)0, - body->oa.o_valid & OBD_MD_FLFID ? - body->oa.o_parent_oid : 0, - body->oa.o_valid & OBD_MD_FLFID ? - body->oa.o_parent_ver : 0, - POSTID(&body->oa.o_oi), - aa->aa_ppga[0]->off, - aa->aa_ppga[aa->aa_page_count-1]->off + - aa->aa_ppga[aa->aa_page_count-1]->count - - 1); - CERROR("client %x, server %x, cksum_type %x\n", - client_cksum, server_cksum, cksum_type); - cksum_counter = 0; - aa->aa_oa->o_cksum = client_cksum; - rc = -EAGAIN; - } else { - cksum_counter++; - CDEBUG(D_PAGE, "checksum %x confirmed\n", client_cksum); - rc = 0; - } - } else if (unlikely(client_cksum)) { - static int cksum_missed; - - cksum_missed++; - if ((cksum_missed & (-cksum_missed)) == cksum_missed) - CERROR("Checksum %u requested from %s but not sent\n", - cksum_missed, libcfs_nid2str(peer->nid)); - } else { - rc = 0; - } -out: - if (rc >= 0) - lustre_get_wire_obdo(&req->rq_import->imp_connect_data, - aa->aa_oa, &body->oa); - - return rc; -} - -static int osc_brw_redo_request(struct ptlrpc_request *request, - struct osc_brw_async_args *aa, int rc) -{ - struct ptlrpc_request *new_req; - struct osc_brw_async_args *new_aa; - struct osc_async_page *oap; - - DEBUG_REQ(rc == -EINPROGRESS ? D_RPCTRACE : D_ERROR, request, - "redo for recoverable error %d", rc); - - rc = osc_brw_prep_request(lustre_msg_get_opc(request->rq_reqmsg) == - OST_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ, - aa->aa_cli, aa->aa_oa, - aa->aa_page_count, aa->aa_ppga, - &new_req, 0, 1); - if (rc) - return rc; - - list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) { - if (oap->oap_request) { - LASSERTF(request == oap->oap_request, - "request %p != oap_request %p\n", - request, oap->oap_request); - if (oap->oap_interrupted) { - ptlrpc_req_finished(new_req); - return -EINTR; - } - } - } - /* New request takes over pga and oaps from old request. - * Note that copying a list_head doesn't work, need to move it... - */ - aa->aa_resends++; - new_req->rq_interpret_reply = request->rq_interpret_reply; - new_req->rq_async_args = request->rq_async_args; - new_req->rq_commit_cb = request->rq_commit_cb; - /* cap resend delay to the current request timeout, this is similar to - * what ptlrpc does (see after_reply()) - */ - if (aa->aa_resends > new_req->rq_timeout) - new_req->rq_sent = ktime_get_real_seconds() + new_req->rq_timeout; - else - new_req->rq_sent = ktime_get_real_seconds() + aa->aa_resends; - new_req->rq_generation_set = 1; - new_req->rq_import_generation = request->rq_import_generation; - - new_aa = ptlrpc_req_async_args(new_req); - - INIT_LIST_HEAD(&new_aa->aa_oaps); - list_splice_init(&aa->aa_oaps, &new_aa->aa_oaps); - INIT_LIST_HEAD(&new_aa->aa_exts); - list_splice_init(&aa->aa_exts, &new_aa->aa_exts); - new_aa->aa_resends = aa->aa_resends; - - list_for_each_entry(oap, &new_aa->aa_oaps, oap_rpc_item) { - if (oap->oap_request) { - ptlrpc_req_finished(oap->oap_request); - oap->oap_request = ptlrpc_request_addref(new_req); - } - } - - /* XXX: This code will run into problem if we're going to support - * to add a series of BRW RPCs into a self-defined ptlrpc_request_set - * and wait for all of them to be finished. We should inherit request - * set from old request. - */ - ptlrpcd_add_req(new_req); - - DEBUG_REQ(D_INFO, new_req, "new request"); - return 0; -} - -/* - * ugh, we want disk allocation on the target to happen in offset order. we'll - * follow sedgewicks advice and stick to the dead simple shellsort -- it'll do - * fine for our small page arrays and doesn't require allocation. its an - * insertion sort that swaps elements that are strides apart, shrinking the - * stride down until its '1' and the array is sorted. - */ -static void sort_brw_pages(struct brw_page **array, int num) -{ - int stride, i, j; - struct brw_page *tmp; - - if (num == 1) - return; - for (stride = 1; stride < num ; stride = (stride * 3) + 1) - ; - - do { - stride /= 3; - for (i = stride ; i < num ; i++) { - tmp = array[i]; - j = i; - while (j >= stride && array[j - stride]->off > tmp->off) { - array[j] = array[j - stride]; - j -= stride; - } - array[j] = tmp; - } - } while (stride > 1); -} - -static void osc_release_ppga(struct brw_page **ppga, u32 count) -{ - LASSERT(ppga); - kfree(ppga); -} - -static int brw_interpret(const struct lu_env *env, - struct ptlrpc_request *req, void *data, int rc) -{ - struct osc_brw_async_args *aa = data; - struct osc_extent *ext; - struct osc_extent *tmp; - struct client_obd *cli = aa->aa_cli; - - rc = osc_brw_fini_request(req, rc); - CDEBUG(D_INODE, "request %p aa %p rc %d\n", req, aa, rc); - /* When server return -EINPROGRESS, client should always retry - * regardless of the number of times the bulk was resent already. - */ - if (osc_recoverable_error(rc)) { - if (req->rq_import_generation != - req->rq_import->imp_generation) { - CDEBUG(D_HA, "%s: resend cross eviction for object: " DOSTID ", rc = %d.\n", - req->rq_import->imp_obd->obd_name, - POSTID(&aa->aa_oa->o_oi), rc); - } else if (rc == -EINPROGRESS || - client_should_resend(aa->aa_resends, aa->aa_cli)) { - rc = osc_brw_redo_request(req, aa, rc); - } else { - CERROR("%s: too many resent retries for object: %llu:%llu, rc = %d.\n", - req->rq_import->imp_obd->obd_name, - POSTID(&aa->aa_oa->o_oi), rc); - } - - if (rc == 0) - return 0; - else if (rc == -EAGAIN || rc == -EINPROGRESS) - rc = -EIO; - } - - if (rc == 0) { - struct obdo *oa = aa->aa_oa; - struct cl_attr *attr = &osc_env_info(env)->oti_attr; - unsigned long valid = 0; - struct cl_object *obj; - struct osc_async_page *last; - - last = brw_page2oap(aa->aa_ppga[aa->aa_page_count - 1]); - obj = osc2cl(last->oap_obj); - - cl_object_attr_lock(obj); - if (oa->o_valid & OBD_MD_FLBLOCKS) { - attr->cat_blocks = oa->o_blocks; - valid |= CAT_BLOCKS; - } - if (oa->o_valid & OBD_MD_FLMTIME) { - attr->cat_mtime = oa->o_mtime; - valid |= CAT_MTIME; - } - if (oa->o_valid & OBD_MD_FLATIME) { - attr->cat_atime = oa->o_atime; - valid |= CAT_ATIME; - } - if (oa->o_valid & OBD_MD_FLCTIME) { - attr->cat_ctime = oa->o_ctime; - valid |= CAT_CTIME; - } - - if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) { - struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo; - loff_t last_off = last->oap_count + last->oap_obj_off + - last->oap_page_off; - - /* Change file size if this is an out of quota or - * direct IO write and it extends the file size - */ - if (loi->loi_lvb.lvb_size < last_off) { - attr->cat_size = last_off; - valid |= CAT_SIZE; - } - /* Extend KMS if it's not a lockless write */ - if (loi->loi_kms < last_off && - oap2osc_page(last)->ops_srvlock == 0) { - attr->cat_kms = last_off; - valid |= CAT_KMS; - } - } - - if (valid != 0) - cl_object_attr_update(env, obj, attr, valid); - cl_object_attr_unlock(obj); - } - kmem_cache_free(obdo_cachep, aa->aa_oa); - - if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE && rc == 0) - osc_inc_unstable_pages(req); - - list_for_each_entry_safe(ext, tmp, &aa->aa_exts, oe_link) { - list_del_init(&ext->oe_link); - osc_extent_finish(env, ext, 1, rc); - } - LASSERT(list_empty(&aa->aa_exts)); - LASSERT(list_empty(&aa->aa_oaps)); - - osc_release_ppga(aa->aa_ppga, aa->aa_page_count); - ptlrpc_lprocfs_brw(req, req->rq_bulk->bd_nob_transferred); - - spin_lock(&cli->cl_loi_list_lock); - /* We need to decrement before osc_ap_completion->osc_wake_cache_waiters - * is called so we know whether to go to sync BRWs or wait for more - * RPCs to complete - */ - if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE) - cli->cl_w_in_flight--; - else - cli->cl_r_in_flight--; - osc_wake_cache_waiters(cli); - spin_unlock(&cli->cl_loi_list_lock); - - osc_io_unplug(env, cli, NULL); - return rc; -} - -static void brw_commit(struct ptlrpc_request *req) -{ - /* - * If osc_inc_unstable_pages (via osc_extent_finish) races with - * this called via the rq_commit_cb, I need to ensure - * osc_dec_unstable_pages is still called. Otherwise unstable - * pages may be leaked. - */ - spin_lock(&req->rq_lock); - if (unlikely(req->rq_unstable)) { - req->rq_unstable = 0; - spin_unlock(&req->rq_lock); - osc_dec_unstable_pages(req); - } else { - req->rq_committed = 1; - spin_unlock(&req->rq_lock); - } -} - -/** - * Build an RPC by the list of extent @ext_list. The caller must ensure - * that the total pages in this list are NOT over max pages per RPC. - * Extents in the list must be in OES_RPC state. - */ -int osc_build_rpc(const struct lu_env *env, struct client_obd *cli, - struct list_head *ext_list, int cmd) -{ - struct ptlrpc_request *req = NULL; - struct osc_extent *ext; - struct brw_page **pga = NULL; - struct osc_brw_async_args *aa = NULL; - struct obdo *oa = NULL; - struct osc_async_page *oap; - struct osc_object *obj = NULL; - struct cl_req_attr *crattr = NULL; - u64 starting_offset = OBD_OBJECT_EOF; - u64 ending_offset = 0; - unsigned int mpflag = 0; - int mem_tight = 0; - int page_count = 0; - bool soft_sync = false; - bool interrupted = false; - int i; - int rc; - struct ost_body *body; - LIST_HEAD(rpc_list); - - LASSERT(!list_empty(ext_list)); - - /* add pages into rpc_list to build BRW rpc */ - list_for_each_entry(ext, ext_list, oe_link) { - LASSERT(ext->oe_state == OES_RPC); - mem_tight |= ext->oe_memalloc; - page_count += ext->oe_nr_pages; - if (!obj) - obj = ext->oe_obj; - } - - soft_sync = osc_over_unstable_soft_limit(cli); - if (mem_tight) - mpflag = memalloc_noreclaim_save(); - - pga = kcalloc(page_count, sizeof(*pga), GFP_NOFS); - if (!pga) { - rc = -ENOMEM; - goto out; - } - - oa = kmem_cache_zalloc(obdo_cachep, GFP_NOFS); - if (!oa) { - rc = -ENOMEM; - goto out; - } - - i = 0; - list_for_each_entry(ext, ext_list, oe_link) { - list_for_each_entry(oap, &ext->oe_pages, oap_pending_item) { - if (mem_tight) - oap->oap_brw_flags |= OBD_BRW_MEMALLOC; - if (soft_sync) - oap->oap_brw_flags |= OBD_BRW_SOFT_SYNC; - pga[i] = &oap->oap_brw_page; - pga[i]->off = oap->oap_obj_off + oap->oap_page_off; - i++; - - list_add_tail(&oap->oap_rpc_item, &rpc_list); - if (starting_offset == OBD_OBJECT_EOF || - starting_offset > oap->oap_obj_off) - starting_offset = oap->oap_obj_off; - else - LASSERT(!oap->oap_page_off); - if (ending_offset < oap->oap_obj_off + oap->oap_count) - ending_offset = oap->oap_obj_off + - oap->oap_count; - else - LASSERT(oap->oap_page_off + oap->oap_count == - PAGE_SIZE); - if (oap->oap_interrupted) - interrupted = true; - } - } - - /* first page in the list */ - oap = list_entry(rpc_list.next, typeof(*oap), oap_rpc_item); - - crattr = &osc_env_info(env)->oti_req_attr; - memset(crattr, 0, sizeof(*crattr)); - crattr->cra_type = (cmd & OBD_BRW_WRITE) ? CRT_WRITE : CRT_READ; - crattr->cra_flags = ~0ULL; - crattr->cra_page = oap2cl_page(oap); - crattr->cra_oa = oa; - cl_req_attr_set(env, osc2cl(obj), crattr); - - sort_brw_pages(pga, page_count); - rc = osc_brw_prep_request(cmd, cli, oa, page_count, pga, &req, 1, 0); - if (rc != 0) { - CERROR("prep_req failed: %d\n", rc); - goto out; - } - - req->rq_commit_cb = brw_commit; - req->rq_interpret_reply = brw_interpret; - - req->rq_memalloc = mem_tight != 0; - oap->oap_request = ptlrpc_request_addref(req); - if (interrupted && !req->rq_intr) - ptlrpc_mark_interrupted(req); - - /* Need to update the timestamps after the request is built in case - * we race with setattr (locally or in queue at OST). If OST gets - * later setattr before earlier BRW (as determined by the request xid), - * the OST will not use BRW timestamps. Sadly, there is no obvious - * way to do this in a single call. bug 10150 - */ - body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); - crattr->cra_oa = &body->oa; - crattr->cra_flags = OBD_MD_FLMTIME | OBD_MD_FLCTIME | OBD_MD_FLATIME; - cl_req_attr_set(env, osc2cl(obj), crattr); - lustre_msg_set_jobid(req->rq_reqmsg, crattr->cra_jobid); - - BUILD_BUG_ON(sizeof(*aa) > sizeof(req->rq_async_args)); - aa = ptlrpc_req_async_args(req); - INIT_LIST_HEAD(&aa->aa_oaps); - list_splice_init(&rpc_list, &aa->aa_oaps); - INIT_LIST_HEAD(&aa->aa_exts); - list_splice_init(ext_list, &aa->aa_exts); - - spin_lock(&cli->cl_loi_list_lock); - starting_offset >>= PAGE_SHIFT; - if (cmd == OBD_BRW_READ) { - cli->cl_r_in_flight++; - lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count); - lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_r_in_flight); - lprocfs_oh_tally_log2(&cli->cl_read_offset_hist, - starting_offset + 1); - } else { - cli->cl_w_in_flight++; - lprocfs_oh_tally_log2(&cli->cl_write_page_hist, page_count); - lprocfs_oh_tally(&cli->cl_write_rpc_hist, cli->cl_w_in_flight); - lprocfs_oh_tally_log2(&cli->cl_write_offset_hist, - starting_offset + 1); - } - spin_unlock(&cli->cl_loi_list_lock); - - DEBUG_REQ(D_INODE, req, "%d pages, aa %p. now %ur/%dw in flight", - page_count, aa, cli->cl_r_in_flight, - cli->cl_w_in_flight); - OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DELAY_IO, cfs_fail_val); - - ptlrpcd_add_req(req); - rc = 0; - -out: - if (mem_tight != 0) - memalloc_noreclaim_restore(mpflag); - - if (rc != 0) { - LASSERT(!req); - - if (oa) - kmem_cache_free(obdo_cachep, oa); - kfree(pga); - /* this should happen rarely and is pretty bad, it makes the - * pending list not follow the dirty order - */ - while (!list_empty(ext_list)) { - ext = list_entry(ext_list->next, struct osc_extent, - oe_link); - list_del_init(&ext->oe_link); - osc_extent_finish(env, ext, 0, rc); - } - } - return rc; -} - -static int osc_set_lock_data(struct ldlm_lock *lock, void *data) -{ - int set = 0; - - LASSERT(lock); - - lock_res_and_lock(lock); - - if (!lock->l_ast_data) - lock->l_ast_data = data; - if (lock->l_ast_data == data) - set = 1; - - unlock_res_and_lock(lock); - - return set; -} - -static int osc_enqueue_fini(struct ptlrpc_request *req, - osc_enqueue_upcall_f upcall, void *cookie, - struct lustre_handle *lockh, enum ldlm_mode mode, - __u64 *flags, int agl, int errcode) -{ - bool intent = *flags & LDLM_FL_HAS_INTENT; - int rc; - - /* The request was created before ldlm_cli_enqueue call. */ - if (intent && errcode == ELDLM_LOCK_ABORTED) { - struct ldlm_reply *rep; - - rep = req_capsule_server_get(&req->rq_pill, &RMF_DLM_REP); - - rep->lock_policy_res1 = - ptlrpc_status_ntoh(rep->lock_policy_res1); - if (rep->lock_policy_res1) - errcode = rep->lock_policy_res1; - if (!agl) - *flags |= LDLM_FL_LVB_READY; - } else if (errcode == ELDLM_OK) { - *flags |= LDLM_FL_LVB_READY; - } - - /* Call the update callback. */ - rc = (*upcall)(cookie, lockh, errcode); - /* release the reference taken in ldlm_cli_enqueue() */ - if (errcode == ELDLM_LOCK_MATCHED) - errcode = ELDLM_OK; - if (errcode == ELDLM_OK && lustre_handle_is_used(lockh)) - ldlm_lock_decref(lockh, mode); - - return rc; -} - -static int osc_enqueue_interpret(const struct lu_env *env, - struct ptlrpc_request *req, - struct osc_enqueue_args *aa, int rc) -{ - struct ldlm_lock *lock; - struct lustre_handle *lockh = &aa->oa_lockh; - enum ldlm_mode mode = aa->oa_mode; - struct ost_lvb *lvb = aa->oa_lvb; - __u32 lvb_len = sizeof(*lvb); - __u64 flags = 0; - - - /* ldlm_cli_enqueue is holding a reference on the lock, so it must - * be valid. - */ - lock = ldlm_handle2lock(lockh); - LASSERTF(lock, "lockh %llx, req %p, aa %p - client evicted?\n", - lockh->cookie, req, aa); - - /* Take an additional reference so that a blocking AST that - * ldlm_cli_enqueue_fini() might post for a failed lock, is guaranteed - * to arrive after an upcall has been executed by - * osc_enqueue_fini(). - */ - ldlm_lock_addref(lockh, mode); - - /* Let cl_lock_state_wait fail with -ERESTARTSYS to unuse sublocks. */ - OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_HANG, 2); - - /* Let CP AST to grant the lock first. */ - OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_ENQ_RACE, 1); - - if (aa->oa_agl) { - LASSERT(!aa->oa_lvb); - LASSERT(!aa->oa_flags); - aa->oa_flags = &flags; - } - - /* Complete obtaining the lock procedure. */ - rc = ldlm_cli_enqueue_fini(aa->oa_exp, req, aa->oa_type, 1, - aa->oa_mode, aa->oa_flags, lvb, lvb_len, - lockh, rc); - /* Complete osc stuff. */ - rc = osc_enqueue_fini(req, aa->oa_upcall, aa->oa_cookie, lockh, mode, - aa->oa_flags, aa->oa_agl, rc); - - OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_CP_CANCEL_RACE, 10); - - ldlm_lock_decref(lockh, mode); - LDLM_LOCK_PUT(lock); - return rc; -} - -struct ptlrpc_request_set *PTLRPCD_SET = (void *)1; - -/* When enqueuing asynchronously, locks are not ordered, we can obtain a lock - * from the 2nd OSC before a lock from the 1st one. This does not deadlock with - * other synchronous requests, however keeping some locks and trying to obtain - * others may take a considerable amount of time in a case of ost failure; and - * when other sync requests do not get released lock from a client, the client - * is evicted from the cluster -- such scenaries make the life difficult, so - * release locks just after they are obtained. - */ -int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id, - __u64 *flags, union ldlm_policy_data *policy, - struct ost_lvb *lvb, int kms_valid, - osc_enqueue_upcall_f upcall, void *cookie, - struct ldlm_enqueue_info *einfo, - struct ptlrpc_request_set *rqset, int async, int agl) -{ - struct obd_device *obd = exp->exp_obd; - struct lustre_handle lockh = { 0 }; - struct ptlrpc_request *req = NULL; - int intent = *flags & LDLM_FL_HAS_INTENT; - __u64 match_flags = *flags; - enum ldlm_mode mode; - int rc; - - /* Filesystem lock extents are extended to page boundaries so that - * dealing with the page cache is a little smoother. - */ - policy->l_extent.start -= policy->l_extent.start & ~PAGE_MASK; - policy->l_extent.end |= ~PAGE_MASK; - - /* - * kms is not valid when either object is completely fresh (so that no - * locks are cached), or object was evicted. In the latter case cached - * lock cannot be used, because it would prime inode state with - * potentially stale LVB. - */ - if (!kms_valid) - goto no_match; - - /* Next, search for already existing extent locks that will cover us */ - /* If we're trying to read, we also search for an existing PW lock. The - * VFS and page cache already protect us locally, so lots of readers/ - * writers can share a single PW lock. - * - * There are problems with conversion deadlocks, so instead of - * converting a read lock to a write lock, we'll just enqueue a new - * one. - * - * At some point we should cancel the read lock instead of making them - * send us a blocking callback, but there are problems with canceling - * locks out from other users right now, too. - */ - mode = einfo->ei_mode; - if (einfo->ei_mode == LCK_PR) - mode |= LCK_PW; - if (agl == 0) - match_flags |= LDLM_FL_LVB_READY; - if (intent != 0) - match_flags |= LDLM_FL_BLOCK_GRANTED; - mode = ldlm_lock_match(obd->obd_namespace, match_flags, res_id, - einfo->ei_type, policy, mode, &lockh, 0); - if (mode) { - struct ldlm_lock *matched; - - if (*flags & LDLM_FL_TEST_LOCK) - return ELDLM_OK; - - matched = ldlm_handle2lock(&lockh); - if (agl) { - /* AGL enqueues DLM locks speculatively. Therefore if - * it already exists a DLM lock, it wll just inform the - * caller to cancel the AGL process for this stripe. - */ - ldlm_lock_decref(&lockh, mode); - LDLM_LOCK_PUT(matched); - return -ECANCELED; - } else if (osc_set_lock_data(matched, einfo->ei_cbdata)) { - *flags |= LDLM_FL_LVB_READY; - /* We already have a lock, and it's referenced. */ - (*upcall)(cookie, &lockh, ELDLM_LOCK_MATCHED); - - ldlm_lock_decref(&lockh, mode); - LDLM_LOCK_PUT(matched); - return ELDLM_OK; - } else { - ldlm_lock_decref(&lockh, mode); - LDLM_LOCK_PUT(matched); - } - } - -no_match: - if (*flags & (LDLM_FL_TEST_LOCK | LDLM_FL_MATCH_LOCK)) - return -ENOLCK; - if (intent) { - req = ptlrpc_request_alloc(class_exp2cliimp(exp), - &RQF_LDLM_ENQUEUE_LVB); - if (!req) - return -ENOMEM; - - rc = ldlm_prep_enqueue_req(exp, req, NULL, 0); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, - sizeof(*lvb)); - ptlrpc_request_set_replen(req); - } - - /* users of osc_enqueue() can pass this flag for ldlm_lock_match() */ - *flags &= ~LDLM_FL_BLOCK_GRANTED; - - rc = ldlm_cli_enqueue(exp, &req, einfo, res_id, policy, flags, lvb, - sizeof(*lvb), LVB_T_OST, &lockh, async); - if (async) { - if (!rc) { - struct osc_enqueue_args *aa; - - BUILD_BUG_ON(sizeof(*aa) > sizeof(req->rq_async_args)); - aa = ptlrpc_req_async_args(req); - aa->oa_exp = exp; - aa->oa_mode = einfo->ei_mode; - aa->oa_type = einfo->ei_type; - lustre_handle_copy(&aa->oa_lockh, &lockh); - aa->oa_upcall = upcall; - aa->oa_cookie = cookie; - aa->oa_agl = !!agl; - if (!agl) { - aa->oa_flags = flags; - aa->oa_lvb = lvb; - } else { - /* AGL is essentially to enqueue an DLM lock - * in advance, so we don't care about the - * result of AGL enqueue. - */ - aa->oa_lvb = NULL; - aa->oa_flags = NULL; - } - - req->rq_interpret_reply = - (ptlrpc_interpterer_t)osc_enqueue_interpret; - if (rqset == PTLRPCD_SET) - ptlrpcd_add_req(req); - else - ptlrpc_set_add_req(rqset, req); - } else if (intent) { - ptlrpc_req_finished(req); - } - return rc; - } - - rc = osc_enqueue_fini(req, upcall, cookie, &lockh, einfo->ei_mode, - flags, agl, rc); - if (intent) - ptlrpc_req_finished(req); - - return rc; -} - -int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id, - enum ldlm_type type, union ldlm_policy_data *policy, - enum ldlm_mode mode, __u64 *flags, void *data, - struct lustre_handle *lockh, int unref) -{ - struct obd_device *obd = exp->exp_obd; - __u64 lflags = *flags; - enum ldlm_mode rc; - - if (OBD_FAIL_CHECK(OBD_FAIL_OSC_MATCH)) - return -EIO; - - /* Filesystem lock extents are extended to page boundaries so that - * dealing with the page cache is a little smoother - */ - policy->l_extent.start -= policy->l_extent.start & ~PAGE_MASK; - policy->l_extent.end |= ~PAGE_MASK; - - /* Next, search for already existing extent locks that will cover us */ - /* If we're trying to read, we also search for an existing PW lock. The - * VFS and page cache already protect us locally, so lots of readers/ - * writers can share a single PW lock. - */ - rc = mode; - if (mode == LCK_PR) - rc |= LCK_PW; - rc = ldlm_lock_match(obd->obd_namespace, lflags, - res_id, type, policy, rc, lockh, unref); - if (!rc || lflags & LDLM_FL_TEST_LOCK) - return rc; - - if (data) { - struct ldlm_lock *lock = ldlm_handle2lock(lockh); - - LASSERT(lock); - if (!osc_set_lock_data(lock, data)) { - ldlm_lock_decref(lockh, rc); - rc = 0; - } - LDLM_LOCK_PUT(lock); - } - return rc; -} - -static int osc_statfs_interpret(const struct lu_env *env, - struct ptlrpc_request *req, - struct osc_async_args *aa, int rc) -{ - struct obd_statfs *msfs; - - if (rc == -EBADR) - /* The request has in fact never been sent - * due to issues at a higher level (LOV). - * Exit immediately since the caller is - * aware of the problem and takes care - * of the clean up - */ - return rc; - - if ((rc == -ENOTCONN || rc == -EAGAIN) && - (aa->aa_oi->oi_flags & OBD_STATFS_NODELAY)) { - rc = 0; - goto out; - } - - if (rc != 0) - goto out; - - msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS); - if (!msfs) { - rc = -EPROTO; - goto out; - } - - *aa->aa_oi->oi_osfs = *msfs; -out: - rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc); - return rc; -} - -static int osc_statfs_async(struct obd_export *exp, - struct obd_info *oinfo, __u64 max_age, - struct ptlrpc_request_set *rqset) -{ - struct obd_device *obd = class_exp2obd(exp); - struct ptlrpc_request *req; - struct osc_async_args *aa; - int rc; - - /* We could possibly pass max_age in the request (as an absolute - * timestamp or a "seconds.usec ago") so the target can avoid doing - * extra calls into the filesystem if that isn't necessary (e.g. - * during mount that would help a bit). Having relative timestamps - * is not so great if request processing is slow, while absolute - * timestamps are not ideal because they need time synchronization. - */ - req = ptlrpc_request_alloc(obd->u.cli.cl_import, &RQF_OST_STATFS); - if (!req) - return -ENOMEM; - - rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - ptlrpc_request_set_replen(req); - req->rq_request_portal = OST_CREATE_PORTAL; - ptlrpc_at_set_req_timeout(req); - - if (oinfo->oi_flags & OBD_STATFS_NODELAY) { - /* procfs requests not want stat in wait for avoid deadlock */ - req->rq_no_resend = 1; - req->rq_no_delay = 1; - } - - req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_statfs_interpret; - BUILD_BUG_ON(sizeof(*aa) > sizeof(req->rq_async_args)); - aa = ptlrpc_req_async_args(req); - aa->aa_oi = oinfo; - - ptlrpc_set_add_req(rqset, req); - return 0; -} - -static int osc_statfs(const struct lu_env *env, struct obd_export *exp, - struct obd_statfs *osfs, __u64 max_age, __u32 flags) -{ - struct obd_device *obd = class_exp2obd(exp); - struct obd_statfs *msfs; - struct ptlrpc_request *req; - struct obd_import *imp = NULL; - int rc; - - /* Since the request might also come from lprocfs, so we need - * sync this with client_disconnect_export Bug15684 - */ - down_read(&obd->u.cli.cl_sem); - if (obd->u.cli.cl_import) - imp = class_import_get(obd->u.cli.cl_import); - up_read(&obd->u.cli.cl_sem); - if (!imp) - return -ENODEV; - - /* We could possibly pass max_age in the request (as an absolute - * timestamp or a "seconds.usec ago") so the target can avoid doing - * extra calls into the filesystem if that isn't necessary (e.g. - * during mount that would help a bit). Having relative timestamps - * is not so great if request processing is slow, while absolute - * timestamps are not ideal because they need time synchronization. - */ - req = ptlrpc_request_alloc(imp, &RQF_OST_STATFS); - - class_import_put(imp); - - if (!req) - return -ENOMEM; - - rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_STATFS); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - ptlrpc_request_set_replen(req); - req->rq_request_portal = OST_CREATE_PORTAL; - ptlrpc_at_set_req_timeout(req); - - if (flags & OBD_STATFS_NODELAY) { - /* procfs requests not want stat in wait for avoid deadlock */ - req->rq_no_resend = 1; - req->rq_no_delay = 1; - } - - rc = ptlrpc_queue_wait(req); - if (rc) - goto out; - - msfs = req_capsule_server_get(&req->rq_pill, &RMF_OBD_STATFS); - if (!msfs) { - rc = -EPROTO; - goto out; - } - - *osfs = *msfs; - - out: - ptlrpc_req_finished(req); - return rc; -} - -static int osc_iocontrol(unsigned int cmd, struct obd_export *exp, int len, - void *karg, void __user *uarg) -{ - struct obd_device *obd = exp->exp_obd; - struct obd_ioctl_data *data = karg; - int err = 0; - - if (!try_module_get(THIS_MODULE)) { - CERROR("%s: cannot get module '%s'\n", obd->obd_name, - module_name(THIS_MODULE)); - return -EINVAL; - } - switch (cmd) { - case OBD_IOC_CLIENT_RECOVER: - err = ptlrpc_recover_import(obd->u.cli.cl_import, - data->ioc_inlbuf1, 0); - if (err > 0) - err = 0; - goto out; - case IOC_OSC_SET_ACTIVE: - err = ptlrpc_set_import_active(obd->u.cli.cl_import, - data->ioc_offset); - goto out; - case OBD_IOC_PING_TARGET: - err = ptlrpc_obd_ping(obd); - goto out; - default: - CDEBUG(D_INODE, "unrecognised ioctl %#x by %s\n", - cmd, current->comm); - err = -ENOTTY; - goto out; - } -out: - module_put(THIS_MODULE); - return err; -} - -static int osc_set_info_async(const struct lu_env *env, struct obd_export *exp, - u32 keylen, void *key, u32 vallen, - void *val, struct ptlrpc_request_set *set) -{ - struct ptlrpc_request *req; - struct obd_device *obd = exp->exp_obd; - struct obd_import *imp = class_exp2cliimp(exp); - char *tmp; - int rc; - - OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_SHUTDOWN, 10); - - if (KEY_IS(KEY_CHECKSUM)) { - if (vallen != sizeof(int)) - return -EINVAL; - exp->exp_obd->u.cli.cl_checksum = (*(int *)val) ? 1 : 0; - return 0; - } - - if (KEY_IS(KEY_SPTLRPC_CONF)) { - sptlrpc_conf_client_adapt(obd); - return 0; - } - - if (KEY_IS(KEY_FLUSH_CTX)) { - sptlrpc_import_flush_my_ctx(imp); - return 0; - } - - if (KEY_IS(KEY_CACHE_SET)) { - struct client_obd *cli = &obd->u.cli; - - LASSERT(!cli->cl_cache); /* only once */ - cli->cl_cache = val; - cl_cache_incref(cli->cl_cache); - cli->cl_lru_left = &cli->cl_cache->ccc_lru_left; - - /* add this osc into entity list */ - LASSERT(list_empty(&cli->cl_lru_osc)); - spin_lock(&cli->cl_cache->ccc_lru_lock); - list_add(&cli->cl_lru_osc, &cli->cl_cache->ccc_lru); - spin_unlock(&cli->cl_cache->ccc_lru_lock); - - return 0; - } - - if (KEY_IS(KEY_CACHE_LRU_SHRINK)) { - struct client_obd *cli = &obd->u.cli; - long nr = atomic_long_read(&cli->cl_lru_in_list) >> 1; - long target = *(long *)val; - - nr = osc_lru_shrink(env, cli, min(nr, target), true); - *(long *)val -= nr; - return 0; - } - - if (!set && !KEY_IS(KEY_GRANT_SHRINK)) - return -EINVAL; - - /* We pass all other commands directly to OST. Since nobody calls osc - * methods directly and everybody is supposed to go through LOV, we - * assume lov checked invalid values for us. - * The only recognised values so far are evict_by_nid and mds_conn. - * Even if something bad goes through, we'd get a -EINVAL from OST - * anyway. - */ - - req = ptlrpc_request_alloc(imp, KEY_IS(KEY_GRANT_SHRINK) ? - &RQF_OST_SET_GRANT_INFO : - &RQF_OBD_SET_INFO); - if (!req) - return -ENOMEM; - - req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY, - RCL_CLIENT, keylen); - if (!KEY_IS(KEY_GRANT_SHRINK)) - req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL, - RCL_CLIENT, vallen); - rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_SET_INFO); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY); - memcpy(tmp, key, keylen); - tmp = req_capsule_client_get(&req->rq_pill, KEY_IS(KEY_GRANT_SHRINK) ? - &RMF_OST_BODY : - &RMF_SETINFO_VAL); - memcpy(tmp, val, vallen); - - if (KEY_IS(KEY_GRANT_SHRINK)) { - struct osc_brw_async_args *aa; - struct obdo *oa; - - BUILD_BUG_ON(sizeof(*aa) > sizeof(req->rq_async_args)); - aa = ptlrpc_req_async_args(req); - oa = kmem_cache_zalloc(obdo_cachep, GFP_NOFS); - if (!oa) { - ptlrpc_req_finished(req); - return -ENOMEM; - } - *oa = ((struct ost_body *)val)->oa; - aa->aa_oa = oa; - req->rq_interpret_reply = osc_shrink_grant_interpret; - } - - ptlrpc_request_set_replen(req); - if (!KEY_IS(KEY_GRANT_SHRINK)) { - LASSERT(set); - ptlrpc_set_add_req(set, req); - ptlrpc_check_set(NULL, set); - } else { - ptlrpcd_add_req(req); - } - - return 0; -} - -static int osc_reconnect(const struct lu_env *env, - struct obd_export *exp, struct obd_device *obd, - struct obd_uuid *cluuid, - struct obd_connect_data *data, - void *localdata) -{ - struct client_obd *cli = &obd->u.cli; - - if (data && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) { - long lost_grant; - - spin_lock(&cli->cl_loi_list_lock); - data->ocd_grant = (cli->cl_avail_grant + - (cli->cl_dirty_pages << PAGE_SHIFT)) ?: - 2 * cli_brw_size(obd); - lost_grant = cli->cl_lost_grant; - cli->cl_lost_grant = 0; - spin_unlock(&cli->cl_loi_list_lock); - - CDEBUG(D_RPCTRACE, "ocd_connect_flags: %#llx ocd_version: %d ocd_grant: %d, lost: %ld.\n", - data->ocd_connect_flags, - data->ocd_version, data->ocd_grant, lost_grant); - } - - return 0; -} - -static int osc_disconnect(struct obd_export *exp) -{ - struct obd_device *obd = class_exp2obd(exp); - int rc; - - rc = client_disconnect_export(exp); - /** - * Initially we put del_shrink_grant before disconnect_export, but it - * causes the following problem if setup (connect) and cleanup - * (disconnect) are tangled together. - * connect p1 disconnect p2 - * ptlrpc_connect_import - * ............... class_manual_cleanup - * osc_disconnect - * del_shrink_grant - * ptlrpc_connect_interrupt - * init_grant_shrink - * add this client to shrink list - * cleanup_osc - * Bang! pinger trigger the shrink. - * So the osc should be disconnected from the shrink list, after we - * are sure the import has been destroyed. BUG18662 - */ - if (!obd->u.cli.cl_import) - osc_del_shrink_grant(&obd->u.cli); - return rc; -} - -static int osc_ldlm_resource_invalidate(struct cfs_hash *hs, - struct cfs_hash_bd *bd, - struct hlist_node *hnode, void *arg) -{ - struct ldlm_resource *res = cfs_hash_object(hs, hnode); - struct osc_object *osc = NULL; - struct lu_env *env = arg; - struct ldlm_lock *lock; - - lock_res(res); - list_for_each_entry(lock, &res->lr_granted, l_res_link) { - if (lock->l_ast_data && !osc) { - osc = lock->l_ast_data; - cl_object_get(osc2cl(osc)); - } - - /* - * clear LDLM_FL_CLEANED flag to make sure it will be canceled - * by the 2nd round of ldlm_namespace_clean() call in - * osc_import_event(). - */ - ldlm_clear_cleaned(lock); - } - unlock_res(res); - - if (osc) { - osc_object_invalidate(env, osc); - cl_object_put(env, osc2cl(osc)); - } - - return 0; -} - -static int osc_import_event(struct obd_device *obd, - struct obd_import *imp, - enum obd_import_event event) -{ - struct client_obd *cli; - int rc = 0; - - LASSERT(imp->imp_obd == obd); - - switch (event) { - case IMP_EVENT_DISCON: { - cli = &obd->u.cli; - spin_lock(&cli->cl_loi_list_lock); - cli->cl_avail_grant = 0; - cli->cl_lost_grant = 0; - spin_unlock(&cli->cl_loi_list_lock); - break; - } - case IMP_EVENT_INACTIVE: { - rc = obd_notify_observer(obd, obd, OBD_NOTIFY_INACTIVE, NULL); - break; - } - case IMP_EVENT_INVALIDATE: { - struct ldlm_namespace *ns = obd->obd_namespace; - struct lu_env *env; - u16 refcheck; - - ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY); - - env = cl_env_get(&refcheck); - if (!IS_ERR(env)) { - osc_io_unplug(env, &obd->u.cli, NULL); - - cfs_hash_for_each_nolock(ns->ns_rs_hash, - osc_ldlm_resource_invalidate, - env, 0); - cl_env_put(env, &refcheck); - - ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY); - } else { - rc = PTR_ERR(env); - } - break; - } - case IMP_EVENT_ACTIVE: { - rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE, NULL); - break; - } - case IMP_EVENT_OCD: { - struct obd_connect_data *ocd = &imp->imp_connect_data; - - if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT) - osc_init_grant(&obd->u.cli, ocd); - - /* See bug 7198 */ - if (ocd->ocd_connect_flags & OBD_CONNECT_REQPORTAL) - imp->imp_client->cli_request_portal = OST_REQUEST_PORTAL; - - rc = obd_notify_observer(obd, obd, OBD_NOTIFY_OCD, NULL); - break; - } - case IMP_EVENT_DEACTIVATE: { - rc = obd_notify_observer(obd, obd, OBD_NOTIFY_DEACTIVATE, NULL); - break; - } - case IMP_EVENT_ACTIVATE: { - rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVATE, NULL); - break; - } - default: - CERROR("Unknown import event %d\n", event); - LBUG(); - } - return rc; -} - -/** - * Determine whether the lock can be canceled before replaying the lock - * during recovery, see bug16774 for detailed information. - * - * \retval zero the lock can't be canceled - * \retval other ok to cancel - */ -static int osc_cancel_weight(struct ldlm_lock *lock) -{ - /* - * Cancel all unused and granted extent lock. - */ - if (lock->l_resource->lr_type == LDLM_EXTENT && - lock->l_granted_mode == lock->l_req_mode && - osc_ldlm_weigh_ast(lock) == 0) - return 1; - - return 0; -} - -static int brw_queue_work(const struct lu_env *env, void *data) -{ - struct client_obd *cli = data; - - CDEBUG(D_CACHE, "Run writeback work for client obd %p.\n", cli); - - osc_io_unplug(env, cli, NULL); - return 0; -} - -int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg) -{ - struct lprocfs_static_vars lvars = { NULL }; - struct client_obd *cli = &obd->u.cli; - void *handler; - int rc; - int adding; - int added; - int req_count; - - rc = ptlrpcd_addref(); - if (rc) - return rc; - - rc = client_obd_setup(obd, lcfg); - if (rc) - goto out_ptlrpcd; - - handler = ptlrpcd_alloc_work(cli->cl_import, brw_queue_work, cli); - if (IS_ERR(handler)) { - rc = PTR_ERR(handler); - goto out_client_setup; - } - cli->cl_writeback_work = handler; - - handler = ptlrpcd_alloc_work(cli->cl_import, lru_queue_work, cli); - if (IS_ERR(handler)) { - rc = PTR_ERR(handler); - goto out_ptlrpcd_work; - } - - cli->cl_lru_work = handler; - - rc = osc_quota_setup(obd); - if (rc) - goto out_ptlrpcd_work; - - cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL; - lprocfs_osc_init_vars(&lvars); - if (lprocfs_obd_setup(obd, lvars.obd_vars, lvars.sysfs_vars) == 0) { - lproc_osc_attach_seqstat(obd); - sptlrpc_lprocfs_cliobd_attach(obd); - ptlrpc_lprocfs_register_obd(obd); - } - - /* - * We try to control the total number of requests with a upper limit - * osc_reqpool_maxreqcount. There might be some race which will cause - * over-limit allocation, but it is fine. - */ - req_count = atomic_read(&osc_pool_req_count); - if (req_count < osc_reqpool_maxreqcount) { - adding = cli->cl_max_rpcs_in_flight + 2; - if (req_count + adding > osc_reqpool_maxreqcount) - adding = osc_reqpool_maxreqcount - req_count; - - added = ptlrpc_add_rqs_to_pool(osc_rq_pool, adding); - atomic_add(added, &osc_pool_req_count); - } - - INIT_LIST_HEAD(&cli->cl_grant_shrink_list); - ns_register_cancel(obd->obd_namespace, osc_cancel_weight); - - spin_lock(&osc_shrink_lock); - list_add_tail(&cli->cl_shrink_list, &osc_shrink_list); - spin_unlock(&osc_shrink_lock); - - return rc; - -out_ptlrpcd_work: - if (cli->cl_writeback_work) { - ptlrpcd_destroy_work(cli->cl_writeback_work); - cli->cl_writeback_work = NULL; - } - if (cli->cl_lru_work) { - ptlrpcd_destroy_work(cli->cl_lru_work); - cli->cl_lru_work = NULL; - } -out_client_setup: - client_obd_cleanup(obd); -out_ptlrpcd: - ptlrpcd_decref(); - return rc; -} - -static int osc_precleanup(struct obd_device *obd) -{ - struct client_obd *cli = &obd->u.cli; - - /* LU-464 - * for echo client, export may be on zombie list, wait for - * zombie thread to cull it, because cli.cl_import will be - * cleared in client_disconnect_export(): - * class_export_destroy() -> obd_cleanup() -> - * echo_device_free() -> echo_client_cleanup() -> - * obd_disconnect() -> osc_disconnect() -> - * client_disconnect_export() - */ - obd_zombie_barrier(); - if (cli->cl_writeback_work) { - ptlrpcd_destroy_work(cli->cl_writeback_work); - cli->cl_writeback_work = NULL; - } - - if (cli->cl_lru_work) { - ptlrpcd_destroy_work(cli->cl_lru_work); - cli->cl_lru_work = NULL; - } - - obd_cleanup_client_import(obd); - ptlrpc_lprocfs_unregister_obd(obd); - lprocfs_obd_cleanup(obd); - return 0; -} - -static int osc_cleanup(struct obd_device *obd) -{ - struct client_obd *cli = &obd->u.cli; - int rc; - - spin_lock(&osc_shrink_lock); - list_del(&cli->cl_shrink_list); - spin_unlock(&osc_shrink_lock); - - /* lru cleanup */ - if (cli->cl_cache) { - LASSERT(atomic_read(&cli->cl_cache->ccc_users) > 0); - spin_lock(&cli->cl_cache->ccc_lru_lock); - list_del_init(&cli->cl_lru_osc); - spin_unlock(&cli->cl_cache->ccc_lru_lock); - cli->cl_lru_left = NULL; - cl_cache_decref(cli->cl_cache); - cli->cl_cache = NULL; - } - - /* free memory of osc quota cache */ - osc_quota_cleanup(obd); - - rc = client_obd_cleanup(obd); - - ptlrpcd_decref(); - return rc; -} - -int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg) -{ - struct lprocfs_static_vars lvars = { NULL }; - int rc = 0; - - lprocfs_osc_init_vars(&lvars); - - switch (lcfg->lcfg_command) { - default: - rc = class_process_proc_param(PARAM_OSC, lvars.obd_vars, - lcfg, obd); - if (rc > 0) - rc = 0; - break; - } - - return rc; -} - -static int osc_process_config(struct obd_device *obd, u32 len, void *buf) -{ - return osc_process_config_base(obd, buf); -} - -static struct obd_ops osc_obd_ops = { - .owner = THIS_MODULE, - .setup = osc_setup, - .precleanup = osc_precleanup, - .cleanup = osc_cleanup, - .add_conn = client_import_add_conn, - .del_conn = client_import_del_conn, - .connect = client_connect_import, - .reconnect = osc_reconnect, - .disconnect = osc_disconnect, - .statfs = osc_statfs, - .statfs_async = osc_statfs_async, - .create = osc_create, - .destroy = osc_destroy, - .getattr = osc_getattr, - .setattr = osc_setattr, - .iocontrol = osc_iocontrol, - .set_info_async = osc_set_info_async, - .import_event = osc_import_event, - .process_config = osc_process_config, - .quotactl = osc_quotactl, -}; - -struct list_head osc_shrink_list = LIST_HEAD_INIT(osc_shrink_list); -DEFINE_SPINLOCK(osc_shrink_lock); - -static struct shrinker osc_cache_shrinker = { - .count_objects = osc_cache_shrink_count, - .scan_objects = osc_cache_shrink_scan, - .seeks = DEFAULT_SEEKS, -}; - -static int __init osc_init(void) -{ - struct lprocfs_static_vars lvars = { NULL }; - unsigned int reqpool_size; - unsigned int reqsize; - int rc; - - /* print an address of _any_ initialized kernel symbol from this - * module, to allow debugging with gdb that doesn't support data - * symbols from modules. - */ - CDEBUG(D_INFO, "Lustre OSC module (%p).\n", &osc_caches); - - rc = libcfs_setup(); - if (rc) - return rc; - - rc = lu_kmem_init(osc_caches); - if (rc) - return rc; - - lprocfs_osc_init_vars(&lvars); - - rc = register_shrinker(&osc_cache_shrinker); - if (rc) - goto err; - - /* This is obviously too much memory, only prevent overflow here */ - if (osc_reqpool_mem_max >= 1 << 12 || osc_reqpool_mem_max == 0) { - rc = -EINVAL; - goto err; - } - - reqpool_size = osc_reqpool_mem_max << 20; - - reqsize = 1; - while (reqsize < OST_MAXREQSIZE) - reqsize = reqsize << 1; - - /* - * We don't enlarge the request count in OSC pool according to - * cl_max_rpcs_in_flight. The allocation from the pool will only be - * tried after normal allocation failed. So a small OSC pool won't - * cause much performance degression in most of cases. - */ - osc_reqpool_maxreqcount = reqpool_size / reqsize; - - atomic_set(&osc_pool_req_count, 0); - osc_rq_pool = ptlrpc_init_rq_pool(0, OST_MAXREQSIZE, - ptlrpc_add_rqs_to_pool); - - rc = -ENOMEM; - - if (!osc_rq_pool) - goto err; - - rc = class_register_type(&osc_obd_ops, NULL, - LUSTRE_OSC_NAME, &osc_device_type); - if (rc) - goto err; - - return rc; - -err: - if (osc_rq_pool) - ptlrpc_free_rq_pool(osc_rq_pool); - unregister_shrinker(&osc_cache_shrinker); - lu_kmem_fini(osc_caches); - return rc; -} - -static void /*__exit*/ osc_exit(void) -{ - unregister_shrinker(&osc_cache_shrinker); - class_unregister_type(LUSTRE_OSC_NAME); - lu_kmem_fini(osc_caches); - ptlrpc_free_rq_pool(osc_rq_pool); -} - -MODULE_AUTHOR("OpenSFS, Inc. "); -MODULE_DESCRIPTION("Lustre Object Storage Client (OSC)"); -MODULE_LICENSE("GPL"); -MODULE_VERSION(LUSTRE_VERSION_STRING); - -module_init(osc_init); -module_exit(osc_exit); diff --git a/drivers/staging/lustre/lustre/ptlrpc/Makefile b/drivers/staging/lustre/lustre/ptlrpc/Makefile deleted file mode 100644 index 1deb1971b39ee5bede8718a1061ec3ed5edfb773..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ptlrpc/Makefile +++ /dev/null @@ -1,23 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/include -subdir-ccflags-y += -I$(srctree)/drivers/staging/lustre/lustre/include - -obj-$(CONFIG_LUSTRE_FS) += ptlrpc.o -LDLM := ../../lustre/ldlm/ - -ldlm_objs := $(LDLM)l_lock.o $(LDLM)ldlm_lock.o -ldlm_objs += $(LDLM)ldlm_resource.o $(LDLM)ldlm_lib.o -ldlm_objs += $(LDLM)ldlm_plain.o $(LDLM)ldlm_extent.o -ldlm_objs += $(LDLM)ldlm_request.o $(LDLM)ldlm_lockd.o -ldlm_objs += $(LDLM)ldlm_flock.o $(LDLM)ldlm_inodebits.o -ldlm_objs += $(LDLM)ldlm_pool.o -ldlm_objs += $(LDLM)interval_tree.o -ptlrpc_objs := client.o recover.o connection.o niobuf.o pack_generic.o -ptlrpc_objs += events.o ptlrpc_module.o service.o pinger.o -ptlrpc_objs += llog_net.o llog_client.o import.o ptlrpcd.o -ptlrpc_objs += pers.o lproc_ptlrpc.o wiretest.o layout.o -ptlrpc_objs += sec.o sec_bulk.o sec_gc.o sec_config.o -ptlrpc_objs += sec_null.o sec_plain.o nrs.o nrs_fifo.o - -ptlrpc-y := $(ldlm_objs) $(ptlrpc_objs) sec_lproc.o -ptlrpc-$(CONFIG_LUSTRE_TRANSLATE_ERRNOS) += errno.o diff --git a/drivers/staging/lustre/lustre/ptlrpc/client.c b/drivers/staging/lustre/lustre/ptlrpc/client.c deleted file mode 100644 index c1b82bf20f0894d2e69fe96b0152aabf98cee21c..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ptlrpc/client.c +++ /dev/null @@ -1,3271 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -/** Implementation of client-side PortalRPC interfaces */ - -#define DEBUG_SUBSYSTEM S_RPC - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ptlrpc_internal.h" - -const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kiov_pin_ops = { - .add_kiov_frag = ptlrpc_prep_bulk_page_pin, - .release_frags = ptlrpc_release_bulk_page_pin, -}; -EXPORT_SYMBOL(ptlrpc_bulk_kiov_pin_ops); - -const struct ptlrpc_bulk_frag_ops ptlrpc_bulk_kiov_nopin_ops = { - .add_kiov_frag = ptlrpc_prep_bulk_page_nopin, - .release_frags = NULL, -}; -EXPORT_SYMBOL(ptlrpc_bulk_kiov_nopin_ops); - -static int ptlrpc_send_new_req(struct ptlrpc_request *req); -static int ptlrpcd_check_work(struct ptlrpc_request *req); -static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async); - -/** - * Initialize passed in client structure \a cl. - */ -void ptlrpc_init_client(int req_portal, int rep_portal, char *name, - struct ptlrpc_client *cl) -{ - cl->cli_request_portal = req_portal; - cl->cli_reply_portal = rep_portal; - cl->cli_name = name; -} -EXPORT_SYMBOL(ptlrpc_init_client); - -/** - * Return PortalRPC connection for remote uud \a uuid - */ -struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid) -{ - struct ptlrpc_connection *c; - lnet_nid_t self; - struct lnet_process_id peer; - int err; - - /* - * ptlrpc_uuid_to_peer() initializes its 2nd parameter - * before accessing its values. - * coverity[uninit_use_in_call] - */ - err = ptlrpc_uuid_to_peer(uuid, &peer, &self); - if (err != 0) { - CNETERR("cannot find peer %s!\n", uuid->uuid); - return NULL; - } - - c = ptlrpc_connection_get(peer, self, uuid); - if (c) { - memcpy(c->c_remote_uuid.uuid, - uuid->uuid, sizeof(c->c_remote_uuid.uuid)); - } - - CDEBUG(D_INFO, "%s -> %p\n", uuid->uuid, c); - - return c; -} - -/** - * Allocate and initialize new bulk descriptor on the sender. - * Returns pointer to the descriptor or NULL on error. - */ -struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned int nfrags, - unsigned int max_brw, - enum ptlrpc_bulk_op_type type, - unsigned int portal, - const struct ptlrpc_bulk_frag_ops *ops) -{ - struct ptlrpc_bulk_desc *desc; - int i; - - /* ensure that only one of KIOV or IOVEC is set but not both */ - LASSERT((ptlrpc_is_bulk_desc_kiov(type) && ops->add_kiov_frag) || - (ptlrpc_is_bulk_desc_kvec(type) && ops->add_iov_frag)); - - desc = kzalloc(sizeof(*desc), GFP_NOFS); - if (!desc) - return NULL; - - if (type & PTLRPC_BULK_BUF_KIOV) { - GET_KIOV(desc) = kcalloc(nfrags, sizeof(*GET_KIOV(desc)), - GFP_NOFS); - if (!GET_KIOV(desc)) - goto free_desc; - } else { - GET_KVEC(desc) = kcalloc(nfrags, sizeof(*GET_KVEC(desc)), - GFP_NOFS); - if (!GET_KVEC(desc)) - goto free_desc; - } - - spin_lock_init(&desc->bd_lock); - init_waitqueue_head(&desc->bd_waitq); - desc->bd_max_iov = nfrags; - desc->bd_iov_count = 0; - desc->bd_portal = portal; - desc->bd_type = type; - desc->bd_md_count = 0; - desc->bd_frag_ops = (struct ptlrpc_bulk_frag_ops *)ops; - LASSERT(max_brw > 0); - desc->bd_md_max_brw = min(max_brw, PTLRPC_BULK_OPS_COUNT); - /* - * PTLRPC_BULK_OPS_COUNT is the compile-time transfer limit for this - * node. Negotiated ocd_brw_size will always be <= this number. - */ - for (i = 0; i < PTLRPC_BULK_OPS_COUNT; i++) - LNetInvalidateMDHandle(&desc->bd_mds[i]); - - return desc; -free_desc: - kfree(desc); - return NULL; -} - -/** - * Prepare bulk descriptor for specified outgoing request \a req that - * can fit \a nfrags * pages. \a type is bulk type. \a portal is where - * the bulk to be sent. Used on client-side. - * Returns pointer to newly allocated initialized bulk descriptor or NULL on - * error. - */ -struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req, - unsigned int nfrags, - unsigned int max_brw, - unsigned int type, - unsigned int portal, - const struct ptlrpc_bulk_frag_ops *ops) -{ - struct obd_import *imp = req->rq_import; - struct ptlrpc_bulk_desc *desc; - - LASSERT(ptlrpc_is_bulk_op_passive(type)); - - desc = ptlrpc_new_bulk(nfrags, max_brw, type, portal, ops); - if (!desc) - return NULL; - - desc->bd_import_generation = req->rq_import_generation; - desc->bd_import = class_import_get(imp); - desc->bd_req = req; - - desc->bd_cbid.cbid_fn = client_bulk_callback; - desc->bd_cbid.cbid_arg = desc; - - /* This makes req own desc, and free it when she frees herself */ - req->rq_bulk = desc; - - return desc; -} -EXPORT_SYMBOL(ptlrpc_prep_bulk_imp); - -void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc, - struct page *page, int pageoffset, int len, int pin) -{ - struct bio_vec *kiov; - - LASSERT(desc->bd_iov_count < desc->bd_max_iov); - LASSERT(page); - LASSERT(pageoffset >= 0); - LASSERT(len > 0); - LASSERT(pageoffset + len <= PAGE_SIZE); - LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type)); - - kiov = &BD_GET_KIOV(desc, desc->bd_iov_count); - - desc->bd_nob += len; - - if (pin) - get_page(page); - - kiov->bv_page = page; - kiov->bv_offset = pageoffset; - kiov->bv_len = len; - - desc->bd_iov_count++; -} -EXPORT_SYMBOL(__ptlrpc_prep_bulk_page); - -int ptlrpc_prep_bulk_frag(struct ptlrpc_bulk_desc *desc, - void *frag, int len) -{ - struct kvec *iovec; - - LASSERT(desc->bd_iov_count < desc->bd_max_iov); - LASSERT(frag); - LASSERT(len > 0); - LASSERT(ptlrpc_is_bulk_desc_kvec(desc->bd_type)); - - iovec = &BD_GET_KVEC(desc, desc->bd_iov_count); - - desc->bd_nob += len; - - iovec->iov_base = frag; - iovec->iov_len = len; - - desc->bd_iov_count++; - - return desc->bd_nob; -} -EXPORT_SYMBOL(ptlrpc_prep_bulk_frag); - -void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc) -{ - LASSERT(desc->bd_iov_count != LI_POISON); /* not freed already */ - LASSERT(desc->bd_md_count == 0); /* network hands off */ - LASSERT((desc->bd_export != NULL) ^ (desc->bd_import != NULL)); - LASSERT(desc->bd_frag_ops); - - if (ptlrpc_is_bulk_desc_kiov(desc->bd_type)) - sptlrpc_enc_pool_put_pages(desc); - - if (desc->bd_export) - class_export_put(desc->bd_export); - else - class_import_put(desc->bd_import); - - if (desc->bd_frag_ops->release_frags) - desc->bd_frag_ops->release_frags(desc); - - if (ptlrpc_is_bulk_desc_kiov(desc->bd_type)) - kfree(GET_KIOV(desc)); - else - kfree(GET_KVEC(desc)); - - kfree(desc); -} -EXPORT_SYMBOL(ptlrpc_free_bulk); - -/** - * Set server timelimit for this req, i.e. how long are we willing to wait - * for reply before timing out this request. - */ -void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req) -{ - __u32 serv_est; - int idx; - struct imp_at *at; - - LASSERT(req->rq_import); - - if (AT_OFF) { - /* - * non-AT settings - * - * \a imp_server_timeout means this is reverse import and - * we send (currently only) ASTs to the client and cannot afford - * to wait too long for the reply, otherwise the other client - * (because of which we are sending this request) would - * timeout waiting for us - */ - req->rq_timeout = req->rq_import->imp_server_timeout ? - obd_timeout / 2 : obd_timeout; - } else { - at = &req->rq_import->imp_at; - idx = import_at_get_index(req->rq_import, - req->rq_request_portal); - serv_est = at_get(&at->iat_service_estimate[idx]); - req->rq_timeout = at_est2timeout(serv_est); - } - /* - * We could get even fancier here, using history to predict increased - * loading... - */ - - /* - * Let the server know what this RPC timeout is by putting it in the - * reqmsg - */ - lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout); -} -EXPORT_SYMBOL(ptlrpc_at_set_req_timeout); - -/* Adjust max service estimate based on server value */ -static void ptlrpc_at_adj_service(struct ptlrpc_request *req, - unsigned int serv_est) -{ - int idx; - unsigned int oldse; - struct imp_at *at; - - LASSERT(req->rq_import); - at = &req->rq_import->imp_at; - - idx = import_at_get_index(req->rq_import, req->rq_request_portal); - /* - * max service estimates are tracked on the server side, - * so just keep minimal history here - */ - oldse = at_measured(&at->iat_service_estimate[idx], serv_est); - if (oldse != 0) - CDEBUG(D_ADAPTTO, "The RPC service estimate for %s ptl %d has changed from %d to %d\n", - req->rq_import->imp_obd->obd_name, req->rq_request_portal, - oldse, at_get(&at->iat_service_estimate[idx])); -} - -/* Expected network latency per remote node (secs) */ -int ptlrpc_at_get_net_latency(struct ptlrpc_request *req) -{ - return AT_OFF ? 0 : at_get(&req->rq_import->imp_at.iat_net_latency); -} - -/* Adjust expected network latency */ -void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req, - unsigned int service_time) -{ - unsigned int nl, oldnl; - struct imp_at *at; - time64_t now = ktime_get_real_seconds(); - - LASSERT(req->rq_import); - - if (service_time > now - req->rq_sent + 3) { - /* - * bz16408, however, this can also happen if early reply - * is lost and client RPC is expired and resent, early reply - * or reply of original RPC can still be fit in reply buffer - * of resent RPC, now client is measuring time from the - * resent time, but server sent back service time of original - * RPC. - */ - CDEBUG((lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) ? - D_ADAPTTO : D_WARNING, - "Reported service time %u > total measured time %lld\n", - service_time, now - req->rq_sent); - return; - } - - /* Network latency is total time less server processing time */ - nl = max_t(int, now - req->rq_sent - - service_time, 0) + 1; /* st rounding */ - at = &req->rq_import->imp_at; - - oldnl = at_measured(&at->iat_net_latency, nl); - if (oldnl != 0) - CDEBUG(D_ADAPTTO, "The network latency for %s (nid %s) has changed from %d to %d\n", - req->rq_import->imp_obd->obd_name, - obd_uuid2str( - &req->rq_import->imp_connection->c_remote_uuid), - oldnl, at_get(&at->iat_net_latency)); -} - -static int unpack_reply(struct ptlrpc_request *req) -{ - int rc; - - if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) { - rc = ptlrpc_unpack_rep_msg(req, req->rq_replen); - if (rc) { - DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc); - return -EPROTO; - } - } - - rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF); - if (rc) { - DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc); - return -EPROTO; - } - return 0; -} - -/** - * Handle an early reply message, called with the rq_lock held. - * If anything goes wrong just ignore it - same as if it never happened - */ -static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) - __must_hold(&req->rq_lock) -{ - struct ptlrpc_request *early_req; - time64_t olddl; - int rc; - - req->rq_early = 0; - spin_unlock(&req->rq_lock); - - rc = sptlrpc_cli_unwrap_early_reply(req, &early_req); - if (rc) { - spin_lock(&req->rq_lock); - return rc; - } - - rc = unpack_reply(early_req); - if (rc) { - sptlrpc_cli_finish_early_reply(early_req); - spin_lock(&req->rq_lock); - return rc; - } - - /* - * Use new timeout value just to adjust the local value for this - * request, don't include it into at_history. It is unclear yet why - * service time increased and should it be counted or skipped, e.g. - * that can be recovery case or some error or server, the real reply - * will add all new data if it is worth to add. - */ - req->rq_timeout = lustre_msg_get_timeout(early_req->rq_repmsg); - lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout); - - /* Network latency can be adjusted, it is pure network delays */ - ptlrpc_at_adj_net_latency(req, - lustre_msg_get_service_time(early_req->rq_repmsg)); - - sptlrpc_cli_finish_early_reply(early_req); - - spin_lock(&req->rq_lock); - olddl = req->rq_deadline; - /* - * server assumes it now has rq_timeout from when the request - * arrived, so the client should give it at least that long. - * since we don't know the arrival time we'll use the original - * sent time - */ - req->rq_deadline = req->rq_sent + req->rq_timeout + - ptlrpc_at_get_net_latency(req); - - DEBUG_REQ(D_ADAPTTO, req, - "Early reply #%d, new deadline in %lds (%lds)", - req->rq_early_count, - (long)(req->rq_deadline - ktime_get_real_seconds()), - (long)(req->rq_deadline - olddl)); - - return rc; -} - -static struct kmem_cache *request_cache; - -int ptlrpc_request_cache_init(void) -{ - request_cache = kmem_cache_create("ptlrpc_cache", - sizeof(struct ptlrpc_request), - 0, SLAB_HWCACHE_ALIGN, NULL); - return !request_cache ? -ENOMEM : 0; -} - -void ptlrpc_request_cache_fini(void) -{ - kmem_cache_destroy(request_cache); -} - -struct ptlrpc_request *ptlrpc_request_cache_alloc(gfp_t flags) -{ - struct ptlrpc_request *req; - - req = kmem_cache_zalloc(request_cache, flags); - return req; -} - -void ptlrpc_request_cache_free(struct ptlrpc_request *req) -{ - kmem_cache_free(request_cache, req); -} - -/** - * Wind down request pool \a pool. - * Frees all requests from the pool too - */ -void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool) -{ - struct ptlrpc_request *req; - - while ((req = list_first_entry_or_null(&pool->prp_req_list, - struct ptlrpc_request, rq_list))) { - list_del(&req->rq_list); - LASSERT(req->rq_reqbuf); - LASSERT(req->rq_reqbuf_len == pool->prp_rq_size); - kvfree(req->rq_reqbuf); - ptlrpc_request_cache_free(req); - } - kfree(pool); -} -EXPORT_SYMBOL(ptlrpc_free_rq_pool); - -/** - * Allocates, initializes and adds \a num_rq requests to the pool \a pool - */ -int ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq) -{ - int i; - int size = 1; - - while (size < pool->prp_rq_size) - size <<= 1; - - LASSERTF(list_empty(&pool->prp_req_list) || - size == pool->prp_rq_size, - "Trying to change pool size with nonempty pool from %d to %d bytes\n", - pool->prp_rq_size, size); - - spin_lock(&pool->prp_lock); - pool->prp_rq_size = size; - for (i = 0; i < num_rq; i++) { - struct ptlrpc_request *req; - struct lustre_msg *msg; - - spin_unlock(&pool->prp_lock); - req = ptlrpc_request_cache_alloc(GFP_KERNEL); - if (!req) - return i; - msg = kvzalloc(size, GFP_KERNEL); - if (!msg) { - ptlrpc_request_cache_free(req); - return i; - } - req->rq_reqbuf = msg; - req->rq_reqbuf_len = size; - req->rq_pool = pool; - spin_lock(&pool->prp_lock); - list_add_tail(&req->rq_list, &pool->prp_req_list); - } - spin_unlock(&pool->prp_lock); - return num_rq; -} -EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool); - -/** - * Create and initialize new request pool with given attributes: - * \a num_rq - initial number of requests to create for the pool - * \a msgsize - maximum message size possible for requests in thid pool - * \a populate_pool - function to be called when more requests need to be added - * to the pool - * Returns pointer to newly created pool or NULL on error. - */ -struct ptlrpc_request_pool * -ptlrpc_init_rq_pool(int num_rq, int msgsize, - int (*populate_pool)(struct ptlrpc_request_pool *, int)) -{ - struct ptlrpc_request_pool *pool; - - pool = kzalloc(sizeof(struct ptlrpc_request_pool), GFP_NOFS); - if (!pool) - return NULL; - - /* - * Request next power of two for the allocation, because internally - * kernel would do exactly this - */ - - spin_lock_init(&pool->prp_lock); - INIT_LIST_HEAD(&pool->prp_req_list); - pool->prp_rq_size = msgsize + SPTLRPC_MAX_PAYLOAD; - pool->prp_populate = populate_pool; - - populate_pool(pool, num_rq); - - return pool; -} -EXPORT_SYMBOL(ptlrpc_init_rq_pool); - -/** - * Fetches one request from pool \a pool - */ -static struct ptlrpc_request * -ptlrpc_prep_req_from_pool(struct ptlrpc_request_pool *pool) -{ - struct ptlrpc_request *request; - struct lustre_msg *reqbuf; - - if (!pool) - return NULL; - - spin_lock(&pool->prp_lock); - - /* - * See if we have anything in a pool, and bail out if nothing, - * in writeout path, where this matters, this is safe to do, because - * nothing is lost in this case, and when some in-flight requests - * complete, this code will be called again. - */ - if (unlikely(list_empty(&pool->prp_req_list))) { - spin_unlock(&pool->prp_lock); - return NULL; - } - - request = list_entry(pool->prp_req_list.next, struct ptlrpc_request, - rq_list); - list_del_init(&request->rq_list); - spin_unlock(&pool->prp_lock); - - LASSERT(request->rq_reqbuf); - LASSERT(request->rq_pool); - - reqbuf = request->rq_reqbuf; - memset(request, 0, sizeof(*request)); - request->rq_reqbuf = reqbuf; - request->rq_reqbuf_len = pool->prp_rq_size; - request->rq_pool = pool; - - return request; -} - -/** - * Returns freed \a request to pool. - */ -static void __ptlrpc_free_req_to_pool(struct ptlrpc_request *request) -{ - struct ptlrpc_request_pool *pool = request->rq_pool; - - spin_lock(&pool->prp_lock); - LASSERT(list_empty(&request->rq_list)); - LASSERT(!request->rq_receiving_reply); - list_add_tail(&request->rq_list, &pool->prp_req_list); - spin_unlock(&pool->prp_lock); -} - -void ptlrpc_add_unreplied(struct ptlrpc_request *req) -{ - struct obd_import *imp = req->rq_import; - struct ptlrpc_request *iter; - - assert_spin_locked(&imp->imp_lock); - LASSERT(list_empty(&req->rq_unreplied_list)); - - /* unreplied list is sorted by xid in ascending order */ - list_for_each_entry_reverse(iter, &imp->imp_unreplied_list, rq_unreplied_list) { - - LASSERT(req->rq_xid != iter->rq_xid); - if (req->rq_xid < iter->rq_xid) - continue; - list_add(&req->rq_unreplied_list, &iter->rq_unreplied_list); - return; - } - list_add(&req->rq_unreplied_list, &imp->imp_unreplied_list); -} - -void ptlrpc_assign_next_xid_nolock(struct ptlrpc_request *req) -{ - req->rq_xid = ptlrpc_next_xid(); - ptlrpc_add_unreplied(req); -} - -static inline void ptlrpc_assign_next_xid(struct ptlrpc_request *req) -{ - spin_lock(&req->rq_import->imp_lock); - ptlrpc_assign_next_xid_nolock(req); - spin_unlock(&req->rq_import->imp_lock); -} - -int ptlrpc_request_bufs_pack(struct ptlrpc_request *request, - __u32 version, int opcode, char **bufs, - struct ptlrpc_cli_ctx *ctx) -{ - int count; - struct obd_import *imp; - __u32 *lengths; - int rc; - - count = req_capsule_filled_sizes(&request->rq_pill, RCL_CLIENT); - imp = request->rq_import; - lengths = request->rq_pill.rc_area[RCL_CLIENT]; - - if (unlikely(ctx)) { - request->rq_cli_ctx = sptlrpc_cli_ctx_get(ctx); - } else { - rc = sptlrpc_req_get_ctx(request); - if (rc) - goto out_free; - } - sptlrpc_req_set_flavor(request, opcode); - - rc = lustre_pack_request(request, imp->imp_msg_magic, count, - lengths, bufs); - if (rc) - goto out_ctx; - - lustre_msg_add_version(request->rq_reqmsg, version); - request->rq_send_state = LUSTRE_IMP_FULL; - request->rq_type = PTL_RPC_MSG_REQUEST; - - request->rq_req_cbid.cbid_fn = request_out_callback; - request->rq_req_cbid.cbid_arg = request; - - request->rq_reply_cbid.cbid_fn = reply_in_callback; - request->rq_reply_cbid.cbid_arg = request; - - request->rq_reply_deadline = 0; - request->rq_bulk_deadline = 0; - request->rq_req_deadline = 0; - request->rq_phase = RQ_PHASE_NEW; - request->rq_next_phase = RQ_PHASE_UNDEFINED; - - request->rq_request_portal = imp->imp_client->cli_request_portal; - request->rq_reply_portal = imp->imp_client->cli_reply_portal; - - ptlrpc_at_set_req_timeout(request); - - lustre_msg_set_opc(request->rq_reqmsg, opcode); - ptlrpc_assign_next_xid(request); - - /* Let's setup deadline for req/reply/bulk unlink for opcode. */ - if (cfs_fail_val == opcode) { - time64_t *fail_t = NULL, *fail2_t = NULL; - - if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) { - fail_t = &request->rq_bulk_deadline; - } else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) { - fail_t = &request->rq_reply_deadline; - } else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REQ_UNLINK)) { - fail_t = &request->rq_req_deadline; - } else if (CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BOTH_UNLINK)) { - fail_t = &request->rq_reply_deadline; - fail2_t = &request->rq_bulk_deadline; - } - - if (fail_t) { - *fail_t = ktime_get_real_seconds() + LONG_UNLINK; - - if (fail2_t) - *fail2_t = ktime_get_real_seconds() + - LONG_UNLINK; - - /* The RPC is infected, let the test change the - * fail_loc - */ - set_current_state(TASK_UNINTERRUPTIBLE); - schedule_timeout(2 * HZ); - set_current_state(TASK_RUNNING); - } - } - - return 0; - -out_ctx: - LASSERT(!request->rq_pool); - sptlrpc_cli_ctx_put(request->rq_cli_ctx, 1); -out_free: - class_import_put(imp); - return rc; -} -EXPORT_SYMBOL(ptlrpc_request_bufs_pack); - -/** - * Pack request buffers for network transfer, performing necessary encryption - * steps if necessary. - */ -int ptlrpc_request_pack(struct ptlrpc_request *request, - __u32 version, int opcode) -{ - int rc; - - rc = ptlrpc_request_bufs_pack(request, version, opcode, NULL, NULL); - if (rc) - return rc; - - /* - * For some old 1.8 clients (< 1.8.7), they will LASSERT the size of - * ptlrpc_body sent from server equal to local ptlrpc_body size, so we - * have to send old ptlrpc_body to keep interoperability with these - * clients. - * - * Only three kinds of server->client RPCs so far: - * - LDLM_BL_CALLBACK - * - LDLM_CP_CALLBACK - * - LDLM_GL_CALLBACK - * - * XXX This should be removed whenever we drop the interoperability with - * the these old clients. - */ - if (opcode == LDLM_BL_CALLBACK || opcode == LDLM_CP_CALLBACK || - opcode == LDLM_GL_CALLBACK) - req_capsule_shrink(&request->rq_pill, &RMF_PTLRPC_BODY, - sizeof(struct ptlrpc_body_v2), RCL_CLIENT); - - return rc; -} -EXPORT_SYMBOL(ptlrpc_request_pack); - -/** - * Helper function to allocate new request on import \a imp - * and possibly using existing request from pool \a pool if provided. - * Returns allocated request structure with import field filled or - * NULL on error. - */ -static inline -struct ptlrpc_request *__ptlrpc_request_alloc(struct obd_import *imp, - struct ptlrpc_request_pool *pool) -{ - struct ptlrpc_request *request; - - request = ptlrpc_request_cache_alloc(GFP_NOFS); - - if (!request && pool) - request = ptlrpc_prep_req_from_pool(pool); - - if (request) { - ptlrpc_cli_req_init(request); - - LASSERTF((unsigned long)imp > 0x1000, "%p", imp); - LASSERT(imp != LP_POISON); - LASSERTF((unsigned long)imp->imp_client > 0x1000, "%p\n", - imp->imp_client); - LASSERT(imp->imp_client != LP_POISON); - - request->rq_import = class_import_get(imp); - } else { - CERROR("request allocation out of memory\n"); - } - - return request; -} - -/** - * Helper function for creating a request. - * Calls __ptlrpc_request_alloc to allocate new request structure and inits - * buffer structures according to capsule template \a format. - * Returns allocated request structure pointer or NULL on error. - */ -static struct ptlrpc_request * -ptlrpc_request_alloc_internal(struct obd_import *imp, - struct ptlrpc_request_pool *pool, - const struct req_format *format) -{ - struct ptlrpc_request *request; - - request = __ptlrpc_request_alloc(imp, pool); - if (!request) - return NULL; - - req_capsule_init(&request->rq_pill, request, RCL_CLIENT); - req_capsule_set(&request->rq_pill, format); - return request; -} - -/** - * Allocate new request structure for import \a imp and initialize its - * buffer structure according to capsule template \a format. - */ -struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp, - const struct req_format *format) -{ - return ptlrpc_request_alloc_internal(imp, NULL, format); -} -EXPORT_SYMBOL(ptlrpc_request_alloc); - -/** - * Allocate new request structure for import \a imp from pool \a pool and - * initialize its buffer structure according to capsule template \a format. - */ -struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp, - struct ptlrpc_request_pool *pool, - const struct req_format *format) -{ - return ptlrpc_request_alloc_internal(imp, pool, format); -} -EXPORT_SYMBOL(ptlrpc_request_alloc_pool); - -/** - * For requests not from pool, free memory of the request structure. - * For requests obtained from a pool earlier, return request back to pool. - */ -void ptlrpc_request_free(struct ptlrpc_request *request) -{ - if (request->rq_pool) - __ptlrpc_free_req_to_pool(request); - else - ptlrpc_request_cache_free(request); -} -EXPORT_SYMBOL(ptlrpc_request_free); - -/** - * Allocate new request for operation \a opcode and immediately pack it for - * network transfer. - * Only used for simple requests like OBD_PING where the only important - * part of the request is operation itself. - * Returns allocated request or NULL on error. - */ -struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp, - const struct req_format *format, - __u32 version, int opcode) -{ - struct ptlrpc_request *req = ptlrpc_request_alloc(imp, format); - int rc; - - if (req) { - rc = ptlrpc_request_pack(req, version, opcode); - if (rc) { - ptlrpc_request_free(req); - req = NULL; - } - } - return req; -} -EXPORT_SYMBOL(ptlrpc_request_alloc_pack); - -/** - * Allocate and initialize new request set structure on the current CPT. - * Returns a pointer to the newly allocated set structure or NULL on error. - */ -struct ptlrpc_request_set *ptlrpc_prep_set(void) -{ - struct ptlrpc_request_set *set; - int cpt; - - cpt = cfs_cpt_current(cfs_cpt_tab, 0); - set = kzalloc_node(sizeof(*set), GFP_NOFS, - cfs_cpt_spread_node(cfs_cpt_tab, cpt)); - if (!set) - return NULL; - atomic_set(&set->set_refcount, 1); - INIT_LIST_HEAD(&set->set_requests); - init_waitqueue_head(&set->set_waitq); - atomic_set(&set->set_new_count, 0); - atomic_set(&set->set_remaining, 0); - spin_lock_init(&set->set_new_req_lock); - INIT_LIST_HEAD(&set->set_new_requests); - INIT_LIST_HEAD(&set->set_cblist); - set->set_max_inflight = UINT_MAX; - set->set_producer = NULL; - set->set_producer_arg = NULL; - set->set_rc = 0; - - return set; -} -EXPORT_SYMBOL(ptlrpc_prep_set); - -/** - * Allocate and initialize new request set structure with flow control - * extension. This extension allows to control the number of requests in-flight - * for the whole set. A callback function to generate requests must be provided - * and the request set will keep the number of requests sent over the wire to - * @max_inflight. - * Returns a pointer to the newly allocated set structure or NULL on error. - */ -struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func, - void *arg) - -{ - struct ptlrpc_request_set *set; - - set = ptlrpc_prep_set(); - if (!set) - return NULL; - - set->set_max_inflight = max; - set->set_producer = func; - set->set_producer_arg = arg; - - return set; -} - -/** - * Wind down and free request set structure previously allocated with - * ptlrpc_prep_set. - * Ensures that all requests on the set have completed and removes - * all requests from the request list in a set. - * If any unsent request happen to be on the list, pretends that they got - * an error in flight and calls their completion handler. - */ -void ptlrpc_set_destroy(struct ptlrpc_request_set *set) -{ - struct ptlrpc_request *req; - int expected_phase; - int n = 0; - - /* Requests on the set should either all be completed, or all be new */ - expected_phase = (atomic_read(&set->set_remaining) == 0) ? - RQ_PHASE_COMPLETE : RQ_PHASE_NEW; - list_for_each_entry(req, &set->set_requests, rq_set_chain) { - LASSERT(req->rq_phase == expected_phase); - n++; - } - - LASSERTF(atomic_read(&set->set_remaining) == 0 || - atomic_read(&set->set_remaining) == n, "%d / %d\n", - atomic_read(&set->set_remaining), n); - - while ((req = list_first_entry_or_null(&set->set_requests, - struct ptlrpc_request, - rq_set_chain))) { - list_del_init(&req->rq_set_chain); - - LASSERT(req->rq_phase == expected_phase); - - if (req->rq_phase == RQ_PHASE_NEW) { - ptlrpc_req_interpret(NULL, req, -EBADR); - atomic_dec(&set->set_remaining); - } - - spin_lock(&req->rq_lock); - req->rq_set = NULL; - req->rq_invalid_rqset = 0; - spin_unlock(&req->rq_lock); - - ptlrpc_req_finished(req); - } - - LASSERT(atomic_read(&set->set_remaining) == 0); - - ptlrpc_reqset_put(set); -} -EXPORT_SYMBOL(ptlrpc_set_destroy); - -/** - * Add a new request to the general purpose request set. - * Assumes request reference from the caller. - */ -void ptlrpc_set_add_req(struct ptlrpc_request_set *set, - struct ptlrpc_request *req) -{ - LASSERT(list_empty(&req->rq_set_chain)); - - /* The set takes over the caller's request reference */ - list_add_tail(&req->rq_set_chain, &set->set_requests); - req->rq_set = set; - atomic_inc(&set->set_remaining); - req->rq_queued_time = jiffies; - - if (req->rq_reqmsg) - lustre_msg_set_jobid(req->rq_reqmsg, NULL); - - if (set->set_producer) - /* - * If the request set has a producer callback, the RPC must be - * sent straight away - */ - ptlrpc_send_new_req(req); -} -EXPORT_SYMBOL(ptlrpc_set_add_req); - -/** - * Add a request to a request with dedicated server thread - * and wake the thread to make any necessary processing. - * Currently only used for ptlrpcd. - */ -void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc, - struct ptlrpc_request *req) -{ - struct ptlrpc_request_set *set = pc->pc_set; - int count, i; - - LASSERT(!req->rq_set); - LASSERT(test_bit(LIOD_STOP, &pc->pc_flags) == 0); - - spin_lock(&set->set_new_req_lock); - /* The set takes over the caller's request reference. */ - req->rq_set = set; - req->rq_queued_time = jiffies; - list_add_tail(&req->rq_set_chain, &set->set_new_requests); - count = atomic_inc_return(&set->set_new_count); - spin_unlock(&set->set_new_req_lock); - - /* Only need to call wakeup once for the first entry. */ - if (count == 1) { - wake_up(&set->set_waitq); - - /* - * XXX: It maybe unnecessary to wakeup all the partners. But to - * guarantee the async RPC can be processed ASAP, we have - * no other better choice. It maybe fixed in future. - */ - for (i = 0; i < pc->pc_npartners; i++) - wake_up(&pc->pc_partners[i]->pc_set->set_waitq); - } -} - -/** - * Based on the current state of the import, determine if the request - * can be sent, is an error, or should be delayed. - * - * Returns true if this request should be delayed. If false, and - * *status is set, then the request can not be sent and *status is the - * error code. If false and status is 0, then request can be sent. - * - * The imp->imp_lock must be held. - */ -static int ptlrpc_import_delay_req(struct obd_import *imp, - struct ptlrpc_request *req, int *status) -{ - int delay = 0; - - *status = 0; - - if (req->rq_ctx_init || req->rq_ctx_fini) { - /* always allow ctx init/fini rpc go through */ - } else if (imp->imp_state == LUSTRE_IMP_NEW) { - DEBUG_REQ(D_ERROR, req, "Uninitialized import."); - *status = -EIO; - } else if (imp->imp_state == LUSTRE_IMP_CLOSED) { - /* pings may safely race with umount */ - DEBUG_REQ(lustre_msg_get_opc(req->rq_reqmsg) == OBD_PING ? - D_HA : D_ERROR, req, "IMP_CLOSED "); - *status = -EIO; - } else if (ptlrpc_send_limit_expired(req)) { - /* probably doesn't need to be a D_ERROR after initial testing */ - DEBUG_REQ(D_HA, req, "send limit expired "); - *status = -ETIMEDOUT; - } else if (req->rq_send_state == LUSTRE_IMP_CONNECTING && - imp->imp_state == LUSTRE_IMP_CONNECTING) { - /* allow CONNECT even if import is invalid */ - if (atomic_read(&imp->imp_inval_count) != 0) { - DEBUG_REQ(D_ERROR, req, "invalidate in flight"); - *status = -EIO; - } - } else if (imp->imp_invalid || imp->imp_obd->obd_no_recov) { - if (!imp->imp_deactive) - DEBUG_REQ(D_NET, req, "IMP_INVALID"); - *status = -ESHUTDOWN; /* bz 12940 */ - } else if (req->rq_import_generation != imp->imp_generation) { - DEBUG_REQ(D_ERROR, req, "req wrong generation:"); - *status = -EIO; - } else if (req->rq_send_state != imp->imp_state) { - /* invalidate in progress - any requests should be drop */ - if (atomic_read(&imp->imp_inval_count) != 0) { - DEBUG_REQ(D_ERROR, req, "invalidate in flight"); - *status = -EIO; - } else if (req->rq_no_delay) { - *status = -EWOULDBLOCK; - } else if (req->rq_allow_replay && - (imp->imp_state == LUSTRE_IMP_REPLAY || - imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS || - imp->imp_state == LUSTRE_IMP_REPLAY_WAIT || - imp->imp_state == LUSTRE_IMP_RECOVER)) { - DEBUG_REQ(D_HA, req, "allow during recovery.\n"); - } else { - delay = 1; - } - } - - return delay; -} - -/** - * Decide if the error message should be printed to the console or not. - * Makes its decision based on request type, status, and failure frequency. - * - * \param[in] req request that failed and may need a console message - * - * \retval false if no message should be printed - * \retval true if console message should be printed - */ -static bool ptlrpc_console_allow(struct ptlrpc_request *req) -{ - __u32 opc; - - LASSERT(req->rq_reqmsg); - opc = lustre_msg_get_opc(req->rq_reqmsg); - - /* Suppress particular reconnect errors which are to be expected. */ - if (opc == OST_CONNECT || opc == MDS_CONNECT || opc == MGS_CONNECT) { - int err; - - /* Suppress timed out reconnect requests */ - if (lustre_handle_is_used(&req->rq_import->imp_remote_handle) || - req->rq_timedout) - return false; - - /* - * Suppress most unavailable/again reconnect requests, but - * print occasionally so it is clear client is trying to - * connect to a server where no target is running. - */ - err = lustre_msg_get_status(req->rq_repmsg); - if ((err == -ENODEV || err == -EAGAIN) && - req->rq_import->imp_conn_cnt % 30 != 20) - return false; - } - - return true; -} - -/** - * Check request processing status. - * Returns the status. - */ -static int ptlrpc_check_status(struct ptlrpc_request *req) -{ - int err; - - err = lustre_msg_get_status(req->rq_repmsg); - if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR) { - struct obd_import *imp = req->rq_import; - lnet_nid_t nid = imp->imp_connection->c_peer.nid; - __u32 opc = lustre_msg_get_opc(req->rq_reqmsg); - - /* -EAGAIN is normal when using POSIX flocks */ - if (ptlrpc_console_allow(req) && - !(opc == LDLM_ENQUEUE && err == -EAGAIN)) - LCONSOLE_ERROR_MSG(0x011, "%s: operation %s to node %s failed: rc = %d\n", - imp->imp_obd->obd_name, - ll_opcode2str(opc), - libcfs_nid2str(nid), err); - return err < 0 ? err : -EINVAL; - } - - if (err < 0) - DEBUG_REQ(D_INFO, req, "status is %d", err); - else if (err > 0) - /* XXX: translate this error from net to host */ - DEBUG_REQ(D_INFO, req, "status is %d", err); - - return err; -} - -/** - * save pre-versions of objects into request for replay. - * Versions are obtained from server reply. - * used for VBR. - */ -static void ptlrpc_save_versions(struct ptlrpc_request *req) -{ - struct lustre_msg *repmsg = req->rq_repmsg; - struct lustre_msg *reqmsg = req->rq_reqmsg; - __u64 *versions = lustre_msg_get_versions(repmsg); - - if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) - return; - - LASSERT(versions); - lustre_msg_set_versions(reqmsg, versions); - CDEBUG(D_INFO, "Client save versions [%#llx/%#llx]\n", - versions[0], versions[1]); -} - -__u64 ptlrpc_known_replied_xid(struct obd_import *imp) -{ - struct ptlrpc_request *req; - - assert_spin_locked(&imp->imp_lock); - if (list_empty(&imp->imp_unreplied_list)) - return 0; - - req = list_entry(imp->imp_unreplied_list.next, struct ptlrpc_request, - rq_unreplied_list); - LASSERTF(req->rq_xid >= 1, "XID:%llu\n", req->rq_xid); - - if (imp->imp_known_replied_xid < req->rq_xid - 1) - imp->imp_known_replied_xid = req->rq_xid - 1; - - return req->rq_xid - 1; -} - -/** - * Callback function called when client receives RPC reply for \a req. - * Returns 0 on success or error code. - * The return value would be assigned to req->rq_status by the caller - * as request processing status. - * This function also decides if the request needs to be saved for later replay. - */ -static int after_reply(struct ptlrpc_request *req) -{ - struct obd_import *imp = req->rq_import; - struct obd_device *obd = req->rq_import->imp_obd; - int rc; - struct timespec64 work_start; - long timediff; - u64 committed; - - LASSERT(obd); - /* repbuf must be unlinked */ - LASSERT(!req->rq_receiving_reply && req->rq_reply_unlinked); - - if (req->rq_reply_truncated) { - if (ptlrpc_no_resend(req)) { - DEBUG_REQ(D_ERROR, req, "reply buffer overflow, expected: %d, actual size: %d", - req->rq_nob_received, req->rq_repbuf_len); - return -EOVERFLOW; - } - - sptlrpc_cli_free_repbuf(req); - /* - * Pass the required reply buffer size (include space for early - * reply). NB: no need to round up because alloc_repbuf will - * round it up - */ - req->rq_replen = req->rq_nob_received; - req->rq_nob_received = 0; - spin_lock(&req->rq_lock); - req->rq_resend = 1; - spin_unlock(&req->rq_lock); - return 0; - } - - ktime_get_real_ts64(&work_start); - timediff = (work_start.tv_sec - req->rq_sent_tv.tv_sec) * USEC_PER_SEC + - (work_start.tv_nsec - req->rq_sent_tv.tv_nsec) / - NSEC_PER_USEC; - /* - * NB Until this point, the whole of the incoming message, - * including buflens, status etc is in the sender's byte order. - */ - rc = sptlrpc_cli_unwrap_reply(req); - if (rc) { - DEBUG_REQ(D_ERROR, req, "unwrap reply failed (%d):", rc); - return rc; - } - - /* Security layer unwrap might ask resend this request. */ - if (req->rq_resend) - return 0; - - rc = unpack_reply(req); - if (rc) - return rc; - - /* retry indefinitely on EINPROGRESS */ - if (lustre_msg_get_status(req->rq_repmsg) == -EINPROGRESS && - ptlrpc_no_resend(req) == 0 && !req->rq_no_retry_einprogress) { - time64_t now = ktime_get_real_seconds(); - - DEBUG_REQ(D_RPCTRACE, req, "Resending request on EINPROGRESS"); - spin_lock(&req->rq_lock); - req->rq_resend = 1; - spin_unlock(&req->rq_lock); - req->rq_nr_resend++; - - /* Readjust the timeout for current conditions */ - ptlrpc_at_set_req_timeout(req); - /* - * delay resend to give a chance to the server to get ready. - * The delay is increased by 1s on every resend and is capped to - * the current request timeout (i.e. obd_timeout if AT is off, - * or AT service time x 125% + 5s, see at_est2timeout) - */ - if (req->rq_nr_resend > req->rq_timeout) - req->rq_sent = now + req->rq_timeout; - else - req->rq_sent = now + req->rq_nr_resend; - - /* Resend for EINPROGRESS will use a new XID */ - spin_lock(&imp->imp_lock); - list_del_init(&req->rq_unreplied_list); - spin_unlock(&imp->imp_lock); - - return 0; - } - - if (obd->obd_svc_stats) { - lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR, - timediff); - ptlrpc_lprocfs_rpc_sent(req, timediff); - } - - if (lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_REPLY && - lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_ERR) { - DEBUG_REQ(D_ERROR, req, "invalid packet received (type=%u)", - lustre_msg_get_type(req->rq_repmsg)); - return -EPROTO; - } - - if (lustre_msg_get_opc(req->rq_reqmsg) != OBD_PING) - CFS_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, cfs_fail_val); - ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg)); - ptlrpc_at_adj_net_latency(req, - lustre_msg_get_service_time(req->rq_repmsg)); - - rc = ptlrpc_check_status(req); - imp->imp_connect_error = rc; - - if (rc) { - /* - * Either we've been evicted, or the server has failed for - * some reason. Try to reconnect, and if that fails, punt to - * the upcall. - */ - if (ptlrpc_recoverable_error(rc)) { - if (req->rq_send_state != LUSTRE_IMP_FULL || - imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) { - return rc; - } - ptlrpc_request_handle_notconn(req); - return rc; - } - } else { - /* - * Let's look if server sent slv. Do it only for RPC with - * rc == 0. - */ - ldlm_cli_update_pool(req); - } - - /* Store transno in reqmsg for replay. */ - if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) { - req->rq_transno = lustre_msg_get_transno(req->rq_repmsg); - lustre_msg_set_transno(req->rq_reqmsg, req->rq_transno); - } - - if (imp->imp_replayable) { - spin_lock(&imp->imp_lock); - /* - * No point in adding already-committed requests to the replay - * list, we will just remove them immediately. b=9829 - */ - if (req->rq_transno != 0 && - (req->rq_transno > - lustre_msg_get_last_committed(req->rq_repmsg) || - req->rq_replay)) { - /* version recovery */ - ptlrpc_save_versions(req); - ptlrpc_retain_replayable_request(req, imp); - } else if (req->rq_commit_cb && - list_empty(&req->rq_replay_list)) { - /* - * NB: don't call rq_commit_cb if it's already on - * rq_replay_list, ptlrpc_free_committed() will call - * it later, see LU-3618 for details - */ - spin_unlock(&imp->imp_lock); - req->rq_commit_cb(req); - spin_lock(&imp->imp_lock); - } - - /* Replay-enabled imports return commit-status information. */ - committed = lustre_msg_get_last_committed(req->rq_repmsg); - if (likely(committed > imp->imp_peer_committed_transno)) - imp->imp_peer_committed_transno = committed; - - ptlrpc_free_committed(imp); - - if (!list_empty(&imp->imp_replay_list)) { - struct ptlrpc_request *last; - - last = list_entry(imp->imp_replay_list.prev, - struct ptlrpc_request, - rq_replay_list); - /* - * Requests with rq_replay stay on the list even if no - * commit is expected. - */ - if (last->rq_transno > imp->imp_peer_committed_transno) - ptlrpc_pinger_commit_expected(imp); - } - - spin_unlock(&imp->imp_lock); - } - - return rc; -} - -/** - * Helper function to send request \a req over the network for the first time - * Also adjusts request phase. - * Returns 0 on success or error code. - */ -static int ptlrpc_send_new_req(struct ptlrpc_request *req) -{ - struct obd_import *imp = req->rq_import; - u64 min_xid = 0; - int rc; - - LASSERT(req->rq_phase == RQ_PHASE_NEW); - - /* do not try to go further if there is not enough memory in enc_pool */ - if (req->rq_sent && req->rq_bulk) - if (req->rq_bulk->bd_iov_count > get_free_pages_in_pool() && - pool_is_at_full_capacity()) - return -ENOMEM; - - if (req->rq_sent && (req->rq_sent > ktime_get_real_seconds()) && - (!req->rq_generation_set || - req->rq_import_generation == imp->imp_generation)) - return 0; - - ptlrpc_rqphase_move(req, RQ_PHASE_RPC); - - spin_lock(&imp->imp_lock); - - LASSERT(req->rq_xid); - LASSERT(!list_empty(&req->rq_unreplied_list)); - - if (!req->rq_generation_set) - req->rq_import_generation = imp->imp_generation; - - if (ptlrpc_import_delay_req(imp, req, &rc)) { - spin_lock(&req->rq_lock); - req->rq_waiting = 1; - spin_unlock(&req->rq_lock); - - DEBUG_REQ(D_HA, req, "req from PID %d waiting for recovery: (%s != %s)", - lustre_msg_get_status(req->rq_reqmsg), - ptlrpc_import_state_name(req->rq_send_state), - ptlrpc_import_state_name(imp->imp_state)); - LASSERT(list_empty(&req->rq_list)); - list_add_tail(&req->rq_list, &imp->imp_delayed_list); - atomic_inc(&req->rq_import->imp_inflight); - spin_unlock(&imp->imp_lock); - return 0; - } - - if (rc != 0) { - spin_unlock(&imp->imp_lock); - req->rq_status = rc; - ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); - return rc; - } - - LASSERT(list_empty(&req->rq_list)); - list_add_tail(&req->rq_list, &imp->imp_sending_list); - atomic_inc(&req->rq_import->imp_inflight); - - /* find the known replied XID from the unreplied list, CONNECT - * and DISCONNECT requests are skipped to make the sanity check - * on server side happy. see process_req_last_xid(). - * - * For CONNECT: Because replay requests have lower XID, it'll - * break the sanity check if CONNECT bump the exp_last_xid on - * server. - * - * For DISCONNECT: Since client will abort inflight RPC before - * sending DISCONNECT, DISCONNECT may carry an XID which higher - * than the inflight RPC. - */ - if (!ptlrpc_req_is_connect(req) && !ptlrpc_req_is_disconnect(req)) - min_xid = ptlrpc_known_replied_xid(imp); - spin_unlock(&imp->imp_lock); - - lustre_msg_set_last_xid(req->rq_reqmsg, min_xid); - - lustre_msg_set_status(req->rq_reqmsg, current->pid); - - rc = sptlrpc_req_refresh_ctx(req, -1); - if (rc) { - if (req->rq_err) { - req->rq_status = rc; - return 1; - } - spin_lock(&req->rq_lock); - req->rq_wait_ctx = 1; - spin_unlock(&req->rq_lock); - return 0; - } - - CDEBUG(D_RPCTRACE, "Sending RPC pname:cluuid:pid:xid:nid:opc %s:%s:%d:%llu:%s:%d\n", - current->comm, - imp->imp_obd->obd_uuid.uuid, - lustre_msg_get_status(req->rq_reqmsg), req->rq_xid, - libcfs_nid2str(imp->imp_connection->c_peer.nid), - lustre_msg_get_opc(req->rq_reqmsg)); - - rc = ptl_send_rpc(req, 0); - if (rc == -ENOMEM) { - spin_lock(&imp->imp_lock); - if (!list_empty(&req->rq_list)) { - list_del_init(&req->rq_list); - if (atomic_dec_and_test(&req->rq_import->imp_inflight)) - wake_up_all(&req->rq_import->imp_recovery_waitq); - } - spin_unlock(&imp->imp_lock); - ptlrpc_rqphase_move(req, RQ_PHASE_NEW); - return rc; - } - if (rc) { - DEBUG_REQ(D_HA, req, "send failed (%d); expect timeout", rc); - spin_lock(&req->rq_lock); - req->rq_net_err = 1; - spin_unlock(&req->rq_lock); - return rc; - } - return 0; -} - -static inline int ptlrpc_set_producer(struct ptlrpc_request_set *set) -{ - int remaining, rc; - - LASSERT(set->set_producer); - - remaining = atomic_read(&set->set_remaining); - - /* - * populate the ->set_requests list with requests until we - * reach the maximum number of RPCs in flight for this set - */ - while (atomic_read(&set->set_remaining) < set->set_max_inflight) { - rc = set->set_producer(set, set->set_producer_arg); - if (rc == -ENOENT) { - /* no more RPC to produce */ - set->set_producer = NULL; - set->set_producer_arg = NULL; - return 0; - } - } - - return (atomic_read(&set->set_remaining) - remaining); -} - -/** - * this sends any unsent RPCs in \a set and returns 1 if all are sent - * and no more replies are expected. - * (it is possible to get less replies than requests sent e.g. due to timed out - * requests or requests that we had trouble to send out) - * - * NOTE: This function contains a potential schedule point (cond_resched()). - */ -int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) -{ - struct ptlrpc_request *req, *next; - struct list_head comp_reqs; - int force_timer_recalc = 0; - - if (atomic_read(&set->set_remaining) == 0) - return 1; - - INIT_LIST_HEAD(&comp_reqs); - list_for_each_entry_safe(req, next, &set->set_requests, rq_set_chain) { - struct obd_import *imp = req->rq_import; - int unregistered = 0; - int rc = 0; - - /* - * This schedule point is mainly for the ptlrpcd caller of this - * function. Most ptlrpc sets are not long-lived and unbounded - * in length, but at the least the set used by the ptlrpcd is. - * Since the processing time is unbounded, we need to insert an - * explicit schedule point to make the thread well-behaved. - */ - cond_resched(); - - if (req->rq_phase == RQ_PHASE_NEW && - ptlrpc_send_new_req(req)) { - force_timer_recalc = 1; - } - - /* delayed send - skip */ - if (req->rq_phase == RQ_PHASE_NEW && req->rq_sent) - continue; - - /* delayed resend - skip */ - if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend && - req->rq_sent > ktime_get_real_seconds()) - continue; - - if (!(req->rq_phase == RQ_PHASE_RPC || - req->rq_phase == RQ_PHASE_BULK || - req->rq_phase == RQ_PHASE_INTERPRET || - req->rq_phase == RQ_PHASE_UNREG_RPC || - req->rq_phase == RQ_PHASE_UNREG_BULK || - req->rq_phase == RQ_PHASE_COMPLETE)) { - DEBUG_REQ(D_ERROR, req, "bad phase %x", req->rq_phase); - LBUG(); - } - - if (req->rq_phase == RQ_PHASE_UNREG_RPC || - req->rq_phase == RQ_PHASE_UNREG_BULK) { - LASSERT(req->rq_next_phase != req->rq_phase); - LASSERT(req->rq_next_phase != RQ_PHASE_UNDEFINED); - - if (req->rq_req_deadline && - !OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REQ_UNLINK)) - req->rq_req_deadline = 0; - if (req->rq_reply_deadline && - !OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) - req->rq_reply_deadline = 0; - if (req->rq_bulk_deadline && - !OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) - req->rq_bulk_deadline = 0; - - /* - * Skip processing until reply is unlinked. We - * can't return to pool before that and we can't - * call interpret before that. We need to make - * sure that all rdma transfers finished and will - * not corrupt any data. - */ - if (req->rq_phase == RQ_PHASE_UNREG_RPC && - ptlrpc_client_recv_or_unlink(req)) - continue; - if (req->rq_phase == RQ_PHASE_UNREG_BULK && - ptlrpc_client_bulk_active(req)) - continue; - - /* - * Turn fail_loc off to prevent it from looping - * forever. - */ - if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) { - OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK, - OBD_FAIL_ONCE); - } - if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) { - OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK, - OBD_FAIL_ONCE); - } - - /* Move to next phase if reply was successfully - * unlinked. - */ - ptlrpc_rqphase_move(req, req->rq_next_phase); - } - - if (req->rq_phase == RQ_PHASE_COMPLETE) { - list_move_tail(&req->rq_set_chain, &comp_reqs); - continue; - } - - if (req->rq_phase == RQ_PHASE_INTERPRET) - goto interpret; - - /* Note that this also will start async reply unlink. */ - if (req->rq_net_err && !req->rq_timedout) { - ptlrpc_expire_one_request(req, 1); - - /* Check if we still need to wait for unlink. */ - if (ptlrpc_client_recv_or_unlink(req) || - ptlrpc_client_bulk_active(req)) - continue; - /* If there is no need to resend, fail it now. */ - if (req->rq_no_resend) { - if (req->rq_status == 0) - req->rq_status = -EIO; - ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); - goto interpret; - } else { - continue; - } - } - - if (req->rq_err) { - spin_lock(&req->rq_lock); - req->rq_replied = 0; - spin_unlock(&req->rq_lock); - if (req->rq_status == 0) - req->rq_status = -EIO; - ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); - goto interpret; - } - - /* - * ptlrpc_set_wait allow signal to abort the timeout - * so it sets rq_intr regardless of individual rpc - * timeouts. The synchronous IO waiting path sets - * rq_intr irrespective of whether ptlrpcd - * has seen a timeout. Our policy is to only interpret - * interrupted rpcs after they have timed out, so we - * need to enforce that here. - */ - - if (req->rq_intr && (req->rq_timedout || req->rq_waiting || - req->rq_wait_ctx)) { - req->rq_status = -EINTR; - ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); - goto interpret; - } - - if (req->rq_phase == RQ_PHASE_RPC) { - if (req->rq_timedout || req->rq_resend || - req->rq_waiting || req->rq_wait_ctx) { - int status; - - if (!ptlrpc_unregister_reply(req, 1)) { - ptlrpc_unregister_bulk(req, 1); - continue; - } - - spin_lock(&imp->imp_lock); - if (ptlrpc_import_delay_req(imp, req, - &status)) { - /* - * put on delay list - only if we wait - * recovery finished - before send - */ - list_del_init(&req->rq_list); - list_add_tail(&req->rq_list, - &imp->imp_delayed_list); - spin_unlock(&imp->imp_lock); - continue; - } - - if (status != 0) { - req->rq_status = status; - ptlrpc_rqphase_move(req, - RQ_PHASE_INTERPRET); - spin_unlock(&imp->imp_lock); - goto interpret; - } - if (ptlrpc_no_resend(req) && - !req->rq_wait_ctx) { - req->rq_status = -ENOTCONN; - ptlrpc_rqphase_move(req, - RQ_PHASE_INTERPRET); - spin_unlock(&imp->imp_lock); - goto interpret; - } - - list_del_init(&req->rq_list); - list_add_tail(&req->rq_list, - &imp->imp_sending_list); - - spin_unlock(&imp->imp_lock); - - spin_lock(&req->rq_lock); - req->rq_waiting = 0; - spin_unlock(&req->rq_lock); - - if (req->rq_timedout || req->rq_resend) { - /* This is re-sending anyway, let's mark req as resend. */ - spin_lock(&req->rq_lock); - req->rq_resend = 1; - spin_unlock(&req->rq_lock); - if (req->rq_bulk && - !ptlrpc_unregister_bulk(req, 1)) - continue; - } - /* - * rq_wait_ctx is only touched by ptlrpcd, - * so no lock is needed here. - */ - status = sptlrpc_req_refresh_ctx(req, -1); - if (status) { - if (req->rq_err) { - req->rq_status = status; - spin_lock(&req->rq_lock); - req->rq_wait_ctx = 0; - spin_unlock(&req->rq_lock); - force_timer_recalc = 1; - } else { - spin_lock(&req->rq_lock); - req->rq_wait_ctx = 1; - spin_unlock(&req->rq_lock); - } - - continue; - } else { - spin_lock(&req->rq_lock); - req->rq_wait_ctx = 0; - spin_unlock(&req->rq_lock); - } - - rc = ptl_send_rpc(req, 0); - if (rc == -ENOMEM) { - spin_lock(&imp->imp_lock); - if (!list_empty(&req->rq_list)) - list_del_init(&req->rq_list); - spin_unlock(&imp->imp_lock); - ptlrpc_rqphase_move(req, RQ_PHASE_NEW); - continue; - } - if (rc) { - DEBUG_REQ(D_HA, req, - "send failed: rc = %d", rc); - force_timer_recalc = 1; - spin_lock(&req->rq_lock); - req->rq_net_err = 1; - spin_unlock(&req->rq_lock); - continue; - } - /* need to reset the timeout */ - force_timer_recalc = 1; - } - - spin_lock(&req->rq_lock); - - if (ptlrpc_client_early(req)) { - ptlrpc_at_recv_early_reply(req); - spin_unlock(&req->rq_lock); - continue; - } - - /* Still waiting for a reply? */ - if (ptlrpc_client_recv(req)) { - spin_unlock(&req->rq_lock); - continue; - } - - /* Did we actually receive a reply? */ - if (!ptlrpc_client_replied(req)) { - spin_unlock(&req->rq_lock); - continue; - } - - spin_unlock(&req->rq_lock); - - /* - * unlink from net because we are going to - * swab in-place of reply buffer - */ - unregistered = ptlrpc_unregister_reply(req, 1); - if (!unregistered) - continue; - - req->rq_status = after_reply(req); - if (req->rq_resend) - continue; - - /* - * If there is no bulk associated with this request, - * then we're done and should let the interpreter - * process the reply. Similarly if the RPC returned - * an error, and therefore the bulk will never arrive. - */ - if (!req->rq_bulk || req->rq_status < 0) { - ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); - goto interpret; - } - - ptlrpc_rqphase_move(req, RQ_PHASE_BULK); - } - - LASSERT(req->rq_phase == RQ_PHASE_BULK); - if (ptlrpc_client_bulk_active(req)) - continue; - - if (req->rq_bulk->bd_failure) { - /* - * The RPC reply arrived OK, but the bulk screwed - * up! Dead weird since the server told us the RPC - * was good after getting the REPLY for her GET or - * the ACK for her PUT. - */ - DEBUG_REQ(D_ERROR, req, "bulk transfer failed"); - req->rq_status = -EIO; - } - - ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); - -interpret: - LASSERT(req->rq_phase == RQ_PHASE_INTERPRET); - - /* - * This moves to "unregistering" phase we need to wait for - * reply unlink. - */ - if (!unregistered && !ptlrpc_unregister_reply(req, 1)) { - /* start async bulk unlink too */ - ptlrpc_unregister_bulk(req, 1); - continue; - } - - if (!ptlrpc_unregister_bulk(req, 1)) - continue; - - /* When calling interpret receive should already be finished. */ - LASSERT(!req->rq_receiving_reply); - - ptlrpc_req_interpret(env, req, req->rq_status); - - if (ptlrpcd_check_work(req)) { - atomic_dec(&set->set_remaining); - continue; - } - ptlrpc_rqphase_move(req, RQ_PHASE_COMPLETE); - - CDEBUG(req->rq_reqmsg ? D_RPCTRACE : 0, - "Completed RPC pname:cluuid:pid:xid:nid:opc %s:%s:%d:%llu:%s:%d\n", - current->comm, imp->imp_obd->obd_uuid.uuid, - lustre_msg_get_status(req->rq_reqmsg), req->rq_xid, - libcfs_nid2str(imp->imp_connection->c_peer.nid), - lustre_msg_get_opc(req->rq_reqmsg)); - - spin_lock(&imp->imp_lock); - /* - * Request already may be not on sending or delaying list. This - * may happen in the case of marking it erroneous for the case - * ptlrpc_import_delay_req(req, status) find it impossible to - * allow sending this rpc and returns *status != 0. - */ - if (!list_empty(&req->rq_list)) { - list_del_init(&req->rq_list); - atomic_dec(&imp->imp_inflight); - } - list_del_init(&req->rq_unreplied_list); - spin_unlock(&imp->imp_lock); - - atomic_dec(&set->set_remaining); - wake_up_all(&imp->imp_recovery_waitq); - - if (set->set_producer) { - /* produce a new request if possible */ - if (ptlrpc_set_producer(set) > 0) - force_timer_recalc = 1; - - /* - * free the request that has just been completed - * in order not to pollute set->set_requests - */ - list_del_init(&req->rq_set_chain); - spin_lock(&req->rq_lock); - req->rq_set = NULL; - req->rq_invalid_rqset = 0; - spin_unlock(&req->rq_lock); - - /* record rq_status to compute the final status later */ - if (req->rq_status != 0) - set->set_rc = req->rq_status; - ptlrpc_req_finished(req); - } else { - list_move_tail(&req->rq_set_chain, &comp_reqs); - } - } - - /* - * move completed request at the head of list so it's easier for - * caller to find them - */ - list_splice(&comp_reqs, &set->set_requests); - - /* If we hit an error, we want to recover promptly. */ - return atomic_read(&set->set_remaining) == 0 || force_timer_recalc; -} -EXPORT_SYMBOL(ptlrpc_check_set); - -/** - * Time out request \a req. is \a async_unlink is set, that means do not wait - * until LNet actually confirms network buffer unlinking. - * Return 1 if we should give up further retrying attempts or 0 otherwise. - */ -int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink) -{ - struct obd_import *imp = req->rq_import; - int rc = 0; - - spin_lock(&req->rq_lock); - req->rq_timedout = 1; - spin_unlock(&req->rq_lock); - - DEBUG_REQ(D_WARNING, req, "Request sent has %s: [sent %lld/real %lld]", - req->rq_net_err ? "failed due to network error" : - ((req->rq_real_sent == 0 || - req->rq_real_sent < req->rq_sent || - req->rq_real_sent >= req->rq_deadline) ? - "timed out for sent delay" : "timed out for slow reply"), - (s64)req->rq_sent, (s64)req->rq_real_sent); - - if (imp && obd_debug_peer_on_timeout) - LNetDebugPeer(imp->imp_connection->c_peer); - - ptlrpc_unregister_reply(req, async_unlink); - ptlrpc_unregister_bulk(req, async_unlink); - - if (obd_dump_on_timeout) - libcfs_debug_dumplog(); - - if (!imp) { - DEBUG_REQ(D_HA, req, "NULL import: already cleaned up?"); - return 1; - } - - atomic_inc(&imp->imp_timeouts); - - /* The DLM server doesn't want recovery run on its imports. */ - if (imp->imp_dlm_fake) - return 1; - - /* - * If this request is for recovery or other primordial tasks, - * then error it out here. - */ - if (req->rq_ctx_init || req->rq_ctx_fini || - req->rq_send_state != LUSTRE_IMP_FULL || - imp->imp_obd->obd_no_recov) { - DEBUG_REQ(D_RPCTRACE, req, "err -110, sent_state=%s (now=%s)", - ptlrpc_import_state_name(req->rq_send_state), - ptlrpc_import_state_name(imp->imp_state)); - spin_lock(&req->rq_lock); - req->rq_status = -ETIMEDOUT; - req->rq_err = 1; - spin_unlock(&req->rq_lock); - return 1; - } - - /* - * if a request can't be resent we can't wait for an answer after - * the timeout - */ - if (ptlrpc_no_resend(req)) { - DEBUG_REQ(D_RPCTRACE, req, "TIMEOUT-NORESEND:"); - rc = 1; - } - - ptlrpc_fail_import(imp, lustre_msg_get_conn_cnt(req->rq_reqmsg)); - - return rc; -} - -/** - * Time out all uncompleted requests in request set pointed by \a data - * Called when wait_event_idle_timeout times out. - */ -void ptlrpc_expired_set(struct ptlrpc_request_set *set) -{ - struct ptlrpc_request *req; - time64_t now = ktime_get_real_seconds(); - - /* A timeout expired. See which reqs it applies to... */ - list_for_each_entry(req, &set->set_requests, rq_set_chain) { - - /* don't expire request waiting for context */ - if (req->rq_wait_ctx) - continue; - - /* Request in-flight? */ - if (!((req->rq_phase == RQ_PHASE_RPC && - !req->rq_waiting && !req->rq_resend) || - (req->rq_phase == RQ_PHASE_BULK))) - continue; - - if (req->rq_timedout || /* already dealt with */ - req->rq_deadline > now) /* not expired */ - continue; - - /* - * Deal with this guy. Do it asynchronously to not block - * ptlrpcd thread. - */ - ptlrpc_expire_one_request(req, 1); - } -} - -/** - * Sets rq_intr flag in \a req under spinlock. - */ -void ptlrpc_mark_interrupted(struct ptlrpc_request *req) -{ - spin_lock(&req->rq_lock); - req->rq_intr = 1; - spin_unlock(&req->rq_lock); -} -EXPORT_SYMBOL(ptlrpc_mark_interrupted); - -/** - * Interrupts (sets interrupted flag) all uncompleted requests in - * a set \a data. Called when l_wait_event_abortable_timeout receives signal. - */ -static void ptlrpc_interrupted_set(struct ptlrpc_request_set *set) -{ - struct ptlrpc_request *req; - CDEBUG(D_RPCTRACE, "INTERRUPTED SET %p\n", set); - - list_for_each_entry(req, &set->set_requests, rq_set_chain) { - - if (req->rq_phase != RQ_PHASE_RPC && - req->rq_phase != RQ_PHASE_UNREG_RPC) - continue; - - ptlrpc_mark_interrupted(req); - } -} - -/** - * Get the smallest timeout in the set; this does NOT set a timeout. - */ -int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set) -{ - time64_t now = ktime_get_real_seconds(); - int timeout = 0; - struct ptlrpc_request *req; - time64_t deadline; - - list_for_each_entry(req, &set->set_requests, rq_set_chain) { - - /* Request in-flight? */ - if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) || - (req->rq_phase == RQ_PHASE_BULK) || - (req->rq_phase == RQ_PHASE_NEW))) - continue; - - /* Already timed out. */ - if (req->rq_timedout) - continue; - - /* Waiting for ctx. */ - if (req->rq_wait_ctx) - continue; - - if (req->rq_phase == RQ_PHASE_NEW) - deadline = req->rq_sent; - else if (req->rq_phase == RQ_PHASE_RPC && req->rq_resend) - deadline = req->rq_sent; - else - deadline = req->rq_sent + req->rq_timeout; - - if (deadline <= now) /* actually expired already */ - timeout = 1; /* ASAP */ - else if (timeout == 0 || timeout > deadline - now) - timeout = deadline - now; - } - return timeout; -} - -/** - * Send all unset request from the set and then wait until all - * requests in the set complete (either get a reply, timeout, get an - * error or otherwise be interrupted). - * Returns 0 on success or error code otherwise. - */ -int ptlrpc_set_wait(struct ptlrpc_request_set *set) -{ - struct ptlrpc_request *req; - int rc, timeout; - - if (set->set_producer) - (void)ptlrpc_set_producer(set); - else - list_for_each_entry(req, &set->set_requests, rq_set_chain) { - if (req->rq_phase == RQ_PHASE_NEW) - (void)ptlrpc_send_new_req(req); - } - - if (list_empty(&set->set_requests)) - return 0; - - do { - timeout = ptlrpc_set_next_timeout(set); - - /* - * wait until all complete, interrupted, or an in-flight - * req times out - */ - CDEBUG(D_RPCTRACE, "set %p going to sleep for %d seconds\n", - set, timeout); - - if (timeout == 0 && !signal_pending(current)) { - /* - * No requests are in-flight (ether timed out - * or delayed), so we can allow interrupts. - * We still want to block for a limited time, - * so we allow interrupts during the timeout. - */ - rc = l_wait_event_abortable_timeout(set->set_waitq, - ptlrpc_check_set(NULL, set), - HZ); - if (rc == 0) { - rc = -ETIMEDOUT; - ptlrpc_expired_set(set); - } else if (rc < 0) { - rc = -EINTR; - ptlrpc_interrupted_set(set); - } else - rc = 0; - } else { - /* - * At least one request is in flight, so no - * interrupts are allowed. Wait until all - * complete, or an in-flight req times out. - */ - rc = wait_event_idle_timeout(set->set_waitq, - ptlrpc_check_set(NULL, set), - (timeout ? timeout : 1) * HZ); - if (rc == 0) { - ptlrpc_expired_set(set); - rc = -ETIMEDOUT; - /* - * LU-769 - if we ignored the signal - * because it was already pending when - * we started, we need to handle it - * now or we risk it being ignored - * forever - */ - if (l_fatal_signal_pending(current)) - ptlrpc_interrupted_set(set); - } else - rc = 0; - } - - LASSERT(rc == 0 || rc == -EINTR || rc == -ETIMEDOUT); - - /* - * -EINTR => all requests have been flagged rq_intr so next - * check completes. - * -ETIMEDOUT => someone timed out. When all reqs have - * timed out, signals are enabled allowing completion with - * EINTR. - * I don't really care if we go once more round the loop in - * the error cases -eeb. - */ - if (rc == 0 && atomic_read(&set->set_remaining) == 0) { - list_for_each_entry(req, &set->set_requests, rq_set_chain) { - spin_lock(&req->rq_lock); - req->rq_invalid_rqset = 1; - spin_unlock(&req->rq_lock); - } - } - } while (rc != 0 || atomic_read(&set->set_remaining) != 0); - - LASSERT(atomic_read(&set->set_remaining) == 0); - - rc = set->set_rc; /* rq_status of already freed requests if any */ - list_for_each_entry(req, &set->set_requests, rq_set_chain) { - LASSERT(req->rq_phase == RQ_PHASE_COMPLETE); - if (req->rq_status != 0) - rc = req->rq_status; - } - - if (set->set_interpret) { - int (*interpreter)(struct ptlrpc_request_set *set, void *, int) = - set->set_interpret; - rc = interpreter(set, set->set_arg, rc); - } else { - struct ptlrpc_set_cbdata *cbdata, *n; - int err; - - list_for_each_entry_safe(cbdata, n, - &set->set_cblist, psc_item) { - list_del_init(&cbdata->psc_item); - err = cbdata->psc_interpret(set, cbdata->psc_data, rc); - if (err && !rc) - rc = err; - kfree(cbdata); - } - } - - return rc; -} -EXPORT_SYMBOL(ptlrpc_set_wait); - -/** - * Helper function for request freeing. - * Called when request count reached zero and request needs to be freed. - * Removes request from all sorts of sending/replay lists it might be on, - * frees network buffers if any are present. - * If \a locked is set, that means caller is already holding import imp_lock - * and so we no longer need to reobtain it (for certain lists manipulations) - */ -static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked) -{ - if (!request) - return; - LASSERT(!request->rq_srv_req); - LASSERT(!request->rq_export); - LASSERTF(!request->rq_receiving_reply, "req %p\n", request); - LASSERTF(list_empty(&request->rq_list), "req %p\n", request); - LASSERTF(list_empty(&request->rq_set_chain), "req %p\n", request); - LASSERTF(!request->rq_replay, "req %p\n", request); - - req_capsule_fini(&request->rq_pill); - - /* - * We must take it off the imp_replay_list first. Otherwise, we'll set - * request->rq_reqmsg to NULL while osc_close is dereferencing it. - */ - if (request->rq_import) { - if (!locked) - spin_lock(&request->rq_import->imp_lock); - list_del_init(&request->rq_replay_list); - list_del_init(&request->rq_unreplied_list); - if (!locked) - spin_unlock(&request->rq_import->imp_lock); - } - LASSERTF(list_empty(&request->rq_replay_list), "req %p\n", request); - - if (atomic_read(&request->rq_refcount) != 0) { - DEBUG_REQ(D_ERROR, request, - "freeing request with nonzero refcount"); - LBUG(); - } - - if (request->rq_repbuf) - sptlrpc_cli_free_repbuf(request); - - if (request->rq_import) { - class_import_put(request->rq_import); - request->rq_import = NULL; - } - if (request->rq_bulk) - ptlrpc_free_bulk(request->rq_bulk); - - if (request->rq_reqbuf || request->rq_clrbuf) - sptlrpc_cli_free_reqbuf(request); - - if (request->rq_cli_ctx) - sptlrpc_req_put_ctx(request, !locked); - - if (request->rq_pool) - __ptlrpc_free_req_to_pool(request); - else - ptlrpc_request_cache_free(request); -} - -/** - * Helper function - * Drops one reference count for request \a request. - * \a locked set indicates that caller holds import imp_lock. - * Frees the request when reference count reaches zero. - */ -static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked) -{ - if (!request) - return 1; - - if (request == LP_POISON || - request->rq_reqmsg == LP_POISON) { - CERROR("dereferencing freed request (bug 575)\n"); - LBUG(); - return 1; - } - - DEBUG_REQ(D_INFO, request, "refcount now %u", - atomic_read(&request->rq_refcount) - 1); - - if (atomic_dec_and_test(&request->rq_refcount)) { - __ptlrpc_free_req(request, locked); - return 1; - } - - return 0; -} - -/** - * Drops one reference count for a request. - */ -void ptlrpc_req_finished(struct ptlrpc_request *request) -{ - __ptlrpc_req_finished(request, 0); -} -EXPORT_SYMBOL(ptlrpc_req_finished); - -/** - * Returns xid of a \a request - */ -__u64 ptlrpc_req_xid(struct ptlrpc_request *request) -{ - return request->rq_xid; -} -EXPORT_SYMBOL(ptlrpc_req_xid); - -/** - * Disengage the client's reply buffer from the network - * NB does _NOT_ unregister any client-side bulk. - * IDEMPOTENT, but _not_ safe against concurrent callers. - * The request owner (i.e. the thread doing the I/O) must call... - * Returns 0 on success or 1 if unregistering cannot be made. - */ -static int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async) -{ - int rc; - wait_queue_head_t *wq; - - /* Might sleep. */ - LASSERT(!in_interrupt()); - - /* Let's setup deadline for reply unlink. */ - if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) && - async && request->rq_reply_deadline == 0 && cfs_fail_val == 0) - request->rq_reply_deadline = - ktime_get_real_seconds() + LONG_UNLINK; - - /* Nothing left to do. */ - if (!ptlrpc_client_recv_or_unlink(request)) - return 1; - - LNetMDUnlink(request->rq_reply_md_h); - - /* Let's check it once again. */ - if (!ptlrpc_client_recv_or_unlink(request)) - return 1; - - /* Move to "Unregistering" phase as reply was not unlinked yet. */ - ptlrpc_rqphase_move(request, RQ_PHASE_UNREG_RPC); - - /* Do not wait for unlink to finish. */ - if (async) - return 0; - - /* - * We have to wait_event_idle_timeout() whatever the result, to give liblustre - * a chance to run reply_in_callback(), and to make sure we've - * unlinked before returning a req to the pool. - */ - if (request->rq_set) - wq = &request->rq_set->set_waitq; - else - wq = &request->rq_reply_waitq; - - for (;;) { - /* - * Network access will complete in finite time but the HUGE - * timeout lets us CWARN for visibility of sluggish NALs - */ - int cnt = 0; - while (cnt < LONG_UNLINK && - (rc = wait_event_idle_timeout(*wq, - !ptlrpc_client_recv_or_unlink(request), - HZ)) == 0) - cnt += 1; - if (rc > 0) { - ptlrpc_rqphase_move(request, request->rq_next_phase); - return 1; - } - - DEBUG_REQ(D_WARNING, request, - "Unexpectedly long timeout receiving_reply=%d req_unlinked=%d reply_unlinked=%d", - request->rq_receiving_reply, - request->rq_req_unlinked, - request->rq_reply_unlinked); - } - return 0; -} - -static void ptlrpc_free_request(struct ptlrpc_request *req) -{ - spin_lock(&req->rq_lock); - req->rq_replay = 0; - spin_unlock(&req->rq_lock); - - if (req->rq_commit_cb) - req->rq_commit_cb(req); - list_del_init(&req->rq_replay_list); - - __ptlrpc_req_finished(req, 1); -} - -/** - * the request is committed and dropped from the replay list of its import - */ -void ptlrpc_request_committed(struct ptlrpc_request *req, int force) -{ - struct obd_import *imp = req->rq_import; - - spin_lock(&imp->imp_lock); - if (list_empty(&req->rq_replay_list)) { - spin_unlock(&imp->imp_lock); - return; - } - - if (force || req->rq_transno <= imp->imp_peer_committed_transno) - ptlrpc_free_request(req); - - spin_unlock(&imp->imp_lock); -} -EXPORT_SYMBOL(ptlrpc_request_committed); - -/** - * Iterates through replay_list on import and prunes - * all requests have transno smaller than last_committed for the - * import and don't have rq_replay set. - * Since requests are sorted in transno order, stops when meeting first - * transno bigger than last_committed. - * caller must hold imp->imp_lock - */ -void ptlrpc_free_committed(struct obd_import *imp) -{ - struct ptlrpc_request *req, *saved; - struct ptlrpc_request *last_req = NULL; /* temporary fire escape */ - bool skip_committed_list = true; - - assert_spin_locked(&imp->imp_lock); - - if (imp->imp_peer_committed_transno == imp->imp_last_transno_checked && - imp->imp_generation == imp->imp_last_generation_checked) { - CDEBUG(D_INFO, "%s: skip recheck: last_committed %llu\n", - imp->imp_obd->obd_name, imp->imp_peer_committed_transno); - return; - } - CDEBUG(D_RPCTRACE, "%s: committing for last_committed %llu gen %d\n", - imp->imp_obd->obd_name, imp->imp_peer_committed_transno, - imp->imp_generation); - - if (imp->imp_generation != imp->imp_last_generation_checked || - !imp->imp_last_transno_checked) - skip_committed_list = false; - - imp->imp_last_transno_checked = imp->imp_peer_committed_transno; - imp->imp_last_generation_checked = imp->imp_generation; - - list_for_each_entry_safe(req, saved, &imp->imp_replay_list, - rq_replay_list) { - /* XXX ok to remove when 1357 resolved - rread 05/29/03 */ - LASSERT(req != last_req); - last_req = req; - - if (req->rq_transno == 0) { - DEBUG_REQ(D_EMERG, req, "zero transno during replay"); - LBUG(); - } - if (req->rq_import_generation < imp->imp_generation) { - DEBUG_REQ(D_RPCTRACE, req, "free request with old gen"); - goto free_req; - } - - /* not yet committed */ - if (req->rq_transno > imp->imp_peer_committed_transno) { - DEBUG_REQ(D_RPCTRACE, req, "stopping search"); - break; - } - - if (req->rq_replay) { - DEBUG_REQ(D_RPCTRACE, req, "keeping (FL_REPLAY)"); - list_move_tail(&req->rq_replay_list, - &imp->imp_committed_list); - continue; - } - - DEBUG_REQ(D_INFO, req, "commit (last_committed %llu)", - imp->imp_peer_committed_transno); -free_req: - ptlrpc_free_request(req); - } - if (skip_committed_list) - return; - - list_for_each_entry_safe(req, saved, &imp->imp_committed_list, - rq_replay_list) { - LASSERT(req->rq_transno != 0); - if (req->rq_import_generation < imp->imp_generation || - !req->rq_replay) { - DEBUG_REQ(D_RPCTRACE, req, "free %s open request", - req->rq_import_generation < - imp->imp_generation ? "stale" : "closed"); - - if (imp->imp_replay_cursor == &req->rq_replay_list) - imp->imp_replay_cursor = - req->rq_replay_list.next; - - ptlrpc_free_request(req); - } - } -} - -/** - * Schedule previously sent request for resend. - * For bulk requests we assign new xid (to avoid problems with - * lost replies and therefore several transfers landing into same buffer - * from different sending attempts). - */ -void ptlrpc_resend_req(struct ptlrpc_request *req) -{ - DEBUG_REQ(D_HA, req, "going to resend"); - spin_lock(&req->rq_lock); - - /* - * Request got reply but linked to the import list still. - * Let ptlrpc_check_set() to process it. - */ - if (ptlrpc_client_replied(req)) { - spin_unlock(&req->rq_lock); - DEBUG_REQ(D_HA, req, "it has reply, so skip it"); - return; - } - - lustre_msg_set_handle(req->rq_reqmsg, &(struct lustre_handle){ 0 }); - req->rq_status = -EAGAIN; - - req->rq_resend = 1; - req->rq_net_err = 0; - req->rq_timedout = 0; - ptlrpc_client_wake_req(req); - spin_unlock(&req->rq_lock); -} - -/** - * Grab additional reference on a request \a req - */ -struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req) -{ - atomic_inc(&req->rq_refcount); - return req; -} -EXPORT_SYMBOL(ptlrpc_request_addref); - -/** - * Add a request to import replay_list. - * Must be called under imp_lock - */ -void ptlrpc_retain_replayable_request(struct ptlrpc_request *req, - struct obd_import *imp) -{ - struct ptlrpc_request *iter; - assert_spin_locked(&imp->imp_lock); - - if (req->rq_transno == 0) { - DEBUG_REQ(D_EMERG, req, "saving request with zero transno"); - LBUG(); - } - - /* - * clear this for new requests that were resent as well - * as resent replayed requests. - */ - lustre_msg_clear_flags(req->rq_reqmsg, MSG_RESENT); - - /* don't re-add requests that have been replayed */ - if (!list_empty(&req->rq_replay_list)) - return; - - lustre_msg_add_flags(req->rq_reqmsg, MSG_REPLAY); - - spin_lock(&req->rq_lock); - req->rq_resend = 0; - spin_unlock(&req->rq_lock); - - LASSERT(imp->imp_replayable); - /* Balanced in ptlrpc_free_committed, usually. */ - ptlrpc_request_addref(req); - list_for_each_entry_reverse(iter, &imp->imp_replay_list, rq_replay_list) { - /* - * We may have duplicate transnos if we create and then - * open a file, or for closes retained if to match creating - * opens, so use req->rq_xid as a secondary key. - * (See bugs 684, 685, and 428.) - * XXX no longer needed, but all opens need transnos! - */ - if (iter->rq_transno > req->rq_transno) - continue; - - if (iter->rq_transno == req->rq_transno) { - LASSERT(iter->rq_xid != req->rq_xid); - if (iter->rq_xid > req->rq_xid) - continue; - } - - list_add(&req->rq_replay_list, &iter->rq_replay_list); - return; - } - - list_add(&req->rq_replay_list, &imp->imp_replay_list); -} - -/** - * Send request and wait until it completes. - * Returns request processing status. - */ -int ptlrpc_queue_wait(struct ptlrpc_request *req) -{ - struct ptlrpc_request_set *set; - int rc; - - LASSERT(!req->rq_set); - LASSERT(!req->rq_receiving_reply); - - set = ptlrpc_prep_set(); - if (!set) { - CERROR("cannot allocate ptlrpc set: rc = %d\n", -ENOMEM); - return -ENOMEM; - } - - /* for distributed debugging */ - lustre_msg_set_status(req->rq_reqmsg, current->pid); - - /* add a ref for the set (see comment in ptlrpc_set_add_req) */ - ptlrpc_request_addref(req); - ptlrpc_set_add_req(set, req); - rc = ptlrpc_set_wait(set); - ptlrpc_set_destroy(set); - - return rc; -} -EXPORT_SYMBOL(ptlrpc_queue_wait); - -/** - * Callback used for replayed requests reply processing. - * In case of successful reply calls registered request replay callback. - * In case of error restart replay process. - */ -static int ptlrpc_replay_interpret(const struct lu_env *env, - struct ptlrpc_request *req, - void *data, int rc) -{ - struct ptlrpc_replay_async_args *aa = data; - struct obd_import *imp = req->rq_import; - - atomic_dec(&imp->imp_replay_inflight); - - /* - * Note: if it is bulk replay (MDS-MDS replay), then even if - * server got the request, but bulk transfer timeout, let's - * replay the bulk req again - */ - if (!ptlrpc_client_replied(req) || - (req->rq_bulk && - lustre_msg_get_status(req->rq_repmsg) == -ETIMEDOUT)) { - DEBUG_REQ(D_ERROR, req, "request replay timed out.\n"); - rc = -ETIMEDOUT; - goto out; - } - - if (lustre_msg_get_type(req->rq_repmsg) == PTL_RPC_MSG_ERR && - (lustre_msg_get_status(req->rq_repmsg) == -ENOTCONN || - lustre_msg_get_status(req->rq_repmsg) == -ENODEV)) { - rc = lustre_msg_get_status(req->rq_repmsg); - goto out; - } - - /** VBR: check version failure */ - if (lustre_msg_get_status(req->rq_repmsg) == -EOVERFLOW) { - /** replay was failed due to version mismatch */ - DEBUG_REQ(D_WARNING, req, "Version mismatch during replay\n"); - spin_lock(&imp->imp_lock); - imp->imp_vbr_failed = 1; - imp->imp_no_lock_replay = 1; - spin_unlock(&imp->imp_lock); - lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status); - } else { - /** The transno had better not change over replay. */ - LASSERTF(lustre_msg_get_transno(req->rq_reqmsg) == - lustre_msg_get_transno(req->rq_repmsg) || - lustre_msg_get_transno(req->rq_repmsg) == 0, - "%#llx/%#llx\n", - lustre_msg_get_transno(req->rq_reqmsg), - lustre_msg_get_transno(req->rq_repmsg)); - } - - spin_lock(&imp->imp_lock); - /** if replays by version then gap occur on server, no trust to locks */ - if (lustre_msg_get_flags(req->rq_repmsg) & MSG_VERSION_REPLAY) - imp->imp_no_lock_replay = 1; - imp->imp_last_replay_transno = lustre_msg_get_transno(req->rq_reqmsg); - spin_unlock(&imp->imp_lock); - LASSERT(imp->imp_last_replay_transno); - - /* transaction number shouldn't be bigger than the latest replayed */ - if (req->rq_transno > lustre_msg_get_transno(req->rq_reqmsg)) { - DEBUG_REQ(D_ERROR, req, - "Reported transno %llu is bigger than the replayed one: %llu", - req->rq_transno, - lustre_msg_get_transno(req->rq_reqmsg)); - rc = -EINVAL; - goto out; - } - - DEBUG_REQ(D_HA, req, "got rep"); - - /* let the callback do fixups, possibly including in the request */ - if (req->rq_replay_cb) - req->rq_replay_cb(req); - - if (ptlrpc_client_replied(req) && - lustre_msg_get_status(req->rq_repmsg) != aa->praa_old_status) { - DEBUG_REQ(D_ERROR, req, "status %d, old was %d", - lustre_msg_get_status(req->rq_repmsg), - aa->praa_old_status); - } else { - /* Put it back for re-replay. */ - lustre_msg_set_status(req->rq_repmsg, aa->praa_old_status); - } - - /* - * Errors while replay can set transno to 0, but - * imp_last_replay_transno shouldn't be set to 0 anyway - */ - if (req->rq_transno == 0) - CERROR("Transno is 0 during replay!\n"); - - /* continue with recovery */ - rc = ptlrpc_import_recovery_state_machine(imp); - out: - req->rq_send_state = aa->praa_old_state; - - if (rc != 0) - /* this replay failed, so restart recovery */ - ptlrpc_connect_import(imp); - - return rc; -} - -/** - * Prepares and queues request for replay. - * Adds it to ptlrpcd queue for actual sending. - * Returns 0 on success. - */ -int ptlrpc_replay_req(struct ptlrpc_request *req) -{ - struct ptlrpc_replay_async_args *aa; - - LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY); - - LASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); - aa = ptlrpc_req_async_args(req); - memset(aa, 0, sizeof(*aa)); - - /* Prepare request to be resent with ptlrpcd */ - aa->praa_old_state = req->rq_send_state; - req->rq_send_state = LUSTRE_IMP_REPLAY; - req->rq_phase = RQ_PHASE_NEW; - req->rq_next_phase = RQ_PHASE_UNDEFINED; - if (req->rq_repmsg) - aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg); - req->rq_status = 0; - req->rq_interpret_reply = ptlrpc_replay_interpret; - /* Readjust the timeout for current conditions */ - ptlrpc_at_set_req_timeout(req); - - /* - * Tell server the net_latency, so the server can calculate how long - * it should wait for next replay - */ - lustre_msg_set_service_time(req->rq_reqmsg, - ptlrpc_at_get_net_latency(req)); - DEBUG_REQ(D_HA, req, "REPLAY"); - - atomic_inc(&req->rq_import->imp_replay_inflight); - ptlrpc_request_addref(req); /* ptlrpcd needs a ref */ - - ptlrpcd_add_req(req); - return 0; -} - -/** - * Aborts all in-flight request on import \a imp sending and delayed lists - */ -void ptlrpc_abort_inflight(struct obd_import *imp) -{ - struct ptlrpc_request *req, *n; - - /* - * Make sure that no new requests get processed for this import. - * ptlrpc_{queue,set}_wait must (and does) hold imp_lock while testing - * this flag and then putting requests on sending_list or delayed_list. - */ - spin_lock(&imp->imp_lock); - - /* - * XXX locking? Maybe we should remove each request with the list - * locked? Also, how do we know if the requests on the list are - * being freed at this time? - */ - list_for_each_entry_safe(req, n, &imp->imp_sending_list, rq_list) { - DEBUG_REQ(D_RPCTRACE, req, "inflight"); - - spin_lock(&req->rq_lock); - if (req->rq_import_generation < imp->imp_generation) { - req->rq_err = 1; - req->rq_status = -EIO; - ptlrpc_client_wake_req(req); - } - spin_unlock(&req->rq_lock); - } - - list_for_each_entry_safe(req, n, &imp->imp_delayed_list, rq_list) { - DEBUG_REQ(D_RPCTRACE, req, "aborting waiting req"); - - spin_lock(&req->rq_lock); - if (req->rq_import_generation < imp->imp_generation) { - req->rq_err = 1; - req->rq_status = -EIO; - ptlrpc_client_wake_req(req); - } - spin_unlock(&req->rq_lock); - } - - /* - * Last chance to free reqs left on the replay list, but we - * will still leak reqs that haven't committed. - */ - if (imp->imp_replayable) - ptlrpc_free_committed(imp); - - spin_unlock(&imp->imp_lock); -} - -/** - * Abort all uncompleted requests in request set \a set - */ -void ptlrpc_abort_set(struct ptlrpc_request_set *set) -{ - struct ptlrpc_request *req, *tmp; - - list_for_each_entry_safe(req, tmp, &set->set_requests, rq_set_chain) { - spin_lock(&req->rq_lock); - if (req->rq_phase != RQ_PHASE_RPC) { - spin_unlock(&req->rq_lock); - continue; - } - - req->rq_err = 1; - req->rq_status = -EINTR; - ptlrpc_client_wake_req(req); - spin_unlock(&req->rq_lock); - } -} - -static __u64 ptlrpc_last_xid; -static spinlock_t ptlrpc_last_xid_lock; - -/** - * Initialize the XID for the node. This is common among all requests on - * this node, and only requires the property that it is monotonically - * increasing. It does not need to be sequential. Since this is also used - * as the RDMA match bits, it is important that a single client NOT have - * the same match bits for two different in-flight requests, hence we do - * NOT want to have an XID per target or similar. - * - * To avoid an unlikely collision between match bits after a client reboot - * (which would deliver old data into the wrong RDMA buffer) initialize - * the XID based on the current time, assuming a maximum RPC rate of 1M RPC/s. - * If the time is clearly incorrect, we instead use a 62-bit random number. - * In the worst case the random number will overflow 1M RPCs per second in - * 9133 years, or permutations thereof. - */ -#define YEAR_2004 (1ULL << 30) -void ptlrpc_init_xid(void) -{ - time64_t now = ktime_get_real_seconds(); - - spin_lock_init(&ptlrpc_last_xid_lock); - if (now < YEAR_2004) { - get_random_bytes(&ptlrpc_last_xid, sizeof(ptlrpc_last_xid)); - ptlrpc_last_xid >>= 2; - ptlrpc_last_xid |= (1ULL << 61); - } else { - ptlrpc_last_xid = (__u64)now << 20; - } - - /* Always need to be aligned to a power-of-two for multi-bulk BRW */ - BUILD_BUG_ON(((PTLRPC_BULK_OPS_COUNT - 1) & PTLRPC_BULK_OPS_COUNT) != 0); - ptlrpc_last_xid &= PTLRPC_BULK_OPS_MASK; -} - -/** - * Increase xid and returns resulting new value to the caller. - * - * Multi-bulk BRW RPCs consume multiple XIDs for each bulk transfer, starting - * at the returned xid, up to xid + PTLRPC_BULK_OPS_COUNT - 1. The BRW RPC - * itself uses the last bulk xid needed, so the server can determine the - * the number of bulk transfers from the RPC XID and a bitmask. The starting - * xid must align to a power-of-two value. - * - * This is assumed to be true due to the initial ptlrpc_last_xid - * value also being initialized to a power-of-two value. LU-1431 - */ -__u64 ptlrpc_next_xid(void) -{ - __u64 next; - - spin_lock(&ptlrpc_last_xid_lock); - next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT; - ptlrpc_last_xid = next; - spin_unlock(&ptlrpc_last_xid_lock); - - return next; -} - -/** - * If request has a new allocated XID (new request or EINPROGRESS resend), - * use this XID as matchbits of bulk, otherwise allocate a new matchbits for - * request to ensure previous bulk fails and avoid problems with lost replies - * and therefore several transfers landing into the same buffer from different - * sending attempts. - */ -void ptlrpc_set_bulk_mbits(struct ptlrpc_request *req) -{ - struct ptlrpc_bulk_desc *bd = req->rq_bulk; - - LASSERT(bd); - - /* - * Generate new matchbits for all resend requests, including - * resend replay. - */ - if (req->rq_resend) { - u64 old_mbits = req->rq_mbits; - - /* - * First time resend on -EINPROGRESS will generate new xid, - * so we can actually use the rq_xid as rq_mbits in such case, - * however, it's bit hard to distinguish such resend with a - * 'resend for the -EINPROGRESS resend'. To make it simple, - * we opt to generate mbits for all resend cases. - */ - if ((bd->bd_import->imp_connect_data.ocd_connect_flags & - OBD_CONNECT_BULK_MBITS)) { - req->rq_mbits = ptlrpc_next_xid(); - } else { - /* old version transfers rq_xid to peer as matchbits */ - spin_lock(&req->rq_import->imp_lock); - list_del_init(&req->rq_unreplied_list); - ptlrpc_assign_next_xid_nolock(req); - spin_unlock(&req->rq_import->imp_lock); - req->rq_mbits = req->rq_xid; - } - - CDEBUG(D_HA, "resend bulk old x%llu new x%llu\n", - old_mbits, req->rq_mbits); - } else if (!(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY)) { - /* Request being sent first time, use xid as matchbits. */ - req->rq_mbits = req->rq_xid; - } else { - /* - * Replay request, xid and matchbits have already been - * correctly assigned. - */ - return; - } - - /* - * For multi-bulk RPCs, rq_mbits is the last mbits needed for bulks so - * that server can infer the number of bulks that were prepared, - * see LU-1431 - */ - req->rq_mbits += DIV_ROUND_UP(bd->bd_iov_count, LNET_MAX_IOV) - 1; -} - -/** - * Get a glimpse at what next xid value might have been. - * Returns possible next xid. - */ -__u64 ptlrpc_sample_next_xid(void) -{ -#if BITS_PER_LONG == 32 - /* need to avoid possible word tearing on 32-bit systems */ - __u64 next; - - spin_lock(&ptlrpc_last_xid_lock); - next = ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT; - spin_unlock(&ptlrpc_last_xid_lock); - - return next; -#else - /* No need to lock, since returned value is racy anyways */ - return ptlrpc_last_xid + PTLRPC_BULK_OPS_COUNT; -#endif -} -EXPORT_SYMBOL(ptlrpc_sample_next_xid); - -/** - * Functions for operating ptlrpc workers. - * - * A ptlrpc work is a function which will be running inside ptlrpc context. - * The callback shouldn't sleep otherwise it will block that ptlrpcd thread. - * - * 1. after a work is created, it can be used many times, that is: - * handler = ptlrpcd_alloc_work(); - * ptlrpcd_queue_work(); - * - * queue it again when necessary: - * ptlrpcd_queue_work(); - * ptlrpcd_destroy_work(); - * 2. ptlrpcd_queue_work() can be called by multiple processes meanwhile, but - * it will only be queued once in any time. Also as its name implies, it may - * have delay before it really runs by ptlrpcd thread. - */ -struct ptlrpc_work_async_args { - int (*cb)(const struct lu_env *, void *); - void *cbdata; -}; - -static void ptlrpcd_add_work_req(struct ptlrpc_request *req) -{ - /* re-initialize the req */ - req->rq_timeout = obd_timeout; - req->rq_sent = ktime_get_real_seconds(); - req->rq_deadline = req->rq_sent + req->rq_timeout; - req->rq_phase = RQ_PHASE_INTERPRET; - req->rq_next_phase = RQ_PHASE_COMPLETE; - req->rq_xid = ptlrpc_next_xid(); - req->rq_import_generation = req->rq_import->imp_generation; - - ptlrpcd_add_req(req); -} - -static int work_interpreter(const struct lu_env *env, - struct ptlrpc_request *req, void *data, int rc) -{ - struct ptlrpc_work_async_args *arg = data; - - LASSERT(ptlrpcd_check_work(req)); - - rc = arg->cb(env, arg->cbdata); - - list_del_init(&req->rq_set_chain); - req->rq_set = NULL; - - if (atomic_dec_return(&req->rq_refcount) > 1) { - atomic_set(&req->rq_refcount, 2); - ptlrpcd_add_work_req(req); - } - return rc; -} - -static int worker_format; - -static int ptlrpcd_check_work(struct ptlrpc_request *req) -{ - return req->rq_pill.rc_fmt == (void *)&worker_format; -} - -/** - * Create a work for ptlrpc. - */ -void *ptlrpcd_alloc_work(struct obd_import *imp, - int (*cb)(const struct lu_env *, void *), void *cbdata) -{ - struct ptlrpc_request *req = NULL; - struct ptlrpc_work_async_args *args; - - might_sleep(); - - if (!cb) - return ERR_PTR(-EINVAL); - - /* copy some code from deprecated fakereq. */ - req = ptlrpc_request_cache_alloc(GFP_NOFS); - if (!req) { - CERROR("ptlrpc: run out of memory!\n"); - return ERR_PTR(-ENOMEM); - } - - ptlrpc_cli_req_init(req); - - req->rq_send_state = LUSTRE_IMP_FULL; - req->rq_type = PTL_RPC_MSG_REQUEST; - req->rq_import = class_import_get(imp); - req->rq_interpret_reply = work_interpreter; - /* don't want reply */ - req->rq_no_delay = 1; - req->rq_no_resend = 1; - req->rq_pill.rc_fmt = (void *)&worker_format; - - BUILD_BUG_ON(sizeof(*args) > sizeof(req->rq_async_args)); - args = ptlrpc_req_async_args(req); - args->cb = cb; - args->cbdata = cbdata; - - return req; -} -EXPORT_SYMBOL(ptlrpcd_alloc_work); - -void ptlrpcd_destroy_work(void *handler) -{ - struct ptlrpc_request *req = handler; - - if (req) - ptlrpc_req_finished(req); -} -EXPORT_SYMBOL(ptlrpcd_destroy_work); - -int ptlrpcd_queue_work(void *handler) -{ - struct ptlrpc_request *req = handler; - - /* - * Check if the req is already being queued. - * - * Here comes a trick: it lacks a way of checking if a req is being - * processed reliably in ptlrpc. Here I have to use refcount of req - * for this purpose. This is okay because the caller should use this - * req as opaque data. - Jinshan - */ - LASSERT(atomic_read(&req->rq_refcount) > 0); - if (atomic_inc_return(&req->rq_refcount) == 2) - ptlrpcd_add_work_req(req); - return 0; -} -EXPORT_SYMBOL(ptlrpcd_queue_work); diff --git a/drivers/staging/lustre/lustre/ptlrpc/connection.c b/drivers/staging/lustre/lustre/ptlrpc/connection.c deleted file mode 100644 index fb35a89ca6c60755f2f6fbfac340b61f68f6a258..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ptlrpc/connection.c +++ /dev/null @@ -1,192 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_RPC -#include -#include -#include - -#include "ptlrpc_internal.h" - -static struct rhashtable conn_hash; - -/* - * struct lnet_process_id may contain unassigned bytes which might not - * be zero, so we cannot just hash and compare bytes. - */ - -static u32 lnet_process_id_hash(const void *data, u32 len, u32 seed) -{ - const struct lnet_process_id *lpi = data; - - seed = hash_32(seed ^ lpi->pid, 32); - seed ^= hash_64(lpi->nid, 32); - return seed; -} - -static int lnet_process_id_cmp(struct rhashtable_compare_arg *arg, - const void *obj) -{ - const struct lnet_process_id *lpi = arg->key; - const struct ptlrpc_connection *con = obj; - - if (lpi->nid == con->c_peer.nid && - lpi->pid == con->c_peer.pid) - return 0; - return -ESRCH; -} - -static const struct rhashtable_params conn_hash_params = { - .key_len = 1, /* actually variable-length */ - .key_offset = offsetof(struct ptlrpc_connection, c_peer), - .head_offset = offsetof(struct ptlrpc_connection, c_hash), - .hashfn = lnet_process_id_hash, - .obj_cmpfn = lnet_process_id_cmp, -}; - -struct ptlrpc_connection * -ptlrpc_connection_get(struct lnet_process_id peer, lnet_nid_t self, - struct obd_uuid *uuid) -{ - struct ptlrpc_connection *conn, *conn2; - - conn = rhashtable_lookup_fast(&conn_hash, &peer, conn_hash_params); - if (conn) { - ptlrpc_connection_addref(conn); - goto out; - } - - conn = kzalloc(sizeof(*conn), GFP_NOFS); - if (!conn) - return NULL; - - conn->c_peer = peer; - conn->c_self = self; - atomic_set(&conn->c_refcount, 1); - if (uuid) - obd_str2uuid(&conn->c_remote_uuid, uuid->uuid); - - /* - * Add the newly created conn to the hash, on key collision we - * lost a racing addition and must destroy our newly allocated - * connection. The object which exists in the hash will be - * returned, otherwise NULL is returned on success. - */ - conn2 = rhashtable_lookup_get_insert_fast(&conn_hash, &conn->c_hash, - conn_hash_params); - if (conn2 != NULL) { - /* insertion failed */ - kfree(conn); - if (IS_ERR(conn2)) - return NULL; - conn = conn2; - ptlrpc_connection_addref(conn); - } -out: - CDEBUG(D_INFO, "conn=%p refcount %d to %s\n", - conn, atomic_read(&conn->c_refcount), - libcfs_nid2str(conn->c_peer.nid)); - return conn; -} - -int ptlrpc_connection_put(struct ptlrpc_connection *conn) -{ - int rc = 0; - - if (!conn) - return rc; - - LASSERT(atomic_read(&conn->c_refcount) > 0); - - /* - * We do not remove connection from hashtable and - * do not free it even if last caller released ref, - * as we want to have it cached for the case it is - * needed again. - * - * Deallocating it and later creating new connection - * again would be wastful. This way we also avoid - * expensive locking to protect things from get/put - * race when found cached connection is freed by - * ptlrpc_connection_put(). - * - * It will be freed later in module unload time, - * when ptlrpc_connection_fini()->lh_exit->conn_exit() - * path is called. - */ - if (atomic_dec_return(&conn->c_refcount) == 0) - rc = 1; - - CDEBUG(D_INFO, "PUT conn=%p refcount %d to %s\n", - conn, atomic_read(&conn->c_refcount), - libcfs_nid2str(conn->c_peer.nid)); - - return rc; -} - -struct ptlrpc_connection * -ptlrpc_connection_addref(struct ptlrpc_connection *conn) -{ - atomic_inc(&conn->c_refcount); - CDEBUG(D_INFO, "conn=%p refcount %d to %s\n", - conn, atomic_read(&conn->c_refcount), - libcfs_nid2str(conn->c_peer.nid)); - - return conn; -} - -static void -conn_exit(void *vconn, void *data) -{ - struct ptlrpc_connection *conn = vconn; - - /* - * Nothing should be left. Connection user put it and - * connection also was deleted from table by this time - * so we should have 0 refs. - */ - LASSERTF(atomic_read(&conn->c_refcount) == 0, - "Busy connection with %d refs\n", - atomic_read(&conn->c_refcount)); - kfree(conn); -} - -int ptlrpc_connection_init(void) -{ - return rhashtable_init(&conn_hash, &conn_hash_params); -} - -void ptlrpc_connection_fini(void) -{ - rhashtable_free_and_destroy(&conn_hash, conn_exit, NULL); -} diff --git a/drivers/staging/lustre/lustre/ptlrpc/errno.c b/drivers/staging/lustre/lustre/ptlrpc/errno.c deleted file mode 100644 index b904524fc1c6d085e06b27530012cbd468612fc5..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ptlrpc/errno.c +++ /dev/null @@ -1,383 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.txt - * - * GPL HEADER END - */ -/* - * Copyright (C) 2011 FUJITSU LIMITED. All rights reserved. - * - * Copyright (c) 2013, Intel Corporation. - */ - -#include -#include -#include -#include - -/* - * The two translation tables below must define a one-to-one mapping between - * host and network errnos. - * - * EWOULDBLOCK is equal to EAGAIN on all architectures except for parisc, which - * appears irrelevant. Thus, existing references to EWOULDBLOCK are fine. - * - * EDEADLOCK is equal to EDEADLK on x86 but not on sparc, at least. A sparc - * host has no context-free way to determine if a LUSTRE_EDEADLK represents an - * EDEADLK or an EDEADLOCK. Therefore, all existing references to EDEADLOCK - * that need to be transferred on wire have been replaced with EDEADLK. - */ -static int lustre_errno_hton_mapping[] = { - [EPERM] = LUSTRE_EPERM, - [ENOENT] = LUSTRE_ENOENT, - [ESRCH] = LUSTRE_ESRCH, - [EINTR] = LUSTRE_EINTR, - [EIO] = LUSTRE_EIO, - [ENXIO] = LUSTRE_ENXIO, - [E2BIG] = LUSTRE_E2BIG, - [ENOEXEC] = LUSTRE_ENOEXEC, - [EBADF] = LUSTRE_EBADF, - [ECHILD] = LUSTRE_ECHILD, - [EAGAIN] = LUSTRE_EAGAIN, - [ENOMEM] = LUSTRE_ENOMEM, - [EACCES] = LUSTRE_EACCES, - [EFAULT] = LUSTRE_EFAULT, - [ENOTBLK] = LUSTRE_ENOTBLK, - [EBUSY] = LUSTRE_EBUSY, - [EEXIST] = LUSTRE_EEXIST, - [EXDEV] = LUSTRE_EXDEV, - [ENODEV] = LUSTRE_ENODEV, - [ENOTDIR] = LUSTRE_ENOTDIR, - [EISDIR] = LUSTRE_EISDIR, - [EINVAL] = LUSTRE_EINVAL, - [ENFILE] = LUSTRE_ENFILE, - [EMFILE] = LUSTRE_EMFILE, - [ENOTTY] = LUSTRE_ENOTTY, - [ETXTBSY] = LUSTRE_ETXTBSY, - [EFBIG] = LUSTRE_EFBIG, - [ENOSPC] = LUSTRE_ENOSPC, - [ESPIPE] = LUSTRE_ESPIPE, - [EROFS] = LUSTRE_EROFS, - [EMLINK] = LUSTRE_EMLINK, - [EPIPE] = LUSTRE_EPIPE, - [EDOM] = LUSTRE_EDOM, - [ERANGE] = LUSTRE_ERANGE, - [EDEADLK] = LUSTRE_EDEADLK, - [ENAMETOOLONG] = LUSTRE_ENAMETOOLONG, - [ENOLCK] = LUSTRE_ENOLCK, - [ENOSYS] = LUSTRE_ENOSYS, - [ENOTEMPTY] = LUSTRE_ENOTEMPTY, - [ELOOP] = LUSTRE_ELOOP, - [ENOMSG] = LUSTRE_ENOMSG, - [EIDRM] = LUSTRE_EIDRM, - [ECHRNG] = LUSTRE_ECHRNG, - [EL2NSYNC] = LUSTRE_EL2NSYNC, - [EL3HLT] = LUSTRE_EL3HLT, - [EL3RST] = LUSTRE_EL3RST, - [ELNRNG] = LUSTRE_ELNRNG, - [EUNATCH] = LUSTRE_EUNATCH, - [ENOCSI] = LUSTRE_ENOCSI, - [EL2HLT] = LUSTRE_EL2HLT, - [EBADE] = LUSTRE_EBADE, - [EBADR] = LUSTRE_EBADR, - [EXFULL] = LUSTRE_EXFULL, - [ENOANO] = LUSTRE_ENOANO, - [EBADRQC] = LUSTRE_EBADRQC, - [EBADSLT] = LUSTRE_EBADSLT, - [EBFONT] = LUSTRE_EBFONT, - [ENOSTR] = LUSTRE_ENOSTR, - [ENODATA] = LUSTRE_ENODATA, - [ETIME] = LUSTRE_ETIME, - [ENOSR] = LUSTRE_ENOSR, - [ENONET] = LUSTRE_ENONET, - [ENOPKG] = LUSTRE_ENOPKG, - [EREMOTE] = LUSTRE_EREMOTE, - [ENOLINK] = LUSTRE_ENOLINK, - [EADV] = LUSTRE_EADV, - [ESRMNT] = LUSTRE_ESRMNT, - [ECOMM] = LUSTRE_ECOMM, - [EPROTO] = LUSTRE_EPROTO, - [EMULTIHOP] = LUSTRE_EMULTIHOP, - [EDOTDOT] = LUSTRE_EDOTDOT, - [EBADMSG] = LUSTRE_EBADMSG, - [EOVERFLOW] = LUSTRE_EOVERFLOW, - [ENOTUNIQ] = LUSTRE_ENOTUNIQ, - [EBADFD] = LUSTRE_EBADFD, - [EREMCHG] = LUSTRE_EREMCHG, - [ELIBACC] = LUSTRE_ELIBACC, - [ELIBBAD] = LUSTRE_ELIBBAD, - [ELIBSCN] = LUSTRE_ELIBSCN, - [ELIBMAX] = LUSTRE_ELIBMAX, - [ELIBEXEC] = LUSTRE_ELIBEXEC, - [EILSEQ] = LUSTRE_EILSEQ, - [ERESTART] = LUSTRE_ERESTART, - [ESTRPIPE] = LUSTRE_ESTRPIPE, - [EUSERS] = LUSTRE_EUSERS, - [ENOTSOCK] = LUSTRE_ENOTSOCK, - [EDESTADDRREQ] = LUSTRE_EDESTADDRREQ, - [EMSGSIZE] = LUSTRE_EMSGSIZE, - [EPROTOTYPE] = LUSTRE_EPROTOTYPE, - [ENOPROTOOPT] = LUSTRE_ENOPROTOOPT, - [EPROTONOSUPPORT] = LUSTRE_EPROTONOSUPPORT, - [ESOCKTNOSUPPORT] = LUSTRE_ESOCKTNOSUPPORT, - [EOPNOTSUPP] = LUSTRE_EOPNOTSUPP, - [EPFNOSUPPORT] = LUSTRE_EPFNOSUPPORT, - [EAFNOSUPPORT] = LUSTRE_EAFNOSUPPORT, - [EADDRINUSE] = LUSTRE_EADDRINUSE, - [EADDRNOTAVAIL] = LUSTRE_EADDRNOTAVAIL, - [ENETDOWN] = LUSTRE_ENETDOWN, - [ENETUNREACH] = LUSTRE_ENETUNREACH, - [ENETRESET] = LUSTRE_ENETRESET, - [ECONNABORTED] = LUSTRE_ECONNABORTED, - [ECONNRESET] = LUSTRE_ECONNRESET, - [ENOBUFS] = LUSTRE_ENOBUFS, - [EISCONN] = LUSTRE_EISCONN, - [ENOTCONN] = LUSTRE_ENOTCONN, - [ESHUTDOWN] = LUSTRE_ESHUTDOWN, - [ETOOMANYREFS] = LUSTRE_ETOOMANYREFS, - [ETIMEDOUT] = LUSTRE_ETIMEDOUT, - [ECONNREFUSED] = LUSTRE_ECONNREFUSED, - [EHOSTDOWN] = LUSTRE_EHOSTDOWN, - [EHOSTUNREACH] = LUSTRE_EHOSTUNREACH, - [EALREADY] = LUSTRE_EALREADY, - [EINPROGRESS] = LUSTRE_EINPROGRESS, - [ESTALE] = LUSTRE_ESTALE, - [EUCLEAN] = LUSTRE_EUCLEAN, - [ENOTNAM] = LUSTRE_ENOTNAM, - [ENAVAIL] = LUSTRE_ENAVAIL, - [EISNAM] = LUSTRE_EISNAM, - [EREMOTEIO] = LUSTRE_EREMOTEIO, - [EDQUOT] = LUSTRE_EDQUOT, - [ENOMEDIUM] = LUSTRE_ENOMEDIUM, - [EMEDIUMTYPE] = LUSTRE_EMEDIUMTYPE, - [ECANCELED] = LUSTRE_ECANCELED, - [ENOKEY] = LUSTRE_ENOKEY, - [EKEYEXPIRED] = LUSTRE_EKEYEXPIRED, - [EKEYREVOKED] = LUSTRE_EKEYREVOKED, - [EKEYREJECTED] = LUSTRE_EKEYREJECTED, - [EOWNERDEAD] = LUSTRE_EOWNERDEAD, - [ENOTRECOVERABLE] = LUSTRE_ENOTRECOVERABLE, - [ERESTARTSYS] = LUSTRE_ERESTARTSYS, - [ERESTARTNOINTR] = LUSTRE_ERESTARTNOINTR, - [ERESTARTNOHAND] = LUSTRE_ERESTARTNOHAND, - [ENOIOCTLCMD] = LUSTRE_ENOIOCTLCMD, - [ERESTART_RESTARTBLOCK] = LUSTRE_ERESTART_RESTARTBLOCK, - [EBADHANDLE] = LUSTRE_EBADHANDLE, - [ENOTSYNC] = LUSTRE_ENOTSYNC, - [EBADCOOKIE] = LUSTRE_EBADCOOKIE, - [ENOTSUPP] = LUSTRE_ENOTSUPP, - [ETOOSMALL] = LUSTRE_ETOOSMALL, - [ESERVERFAULT] = LUSTRE_ESERVERFAULT, - [EBADTYPE] = LUSTRE_EBADTYPE, - [EJUKEBOX] = LUSTRE_EJUKEBOX, - [EIOCBQUEUED] = LUSTRE_EIOCBQUEUED, -}; - -static int lustre_errno_ntoh_mapping[] = { - [LUSTRE_EPERM] = EPERM, - [LUSTRE_ENOENT] = ENOENT, - [LUSTRE_ESRCH] = ESRCH, - [LUSTRE_EINTR] = EINTR, - [LUSTRE_EIO] = EIO, - [LUSTRE_ENXIO] = ENXIO, - [LUSTRE_E2BIG] = E2BIG, - [LUSTRE_ENOEXEC] = ENOEXEC, - [LUSTRE_EBADF] = EBADF, - [LUSTRE_ECHILD] = ECHILD, - [LUSTRE_EAGAIN] = EAGAIN, - [LUSTRE_ENOMEM] = ENOMEM, - [LUSTRE_EACCES] = EACCES, - [LUSTRE_EFAULT] = EFAULT, - [LUSTRE_ENOTBLK] = ENOTBLK, - [LUSTRE_EBUSY] = EBUSY, - [LUSTRE_EEXIST] = EEXIST, - [LUSTRE_EXDEV] = EXDEV, - [LUSTRE_ENODEV] = ENODEV, - [LUSTRE_ENOTDIR] = ENOTDIR, - [LUSTRE_EISDIR] = EISDIR, - [LUSTRE_EINVAL] = EINVAL, - [LUSTRE_ENFILE] = ENFILE, - [LUSTRE_EMFILE] = EMFILE, - [LUSTRE_ENOTTY] = ENOTTY, - [LUSTRE_ETXTBSY] = ETXTBSY, - [LUSTRE_EFBIG] = EFBIG, - [LUSTRE_ENOSPC] = ENOSPC, - [LUSTRE_ESPIPE] = ESPIPE, - [LUSTRE_EROFS] = EROFS, - [LUSTRE_EMLINK] = EMLINK, - [LUSTRE_EPIPE] = EPIPE, - [LUSTRE_EDOM] = EDOM, - [LUSTRE_ERANGE] = ERANGE, - [LUSTRE_EDEADLK] = EDEADLK, - [LUSTRE_ENAMETOOLONG] = ENAMETOOLONG, - [LUSTRE_ENOLCK] = ENOLCK, - [LUSTRE_ENOSYS] = ENOSYS, - [LUSTRE_ENOTEMPTY] = ENOTEMPTY, - [LUSTRE_ELOOP] = ELOOP, - [LUSTRE_ENOMSG] = ENOMSG, - [LUSTRE_EIDRM] = EIDRM, - [LUSTRE_ECHRNG] = ECHRNG, - [LUSTRE_EL2NSYNC] = EL2NSYNC, - [LUSTRE_EL3HLT] = EL3HLT, - [LUSTRE_EL3RST] = EL3RST, - [LUSTRE_ELNRNG] = ELNRNG, - [LUSTRE_EUNATCH] = EUNATCH, - [LUSTRE_ENOCSI] = ENOCSI, - [LUSTRE_EL2HLT] = EL2HLT, - [LUSTRE_EBADE] = EBADE, - [LUSTRE_EBADR] = EBADR, - [LUSTRE_EXFULL] = EXFULL, - [LUSTRE_ENOANO] = ENOANO, - [LUSTRE_EBADRQC] = EBADRQC, - [LUSTRE_EBADSLT] = EBADSLT, - [LUSTRE_EBFONT] = EBFONT, - [LUSTRE_ENOSTR] = ENOSTR, - [LUSTRE_ENODATA] = ENODATA, - [LUSTRE_ETIME] = ETIME, - [LUSTRE_ENOSR] = ENOSR, - [LUSTRE_ENONET] = ENONET, - [LUSTRE_ENOPKG] = ENOPKG, - [LUSTRE_EREMOTE] = EREMOTE, - [LUSTRE_ENOLINK] = ENOLINK, - [LUSTRE_EADV] = EADV, - [LUSTRE_ESRMNT] = ESRMNT, - [LUSTRE_ECOMM] = ECOMM, - [LUSTRE_EPROTO] = EPROTO, - [LUSTRE_EMULTIHOP] = EMULTIHOP, - [LUSTRE_EDOTDOT] = EDOTDOT, - [LUSTRE_EBADMSG] = EBADMSG, - [LUSTRE_EOVERFLOW] = EOVERFLOW, - [LUSTRE_ENOTUNIQ] = ENOTUNIQ, - [LUSTRE_EBADFD] = EBADFD, - [LUSTRE_EREMCHG] = EREMCHG, - [LUSTRE_ELIBACC] = ELIBACC, - [LUSTRE_ELIBBAD] = ELIBBAD, - [LUSTRE_ELIBSCN] = ELIBSCN, - [LUSTRE_ELIBMAX] = ELIBMAX, - [LUSTRE_ELIBEXEC] = ELIBEXEC, - [LUSTRE_EILSEQ] = EILSEQ, - [LUSTRE_ERESTART] = ERESTART, - [LUSTRE_ESTRPIPE] = ESTRPIPE, - [LUSTRE_EUSERS] = EUSERS, - [LUSTRE_ENOTSOCK] = ENOTSOCK, - [LUSTRE_EDESTADDRREQ] = EDESTADDRREQ, - [LUSTRE_EMSGSIZE] = EMSGSIZE, - [LUSTRE_EPROTOTYPE] = EPROTOTYPE, - [LUSTRE_ENOPROTOOPT] = ENOPROTOOPT, - [LUSTRE_EPROTONOSUPPORT] = EPROTONOSUPPORT, - [LUSTRE_ESOCKTNOSUPPORT] = ESOCKTNOSUPPORT, - [LUSTRE_EOPNOTSUPP] = EOPNOTSUPP, - [LUSTRE_EPFNOSUPPORT] = EPFNOSUPPORT, - [LUSTRE_EAFNOSUPPORT] = EAFNOSUPPORT, - [LUSTRE_EADDRINUSE] = EADDRINUSE, - [LUSTRE_EADDRNOTAVAIL] = EADDRNOTAVAIL, - [LUSTRE_ENETDOWN] = ENETDOWN, - [LUSTRE_ENETUNREACH] = ENETUNREACH, - [LUSTRE_ENETRESET] = ENETRESET, - [LUSTRE_ECONNABORTED] = ECONNABORTED, - [LUSTRE_ECONNRESET] = ECONNRESET, - [LUSTRE_ENOBUFS] = ENOBUFS, - [LUSTRE_EISCONN] = EISCONN, - [LUSTRE_ENOTCONN] = ENOTCONN, - [LUSTRE_ESHUTDOWN] = ESHUTDOWN, - [LUSTRE_ETOOMANYREFS] = ETOOMANYREFS, - [LUSTRE_ETIMEDOUT] = ETIMEDOUT, - [LUSTRE_ECONNREFUSED] = ECONNREFUSED, - [LUSTRE_EHOSTDOWN] = EHOSTDOWN, - [LUSTRE_EHOSTUNREACH] = EHOSTUNREACH, - [LUSTRE_EALREADY] = EALREADY, - [LUSTRE_EINPROGRESS] = EINPROGRESS, - [LUSTRE_ESTALE] = ESTALE, - [LUSTRE_EUCLEAN] = EUCLEAN, - [LUSTRE_ENOTNAM] = ENOTNAM, - [LUSTRE_ENAVAIL] = ENAVAIL, - [LUSTRE_EISNAM] = EISNAM, - [LUSTRE_EREMOTEIO] = EREMOTEIO, - [LUSTRE_EDQUOT] = EDQUOT, - [LUSTRE_ENOMEDIUM] = ENOMEDIUM, - [LUSTRE_EMEDIUMTYPE] = EMEDIUMTYPE, - [LUSTRE_ECANCELED] = ECANCELED, - [LUSTRE_ENOKEY] = ENOKEY, - [LUSTRE_EKEYEXPIRED] = EKEYEXPIRED, - [LUSTRE_EKEYREVOKED] = EKEYREVOKED, - [LUSTRE_EKEYREJECTED] = EKEYREJECTED, - [LUSTRE_EOWNERDEAD] = EOWNERDEAD, - [LUSTRE_ENOTRECOVERABLE] = ENOTRECOVERABLE, - [LUSTRE_ERESTARTSYS] = ERESTARTSYS, - [LUSTRE_ERESTARTNOINTR] = ERESTARTNOINTR, - [LUSTRE_ERESTARTNOHAND] = ERESTARTNOHAND, - [LUSTRE_ENOIOCTLCMD] = ENOIOCTLCMD, - [LUSTRE_ERESTART_RESTARTBLOCK] = ERESTART_RESTARTBLOCK, - [LUSTRE_EBADHANDLE] = EBADHANDLE, - [LUSTRE_ENOTSYNC] = ENOTSYNC, - [LUSTRE_EBADCOOKIE] = EBADCOOKIE, - [LUSTRE_ENOTSUPP] = ENOTSUPP, - [LUSTRE_ETOOSMALL] = ETOOSMALL, - [LUSTRE_ESERVERFAULT] = ESERVERFAULT, - [LUSTRE_EBADTYPE] = EBADTYPE, - [LUSTRE_EJUKEBOX] = EJUKEBOX, - [LUSTRE_EIOCBQUEUED] = EIOCBQUEUED, -}; - -unsigned int lustre_errno_hton(unsigned int h) -{ - unsigned int n; - - if (h == 0) { - n = 0; - } else if (h < ARRAY_SIZE(lustre_errno_hton_mapping)) { - n = lustre_errno_hton_mapping[h]; - if (n == 0) - goto generic; - } else { -generic: - /* - * A generic errno is better than the unknown one that could - * mean anything to a different host. - */ - n = LUSTRE_EIO; - } - - return n; -} -EXPORT_SYMBOL(lustre_errno_hton); - -unsigned int lustre_errno_ntoh(unsigned int n) -{ - unsigned int h; - - if (n == 0) { - h = 0; - } else if (n < ARRAY_SIZE(lustre_errno_ntoh_mapping)) { - h = lustre_errno_ntoh_mapping[n]; - if (h == 0) - goto generic; - } else { -generic: - /* - * Similar to the situation in lustre_errno_hton(), an unknown - * network errno could coincide with anything. Hence, it is - * better to return a generic errno. - */ - h = EIO; - } - - return h; -} -EXPORT_SYMBOL(lustre_errno_ntoh); diff --git a/drivers/staging/lustre/lustre/ptlrpc/events.c b/drivers/staging/lustre/lustre/ptlrpc/events.c deleted file mode 100644 index 130bacc2c891b77112d80b68b25e5aca2e635e14..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ptlrpc/events.c +++ /dev/null @@ -1,585 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015 Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_RPC - -#include -# ifdef __mips64__ -# include -# endif - -#include -#include -#include -#include "ptlrpc_internal.h" - -struct lnet_handle_eq ptlrpc_eq_h; - -/* - * Client's outgoing request callback - */ -void request_out_callback(struct lnet_event *ev) -{ - struct ptlrpc_cb_id *cbid = ev->md.user_ptr; - struct ptlrpc_request *req = cbid->cbid_arg; - bool wakeup = false; - - LASSERT(ev->type == LNET_EVENT_SEND || ev->type == LNET_EVENT_UNLINK); - LASSERT(ev->unlinked); - - DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status); - - sptlrpc_request_out_callback(req); - - spin_lock(&req->rq_lock); - req->rq_real_sent = ktime_get_real_seconds(); - req->rq_req_unlinked = 1; - /* reply_in_callback happened before request_out_callback? */ - if (req->rq_reply_unlinked) - wakeup = true; - - if (ev->type == LNET_EVENT_UNLINK || ev->status != 0) { - /* Failed send: make it seem like the reply timed out, just - * like failing sends in client.c does currently... - */ - req->rq_net_err = 1; - wakeup = true; - } - - if (wakeup) - ptlrpc_client_wake_req(req); - - spin_unlock(&req->rq_lock); - - ptlrpc_req_finished(req); -} - -/* - * Client's incoming reply callback - */ -void reply_in_callback(struct lnet_event *ev) -{ - struct ptlrpc_cb_id *cbid = ev->md.user_ptr; - struct ptlrpc_request *req = cbid->cbid_arg; - - DEBUG_REQ(D_NET, req, "type %d, status %d", ev->type, ev->status); - - LASSERT(ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_UNLINK); - LASSERT(ev->md.start == req->rq_repbuf); - LASSERT(ev->offset + ev->mlength <= req->rq_repbuf_len); - /* We've set LNET_MD_MANAGE_REMOTE for all outgoing requests - * for adaptive timeouts' early reply. - */ - LASSERT((ev->md.options & LNET_MD_MANAGE_REMOTE) != 0); - - spin_lock(&req->rq_lock); - - req->rq_receiving_reply = 0; - req->rq_early = 0; - if (ev->unlinked) - req->rq_reply_unlinked = 1; - - if (ev->status) - goto out_wake; - - if (ev->type == LNET_EVENT_UNLINK) { - LASSERT(ev->unlinked); - DEBUG_REQ(D_NET, req, "unlink"); - goto out_wake; - } - - if (ev->mlength < ev->rlength) { - CDEBUG(D_RPCTRACE, "truncate req %p rpc %d - %d+%d\n", req, - req->rq_replen, ev->rlength, ev->offset); - req->rq_reply_truncated = 1; - req->rq_replied = 1; - req->rq_status = -EOVERFLOW; - req->rq_nob_received = ev->rlength + ev->offset; - goto out_wake; - } - - if ((ev->offset == 0) && - ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT))) { - /* Early reply */ - DEBUG_REQ(D_ADAPTTO, req, - "Early reply received: mlen=%u offset=%d replen=%d replied=%d unlinked=%d", - ev->mlength, ev->offset, - req->rq_replen, req->rq_replied, ev->unlinked); - - req->rq_early_count++; /* number received, client side */ - - /* already got the real reply or buffers are already unlinked */ - if (req->rq_replied || req->rq_reply_unlinked == 1) - goto out_wake; - - req->rq_early = 1; - req->rq_reply_off = ev->offset; - req->rq_nob_received = ev->mlength; - /* And we're still receiving */ - req->rq_receiving_reply = 1; - } else { - /* Real reply */ - req->rq_rep_swab_mask = 0; - req->rq_replied = 1; - /* Got reply, no resend required */ - req->rq_resend = 0; - req->rq_reply_off = ev->offset; - req->rq_nob_received = ev->mlength; - /* LNetMDUnlink can't be called under the LNET_LOCK, - * so we must unlink in ptlrpc_unregister_reply - */ - DEBUG_REQ(D_INFO, req, - "reply in flags=%x mlen=%u offset=%d replen=%d", - lustre_msg_get_flags(req->rq_reqmsg), - ev->mlength, ev->offset, req->rq_replen); - } - - req->rq_import->imp_last_reply_time = ktime_get_real_seconds(); - -out_wake: - /* NB don't unlock till after wakeup; req can disappear under us - * since we don't have our own ref - */ - ptlrpc_client_wake_req(req); - spin_unlock(&req->rq_lock); -} - -/* - * Client's bulk has been written/read - */ -void client_bulk_callback(struct lnet_event *ev) -{ - struct ptlrpc_cb_id *cbid = ev->md.user_ptr; - struct ptlrpc_bulk_desc *desc = cbid->cbid_arg; - struct ptlrpc_request *req; - - LASSERT((ptlrpc_is_bulk_put_sink(desc->bd_type) && - ev->type == LNET_EVENT_PUT) || - (ptlrpc_is_bulk_get_source(desc->bd_type) && - ev->type == LNET_EVENT_GET) || - ev->type == LNET_EVENT_UNLINK); - LASSERT(ev->unlinked); - - if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB, CFS_FAIL_ONCE)) - ev->status = -EIO; - - if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2, - CFS_FAIL_ONCE)) - ev->status = -EIO; - - CDEBUG((ev->status == 0) ? D_NET : D_ERROR, - "event type %d, status %d, desc %p\n", - ev->type, ev->status, desc); - - spin_lock(&desc->bd_lock); - req = desc->bd_req; - LASSERT(desc->bd_md_count > 0); - desc->bd_md_count--; - - if (ev->type != LNET_EVENT_UNLINK && ev->status == 0) { - desc->bd_nob_transferred += ev->mlength; - desc->bd_sender = ev->sender; - } else { - /* start reconnect and resend if network error hit */ - spin_lock(&req->rq_lock); - req->rq_net_err = 1; - spin_unlock(&req->rq_lock); - } - - if (ev->status != 0) - desc->bd_failure = 1; - - /* NB don't unlock till after wakeup; desc can disappear under us - * otherwise - */ - if (desc->bd_md_count == 0) - ptlrpc_client_wake_req(desc->bd_req); - - spin_unlock(&desc->bd_lock); -} - -/* - * We will have percpt request history list for ptlrpc service in upcoming - * patches because we don't want to be serialized by current per-service - * history operations. So we require history ID can (somehow) show arriving - * order w/o grabbing global lock, and user can sort them in userspace. - * - * This is how we generate history ID for ptlrpc_request: - * ---------------------------------------------------- - * | 32 bits | 16 bits | (16 - X)bits | X bits | - * ---------------------------------------------------- - * | seconds | usec / 16 | sequence | CPT id | - * ---------------------------------------------------- - * - * it might not be precise but should be good enough. - */ - -#define REQS_CPT_BITS(svcpt) ((svcpt)->scp_service->srv_cpt_bits) - -#define REQS_SEC_SHIFT 32 -#define REQS_USEC_SHIFT 16 -#define REQS_SEQ_SHIFT(svcpt) REQS_CPT_BITS(svcpt) - -static void ptlrpc_req_add_history(struct ptlrpc_service_part *svcpt, - struct ptlrpc_request *req) -{ - __u64 sec = req->rq_arrival_time.tv_sec; - __u32 usec = req->rq_arrival_time.tv_nsec / NSEC_PER_USEC / 16; /* usec / 16 */ - __u64 new_seq; - - /* set sequence ID for request and add it to history list, - * it must be called with hold svcpt::scp_lock - */ - - new_seq = (sec << REQS_SEC_SHIFT) | - (usec << REQS_USEC_SHIFT) | - (svcpt->scp_cpt < 0 ? 0 : svcpt->scp_cpt); - - if (new_seq > svcpt->scp_hist_seq) { - /* This handles the initial case of scp_hist_seq == 0 or - * we just jumped into a new time window - */ - svcpt->scp_hist_seq = new_seq; - } else { - LASSERT(REQS_SEQ_SHIFT(svcpt) < REQS_USEC_SHIFT); - /* NB: increase sequence number in current usec bucket, - * however, it's possible that we used up all bits for - * sequence and jumped into the next usec bucket (future time), - * then we hope there will be less RPCs per bucket at some - * point, and sequence will catch up again - */ - svcpt->scp_hist_seq += (1ULL << REQS_SEQ_SHIFT(svcpt)); - new_seq = svcpt->scp_hist_seq; - } - - req->rq_history_seq = new_seq; - - list_add_tail(&req->rq_history_list, &svcpt->scp_hist_reqs); -} - -/* - * Server's incoming request callback - */ -void request_in_callback(struct lnet_event *ev) -{ - struct ptlrpc_cb_id *cbid = ev->md.user_ptr; - struct ptlrpc_request_buffer_desc *rqbd = cbid->cbid_arg; - struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt; - struct ptlrpc_service *service = svcpt->scp_service; - struct ptlrpc_request *req; - - LASSERT(ev->type == LNET_EVENT_PUT || - ev->type == LNET_EVENT_UNLINK); - LASSERT((char *)ev->md.start >= rqbd->rqbd_buffer); - LASSERT((char *)ev->md.start + ev->offset + ev->mlength <= - rqbd->rqbd_buffer + service->srv_buf_size); - - CDEBUG((ev->status == 0) ? D_NET : D_ERROR, - "event type %d, status %d, service %s\n", - ev->type, ev->status, service->srv_name); - - if (ev->unlinked) { - /* If this is the last request message to fit in the - * request buffer we can use the request object embedded in - * rqbd. Note that if we failed to allocate a request, - * we'd have to re-post the rqbd, which we can't do in this - * context. - */ - req = &rqbd->rqbd_req; - memset(req, 0, sizeof(*req)); - } else { - LASSERT(ev->type == LNET_EVENT_PUT); - if (ev->status != 0) { - /* We moaned above already... */ - return; - } - req = ptlrpc_request_cache_alloc(GFP_ATOMIC); - if (!req) { - CERROR("Can't allocate incoming request descriptor: Dropping %s RPC from %s\n", - service->srv_name, - libcfs_id2str(ev->initiator)); - return; - } - } - - ptlrpc_srv_req_init(req); - /* NB we ABSOLUTELY RELY on req being zeroed, so pointers are NULL, - * flags are reset and scalars are zero. We only set the message - * size to non-zero if this was a successful receive. - */ - req->rq_xid = ev->match_bits; - req->rq_reqbuf = ev->md.start + ev->offset; - if (ev->type == LNET_EVENT_PUT && ev->status == 0) - req->rq_reqdata_len = ev->mlength; - ktime_get_real_ts64(&req->rq_arrival_time); - req->rq_peer = ev->initiator; - req->rq_self = ev->target.nid; - req->rq_rqbd = rqbd; - req->rq_phase = RQ_PHASE_NEW; - if (ev->type == LNET_EVENT_PUT) - CDEBUG(D_INFO, "incoming req@%p x%llu msgsize %u\n", - req, req->rq_xid, ev->mlength); - - CDEBUG(D_RPCTRACE, "peer: %s\n", libcfs_id2str(req->rq_peer)); - - spin_lock(&svcpt->scp_lock); - - ptlrpc_req_add_history(svcpt, req); - - if (ev->unlinked) { - svcpt->scp_nrqbds_posted--; - CDEBUG(D_INFO, "Buffer complete: %d buffers still posted\n", - svcpt->scp_nrqbds_posted); - - /* Normally, don't complain about 0 buffers posted; LNET won't - * drop incoming reqs since we set the portal lazy - */ - if (test_req_buffer_pressure && - ev->type != LNET_EVENT_UNLINK && - svcpt->scp_nrqbds_posted == 0) - CWARN("All %s request buffers busy\n", - service->srv_name); - - /* req takes over the network's ref on rqbd */ - } else { - /* req takes a ref on rqbd */ - rqbd->rqbd_refcount++; - } - - list_add_tail(&req->rq_list, &svcpt->scp_req_incoming); - svcpt->scp_nreqs_incoming++; - - /* NB everything can disappear under us once the request - * has been queued and we unlock, so do the wake now... - */ - wake_up(&svcpt->scp_waitq); - - spin_unlock(&svcpt->scp_lock); -} - -/* - * Server's outgoing reply callback - */ -void reply_out_callback(struct lnet_event *ev) -{ - struct ptlrpc_cb_id *cbid = ev->md.user_ptr; - struct ptlrpc_reply_state *rs = cbid->cbid_arg; - struct ptlrpc_service_part *svcpt = rs->rs_svcpt; - - LASSERT(ev->type == LNET_EVENT_SEND || - ev->type == LNET_EVENT_ACK || - ev->type == LNET_EVENT_UNLINK); - - if (!rs->rs_difficult) { - /* 'Easy' replies have no further processing so I drop the - * net's ref on 'rs' - */ - LASSERT(ev->unlinked); - ptlrpc_rs_decref(rs); - return; - } - - LASSERT(rs->rs_on_net); - - if (ev->unlinked) { - /* Last network callback. The net's ref on 'rs' stays put - * until ptlrpc_handle_rs() is done with it - */ - spin_lock(&svcpt->scp_rep_lock); - spin_lock(&rs->rs_lock); - - rs->rs_on_net = 0; - if (!rs->rs_no_ack || - rs->rs_transno <= - rs->rs_export->exp_obd->obd_last_committed || - list_empty(&rs->rs_obd_list)) - ptlrpc_schedule_difficult_reply(rs); - - spin_unlock(&rs->rs_lock); - spin_unlock(&svcpt->scp_rep_lock); - } -} - -static void ptlrpc_master_callback(struct lnet_event *ev) -{ - struct ptlrpc_cb_id *cbid = ev->md.user_ptr; - void (*callback)(struct lnet_event *ev) = cbid->cbid_fn; - - /* Honestly, it's best to find out early. */ - LASSERT(cbid->cbid_arg != LP_POISON); - LASSERT(callback == request_out_callback || - callback == reply_in_callback || - callback == client_bulk_callback || - callback == request_in_callback || - callback == reply_out_callback); - - callback(ev); -} - -int ptlrpc_uuid_to_peer(struct obd_uuid *uuid, - struct lnet_process_id *peer, lnet_nid_t *self) -{ - int best_dist = 0; - __u32 best_order = 0; - int count = 0; - int rc = -ENOENT; - int dist; - __u32 order; - lnet_nid_t dst_nid; - lnet_nid_t src_nid; - - peer->pid = LNET_PID_LUSTRE; - - /* Choose the matching UUID that's closest */ - while (lustre_uuid_to_peer(uuid->uuid, &dst_nid, count++) == 0) { - dist = LNetDist(dst_nid, &src_nid, &order); - if (dist < 0) - continue; - - if (dist == 0) { /* local! use loopback LND */ - peer->nid = *self = LNET_MKNID(LNET_MKNET(LOLND, 0), 0); - rc = 0; - break; - } - - if (rc < 0 || - dist < best_dist || - (dist == best_dist && order < best_order)) { - best_dist = dist; - best_order = order; - - peer->nid = dst_nid; - *self = src_nid; - rc = 0; - } - } - - CDEBUG(D_NET, "%s->%s\n", uuid->uuid, libcfs_id2str(*peer)); - return rc; -} - -static void ptlrpc_ni_fini(void) -{ - int rc; - int retries; - - /* Wait for the event queue to become idle since there may still be - * messages in flight with pending events (i.e. the fire-and-forget - * messages == client requests and "non-difficult" server - * replies - */ - - for (retries = 0;; retries++) { - rc = LNetEQFree(ptlrpc_eq_h); - switch (rc) { - default: - LBUG(); - - case 0: - LNetNIFini(); - return; - - case -EBUSY: - if (retries != 0) - CWARN("Event queue still busy\n"); - - schedule_timeout_uninterruptible(2 * HZ); - break; - } - } - /* notreached */ -} - -static lnet_pid_t ptl_get_pid(void) -{ - lnet_pid_t pid; - - pid = LNET_PID_LUSTRE; - return pid; -} - -static int ptlrpc_ni_init(void) -{ - int rc; - lnet_pid_t pid; - - pid = ptl_get_pid(); - CDEBUG(D_NET, "My pid is: %x\n", pid); - - /* We're not passing any limits yet... */ - rc = LNetNIInit(pid); - if (rc < 0) { - CDEBUG(D_NET, "Can't init network interface: %d\n", rc); - return rc; - } - - /* CAVEAT EMPTOR: how we process portals events is _radically_ - * different depending on... - */ - /* kernel LNet calls our master callback when there are new event, - * because we are guaranteed to get every event via callback, - * so we just set EQ size to 0 to avoid overhead of serializing - * enqueue/dequeue operations in LNet. - */ - rc = LNetEQAlloc(0, ptlrpc_master_callback, &ptlrpc_eq_h); - if (rc == 0) - return 0; - - CERROR("Failed to allocate event queue: %d\n", rc); - LNetNIFini(); - - return rc; -} - -int ptlrpc_init_portals(void) -{ - int rc = ptlrpc_ni_init(); - - if (rc != 0) { - CERROR("network initialisation failed\n"); - return rc; - } - rc = ptlrpcd_addref(); - if (rc == 0) - return 0; - - CERROR("rpcd initialisation failed\n"); - ptlrpc_ni_fini(); - return rc; -} - -void ptlrpc_exit_portals(void) -{ - ptlrpcd_decref(); - ptlrpc_ni_fini(); -} diff --git a/drivers/staging/lustre/lustre/ptlrpc/import.c b/drivers/staging/lustre/lustre/ptlrpc/import.c deleted file mode 100644 index 1a0f35dfab97ee6599aefdc48b7ed6eaeee424f8..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ptlrpc/import.c +++ /dev/null @@ -1,1677 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/ptlrpc/import.c - * - * Author: Mike Shaver - */ - -#define DEBUG_SUBSYSTEM S_RPC - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ptlrpc_internal.h" - -struct ptlrpc_connect_async_args { - __u64 pcaa_peer_committed; - int pcaa_initial_connect; -}; - -/** - * Updates import \a imp current state to provided \a state value - * Helper function. Must be called under imp_lock. - */ -static void __import_set_state(struct obd_import *imp, - enum lustre_imp_state state) -{ - switch (state) { - case LUSTRE_IMP_CLOSED: - case LUSTRE_IMP_NEW: - case LUSTRE_IMP_DISCON: - case LUSTRE_IMP_CONNECTING: - break; - case LUSTRE_IMP_REPLAY_WAIT: - imp->imp_replay_state = LUSTRE_IMP_REPLAY_LOCKS; - break; - default: - imp->imp_replay_state = LUSTRE_IMP_REPLAY; - } - - imp->imp_state = state; - imp->imp_state_hist[imp->imp_state_hist_idx].ish_state = state; - imp->imp_state_hist[imp->imp_state_hist_idx].ish_time = - ktime_get_real_seconds(); - imp->imp_state_hist_idx = (imp->imp_state_hist_idx + 1) % - IMP_STATE_HIST_LEN; -} - -/* A CLOSED import should remain so. */ -#define IMPORT_SET_STATE_NOLOCK(imp, state) \ -do { \ - if (imp->imp_state != LUSTRE_IMP_CLOSED) { \ - CDEBUG(D_HA, "%p %s: changing import state from %s to %s\n", \ - imp, obd2cli_tgt(imp->imp_obd), \ - ptlrpc_import_state_name(imp->imp_state), \ - ptlrpc_import_state_name(state)); \ - __import_set_state(imp, state); \ - } \ -} while (0) - -#define IMPORT_SET_STATE(imp, state) \ -do { \ - spin_lock(&imp->imp_lock); \ - IMPORT_SET_STATE_NOLOCK(imp, state); \ - spin_unlock(&imp->imp_lock); \ -} while (0) - -static int ptlrpc_connect_interpret(const struct lu_env *env, - struct ptlrpc_request *request, - void *data, int rc); -int ptlrpc_import_recovery_state_machine(struct obd_import *imp); - -/* Only this function is allowed to change the import state when it is - * CLOSED. I would rather refcount the import and free it after - * disconnection like we do with exports. To do that, the client_obd - * will need to save the peer info somewhere other than in the import, - * though. - */ -int ptlrpc_init_import(struct obd_import *imp) -{ - spin_lock(&imp->imp_lock); - - imp->imp_generation++; - imp->imp_state = LUSTRE_IMP_NEW; - - spin_unlock(&imp->imp_lock); - - return 0; -} -EXPORT_SYMBOL(ptlrpc_init_import); - -#define UUID_STR "_UUID" -static void deuuidify(char *uuid, const char *prefix, char **uuid_start, - int *uuid_len) -{ - *uuid_start = !prefix || strncmp(uuid, prefix, strlen(prefix)) - ? uuid : uuid + strlen(prefix); - - *uuid_len = strlen(*uuid_start); - - if (*uuid_len < strlen(UUID_STR)) - return; - - if (!strncmp(*uuid_start + *uuid_len - strlen(UUID_STR), - UUID_STR, strlen(UUID_STR))) - *uuid_len -= strlen(UUID_STR); -} - -/** - * Returns true if import was FULL, false if import was already not - * connected. - * @imp - import to be disconnected - * @conn_cnt - connection count (epoch) of the request that timed out - * and caused the disconnection. In some cases, multiple - * inflight requests can fail to a single target (e.g. OST - * bulk requests) and if one has already caused a reconnection - * (increasing the import->conn_cnt) the older failure should - * not also cause a reconnection. If zero it forces a reconnect. - */ -int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt) -{ - int rc = 0; - - spin_lock(&imp->imp_lock); - - if (imp->imp_state == LUSTRE_IMP_FULL && - (conn_cnt == 0 || conn_cnt == imp->imp_conn_cnt)) { - char *target_start; - int target_len; - - deuuidify(obd2cli_tgt(imp->imp_obd), NULL, - &target_start, &target_len); - - if (imp->imp_replayable) { - LCONSOLE_WARN("%s: Connection to %.*s (at %s) was lost; in progress operations using this service will wait for recovery to complete\n", - imp->imp_obd->obd_name, target_len, target_start, - libcfs_nid2str(imp->imp_connection->c_peer.nid)); - } else { - LCONSOLE_ERROR_MSG(0x166, "%s: Connection to %.*s (at %s) was lost; in progress operations using this service will fail\n", - imp->imp_obd->obd_name, - target_len, target_start, - libcfs_nid2str(imp->imp_connection->c_peer.nid)); - } - IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON); - spin_unlock(&imp->imp_lock); - - if (obd_dump_on_timeout) - libcfs_debug_dumplog(); - - obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON); - rc = 1; - } else { - spin_unlock(&imp->imp_lock); - CDEBUG(D_HA, "%s: import %p already %s (conn %u, was %u): %s\n", - imp->imp_client->cli_name, imp, - (imp->imp_state == LUSTRE_IMP_FULL && - imp->imp_conn_cnt > conn_cnt) ? - "reconnected" : "not connected", imp->imp_conn_cnt, - conn_cnt, ptlrpc_import_state_name(imp->imp_state)); - } - - return rc; -} - -/* - * This acts as a barrier; all existing requests are rejected, and - * no new requests will be accepted until the import is valid again. - */ -void ptlrpc_deactivate_import(struct obd_import *imp) -{ - CDEBUG(D_HA, "setting import %s INVALID\n", obd2cli_tgt(imp->imp_obd)); - - spin_lock(&imp->imp_lock); - imp->imp_invalid = 1; - imp->imp_generation++; - spin_unlock(&imp->imp_lock); - - ptlrpc_abort_inflight(imp); - obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE); -} -EXPORT_SYMBOL(ptlrpc_deactivate_import); - -static unsigned int -ptlrpc_inflight_deadline(struct ptlrpc_request *req, time64_t now) -{ - long dl; - - if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) || - (req->rq_phase == RQ_PHASE_BULK) || - (req->rq_phase == RQ_PHASE_NEW))) - return 0; - - if (req->rq_timedout) - return 0; - - if (req->rq_phase == RQ_PHASE_NEW) - dl = req->rq_sent; - else - dl = req->rq_deadline; - - if (dl <= now) - return 0; - - return dl - now; -} - -static unsigned int ptlrpc_inflight_timeout(struct obd_import *imp) -{ - time64_t now = ktime_get_real_seconds(); - struct ptlrpc_request *req, *n; - unsigned int timeout = 0; - - spin_lock(&imp->imp_lock); - list_for_each_entry_safe(req, n, &imp->imp_sending_list, rq_list) - timeout = max(ptlrpc_inflight_deadline(req, now), timeout); - - spin_unlock(&imp->imp_lock); - return timeout; -} - -/** - * This function will invalidate the import, if necessary, then block - * for all the RPC completions, and finally notify the obd to - * invalidate its state (ie cancel locks, clear pending requests, - * etc). - */ -void ptlrpc_invalidate_import(struct obd_import *imp) -{ - struct ptlrpc_request *req, *n; - unsigned int timeout; - int rc; - - atomic_inc(&imp->imp_inval_count); - - if (!imp->imp_invalid || imp->imp_obd->obd_no_recov) - ptlrpc_deactivate_import(imp); - - CFS_FAIL_TIMEOUT(OBD_FAIL_MGS_CONNECT_NET, 3 * cfs_fail_val / 2); - LASSERT(imp->imp_invalid); - - /* Wait forever until inflight == 0. We really can't do it another - * way because in some cases we need to wait for very long reply - * unlink. We can't do anything before that because there is really - * no guarantee that some rdma transfer is not in progress right now. - */ - do { - /* Calculate max timeout for waiting on rpcs to error - * out. Use obd_timeout if calculated value is smaller - * than it. - */ - if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) { - timeout = ptlrpc_inflight_timeout(imp); - timeout += timeout / 3; - - if (timeout == 0) - timeout = obd_timeout; - } else { - /* decrease the interval to increase race condition */ - timeout = 1; - } - - CDEBUG(D_RPCTRACE, - "Sleeping %d sec for inflight to error out\n", - timeout); - - /* Wait for all requests to error out and call completion - * callbacks. Cap it at obd_timeout -- these should all - * have been locally cancelled by ptlrpc_abort_inflight. - */ - rc = wait_event_idle_timeout(imp->imp_recovery_waitq, - atomic_read(&imp->imp_inflight) == 0, - obd_timeout * HZ); - - if (rc == 0) { - const char *cli_tgt = obd2cli_tgt(imp->imp_obd); - - CERROR("%s: timeout waiting for callback (%d != 0)\n", - cli_tgt, - atomic_read(&imp->imp_inflight)); - - spin_lock(&imp->imp_lock); - if (atomic_read(&imp->imp_inflight) == 0) { - int count = atomic_read(&imp->imp_unregistering); - - /* We know that "unregistering" rpcs only can - * survive in sending or delaying lists (they - * maybe waiting for long reply unlink in - * sluggish nets). Let's check this. If there - * is no inflight and unregistering != 0, this - * is bug. - */ - LASSERTF(count == 0, "Some RPCs are still unregistering: %d\n", - count); - - /* Let's save one loop as soon as inflight have - * dropped to zero. No new inflights possible at - * this point. - */ - rc = 0; - } else { - list_for_each_entry_safe(req, n, - &imp->imp_sending_list, rq_list) { - DEBUG_REQ(D_ERROR, req, - "still on sending list"); - } - list_for_each_entry_safe(req, n, - &imp->imp_delayed_list, rq_list) { - DEBUG_REQ(D_ERROR, req, - "still on delayed list"); - } - - CERROR("%s: Unregistering RPCs found (%d). Network is sluggish? Waiting them to error out.\n", - cli_tgt, - atomic_read(&imp-> - imp_unregistering)); - } - spin_unlock(&imp->imp_lock); - } - } while (rc == 0); - - /* - * Let's additionally check that no new rpcs added to import in - * "invalidate" state. - */ - LASSERT(atomic_read(&imp->imp_inflight) == 0); - obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE); - sptlrpc_import_flush_all_ctx(imp); - - atomic_dec(&imp->imp_inval_count); - wake_up_all(&imp->imp_recovery_waitq); -} -EXPORT_SYMBOL(ptlrpc_invalidate_import); - -/* unset imp_invalid */ -void ptlrpc_activate_import(struct obd_import *imp) -{ - struct obd_device *obd = imp->imp_obd; - - spin_lock(&imp->imp_lock); - if (imp->imp_deactive != 0) { - spin_unlock(&imp->imp_lock); - return; - } - - imp->imp_invalid = 0; - spin_unlock(&imp->imp_lock); - obd_import_event(obd, imp, IMP_EVENT_ACTIVE); -} -EXPORT_SYMBOL(ptlrpc_activate_import); - -void ptlrpc_pinger_force(struct obd_import *imp) -{ - CDEBUG(D_HA, "%s: waking up pinger s:%s\n", obd2cli_tgt(imp->imp_obd), - ptlrpc_import_state_name(imp->imp_state)); - - spin_lock(&imp->imp_lock); - imp->imp_force_verify = 1; - spin_unlock(&imp->imp_lock); - - if (imp->imp_state != LUSTRE_IMP_CONNECTING) - ptlrpc_pinger_wake_up(); -} -EXPORT_SYMBOL(ptlrpc_pinger_force); - -void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt) -{ - LASSERT(!imp->imp_dlm_fake); - - if (ptlrpc_set_import_discon(imp, conn_cnt)) { - if (!imp->imp_replayable) { - CDEBUG(D_HA, "import %s@%s for %s not replayable, auto-deactivating\n", - obd2cli_tgt(imp->imp_obd), - imp->imp_connection->c_remote_uuid.uuid, - imp->imp_obd->obd_name); - ptlrpc_deactivate_import(imp); - } - - ptlrpc_pinger_force(imp); - } -} - -int ptlrpc_reconnect_import(struct obd_import *imp) -{ - int rc; - - ptlrpc_pinger_force(imp); - - CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n", - obd2cli_tgt(imp->imp_obd), obd_timeout); - - rc = wait_event_idle_timeout(imp->imp_recovery_waitq, - !ptlrpc_import_in_recovery(imp), - obd_timeout * HZ); - CDEBUG(D_HA, "%s: recovery finished s:%s\n", obd2cli_tgt(imp->imp_obd), - ptlrpc_import_state_name(imp->imp_state)); - return rc == 0 ? -ETIMEDOUT : 0; -} -EXPORT_SYMBOL(ptlrpc_reconnect_import); - -/** - * Connection on import \a imp is changed to another one (if more than one is - * present). We typically chose connection that we have not tried to connect to - * the longest - */ -static int import_select_connection(struct obd_import *imp) -{ - struct obd_import_conn *imp_conn = NULL, *conn; - struct obd_export *dlmexp; - char *target_start; - int target_len, tried_all = 1; - - spin_lock(&imp->imp_lock); - - if (list_empty(&imp->imp_conn_list)) { - CERROR("%s: no connections available\n", - imp->imp_obd->obd_name); - spin_unlock(&imp->imp_lock); - return -EINVAL; - } - - list_for_each_entry(conn, &imp->imp_conn_list, oic_item) { - CDEBUG(D_HA, "%s: connect to NID %s last attempt %llu\n", - imp->imp_obd->obd_name, - libcfs_nid2str(conn->oic_conn->c_peer.nid), - conn->oic_last_attempt); - - /* If we have not tried this connection since - * the last successful attempt, go with this one - */ - if ((conn->oic_last_attempt == 0) || - time_before_eq64(conn->oic_last_attempt, - imp->imp_last_success_conn)) { - imp_conn = conn; - tried_all = 0; - break; - } - - /* If all of the connections have already been tried - * since the last successful connection; just choose the - * least recently used - */ - if (!imp_conn) - imp_conn = conn; - else if (time_before64(conn->oic_last_attempt, - imp_conn->oic_last_attempt)) - imp_conn = conn; - } - - /* if not found, simply choose the current one */ - if (!imp_conn || imp->imp_force_reconnect) { - LASSERT(imp->imp_conn_current); - imp_conn = imp->imp_conn_current; - tried_all = 0; - } - LASSERT(imp_conn->oic_conn); - - /* If we've tried everything, and we're back to the beginning of the - * list, increase our timeout and try again. It will be reset when - * we do finally connect. (FIXME: really we should wait for all network - * state associated with the last connection attempt to drain before - * trying to reconnect on it.) - */ - if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) { - struct adaptive_timeout *at = &imp->imp_at.iat_net_latency; - - if (at_get(at) < CONNECTION_SWITCH_MAX) { - at_measured(at, at_get(at) + CONNECTION_SWITCH_INC); - if (at_get(at) > CONNECTION_SWITCH_MAX) - at_reset(at, CONNECTION_SWITCH_MAX); - } - LASSERT(imp_conn->oic_last_attempt); - CDEBUG(D_HA, "%s: tried all connections, increasing latency to %ds\n", - imp->imp_obd->obd_name, at_get(at)); - } - - imp_conn->oic_last_attempt = get_jiffies_64(); - - /* switch connection, don't mind if it's same as the current one */ - ptlrpc_connection_put(imp->imp_connection); - imp->imp_connection = ptlrpc_connection_addref(imp_conn->oic_conn); - - dlmexp = class_conn2export(&imp->imp_dlm_handle); - ptlrpc_connection_put(dlmexp->exp_connection); - dlmexp->exp_connection = ptlrpc_connection_addref(imp_conn->oic_conn); - class_export_put(dlmexp); - - if (imp->imp_conn_current != imp_conn) { - if (imp->imp_conn_current) { - deuuidify(obd2cli_tgt(imp->imp_obd), NULL, - &target_start, &target_len); - - CDEBUG(D_HA, "%s: Connection changing to %.*s (at %s)\n", - imp->imp_obd->obd_name, - target_len, target_start, - libcfs_nid2str(imp_conn->oic_conn->c_peer.nid)); - } - - imp->imp_conn_current = imp_conn; - } - - CDEBUG(D_HA, "%s: import %p using connection %s/%s\n", - imp->imp_obd->obd_name, imp, imp_conn->oic_uuid.uuid, - libcfs_nid2str(imp_conn->oic_conn->c_peer.nid)); - - spin_unlock(&imp->imp_lock); - - return 0; -} - -/* - * must be called under imp_lock - */ -static int ptlrpc_first_transno(struct obd_import *imp, __u64 *transno) -{ - struct ptlrpc_request *req; - - /* The requests in committed_list always have smaller transnos than - * the requests in replay_list - */ - if (!list_empty(&imp->imp_committed_list)) { - req = list_first_entry(&imp->imp_committed_list, - struct ptlrpc_request, rq_replay_list); - *transno = req->rq_transno; - if (req->rq_transno == 0) { - DEBUG_REQ(D_ERROR, req, - "zero transno in committed_list"); - LBUG(); - } - return 1; - } - if (!list_empty(&imp->imp_replay_list)) { - req = list_first_entry(&imp->imp_replay_list, - struct ptlrpc_request, rq_replay_list); - *transno = req->rq_transno; - if (req->rq_transno == 0) { - DEBUG_REQ(D_ERROR, req, "zero transno in replay_list"); - LBUG(); - } - return 1; - } - return 0; -} - -/** - * Attempt to (re)connect import \a imp. This includes all preparations, - * initializing CONNECT RPC request and passing it to ptlrpcd for - * actual sending. - * Returns 0 on success or error code. - */ -int ptlrpc_connect_import(struct obd_import *imp) -{ - struct obd_device *obd = imp->imp_obd; - int initial_connect = 0; - int set_transno = 0; - __u64 committed_before_reconnect = 0; - struct ptlrpc_request *request; - char *bufs[] = { NULL, - obd2cli_tgt(imp->imp_obd), - obd->obd_uuid.uuid, - (char *)&imp->imp_dlm_handle, - (char *)&imp->imp_connect_data }; - struct ptlrpc_connect_async_args *aa; - int rc; - - spin_lock(&imp->imp_lock); - if (imp->imp_state == LUSTRE_IMP_CLOSED) { - spin_unlock(&imp->imp_lock); - CERROR("can't connect to a closed import\n"); - return -EINVAL; - } else if (imp->imp_state == LUSTRE_IMP_FULL) { - spin_unlock(&imp->imp_lock); - CERROR("already connected\n"); - return 0; - } else if (imp->imp_state == LUSTRE_IMP_CONNECTING || - imp->imp_connected) { - spin_unlock(&imp->imp_lock); - CERROR("already connecting\n"); - return -EALREADY; - } - - IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CONNECTING); - - imp->imp_conn_cnt++; - imp->imp_resend_replay = 0; - - if (!lustre_handle_is_used(&imp->imp_remote_handle)) - initial_connect = 1; - else - committed_before_reconnect = imp->imp_peer_committed_transno; - - set_transno = ptlrpc_first_transno(imp, - &imp->imp_connect_data.ocd_transno); - spin_unlock(&imp->imp_lock); - - rc = import_select_connection(imp); - if (rc) - goto out; - - rc = sptlrpc_import_sec_adapt(imp, NULL, NULL); - if (rc) - goto out; - - /* Reset connect flags to the originally requested flags, in case - * the server is updated on-the-fly we will get the new features. - */ - imp->imp_connect_data.ocd_connect_flags = imp->imp_connect_flags_orig; - /* Reset ocd_version each time so the server knows the exact versions */ - imp->imp_connect_data.ocd_version = LUSTRE_VERSION_CODE; - imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT; - imp->imp_msghdr_flags &= ~MSGHDR_CKSUM_INCOMPAT18; - - rc = obd_reconnect(NULL, imp->imp_obd->obd_self_export, obd, - &obd->obd_uuid, &imp->imp_connect_data, NULL); - if (rc) - goto out; - - request = ptlrpc_request_alloc(imp, &RQF_MDS_CONNECT); - if (!request) { - rc = -ENOMEM; - goto out; - } - - rc = ptlrpc_request_bufs_pack(request, LUSTRE_OBD_VERSION, - imp->imp_connect_op, bufs, NULL); - if (rc) { - ptlrpc_request_free(request); - goto out; - } - - /* Report the rpc service time to the server so that it knows how long - * to wait for clients to join recovery - */ - lustre_msg_set_service_time(request->rq_reqmsg, - at_timeout2est(request->rq_timeout)); - - /* The amount of time we give the server to process the connect req. - * import_select_connection will increase the net latency on - * repeated reconnect attempts to cover slow networks. - * We override/ignore the server rpc completion estimate here, - * which may be large if this is a reconnect attempt - */ - request->rq_timeout = INITIAL_CONNECT_TIMEOUT; - lustre_msg_set_timeout(request->rq_reqmsg, request->rq_timeout); - - request->rq_no_resend = 1; - request->rq_no_delay = 1; - request->rq_send_state = LUSTRE_IMP_CONNECTING; - /* Allow a slightly larger reply for future growth compatibility */ - req_capsule_set_size(&request->rq_pill, &RMF_CONNECT_DATA, RCL_SERVER, - sizeof(struct obd_connect_data) + - 16 * sizeof(__u64)); - ptlrpc_request_set_replen(request); - request->rq_interpret_reply = ptlrpc_connect_interpret; - - BUILD_BUG_ON(sizeof(*aa) > sizeof(request->rq_async_args)); - aa = ptlrpc_req_async_args(request); - memset(aa, 0, sizeof(*aa)); - - aa->pcaa_peer_committed = committed_before_reconnect; - aa->pcaa_initial_connect = initial_connect; - - if (aa->pcaa_initial_connect) { - spin_lock(&imp->imp_lock); - imp->imp_replayable = 1; - spin_unlock(&imp->imp_lock); - lustre_msg_add_op_flags(request->rq_reqmsg, - MSG_CONNECT_INITIAL); - } - - if (set_transno) - lustre_msg_add_op_flags(request->rq_reqmsg, - MSG_CONNECT_TRANSNO); - - DEBUG_REQ(D_RPCTRACE, request, "(re)connect request (timeout %d)", - request->rq_timeout); - ptlrpcd_add_req(request); - rc = 0; -out: - if (rc != 0) - IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON); - - return rc; -} -EXPORT_SYMBOL(ptlrpc_connect_import); - -static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp) -{ - int force_verify; - - spin_lock(&imp->imp_lock); - force_verify = imp->imp_force_verify != 0; - spin_unlock(&imp->imp_lock); - - if (force_verify) - ptlrpc_pinger_wake_up(); -} - -static int ptlrpc_busy_reconnect(int rc) -{ - return (rc == -EBUSY) || (rc == -EAGAIN); -} - -static int ptlrpc_connect_set_flags(struct obd_import *imp, - struct obd_connect_data *ocd, - u64 old_connect_flags, - struct obd_export *exp, int init_connect) -{ - struct client_obd *cli = &imp->imp_obd->u.cli; - static bool warned; - - if ((imp->imp_connect_flags_orig & OBD_CONNECT_IBITS) && - !(ocd->ocd_connect_flags & OBD_CONNECT_IBITS)) { - LCONSOLE_WARN("%s: MDS %s does not support ibits lock, either very old or invalid: requested %#llx, replied %#llx\n", - imp->imp_obd->obd_name, - imp->imp_connection->c_remote_uuid.uuid, - imp->imp_connect_flags_orig, - ocd->ocd_connect_flags); - return -EPROTO; - } - - spin_lock(&imp->imp_lock); - list_del(&imp->imp_conn_current->oic_item); - list_add(&imp->imp_conn_current->oic_item, &imp->imp_conn_list); - imp->imp_last_success_conn = imp->imp_conn_current->oic_last_attempt; - - spin_unlock(&imp->imp_lock); - - if (!warned && (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) && - (ocd->ocd_version > LUSTRE_VERSION_CODE + - LUSTRE_VERSION_OFFSET_WARN || - ocd->ocd_version < LUSTRE_VERSION_CODE - - LUSTRE_VERSION_OFFSET_WARN)) { - /* - * Sigh, some compilers do not like #ifdef in the middle - * of macro arguments - */ - const char *older = "older than client. Consider upgrading server"; - const char *newer = "newer than client. Consider recompiling application"; - - LCONSOLE_WARN("Server %s version (%d.%d.%d.%d) is much %s (%s)\n", - obd2cli_tgt(imp->imp_obd), - OBD_OCD_VERSION_MAJOR(ocd->ocd_version), - OBD_OCD_VERSION_MINOR(ocd->ocd_version), - OBD_OCD_VERSION_PATCH(ocd->ocd_version), - OBD_OCD_VERSION_FIX(ocd->ocd_version), - ocd->ocd_version > LUSTRE_VERSION_CODE ? - newer : older, LUSTRE_VERSION_STRING); - warned = true; - } - -#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 0, 53, 0) - /* - * Check if server has LU-1252 fix applied to not always swab - * the IR MNE entries. Do this only once per connection. This - * fixup is version-limited, because we don't want to carry the - * OBD_CONNECT_MNE_SWAB flag around forever, just so long as we - * need interop with unpatched 2.2 servers. For newer servers, - * the client will do MNE swabbing only as needed. LU-1644 - */ - if (unlikely((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) && - !(ocd->ocd_connect_flags & OBD_CONNECT_MNE_SWAB) && - OBD_OCD_VERSION_MAJOR(ocd->ocd_version) == 2 && - OBD_OCD_VERSION_MINOR(ocd->ocd_version) == 2 && - OBD_OCD_VERSION_PATCH(ocd->ocd_version) < 55 && - !strcmp(imp->imp_obd->obd_type->typ_name, - LUSTRE_MGC_NAME))) - imp->imp_need_mne_swab = 1; - else /* clear if server was upgraded since last connect */ - imp->imp_need_mne_swab = 0; -#endif - - if (ocd->ocd_connect_flags & OBD_CONNECT_CKSUM) { - /* - * We sent to the server ocd_cksum_types with bits set - * for algorithms we understand. The server masked off - * the checksum types it doesn't support - */ - if (!(ocd->ocd_cksum_types & cksum_types_supported_client())) { - LCONSOLE_WARN("The negotiation of the checksum algorithm to use with server %s failed (%x/%x), disabling checksums\n", - obd2cli_tgt(imp->imp_obd), - ocd->ocd_cksum_types, - cksum_types_supported_client()); - cli->cl_checksum = 0; - cli->cl_supp_cksum_types = OBD_CKSUM_ADLER; - } else { - cli->cl_supp_cksum_types = ocd->ocd_cksum_types; - } - } else { - /* - * The server does not support OBD_CONNECT_CKSUM. - * Enforce ADLER for backward compatibility - */ - cli->cl_supp_cksum_types = OBD_CKSUM_ADLER; - } - cli->cl_cksum_type = cksum_type_select(cli->cl_supp_cksum_types); - - if (ocd->ocd_connect_flags & OBD_CONNECT_BRW_SIZE) - cli->cl_max_pages_per_rpc = - min(ocd->ocd_brw_size >> PAGE_SHIFT, - cli->cl_max_pages_per_rpc); - else if (imp->imp_connect_op == MDS_CONNECT || - imp->imp_connect_op == MGS_CONNECT) - cli->cl_max_pages_per_rpc = 1; - - LASSERT((cli->cl_max_pages_per_rpc <= PTLRPC_MAX_BRW_PAGES) && - (cli->cl_max_pages_per_rpc > 0)); - - client_adjust_max_dirty(cli); - - /* - * Update client max modify RPCs in flight with value returned - * by the server - */ - if (ocd->ocd_connect_flags & OBD_CONNECT_MULTIMODRPCS) - cli->cl_max_mod_rpcs_in_flight = min( - cli->cl_max_mod_rpcs_in_flight, - ocd->ocd_maxmodrpcs); - else - cli->cl_max_mod_rpcs_in_flight = 1; - - /* - * Reset ns_connect_flags only for initial connect. It might be - * changed in while using FS and if we reset it in reconnect - * this leads to losing user settings done before such as - * disable lru_resize, etc. - */ - if (old_connect_flags != exp_connect_flags(exp) || init_connect) { - CDEBUG(D_HA, "%s: Resetting ns_connect_flags to server flags: %#llx\n", - imp->imp_obd->obd_name, ocd->ocd_connect_flags); - imp->imp_obd->obd_namespace->ns_connect_flags = - ocd->ocd_connect_flags; - imp->imp_obd->obd_namespace->ns_orig_connect_flags = - ocd->ocd_connect_flags; - } - - if (ocd->ocd_connect_flags & OBD_CONNECT_AT) - /* - * We need a per-message support flag, because - * a. we don't know if the incoming connect reply - * supports AT or not (in reply_in_callback) - * until we unpack it. - * b. failovered server means export and flags are gone - * (in ptlrpc_send_reply). - * Can only be set when we know AT is supported at - * both ends - */ - imp->imp_msghdr_flags |= MSGHDR_AT_SUPPORT; - else - imp->imp_msghdr_flags &= ~MSGHDR_AT_SUPPORT; - - imp->imp_msghdr_flags |= MSGHDR_CKSUM_INCOMPAT18; - - return 0; -} - -/** - * Add all replay requests back to unreplied list before start replay, - * so that we can make sure the known replied XID is always increased - * only even if when replaying requests. - */ -static void ptlrpc_prepare_replay(struct obd_import *imp) -{ - struct ptlrpc_request *req; - - if (imp->imp_state != LUSTRE_IMP_REPLAY || - imp->imp_resend_replay) - return; - - /* - * If the server was restart during repaly, the requests may - * have been added to the unreplied list in former replay. - */ - spin_lock(&imp->imp_lock); - - list_for_each_entry(req, &imp->imp_committed_list, rq_replay_list) { - if (list_empty(&req->rq_unreplied_list)) - ptlrpc_add_unreplied(req); - } - - list_for_each_entry(req, &imp->imp_replay_list, rq_replay_list) { - if (list_empty(&req->rq_unreplied_list)) - ptlrpc_add_unreplied(req); - } - - imp->imp_known_replied_xid = ptlrpc_known_replied_xid(imp); - spin_unlock(&imp->imp_lock); -} - -/** - * interpret_reply callback for connect RPCs. - * Looks into returned status of connect operation and decides - * what to do with the import - i.e enter recovery, promote it to - * full state for normal operations of disconnect it due to an error. - */ -static int ptlrpc_connect_interpret(const struct lu_env *env, - struct ptlrpc_request *request, - void *data, int rc) -{ - struct ptlrpc_connect_async_args *aa = data; - struct obd_import *imp = request->rq_import; - struct lustre_handle old_hdl; - __u64 old_connect_flags; - int msg_flags; - struct obd_connect_data *ocd; - struct obd_export *exp; - int ret; - - spin_lock(&imp->imp_lock); - if (imp->imp_state == LUSTRE_IMP_CLOSED) { - imp->imp_connect_tried = 1; - spin_unlock(&imp->imp_lock); - return 0; - } - - if (rc) { - /* if this reconnect to busy export - not need select new target - * for connecting - */ - imp->imp_force_reconnect = ptlrpc_busy_reconnect(rc); - spin_unlock(&imp->imp_lock); - ptlrpc_maybe_ping_import_soon(imp); - goto out; - } - - /* - * LU-7558: indicate that we are interpretting connect reply, - * pltrpc_connect_import() will not try to reconnect until - * interpret will finish. - */ - imp->imp_connected = 1; - spin_unlock(&imp->imp_lock); - - LASSERT(imp->imp_conn_current); - - msg_flags = lustre_msg_get_op_flags(request->rq_repmsg); - - ret = req_capsule_get_size(&request->rq_pill, &RMF_CONNECT_DATA, - RCL_SERVER); - /* server replied obd_connect_data is always bigger */ - ocd = req_capsule_server_sized_get(&request->rq_pill, - &RMF_CONNECT_DATA, ret); - - if (!ocd) { - CERROR("%s: no connect data from server\n", - imp->imp_obd->obd_name); - rc = -EPROTO; - goto out; - } - - spin_lock(&imp->imp_lock); - - /* All imports are pingable */ - imp->imp_pingable = 1; - imp->imp_force_reconnect = 0; - imp->imp_force_verify = 0; - - imp->imp_connect_data = *ocd; - - CDEBUG(D_HA, "%s: connect to target with instance %u\n", - imp->imp_obd->obd_name, ocd->ocd_instance); - exp = class_conn2export(&imp->imp_dlm_handle); - - spin_unlock(&imp->imp_lock); - - if (!exp) { - /* This could happen if export is cleaned during the - * connect attempt - */ - CERROR("%s: missing export after connect\n", - imp->imp_obd->obd_name); - rc = -ENODEV; - goto out; - } - - /* check that server granted subset of flags we asked for. */ - if ((ocd->ocd_connect_flags & imp->imp_connect_flags_orig) != - ocd->ocd_connect_flags) { - CERROR("%s: Server didn't grant the asked for subset of flags: asked=%#llx granted=%#llx\n", - imp->imp_obd->obd_name, imp->imp_connect_flags_orig, - ocd->ocd_connect_flags); - rc = -EPROTO; - goto out; - } - - old_connect_flags = exp_connect_flags(exp); - exp->exp_connect_data = *ocd; - imp->imp_obd->obd_self_export->exp_connect_data = *ocd; - - /* - * The net statistics after (re-)connect is not valid anymore, - * because may reflect other routing, etc. - */ - at_init(&imp->imp_at.iat_net_latency, 0, 0); - ptlrpc_at_adj_net_latency(request, - lustre_msg_get_service_time(request->rq_repmsg)); - - /* Import flags should be updated before waking import at FULL state */ - rc = ptlrpc_connect_set_flags(imp, ocd, old_connect_flags, exp, - aa->pcaa_initial_connect); - class_export_put(exp); - if (rc) - goto out; - - obd_import_event(imp->imp_obd, imp, IMP_EVENT_OCD); - - if (aa->pcaa_initial_connect) { - spin_lock(&imp->imp_lock); - if (msg_flags & MSG_CONNECT_REPLAYABLE) { - imp->imp_replayable = 1; - spin_unlock(&imp->imp_lock); - CDEBUG(D_HA, "connected to replayable target: %s\n", - obd2cli_tgt(imp->imp_obd)); - } else { - imp->imp_replayable = 0; - spin_unlock(&imp->imp_lock); - } - - /* if applies, adjust the imp->imp_msg_magic here - * according to reply flags - */ - - imp->imp_remote_handle = - *lustre_msg_get_handle(request->rq_repmsg); - - /* Initial connects are allowed for clients with non-random - * uuids when servers are in recovery. Simply signal the - * servers replay is complete and wait in REPLAY_WAIT. - */ - if (msg_flags & MSG_CONNECT_RECOVERING) { - CDEBUG(D_HA, "connect to %s during recovery\n", - obd2cli_tgt(imp->imp_obd)); - IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS); - } else { - IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL); - ptlrpc_activate_import(imp); - } - - rc = 0; - goto finish; - } - - /* Determine what recovery state to move the import to. */ - if (msg_flags & MSG_CONNECT_RECONNECT) { - memset(&old_hdl, 0, sizeof(old_hdl)); - if (!memcmp(&old_hdl, lustre_msg_get_handle(request->rq_repmsg), - sizeof(old_hdl))) { - LCONSOLE_WARN("Reconnect to %s (at @%s) failed due bad handle %#llx\n", - obd2cli_tgt(imp->imp_obd), - imp->imp_connection->c_remote_uuid.uuid, - imp->imp_dlm_handle.cookie); - rc = -ENOTCONN; - goto out; - } - - if (memcmp(&imp->imp_remote_handle, - lustre_msg_get_handle(request->rq_repmsg), - sizeof(imp->imp_remote_handle))) { - int level = msg_flags & MSG_CONNECT_RECOVERING ? - D_HA : D_WARNING; - - /* Bug 16611/14775: if server handle have changed, - * that means some sort of disconnection happened. - * If the server is not in recovery, that also means it - * already erased all of our state because of previous - * eviction. If it is in recovery - we are safe to - * participate since we can reestablish all of our state - * with server again - */ - if ((msg_flags & MSG_CONNECT_RECOVERING)) { - CDEBUG(level, "%s@%s changed server handle from %#llx to %#llx but is still in recovery\n", - obd2cli_tgt(imp->imp_obd), - imp->imp_connection->c_remote_uuid.uuid, - imp->imp_remote_handle.cookie, - lustre_msg_get_handle( - request->rq_repmsg)->cookie); - } else { - LCONSOLE_WARN("Evicted from %s (at %s) after server handle changed from %#llx to %#llx\n", - obd2cli_tgt(imp->imp_obd), - imp->imp_connection-> \ - c_remote_uuid.uuid, - imp->imp_remote_handle.cookie, - lustre_msg_get_handle( - request->rq_repmsg)->cookie); - } - - imp->imp_remote_handle = - *lustre_msg_get_handle(request->rq_repmsg); - - if (!(msg_flags & MSG_CONNECT_RECOVERING)) { - IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED); - rc = 0; - goto finish; - } - - } else { - CDEBUG(D_HA, "reconnected to %s@%s after partition\n", - obd2cli_tgt(imp->imp_obd), - imp->imp_connection->c_remote_uuid.uuid); - } - - if (imp->imp_invalid) { - CDEBUG(D_HA, "%s: reconnected but import is invalid; marking evicted\n", - imp->imp_obd->obd_name); - IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED); - } else if (msg_flags & MSG_CONNECT_RECOVERING) { - CDEBUG(D_HA, "%s: reconnected to %s during replay\n", - imp->imp_obd->obd_name, - obd2cli_tgt(imp->imp_obd)); - - spin_lock(&imp->imp_lock); - imp->imp_resend_replay = 1; - spin_unlock(&imp->imp_lock); - - IMPORT_SET_STATE(imp, imp->imp_replay_state); - } else { - IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER); - } - } else if ((msg_flags & MSG_CONNECT_RECOVERING) && !imp->imp_invalid) { - LASSERT(imp->imp_replayable); - imp->imp_remote_handle = - *lustre_msg_get_handle(request->rq_repmsg); - imp->imp_last_replay_transno = 0; - imp->imp_replay_cursor = &imp->imp_committed_list; - IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY); - } else { - DEBUG_REQ(D_HA, request, "%s: evicting (reconnect/recover flags not set: %x)", - imp->imp_obd->obd_name, msg_flags); - imp->imp_remote_handle = - *lustre_msg_get_handle(request->rq_repmsg); - IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED); - } - - /* Sanity checks for a reconnected import. */ - if (!(imp->imp_replayable) != !(msg_flags & MSG_CONNECT_REPLAYABLE)) - CERROR("imp_replayable flag does not match server after reconnect. We should LBUG right here.\n"); - - if (lustre_msg_get_last_committed(request->rq_repmsg) > 0 && - lustre_msg_get_last_committed(request->rq_repmsg) < - aa->pcaa_peer_committed) - CERROR("%s went back in time (transno %lld was previously committed, server now claims %lld)! See https://bugzilla.lustre.org/show_bug.cgi?id=9646\n", - obd2cli_tgt(imp->imp_obd), aa->pcaa_peer_committed, - lustre_msg_get_last_committed(request->rq_repmsg)); - -finish: - ptlrpc_prepare_replay(imp); - rc = ptlrpc_import_recovery_state_machine(imp); - if (rc == -ENOTCONN) { - CDEBUG(D_HA, "evicted/aborted by %s@%s during recovery; invalidating and reconnecting\n", - obd2cli_tgt(imp->imp_obd), - imp->imp_connection->c_remote_uuid.uuid); - ptlrpc_connect_import(imp); - spin_lock(&imp->imp_lock); - imp->imp_connected = 0; - imp->imp_connect_tried = 1; - spin_unlock(&imp->imp_lock); - return 0; - } - -out: - spin_lock(&imp->imp_lock); - imp->imp_connected = 0; - imp->imp_connect_tried = 1; - spin_unlock(&imp->imp_lock); - - if (rc != 0) { - IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON); - if (rc == -EACCES) { - /* - * Give up trying to reconnect - * EACCES means client has no permission for connection - */ - imp->imp_obd->obd_no_recov = 1; - ptlrpc_deactivate_import(imp); - } - - if (rc == -EPROTO) { - struct obd_connect_data *ocd; - - /* reply message might not be ready */ - if (!request->rq_repmsg) - return -EPROTO; - - ocd = req_capsule_server_get(&request->rq_pill, - &RMF_CONNECT_DATA); - if (ocd && - (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) && - (ocd->ocd_version != LUSTRE_VERSION_CODE)) { - /* - * Actually servers are only supposed to refuse - * connection from liblustre clients, so we - * should never see this from VFS context - */ - LCONSOLE_ERROR_MSG(0x16a, "Server %s version (%d.%d.%d.%d) refused connection from this client with an incompatible version (%s). Client must be recompiled\n", - obd2cli_tgt(imp->imp_obd), - OBD_OCD_VERSION_MAJOR(ocd->ocd_version), - OBD_OCD_VERSION_MINOR(ocd->ocd_version), - OBD_OCD_VERSION_PATCH(ocd->ocd_version), - OBD_OCD_VERSION_FIX(ocd->ocd_version), - LUSTRE_VERSION_STRING); - ptlrpc_deactivate_import(imp); - IMPORT_SET_STATE(imp, LUSTRE_IMP_CLOSED); - } - return -EPROTO; - } - - ptlrpc_maybe_ping_import_soon(imp); - - CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n", - obd2cli_tgt(imp->imp_obd), - (char *)imp->imp_connection->c_remote_uuid.uuid, rc); - } - - wake_up_all(&imp->imp_recovery_waitq); - return rc; -} - -/** - * interpret callback for "completed replay" RPCs. - * \see signal_completed_replay - */ -static int completed_replay_interpret(const struct lu_env *env, - struct ptlrpc_request *req, - void *data, int rc) -{ - atomic_dec(&req->rq_import->imp_replay_inflight); - if (req->rq_status == 0 && - !req->rq_import->imp_vbr_failed) { - ptlrpc_import_recovery_state_machine(req->rq_import); - } else { - if (req->rq_import->imp_vbr_failed) { - CDEBUG(D_WARNING, - "%s: version recovery fails, reconnecting\n", - req->rq_import->imp_obd->obd_name); - } else { - CDEBUG(D_HA, "%s: LAST_REPLAY message error: %d, reconnecting\n", - req->rq_import->imp_obd->obd_name, - req->rq_status); - } - ptlrpc_connect_import(req->rq_import); - } - - return 0; -} - -/** - * Let server know that we have no requests to replay anymore. - * Achieved by just sending a PING request - */ -static int signal_completed_replay(struct obd_import *imp) -{ - struct ptlrpc_request *req; - - if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_FINISH_REPLAY))) - return 0; - - LASSERT(atomic_read(&imp->imp_replay_inflight) == 0); - atomic_inc(&imp->imp_replay_inflight); - - req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING, LUSTRE_OBD_VERSION, - OBD_PING); - if (!req) { - atomic_dec(&imp->imp_replay_inflight); - return -ENOMEM; - } - - ptlrpc_request_set_replen(req); - req->rq_send_state = LUSTRE_IMP_REPLAY_WAIT; - lustre_msg_add_flags(req->rq_reqmsg, - MSG_LOCK_REPLAY_DONE | MSG_REQ_REPLAY_DONE); - if (AT_OFF) - req->rq_timeout *= 3; - req->rq_interpret_reply = completed_replay_interpret; - - ptlrpcd_add_req(req); - return 0; -} - -/** - * In kernel code all import invalidation happens in its own - * separate thread, so that whatever application happened to encounter - * a problem could still be killed or otherwise continue - */ -static int ptlrpc_invalidate_import_thread(void *data) -{ - struct obd_import *imp = data; - - unshare_fs_struct(); - - CDEBUG(D_HA, "thread invalidate import %s to %s@%s\n", - imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd), - imp->imp_connection->c_remote_uuid.uuid); - - ptlrpc_invalidate_import(imp); - - if (obd_dump_on_eviction) { - CERROR("dump the log upon eviction\n"); - libcfs_debug_dumplog(); - } - - IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER); - ptlrpc_import_recovery_state_machine(imp); - - class_import_put(imp); - return 0; -} - -/** - * This is the state machine for client-side recovery on import. - * - * Typically we have two possibly paths. If we came to server and it is not - * in recovery, we just enter IMP_EVICTED state, invalidate our import - * state and reconnect from scratch. - * If we came to server that is in recovery, we enter IMP_REPLAY import state. - * We go through our list of requests to replay and send them to server one by - * one. - * After sending all request from the list we change import state to - * IMP_REPLAY_LOCKS and re-request all the locks we believe we have from server - * and also all the locks we don't yet have and wait for server to grant us. - * After that we send a special "replay completed" request and change import - * state to IMP_REPLAY_WAIT. - * Upon receiving reply to that "replay completed" RPC we enter IMP_RECOVER - * state and resend all requests from sending list. - * After that we promote import to FULL state and send all delayed requests - * and import is fully operational after that. - * - */ -int ptlrpc_import_recovery_state_machine(struct obd_import *imp) -{ - int rc = 0; - int inflight; - char *target_start; - int target_len; - - if (imp->imp_state == LUSTRE_IMP_EVICTED) { - deuuidify(obd2cli_tgt(imp->imp_obd), NULL, - &target_start, &target_len); - /* Don't care about MGC eviction */ - if (strcmp(imp->imp_obd->obd_type->typ_name, - LUSTRE_MGC_NAME) != 0) { - LCONSOLE_ERROR_MSG(0x167, "%s: This client was evicted by %.*s; in progress operations using this service will fail.\n", - imp->imp_obd->obd_name, target_len, - target_start); - } - CDEBUG(D_HA, "evicted from %s@%s; invalidating\n", - obd2cli_tgt(imp->imp_obd), - imp->imp_connection->c_remote_uuid.uuid); - /* reset vbr_failed flag upon eviction */ - spin_lock(&imp->imp_lock); - imp->imp_vbr_failed = 0; - spin_unlock(&imp->imp_lock); - - { - struct task_struct *task; - /* bug 17802: XXX client_disconnect_export vs connect request - * race. if client is evicted at this time, we start - * invalidate thread without reference to import and import can - * be freed at same time. - */ - class_import_get(imp); - task = kthread_run(ptlrpc_invalidate_import_thread, imp, - "ll_imp_inval"); - if (IS_ERR(task)) { - class_import_put(imp); - CERROR("error starting invalidate thread: %d\n", rc); - rc = PTR_ERR(task); - } else { - rc = 0; - } - return rc; - } - } - - if (imp->imp_state == LUSTRE_IMP_REPLAY) { - CDEBUG(D_HA, "replay requested by %s\n", - obd2cli_tgt(imp->imp_obd)); - rc = ptlrpc_replay_next(imp, &inflight); - if (inflight == 0 && - atomic_read(&imp->imp_replay_inflight) == 0) { - IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS); - rc = ldlm_replay_locks(imp); - if (rc) - goto out; - } - rc = 0; - } - - if (imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS) - if (atomic_read(&imp->imp_replay_inflight) == 0) { - IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_WAIT); - rc = signal_completed_replay(imp); - if (rc) - goto out; - } - - if (imp->imp_state == LUSTRE_IMP_REPLAY_WAIT) - if (atomic_read(&imp->imp_replay_inflight) == 0) - IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER); - - if (imp->imp_state == LUSTRE_IMP_RECOVER) { - CDEBUG(D_HA, "reconnected to %s@%s\n", - obd2cli_tgt(imp->imp_obd), - imp->imp_connection->c_remote_uuid.uuid); - - rc = ptlrpc_resend(imp); - if (rc) - goto out; - IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL); - ptlrpc_activate_import(imp); - - deuuidify(obd2cli_tgt(imp->imp_obd), NULL, - &target_start, &target_len); - LCONSOLE_INFO("%s: Connection restored to %.*s (at %s)\n", - imp->imp_obd->obd_name, - target_len, target_start, - libcfs_nid2str(imp->imp_connection->c_peer.nid)); - } - - if (imp->imp_state == LUSTRE_IMP_FULL) { - wake_up_all(&imp->imp_recovery_waitq); - ptlrpc_wake_delayed(imp); - } - -out: - return rc; -} - -int ptlrpc_disconnect_import(struct obd_import *imp, int noclose) -{ - struct ptlrpc_request *req; - int rq_opc, rc = 0; - - if (imp->imp_obd->obd_force) - goto set_state; - - switch (imp->imp_connect_op) { - case OST_CONNECT: - rq_opc = OST_DISCONNECT; - break; - case MDS_CONNECT: - rq_opc = MDS_DISCONNECT; - break; - case MGS_CONNECT: - rq_opc = MGS_DISCONNECT; - break; - default: - rc = -EINVAL; - CERROR("%s: don't know how to disconnect from %s (connect_op %d): rc = %d\n", - imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd), - imp->imp_connect_op, rc); - return rc; - } - - if (ptlrpc_import_in_recovery(imp)) { - unsigned long timeout; - - if (AT_OFF) { - if (imp->imp_server_timeout) - timeout = obd_timeout * HZ / 2; - else - timeout = obd_timeout * HZ; - } else { - int idx = import_at_get_index(imp, - imp->imp_client->cli_request_portal); - timeout = at_get(&imp->imp_at.iat_service_estimate[idx]) * HZ; - } - - if (wait_event_idle_timeout(imp->imp_recovery_waitq, - !ptlrpc_import_in_recovery(imp), - max(timeout, 1UL)) == 0) - l_wait_event_abortable( - imp->imp_recovery_waitq, - !ptlrpc_import_in_recovery(imp)); - } - - spin_lock(&imp->imp_lock); - if (imp->imp_state != LUSTRE_IMP_FULL) - goto out; - spin_unlock(&imp->imp_lock); - - req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_DISCONNECT, - LUSTRE_OBD_VERSION, rq_opc); - if (req) { - /* We are disconnecting, do not retry a failed DISCONNECT rpc if - * it fails. We can get through the above with a down server - * if the client doesn't know the server is gone yet. - */ - req->rq_no_resend = 1; - - /* We want client umounts to happen quickly, no matter the - * server state... - */ - req->rq_timeout = min_t(int, req->rq_timeout, - INITIAL_CONNECT_TIMEOUT); - - IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING); - req->rq_send_state = LUSTRE_IMP_CONNECTING; - ptlrpc_request_set_replen(req); - rc = ptlrpc_queue_wait(req); - ptlrpc_req_finished(req); - } - -set_state: - spin_lock(&imp->imp_lock); -out: - if (noclose) - IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON); - else - IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED); - memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle)); - spin_unlock(&imp->imp_lock); - - if (rc == -ETIMEDOUT || rc == -ENOTCONN || rc == -ESHUTDOWN) - rc = 0; - - return rc; -} -EXPORT_SYMBOL(ptlrpc_disconnect_import); - -/* Adaptive Timeout utils */ -extern unsigned int at_min, at_max, at_history; - -/* - *Update at_current with the specified value (bounded by at_min and at_max), - * as well as the AT history "bins". - * - Bin into timeslices using AT_BINS bins. - * - This gives us a max of the last at_history seconds without the storage, - * but still smoothing out a return to normalcy from a slow response. - * - (E.g. remember the maximum latency in each minute of the last 4 minutes.) - */ -int at_measured(struct adaptive_timeout *at, unsigned int val) -{ - unsigned int old = at->at_current; - time64_t now = ktime_get_real_seconds(); - long binlimit = max_t(long, at_history / AT_BINS, 1); - - LASSERT(at); - CDEBUG(D_OTHER, "add %u to %p time=%lu v=%u (%u %u %u %u)\n", - val, at, (long)(now - at->at_binstart), at->at_current, - at->at_hist[0], at->at_hist[1], at->at_hist[2], at->at_hist[3]); - - if (val == 0) - /* 0's don't count, because we never want our timeout to - * drop to 0, and because 0 could mean an error - */ - return 0; - - spin_lock(&at->at_lock); - - if (unlikely(at->at_binstart == 0)) { - /* Special case to remove default from history */ - at->at_current = val; - at->at_worst_ever = val; - at->at_worst_time = now; - at->at_hist[0] = val; - at->at_binstart = now; - } else if (now - at->at_binstart < binlimit) { - /* in bin 0 */ - at->at_hist[0] = max(val, at->at_hist[0]); - at->at_current = max(val, at->at_current); - } else { - int i, shift; - unsigned int maxv = val; - /* move bins over */ - shift = (u32)(now - at->at_binstart) / binlimit; - LASSERT(shift > 0); - for (i = AT_BINS - 1; i >= 0; i--) { - if (i >= shift) { - at->at_hist[i] = at->at_hist[i - shift]; - maxv = max(maxv, at->at_hist[i]); - } else { - at->at_hist[i] = 0; - } - } - at->at_hist[0] = val; - at->at_current = maxv; - at->at_binstart += shift * binlimit; - } - - if (at->at_current > at->at_worst_ever) { - at->at_worst_ever = at->at_current; - at->at_worst_time = now; - } - - if (at->at_flags & AT_FLG_NOHIST) - /* Only keep last reported val; keeping the rest of the history - * for debugfs only - */ - at->at_current = val; - - if (at_max > 0) - at->at_current = min(at->at_current, at_max); - at->at_current = max(at->at_current, at_min); - - if (at->at_current != old) - CDEBUG(D_OTHER, "AT %p change: old=%u new=%u delta=%d (val=%u) hist %u %u %u %u\n", - at, - old, at->at_current, at->at_current - old, val, - at->at_hist[0], at->at_hist[1], at->at_hist[2], - at->at_hist[3]); - - /* if we changed, report the old value */ - old = (at->at_current != old) ? old : 0; - - spin_unlock(&at->at_lock); - return old; -} - -/* Find the imp_at index for a given portal; assign if space available */ -int import_at_get_index(struct obd_import *imp, int portal) -{ - struct imp_at *at = &imp->imp_at; - int i; - - for (i = 0; i < IMP_AT_MAX_PORTALS; i++) { - if (at->iat_portal[i] == portal) - return i; - if (at->iat_portal[i] == 0) - /* unused */ - break; - } - - /* Not found in list, add it under a lock */ - spin_lock(&imp->imp_lock); - - /* Check unused under lock */ - for (; i < IMP_AT_MAX_PORTALS; i++) { - if (at->iat_portal[i] == portal) - goto out; - if (at->iat_portal[i] == 0) - /* unused */ - break; - } - - /* Not enough portals? */ - LASSERT(i < IMP_AT_MAX_PORTALS); - - at->iat_portal[i] = portal; -out: - spin_unlock(&imp->imp_lock); - return i; -} diff --git a/drivers/staging/lustre/lustre/ptlrpc/layout.c b/drivers/staging/lustre/lustre/ptlrpc/layout.c deleted file mode 100644 index 417d4a151433cdff1b656aa07583bb086ea0e669..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ptlrpc/layout.c +++ /dev/null @@ -1,2232 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/ptlrpc/layout.c - * - * Lustre Metadata Target (mdt) request handler - * - * Author: Nikita Danilov - */ -/* - * This file contains the "capsule/pill" abstraction layered above PTLRPC. - * - * Every struct ptlrpc_request contains a "pill", which points to a description - * of the format that the request conforms to. - */ - -#define DEBUG_SUBSYSTEM S_RPC - -#include - -#include - -#include -#include -#include -#include -#include -#include - -/* struct ptlrpc_request, lustre_msg* */ -#include -#include - -/* - * RQFs (see below) refer to two struct req_msg_field arrays describing the - * client request and server reply, respectively. - */ -/* empty set of fields... for suitable definition of emptiness. */ -static const struct req_msg_field *empty[] = { - &RMF_PTLRPC_BODY -}; - -static const struct req_msg_field *mgs_target_info_only[] = { - &RMF_PTLRPC_BODY, - &RMF_MGS_TARGET_INFO -}; - -static const struct req_msg_field *mgs_set_info[] = { - &RMF_PTLRPC_BODY, - &RMF_MGS_SEND_PARAM -}; - -static const struct req_msg_field *mgs_config_read_client[] = { - &RMF_PTLRPC_BODY, - &RMF_MGS_CONFIG_BODY -}; - -static const struct req_msg_field *mgs_config_read_server[] = { - &RMF_PTLRPC_BODY, - &RMF_MGS_CONFIG_RES -}; - -static const struct req_msg_field *log_cancel_client[] = { - &RMF_PTLRPC_BODY, - &RMF_LOGCOOKIES -}; - -static const struct req_msg_field *mdt_body_only[] = { - &RMF_PTLRPC_BODY, - &RMF_MDT_BODY -}; - -static const struct req_msg_field *mdt_body_capa[] = { - &RMF_PTLRPC_BODY, - &RMF_MDT_BODY, - &RMF_CAPA1 -}; - -static const struct req_msg_field *quotactl_only[] = { - &RMF_PTLRPC_BODY, - &RMF_OBD_QUOTACTL -}; - -static const struct req_msg_field *mdt_close_client[] = { - &RMF_PTLRPC_BODY, - &RMF_MDT_EPOCH, - &RMF_REC_REINT, - &RMF_CAPA1 -}; - -static const struct req_msg_field *mdt_intent_close_client[] = { - &RMF_PTLRPC_BODY, - &RMF_MDT_EPOCH, - &RMF_REC_REINT, - &RMF_CAPA1, - &RMF_CLOSE_DATA -}; - -static const struct req_msg_field *obd_statfs_server[] = { - &RMF_PTLRPC_BODY, - &RMF_OBD_STATFS -}; - -static const struct req_msg_field *seq_query_client[] = { - &RMF_PTLRPC_BODY, - &RMF_SEQ_OPC, - &RMF_SEQ_RANGE -}; - -static const struct req_msg_field *seq_query_server[] = { - &RMF_PTLRPC_BODY, - &RMF_SEQ_RANGE -}; - -static const struct req_msg_field *fld_query_client[] = { - &RMF_PTLRPC_BODY, - &RMF_FLD_OPC, - &RMF_FLD_MDFLD -}; - -static const struct req_msg_field *fld_query_server[] = { - &RMF_PTLRPC_BODY, - &RMF_FLD_MDFLD -}; - -static const struct req_msg_field *fld_read_client[] = { - &RMF_PTLRPC_BODY, - &RMF_FLD_MDFLD -}; - -static const struct req_msg_field *fld_read_server[] = { - &RMF_PTLRPC_BODY, - &RMF_GENERIC_DATA -}; - -static const struct req_msg_field *mds_getattr_name_client[] = { - &RMF_PTLRPC_BODY, - &RMF_MDT_BODY, - &RMF_CAPA1, - &RMF_NAME -}; - -static const struct req_msg_field *mds_reint_client[] = { - &RMF_PTLRPC_BODY, - &RMF_REC_REINT -}; - -static const struct req_msg_field *mds_reint_create_client[] = { - &RMF_PTLRPC_BODY, - &RMF_REC_REINT, - &RMF_CAPA1, - &RMF_NAME -}; - -static const struct req_msg_field *mds_reint_create_slave_client[] = { - &RMF_PTLRPC_BODY, - &RMF_REC_REINT, - &RMF_CAPA1, - &RMF_NAME, - &RMF_EADATA, - &RMF_DLM_REQ -}; - -static const struct req_msg_field *mds_reint_create_acl_client[] = { - &RMF_PTLRPC_BODY, - &RMF_REC_REINT, - &RMF_CAPA1, - &RMF_NAME, - &RMF_EADATA, - &RMF_DLM_REQ -}; - -static const struct req_msg_field *mds_reint_create_sym_client[] = { - &RMF_PTLRPC_BODY, - &RMF_REC_REINT, - &RMF_CAPA1, - &RMF_NAME, - &RMF_SYMTGT, - &RMF_DLM_REQ -}; - -static const struct req_msg_field *mds_reint_open_client[] = { - &RMF_PTLRPC_BODY, - &RMF_REC_REINT, - &RMF_CAPA1, - &RMF_CAPA2, - &RMF_NAME, - &RMF_EADATA -}; - -static const struct req_msg_field *mds_reint_open_server[] = { - &RMF_PTLRPC_BODY, - &RMF_MDT_BODY, - &RMF_MDT_MD, - &RMF_ACL, - &RMF_CAPA1, - &RMF_CAPA2 -}; - -static const struct req_msg_field *mds_reint_unlink_client[] = { - &RMF_PTLRPC_BODY, - &RMF_REC_REINT, - &RMF_CAPA1, - &RMF_NAME, - &RMF_DLM_REQ -}; - -static const struct req_msg_field *mds_reint_link_client[] = { - &RMF_PTLRPC_BODY, - &RMF_REC_REINT, - &RMF_CAPA1, - &RMF_CAPA2, - &RMF_NAME, - &RMF_DLM_REQ -}; - -static const struct req_msg_field *mds_reint_rename_client[] = { - &RMF_PTLRPC_BODY, - &RMF_REC_REINT, - &RMF_CAPA1, - &RMF_CAPA2, - &RMF_NAME, - &RMF_SYMTGT, - &RMF_DLM_REQ -}; - -static const struct req_msg_field *mds_reint_migrate_client[] = { - &RMF_PTLRPC_BODY, - &RMF_REC_REINT, - &RMF_CAPA1, - &RMF_CAPA2, - &RMF_NAME, - &RMF_SYMTGT, - &RMF_DLM_REQ, - &RMF_MDT_EPOCH, - &RMF_CLOSE_DATA -}; - -static const struct req_msg_field *mds_last_unlink_server[] = { - &RMF_PTLRPC_BODY, - &RMF_MDT_BODY, - &RMF_MDT_MD, - &RMF_LOGCOOKIES, - &RMF_CAPA1, - &RMF_CAPA2 -}; - -static const struct req_msg_field *mds_reint_setattr_client[] = { - &RMF_PTLRPC_BODY, - &RMF_REC_REINT, - &RMF_CAPA1, - &RMF_MDT_EPOCH, - &RMF_EADATA, - &RMF_LOGCOOKIES, - &RMF_DLM_REQ -}; - -static const struct req_msg_field *mds_reint_setxattr_client[] = { - &RMF_PTLRPC_BODY, - &RMF_REC_REINT, - &RMF_CAPA1, - &RMF_NAME, - &RMF_EADATA, - &RMF_DLM_REQ -}; - -static const struct req_msg_field *mdt_swap_layouts[] = { - &RMF_PTLRPC_BODY, - &RMF_MDT_BODY, - &RMF_SWAP_LAYOUTS, - &RMF_CAPA1, - &RMF_CAPA2, - &RMF_DLM_REQ -}; - -static const struct req_msg_field *obd_connect_client[] = { - &RMF_PTLRPC_BODY, - &RMF_TGTUUID, - &RMF_CLUUID, - &RMF_CONN, - &RMF_CONNECT_DATA -}; - -static const struct req_msg_field *obd_connect_server[] = { - &RMF_PTLRPC_BODY, - &RMF_CONNECT_DATA -}; - -static const struct req_msg_field *obd_set_info_client[] = { - &RMF_PTLRPC_BODY, - &RMF_SETINFO_KEY, - &RMF_SETINFO_VAL -}; - -static const struct req_msg_field *ost_grant_shrink_client[] = { - &RMF_PTLRPC_BODY, - &RMF_SETINFO_KEY, - &RMF_OST_BODY -}; - -static const struct req_msg_field *mds_getinfo_client[] = { - &RMF_PTLRPC_BODY, - &RMF_GETINFO_KEY, - &RMF_GETINFO_VALLEN -}; - -static const struct req_msg_field *mds_getinfo_server[] = { - &RMF_PTLRPC_BODY, - &RMF_GETINFO_VAL, -}; - -static const struct req_msg_field *ldlm_enqueue_client[] = { - &RMF_PTLRPC_BODY, - &RMF_DLM_REQ -}; - -static const struct req_msg_field *ldlm_enqueue_server[] = { - &RMF_PTLRPC_BODY, - &RMF_DLM_REP -}; - -static const struct req_msg_field *ldlm_enqueue_lvb_server[] = { - &RMF_PTLRPC_BODY, - &RMF_DLM_REP, - &RMF_DLM_LVB -}; - -static const struct req_msg_field *ldlm_cp_callback_client[] = { - &RMF_PTLRPC_BODY, - &RMF_DLM_REQ, - &RMF_DLM_LVB -}; - -static const struct req_msg_field *ldlm_gl_callback_desc_client[] = { - &RMF_PTLRPC_BODY, - &RMF_DLM_REQ, - &RMF_DLM_GL_DESC -}; - -static const struct req_msg_field *ldlm_gl_callback_server[] = { - &RMF_PTLRPC_BODY, - &RMF_DLM_LVB -}; - -static const struct req_msg_field *ldlm_intent_basic_client[] = { - &RMF_PTLRPC_BODY, - &RMF_DLM_REQ, - &RMF_LDLM_INTENT, -}; - -static const struct req_msg_field *ldlm_intent_client[] = { - &RMF_PTLRPC_BODY, - &RMF_DLM_REQ, - &RMF_LDLM_INTENT, - &RMF_REC_REINT -}; - -static const struct req_msg_field *ldlm_intent_server[] = { - &RMF_PTLRPC_BODY, - &RMF_DLM_REP, - &RMF_MDT_BODY, - &RMF_MDT_MD, - &RMF_ACL -}; - -static const struct req_msg_field *ldlm_intent_layout_client[] = { - &RMF_PTLRPC_BODY, - &RMF_DLM_REQ, - &RMF_LDLM_INTENT, - &RMF_LAYOUT_INTENT, - &RMF_EADATA /* for new layout to be set up */ -}; - -static const struct req_msg_field *ldlm_intent_open_server[] = { - &RMF_PTLRPC_BODY, - &RMF_DLM_REP, - &RMF_MDT_BODY, - &RMF_MDT_MD, - &RMF_ACL, - &RMF_CAPA1, - &RMF_CAPA2 -}; - -static const struct req_msg_field *ldlm_intent_getattr_client[] = { - &RMF_PTLRPC_BODY, - &RMF_DLM_REQ, - &RMF_LDLM_INTENT, - &RMF_MDT_BODY, /* coincides with mds_getattr_name_client[] */ - &RMF_CAPA1, - &RMF_NAME -}; - -static const struct req_msg_field *ldlm_intent_getattr_server[] = { - &RMF_PTLRPC_BODY, - &RMF_DLM_REP, - &RMF_MDT_BODY, - &RMF_MDT_MD, - &RMF_ACL, - &RMF_CAPA1 -}; - -static const struct req_msg_field *ldlm_intent_create_client[] = { - &RMF_PTLRPC_BODY, - &RMF_DLM_REQ, - &RMF_LDLM_INTENT, - &RMF_REC_REINT, /* coincides with mds_reint_create_client[] */ - &RMF_CAPA1, - &RMF_NAME, - &RMF_EADATA -}; - -static const struct req_msg_field *ldlm_intent_open_client[] = { - &RMF_PTLRPC_BODY, - &RMF_DLM_REQ, - &RMF_LDLM_INTENT, - &RMF_REC_REINT, /* coincides with mds_reint_open_client[] */ - &RMF_CAPA1, - &RMF_CAPA2, - &RMF_NAME, - &RMF_EADATA -}; - -static const struct req_msg_field *ldlm_intent_unlink_client[] = { - &RMF_PTLRPC_BODY, - &RMF_DLM_REQ, - &RMF_LDLM_INTENT, - &RMF_REC_REINT, /* coincides with mds_reint_unlink_client[] */ - &RMF_CAPA1, - &RMF_NAME -}; - -static const struct req_msg_field *ldlm_intent_getxattr_client[] = { - &RMF_PTLRPC_BODY, - &RMF_DLM_REQ, - &RMF_LDLM_INTENT, - &RMF_MDT_BODY, - &RMF_CAPA1, -}; - -static const struct req_msg_field *ldlm_intent_getxattr_server[] = { - &RMF_PTLRPC_BODY, - &RMF_DLM_REP, - &RMF_MDT_BODY, - &RMF_MDT_MD, - &RMF_ACL, /* for req_capsule_extend/mdt_intent_policy */ - &RMF_EADATA, - &RMF_EAVALS, - &RMF_EAVALS_LENS -}; - -static const struct req_msg_field *mds_getxattr_client[] = { - &RMF_PTLRPC_BODY, - &RMF_MDT_BODY, - &RMF_CAPA1, - &RMF_NAME, - &RMF_EADATA -}; - -static const struct req_msg_field *mds_getxattr_server[] = { - &RMF_PTLRPC_BODY, - &RMF_MDT_BODY, - &RMF_EADATA -}; - -static const struct req_msg_field *mds_getattr_server[] = { - &RMF_PTLRPC_BODY, - &RMF_MDT_BODY, - &RMF_MDT_MD, - &RMF_ACL, - &RMF_CAPA1, - &RMF_CAPA2 -}; - -static const struct req_msg_field *mds_setattr_server[] = { - &RMF_PTLRPC_BODY, - &RMF_MDT_BODY, - &RMF_MDT_MD, - &RMF_ACL, - &RMF_CAPA1, - &RMF_CAPA2 -}; - -static const struct req_msg_field *llog_origin_handle_create_client[] = { - &RMF_PTLRPC_BODY, - &RMF_LLOGD_BODY, - &RMF_NAME -}; - -static const struct req_msg_field *llogd_body_only[] = { - &RMF_PTLRPC_BODY, - &RMF_LLOGD_BODY -}; - -static const struct req_msg_field *llog_log_hdr_only[] = { - &RMF_PTLRPC_BODY, - &RMF_LLOG_LOG_HDR -}; - -static const struct req_msg_field *llogd_conn_body_only[] = { - &RMF_PTLRPC_BODY, - &RMF_LLOGD_CONN_BODY -}; - -static const struct req_msg_field *llog_origin_handle_next_block_server[] = { - &RMF_PTLRPC_BODY, - &RMF_LLOGD_BODY, - &RMF_EADATA -}; - -static const struct req_msg_field *ost_body_only[] = { - &RMF_PTLRPC_BODY, - &RMF_OST_BODY -}; - -static const struct req_msg_field *ost_body_capa[] = { - &RMF_PTLRPC_BODY, - &RMF_OST_BODY, - &RMF_CAPA1 -}; - -static const struct req_msg_field *ost_destroy_client[] = { - &RMF_PTLRPC_BODY, - &RMF_OST_BODY, - &RMF_DLM_REQ, - &RMF_CAPA1 -}; - -static const struct req_msg_field *ost_brw_client[] = { - &RMF_PTLRPC_BODY, - &RMF_OST_BODY, - &RMF_OBD_IOOBJ, - &RMF_NIOBUF_REMOTE, - &RMF_CAPA1 -}; - -static const struct req_msg_field *ost_brw_read_server[] = { - &RMF_PTLRPC_BODY, - &RMF_OST_BODY -}; - -static const struct req_msg_field *ost_brw_write_server[] = { - &RMF_PTLRPC_BODY, - &RMF_OST_BODY, - &RMF_RCS -}; - -static const struct req_msg_field *ost_get_info_generic_server[] = { - &RMF_PTLRPC_BODY, - &RMF_GENERIC_DATA, -}; - -static const struct req_msg_field *ost_get_info_generic_client[] = { - &RMF_PTLRPC_BODY, - &RMF_GETINFO_KEY -}; - -static const struct req_msg_field *ost_get_last_id_server[] = { - &RMF_PTLRPC_BODY, - &RMF_OBD_ID -}; - -static const struct req_msg_field *ost_get_last_fid_client[] = { - &RMF_PTLRPC_BODY, - &RMF_GETINFO_KEY, - &RMF_FID, -}; - -static const struct req_msg_field *ost_get_last_fid_server[] = { - &RMF_PTLRPC_BODY, - &RMF_FID, -}; - -static const struct req_msg_field *ost_get_fiemap_client[] = { - &RMF_PTLRPC_BODY, - &RMF_FIEMAP_KEY, - &RMF_FIEMAP_VAL -}; - -static const struct req_msg_field *ost_get_fiemap_server[] = { - &RMF_PTLRPC_BODY, - &RMF_FIEMAP_VAL -}; - -static const struct req_msg_field *mdt_hsm_progress[] = { - &RMF_PTLRPC_BODY, - &RMF_MDT_BODY, - &RMF_MDS_HSM_PROGRESS, -}; - -static const struct req_msg_field *mdt_hsm_ct_register[] = { - &RMF_PTLRPC_BODY, - &RMF_MDT_BODY, - &RMF_MDS_HSM_ARCHIVE, -}; - -static const struct req_msg_field *mdt_hsm_ct_unregister[] = { - &RMF_PTLRPC_BODY, - &RMF_MDT_BODY, -}; - -static const struct req_msg_field *mdt_hsm_action_server[] = { - &RMF_PTLRPC_BODY, - &RMF_MDT_BODY, - &RMF_MDS_HSM_CURRENT_ACTION, -}; - -static const struct req_msg_field *mdt_hsm_state_get_server[] = { - &RMF_PTLRPC_BODY, - &RMF_MDT_BODY, - &RMF_HSM_USER_STATE, -}; - -static const struct req_msg_field *mdt_hsm_state_set[] = { - &RMF_PTLRPC_BODY, - &RMF_MDT_BODY, - &RMF_CAPA1, - &RMF_HSM_STATE_SET, -}; - -static const struct req_msg_field *mdt_hsm_request[] = { - &RMF_PTLRPC_BODY, - &RMF_MDT_BODY, - &RMF_MDS_HSM_REQUEST, - &RMF_MDS_HSM_USER_ITEM, - &RMF_GENERIC_DATA, -}; - -static struct req_format *req_formats[] = { - &RQF_OBD_PING, - &RQF_OBD_SET_INFO, - &RQF_SEC_CTX, - &RQF_MGS_TARGET_REG, - &RQF_MGS_SET_INFO, - &RQF_MGS_CONFIG_READ, - &RQF_SEQ_QUERY, - &RQF_FLD_QUERY, - &RQF_FLD_READ, - &RQF_MDS_CONNECT, - &RQF_MDS_DISCONNECT, - &RQF_MDS_GET_INFO, - &RQF_MDS_GETSTATUS, - &RQF_MDS_STATFS, - &RQF_MDS_GETATTR, - &RQF_MDS_GETATTR_NAME, - &RQF_MDS_GETXATTR, - &RQF_MDS_SYNC, - &RQF_MDS_CLOSE, - &RQF_MDS_INTENT_CLOSE, - &RQF_MDS_READPAGE, - &RQF_MDS_WRITEPAGE, - &RQF_MDS_REINT, - &RQF_MDS_REINT_CREATE, - &RQF_MDS_REINT_CREATE_ACL, - &RQF_MDS_REINT_CREATE_SLAVE, - &RQF_MDS_REINT_CREATE_SYM, - &RQF_MDS_REINT_OPEN, - &RQF_MDS_REINT_UNLINK, - &RQF_MDS_REINT_LINK, - &RQF_MDS_REINT_RENAME, - &RQF_MDS_REINT_MIGRATE, - &RQF_MDS_REINT_SETATTR, - &RQF_MDS_REINT_SETXATTR, - &RQF_MDS_QUOTACTL, - &RQF_MDS_HSM_PROGRESS, - &RQF_MDS_HSM_CT_REGISTER, - &RQF_MDS_HSM_CT_UNREGISTER, - &RQF_MDS_HSM_STATE_GET, - &RQF_MDS_HSM_STATE_SET, - &RQF_MDS_HSM_ACTION, - &RQF_MDS_HSM_REQUEST, - &RQF_MDS_SWAP_LAYOUTS, - &RQF_OST_CONNECT, - &RQF_OST_DISCONNECT, - &RQF_OST_QUOTACTL, - &RQF_OST_GETATTR, - &RQF_OST_SETATTR, - &RQF_OST_CREATE, - &RQF_OST_PUNCH, - &RQF_OST_SYNC, - &RQF_OST_DESTROY, - &RQF_OST_BRW_READ, - &RQF_OST_BRW_WRITE, - &RQF_OST_STATFS, - &RQF_OST_SET_GRANT_INFO, - &RQF_OST_GET_INFO, - &RQF_OST_GET_INFO_LAST_ID, - &RQF_OST_GET_INFO_LAST_FID, - &RQF_OST_SET_INFO_LAST_FID, - &RQF_OST_GET_INFO_FIEMAP, - &RQF_LDLM_ENQUEUE, - &RQF_LDLM_ENQUEUE_LVB, - &RQF_LDLM_CONVERT, - &RQF_LDLM_CANCEL, - &RQF_LDLM_CALLBACK, - &RQF_LDLM_CP_CALLBACK, - &RQF_LDLM_BL_CALLBACK, - &RQF_LDLM_GL_CALLBACK, - &RQF_LDLM_GL_DESC_CALLBACK, - &RQF_LDLM_INTENT, - &RQF_LDLM_INTENT_BASIC, - &RQF_LDLM_INTENT_LAYOUT, - &RQF_LDLM_INTENT_GETATTR, - &RQF_LDLM_INTENT_OPEN, - &RQF_LDLM_INTENT_CREATE, - &RQF_LDLM_INTENT_UNLINK, - &RQF_LDLM_INTENT_GETXATTR, - &RQF_LOG_CANCEL, - &RQF_LLOG_ORIGIN_HANDLE_CREATE, - &RQF_LLOG_ORIGIN_HANDLE_DESTROY, - &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK, - &RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK, - &RQF_LLOG_ORIGIN_HANDLE_READ_HEADER, - &RQF_LLOG_ORIGIN_CONNECT, - &RQF_CONNECT, -}; - -struct req_msg_field { - const __u32 rmf_flags; - const char *rmf_name; - /** - * Field length. (-1) means "variable length". If the - * \a RMF_F_STRUCT_ARRAY flag is set the field is also variable-length, - * but the actual size must be a whole multiple of \a rmf_size. - */ - const int rmf_size; - void (*rmf_swabber)(void *); - void (*rmf_dumper)(void *); - int rmf_offset[ARRAY_SIZE(req_formats)][RCL_NR]; -}; - -enum rmf_flags { - /** - * The field is a string, must be NUL-terminated. - */ - RMF_F_STRING = BIT(0), - /** - * The field's buffer size need not match the declared \a rmf_size. - */ - RMF_F_NO_SIZE_CHECK = BIT(1), - /** - * The field's buffer size must be a whole multiple of the declared \a - * rmf_size and the \a rmf_swabber function must work on the declared \a - * rmf_size worth of bytes. - */ - RMF_F_STRUCT_ARRAY = BIT(2) -}; - -struct req_capsule; - -/* - * Request fields. - */ -#define DEFINE_MSGF(name, flags, size, swabber, dumper) { \ - .rmf_name = (name), \ - .rmf_flags = (flags), \ - .rmf_size = (size), \ - .rmf_swabber = (void (*)(void *))(swabber), \ - .rmf_dumper = (void (*)(void *))(dumper) \ -} - -struct req_msg_field RMF_GENERIC_DATA = - DEFINE_MSGF("generic_data", 0, - -1, NULL, NULL); -EXPORT_SYMBOL(RMF_GENERIC_DATA); - -struct req_msg_field RMF_MGS_TARGET_INFO = - DEFINE_MSGF("mgs_target_info", 0, - sizeof(struct mgs_target_info), - lustre_swab_mgs_target_info, NULL); -EXPORT_SYMBOL(RMF_MGS_TARGET_INFO); - -struct req_msg_field RMF_MGS_SEND_PARAM = - DEFINE_MSGF("mgs_send_param", 0, - sizeof(struct mgs_send_param), - NULL, NULL); -EXPORT_SYMBOL(RMF_MGS_SEND_PARAM); - -struct req_msg_field RMF_MGS_CONFIG_BODY = - DEFINE_MSGF("mgs_config_read request", 0, - sizeof(struct mgs_config_body), - lustre_swab_mgs_config_body, NULL); -EXPORT_SYMBOL(RMF_MGS_CONFIG_BODY); - -struct req_msg_field RMF_MGS_CONFIG_RES = - DEFINE_MSGF("mgs_config_read reply ", 0, - sizeof(struct mgs_config_res), - lustre_swab_mgs_config_res, NULL); -EXPORT_SYMBOL(RMF_MGS_CONFIG_RES); - -struct req_msg_field RMF_U32 = - DEFINE_MSGF("generic u32", 0, - sizeof(__u32), lustre_swab_generic_32s, NULL); -EXPORT_SYMBOL(RMF_U32); - -struct req_msg_field RMF_SETINFO_VAL = - DEFINE_MSGF("setinfo_val", 0, -1, NULL, NULL); -EXPORT_SYMBOL(RMF_SETINFO_VAL); - -struct req_msg_field RMF_GETINFO_KEY = - DEFINE_MSGF("getinfo_key", 0, -1, NULL, NULL); -EXPORT_SYMBOL(RMF_GETINFO_KEY); - -struct req_msg_field RMF_GETINFO_VALLEN = - DEFINE_MSGF("getinfo_vallen", 0, - sizeof(__u32), lustre_swab_generic_32s, NULL); -EXPORT_SYMBOL(RMF_GETINFO_VALLEN); - -struct req_msg_field RMF_GETINFO_VAL = - DEFINE_MSGF("getinfo_val", 0, -1, NULL, NULL); -EXPORT_SYMBOL(RMF_GETINFO_VAL); - -struct req_msg_field RMF_SEQ_OPC = - DEFINE_MSGF("seq_query_opc", 0, - sizeof(__u32), lustre_swab_generic_32s, NULL); -EXPORT_SYMBOL(RMF_SEQ_OPC); - -struct req_msg_field RMF_SEQ_RANGE = - DEFINE_MSGF("seq_query_range", 0, - sizeof(struct lu_seq_range), - lustre_swab_lu_seq_range, NULL); -EXPORT_SYMBOL(RMF_SEQ_RANGE); - -struct req_msg_field RMF_FLD_OPC = - DEFINE_MSGF("fld_query_opc", 0, - sizeof(__u32), lustre_swab_generic_32s, NULL); -EXPORT_SYMBOL(RMF_FLD_OPC); - -struct req_msg_field RMF_FLD_MDFLD = - DEFINE_MSGF("fld_query_mdfld", 0, - sizeof(struct lu_seq_range), - lustre_swab_lu_seq_range, NULL); -EXPORT_SYMBOL(RMF_FLD_MDFLD); - -struct req_msg_field RMF_MDT_BODY = - DEFINE_MSGF("mdt_body", 0, - sizeof(struct mdt_body), lustre_swab_mdt_body, NULL); -EXPORT_SYMBOL(RMF_MDT_BODY); - -struct req_msg_field RMF_OBD_QUOTACTL = - DEFINE_MSGF("obd_quotactl", 0, - sizeof(struct obd_quotactl), - lustre_swab_obd_quotactl, NULL); -EXPORT_SYMBOL(RMF_OBD_QUOTACTL); - -struct req_msg_field RMF_MDT_EPOCH = - DEFINE_MSGF("mdt_ioepoch", 0, - sizeof(struct mdt_ioepoch), lustre_swab_mdt_ioepoch, NULL); -EXPORT_SYMBOL(RMF_MDT_EPOCH); - -struct req_msg_field RMF_PTLRPC_BODY = - DEFINE_MSGF("ptlrpc_body", 0, - sizeof(struct ptlrpc_body), lustre_swab_ptlrpc_body, NULL); -EXPORT_SYMBOL(RMF_PTLRPC_BODY); - -struct req_msg_field RMF_CLOSE_DATA = - DEFINE_MSGF("data_version", 0, - sizeof(struct close_data), lustre_swab_close_data, NULL); -EXPORT_SYMBOL(RMF_CLOSE_DATA); - -struct req_msg_field RMF_OBD_STATFS = - DEFINE_MSGF("obd_statfs", 0, - sizeof(struct obd_statfs), lustre_swab_obd_statfs, NULL); -EXPORT_SYMBOL(RMF_OBD_STATFS); - -struct req_msg_field RMF_SETINFO_KEY = - DEFINE_MSGF("setinfo_key", 0, -1, NULL, NULL); -EXPORT_SYMBOL(RMF_SETINFO_KEY); - -struct req_msg_field RMF_NAME = - DEFINE_MSGF("name", RMF_F_STRING, -1, NULL, NULL); -EXPORT_SYMBOL(RMF_NAME); - -struct req_msg_field RMF_SYMTGT = - DEFINE_MSGF("symtgt", RMF_F_STRING, -1, NULL, NULL); -EXPORT_SYMBOL(RMF_SYMTGT); - -struct req_msg_field RMF_TGTUUID = - DEFINE_MSGF("tgtuuid", RMF_F_STRING, sizeof(struct obd_uuid) - 1, NULL, - NULL); -EXPORT_SYMBOL(RMF_TGTUUID); - -struct req_msg_field RMF_CLUUID = - DEFINE_MSGF("cluuid", RMF_F_STRING, sizeof(struct obd_uuid) - 1, NULL, - NULL); -EXPORT_SYMBOL(RMF_CLUUID); - -struct req_msg_field RMF_STRING = - DEFINE_MSGF("string", RMF_F_STRING, -1, NULL, NULL); -EXPORT_SYMBOL(RMF_STRING); - -struct req_msg_field RMF_LLOGD_BODY = - DEFINE_MSGF("llogd_body", 0, - sizeof(struct llogd_body), lustre_swab_llogd_body, NULL); -EXPORT_SYMBOL(RMF_LLOGD_BODY); - -struct req_msg_field RMF_LLOG_LOG_HDR = - DEFINE_MSGF("llog_log_hdr", 0, - sizeof(struct llog_log_hdr), lustre_swab_llog_hdr, NULL); -EXPORT_SYMBOL(RMF_LLOG_LOG_HDR); - -struct req_msg_field RMF_LLOGD_CONN_BODY = - DEFINE_MSGF("llogd_conn_body", 0, - sizeof(struct llogd_conn_body), - lustre_swab_llogd_conn_body, NULL); -EXPORT_SYMBOL(RMF_LLOGD_CONN_BODY); - -/* - * connection handle received in MDS_CONNECT request. - * - * No swabbing needed because struct lustre_handle contains only a 64-bit cookie - * that the client does not interpret at all. - */ -struct req_msg_field RMF_CONN = - DEFINE_MSGF("conn", 0, sizeof(struct lustre_handle), NULL, NULL); -EXPORT_SYMBOL(RMF_CONN); - -struct req_msg_field RMF_CONNECT_DATA = - DEFINE_MSGF("cdata", - RMF_F_NO_SIZE_CHECK /* we allow extra space for interop */, - sizeof(struct obd_connect_data), - lustre_swab_connect, NULL); -EXPORT_SYMBOL(RMF_CONNECT_DATA); - -struct req_msg_field RMF_DLM_REQ = - DEFINE_MSGF("dlm_req", RMF_F_NO_SIZE_CHECK /* ldlm_request_bufsize */, - sizeof(struct ldlm_request), - lustre_swab_ldlm_request, NULL); -EXPORT_SYMBOL(RMF_DLM_REQ); - -struct req_msg_field RMF_DLM_REP = - DEFINE_MSGF("dlm_rep", 0, - sizeof(struct ldlm_reply), lustre_swab_ldlm_reply, NULL); -EXPORT_SYMBOL(RMF_DLM_REP); - -struct req_msg_field RMF_LDLM_INTENT = - DEFINE_MSGF("ldlm_intent", 0, - sizeof(struct ldlm_intent), lustre_swab_ldlm_intent, NULL); -EXPORT_SYMBOL(RMF_LDLM_INTENT); - -struct req_msg_field RMF_DLM_LVB = - DEFINE_MSGF("dlm_lvb", 0, -1, NULL, NULL); -EXPORT_SYMBOL(RMF_DLM_LVB); - -struct req_msg_field RMF_DLM_GL_DESC = - DEFINE_MSGF("dlm_gl_desc", 0, sizeof(union ldlm_gl_desc), - lustre_swab_gl_desc, NULL); -EXPORT_SYMBOL(RMF_DLM_GL_DESC); - -struct req_msg_field RMF_MDT_MD = - DEFINE_MSGF("mdt_md", RMF_F_NO_SIZE_CHECK, MIN_MD_SIZE, NULL, NULL); -EXPORT_SYMBOL(RMF_MDT_MD); - -struct req_msg_field RMF_REC_REINT = - DEFINE_MSGF("rec_reint", 0, sizeof(struct mdt_rec_reint), - lustre_swab_mdt_rec_reint, NULL); -EXPORT_SYMBOL(RMF_REC_REINT); - -/* FIXME: this length should be defined as a macro */ -struct req_msg_field RMF_EADATA = DEFINE_MSGF("eadata", 0, -1, - NULL, NULL); -EXPORT_SYMBOL(RMF_EADATA); - -struct req_msg_field RMF_EAVALS = DEFINE_MSGF("eavals", 0, -1, NULL, NULL); -EXPORT_SYMBOL(RMF_EAVALS); - -struct req_msg_field RMF_ACL = DEFINE_MSGF("acl", 0, -1, NULL, NULL); -EXPORT_SYMBOL(RMF_ACL); - -/* FIXME: this should be made to use RMF_F_STRUCT_ARRAY */ -struct req_msg_field RMF_LOGCOOKIES = - DEFINE_MSGF("logcookies", RMF_F_NO_SIZE_CHECK /* multiple cookies */, - sizeof(struct llog_cookie), NULL, NULL); -EXPORT_SYMBOL(RMF_LOGCOOKIES); - -struct req_msg_field RMF_CAPA1 = - DEFINE_MSGF("capa", 0, sizeof(struct lustre_capa), - lustre_swab_lustre_capa, NULL); -EXPORT_SYMBOL(RMF_CAPA1); - -struct req_msg_field RMF_CAPA2 = - DEFINE_MSGF("capa", 0, sizeof(struct lustre_capa), - lustre_swab_lustre_capa, NULL); -EXPORT_SYMBOL(RMF_CAPA2); - -struct req_msg_field RMF_LAYOUT_INTENT = - DEFINE_MSGF("layout_intent", 0, - sizeof(struct layout_intent), lustre_swab_layout_intent, - NULL); -EXPORT_SYMBOL(RMF_LAYOUT_INTENT); - -/* - * OST request field. - */ -struct req_msg_field RMF_OST_BODY = - DEFINE_MSGF("ost_body", 0, - sizeof(struct ost_body), lustre_swab_ost_body, dump_ost_body); -EXPORT_SYMBOL(RMF_OST_BODY); - -struct req_msg_field RMF_OBD_IOOBJ = - DEFINE_MSGF("obd_ioobj", RMF_F_STRUCT_ARRAY, - sizeof(struct obd_ioobj), lustre_swab_obd_ioobj, dump_ioo); -EXPORT_SYMBOL(RMF_OBD_IOOBJ); - -struct req_msg_field RMF_NIOBUF_REMOTE = - DEFINE_MSGF("niobuf_remote", RMF_F_STRUCT_ARRAY, - sizeof(struct niobuf_remote), lustre_swab_niobuf_remote, - dump_rniobuf); -EXPORT_SYMBOL(RMF_NIOBUF_REMOTE); - -struct req_msg_field RMF_RCS = - DEFINE_MSGF("niobuf_remote", RMF_F_STRUCT_ARRAY, sizeof(__u32), - lustre_swab_generic_32s, dump_rcs); -EXPORT_SYMBOL(RMF_RCS); - -struct req_msg_field RMF_EAVALS_LENS = - DEFINE_MSGF("eavals_lens", RMF_F_STRUCT_ARRAY, sizeof(__u32), - lustre_swab_generic_32s, NULL); -EXPORT_SYMBOL(RMF_EAVALS_LENS); - -struct req_msg_field RMF_OBD_ID = - DEFINE_MSGF("u64", 0, - sizeof(u64), lustre_swab_ost_last_id, NULL); -EXPORT_SYMBOL(RMF_OBD_ID); - -struct req_msg_field RMF_FID = - DEFINE_MSGF("fid", 0, - sizeof(struct lu_fid), lustre_swab_lu_fid, NULL); -EXPORT_SYMBOL(RMF_FID); - -struct req_msg_field RMF_OST_ID = - DEFINE_MSGF("ost_id", 0, - sizeof(struct ost_id), lustre_swab_ost_id, NULL); -EXPORT_SYMBOL(RMF_OST_ID); - -struct req_msg_field RMF_FIEMAP_KEY = - DEFINE_MSGF("fiemap", 0, sizeof(struct ll_fiemap_info_key), - lustre_swab_fiemap, NULL); -EXPORT_SYMBOL(RMF_FIEMAP_KEY); - -struct req_msg_field RMF_FIEMAP_VAL = - DEFINE_MSGF("fiemap", 0, -1, lustre_swab_fiemap, NULL); -EXPORT_SYMBOL(RMF_FIEMAP_VAL); - -struct req_msg_field RMF_HSM_USER_STATE = - DEFINE_MSGF("hsm_user_state", 0, sizeof(struct hsm_user_state), - lustre_swab_hsm_user_state, NULL); -EXPORT_SYMBOL(RMF_HSM_USER_STATE); - -struct req_msg_field RMF_HSM_STATE_SET = - DEFINE_MSGF("hsm_state_set", 0, sizeof(struct hsm_state_set), - lustre_swab_hsm_state_set, NULL); -EXPORT_SYMBOL(RMF_HSM_STATE_SET); - -struct req_msg_field RMF_MDS_HSM_PROGRESS = - DEFINE_MSGF("hsm_progress", 0, sizeof(struct hsm_progress_kernel), - lustre_swab_hsm_progress_kernel, NULL); -EXPORT_SYMBOL(RMF_MDS_HSM_PROGRESS); - -struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION = - DEFINE_MSGF("hsm_current_action", 0, sizeof(struct hsm_current_action), - lustre_swab_hsm_current_action, NULL); -EXPORT_SYMBOL(RMF_MDS_HSM_CURRENT_ACTION); - -struct req_msg_field RMF_MDS_HSM_USER_ITEM = - DEFINE_MSGF("hsm_user_item", RMF_F_STRUCT_ARRAY, - sizeof(struct hsm_user_item), lustre_swab_hsm_user_item, - NULL); -EXPORT_SYMBOL(RMF_MDS_HSM_USER_ITEM); - -struct req_msg_field RMF_MDS_HSM_ARCHIVE = - DEFINE_MSGF("hsm_archive", 0, - sizeof(__u32), lustre_swab_generic_32s, NULL); -EXPORT_SYMBOL(RMF_MDS_HSM_ARCHIVE); - -struct req_msg_field RMF_MDS_HSM_REQUEST = - DEFINE_MSGF("hsm_request", 0, sizeof(struct hsm_request), - lustre_swab_hsm_request, NULL); -EXPORT_SYMBOL(RMF_MDS_HSM_REQUEST); - -struct req_msg_field RMF_SWAP_LAYOUTS = - DEFINE_MSGF("swap_layouts", 0, sizeof(struct mdc_swap_layouts), - lustre_swab_swap_layouts, NULL); -EXPORT_SYMBOL(RMF_SWAP_LAYOUTS); -/* - * Request formats. - */ - -struct req_format { - const char *rf_name; - size_t rf_idx; - struct { - size_t nr; - const struct req_msg_field **d; - } rf_fields[RCL_NR]; -}; - -#define DEFINE_REQ_FMT(name, client, client_nr, server, server_nr) { \ - .rf_name = name, \ - .rf_fields = { \ - [RCL_CLIENT] = { \ - .nr = client_nr, \ - .d = client \ - }, \ - [RCL_SERVER] = { \ - .nr = server_nr, \ - .d = server \ - } \ - } \ -} - -#define DEFINE_REQ_FMT0(name, client, server) \ -DEFINE_REQ_FMT(name, client, ARRAY_SIZE(client), server, ARRAY_SIZE(server)) - -struct req_format RQF_OBD_PING = - DEFINE_REQ_FMT0("OBD_PING", empty, empty); -EXPORT_SYMBOL(RQF_OBD_PING); - -struct req_format RQF_OBD_SET_INFO = - DEFINE_REQ_FMT0("OBD_SET_INFO", obd_set_info_client, empty); -EXPORT_SYMBOL(RQF_OBD_SET_INFO); - -struct req_format RQF_SEC_CTX = - DEFINE_REQ_FMT0("SEC_CTX", empty, empty); -EXPORT_SYMBOL(RQF_SEC_CTX); - -struct req_format RQF_MGS_TARGET_REG = - DEFINE_REQ_FMT0("MGS_TARGET_REG", mgs_target_info_only, - mgs_target_info_only); -EXPORT_SYMBOL(RQF_MGS_TARGET_REG); - -struct req_format RQF_MGS_SET_INFO = - DEFINE_REQ_FMT0("MGS_SET_INFO", mgs_set_info, - mgs_set_info); -EXPORT_SYMBOL(RQF_MGS_SET_INFO); - -struct req_format RQF_MGS_CONFIG_READ = - DEFINE_REQ_FMT0("MGS_CONFIG_READ", mgs_config_read_client, - mgs_config_read_server); -EXPORT_SYMBOL(RQF_MGS_CONFIG_READ); - -struct req_format RQF_SEQ_QUERY = - DEFINE_REQ_FMT0("SEQ_QUERY", seq_query_client, seq_query_server); -EXPORT_SYMBOL(RQF_SEQ_QUERY); - -struct req_format RQF_FLD_QUERY = - DEFINE_REQ_FMT0("FLD_QUERY", fld_query_client, fld_query_server); -EXPORT_SYMBOL(RQF_FLD_QUERY); - -/* - * The 'fld_read_server' uses 'RMF_GENERIC_DATA' to hold the 'FLD_QUERY' - * RPC reply that is composed of 'struct lu_seq_range_array'. But there - * is not registered swabber function for 'RMF_GENERIC_DATA'. So the RPC - * peers need to handle the RPC reply with fixed little-endian format. - * - * In theory, we can define new structure with some swabber registered to - * handle the 'FLD_QUERY' RPC reply result automatically. But from the - * implementation view, it is not easy to be done within current "struct - * req_msg_field" framework. Because the sequence range array in the RPC - * reply is not fixed length, instead, its length depends on 'lu_seq_range' - * count, that is unknown when prepare the RPC buffer. Generally, for such - * flexible length RPC usage, there will be a field in the RPC layout to - * indicate the data length. But for the 'FLD_READ' RPC, we have no way to - * do that unless we add new length filed that will broken the on-wire RPC - * protocol and cause interoperability trouble with old peer. - */ -struct req_format RQF_FLD_READ = - DEFINE_REQ_FMT0("FLD_READ", fld_read_client, fld_read_server); -EXPORT_SYMBOL(RQF_FLD_READ); - -struct req_format RQF_LOG_CANCEL = - DEFINE_REQ_FMT0("OBD_LOG_CANCEL", log_cancel_client, empty); -EXPORT_SYMBOL(RQF_LOG_CANCEL); - -struct req_format RQF_MDS_QUOTACTL = - DEFINE_REQ_FMT0("MDS_QUOTACTL", quotactl_only, quotactl_only); -EXPORT_SYMBOL(RQF_MDS_QUOTACTL); - -struct req_format RQF_OST_QUOTACTL = - DEFINE_REQ_FMT0("OST_QUOTACTL", quotactl_only, quotactl_only); -EXPORT_SYMBOL(RQF_OST_QUOTACTL); - -struct req_format RQF_MDS_GETSTATUS = - DEFINE_REQ_FMT0("MDS_GETSTATUS", mdt_body_only, mdt_body_capa); -EXPORT_SYMBOL(RQF_MDS_GETSTATUS); - -struct req_format RQF_MDS_STATFS = - DEFINE_REQ_FMT0("MDS_STATFS", empty, obd_statfs_server); -EXPORT_SYMBOL(RQF_MDS_STATFS); - -struct req_format RQF_MDS_SYNC = - DEFINE_REQ_FMT0("MDS_SYNC", mdt_body_capa, mdt_body_only); -EXPORT_SYMBOL(RQF_MDS_SYNC); - -struct req_format RQF_MDS_GETATTR = - DEFINE_REQ_FMT0("MDS_GETATTR", mdt_body_capa, mds_getattr_server); -EXPORT_SYMBOL(RQF_MDS_GETATTR); - -struct req_format RQF_MDS_GETXATTR = - DEFINE_REQ_FMT0("MDS_GETXATTR", - mds_getxattr_client, mds_getxattr_server); -EXPORT_SYMBOL(RQF_MDS_GETXATTR); - -struct req_format RQF_MDS_GETATTR_NAME = - DEFINE_REQ_FMT0("MDS_GETATTR_NAME", - mds_getattr_name_client, mds_getattr_server); -EXPORT_SYMBOL(RQF_MDS_GETATTR_NAME); - -struct req_format RQF_MDS_REINT = - DEFINE_REQ_FMT0("MDS_REINT", mds_reint_client, mdt_body_only); -EXPORT_SYMBOL(RQF_MDS_REINT); - -struct req_format RQF_MDS_REINT_CREATE = - DEFINE_REQ_FMT0("MDS_REINT_CREATE", - mds_reint_create_client, mdt_body_capa); -EXPORT_SYMBOL(RQF_MDS_REINT_CREATE); - -struct req_format RQF_MDS_REINT_CREATE_ACL = - DEFINE_REQ_FMT0("MDS_REINT_CREATE_ACL", - mds_reint_create_acl_client, mdt_body_capa); -EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_ACL); - -struct req_format RQF_MDS_REINT_CREATE_SLAVE = - DEFINE_REQ_FMT0("MDS_REINT_CREATE_EA", - mds_reint_create_slave_client, mdt_body_capa); -EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_SLAVE); - -struct req_format RQF_MDS_REINT_CREATE_SYM = - DEFINE_REQ_FMT0("MDS_REINT_CREATE_SYM", - mds_reint_create_sym_client, mdt_body_capa); -EXPORT_SYMBOL(RQF_MDS_REINT_CREATE_SYM); - -struct req_format RQF_MDS_REINT_OPEN = - DEFINE_REQ_FMT0("MDS_REINT_OPEN", - mds_reint_open_client, mds_reint_open_server); -EXPORT_SYMBOL(RQF_MDS_REINT_OPEN); - -struct req_format RQF_MDS_REINT_UNLINK = - DEFINE_REQ_FMT0("MDS_REINT_UNLINK", mds_reint_unlink_client, - mds_last_unlink_server); -EXPORT_SYMBOL(RQF_MDS_REINT_UNLINK); - -struct req_format RQF_MDS_REINT_LINK = - DEFINE_REQ_FMT0("MDS_REINT_LINK", - mds_reint_link_client, mdt_body_only); -EXPORT_SYMBOL(RQF_MDS_REINT_LINK); - -struct req_format RQF_MDS_REINT_RENAME = - DEFINE_REQ_FMT0("MDS_REINT_RENAME", mds_reint_rename_client, - mds_last_unlink_server); -EXPORT_SYMBOL(RQF_MDS_REINT_RENAME); - -struct req_format RQF_MDS_REINT_MIGRATE = - DEFINE_REQ_FMT0("MDS_REINT_MIGRATE", mds_reint_migrate_client, - mds_last_unlink_server); -EXPORT_SYMBOL(RQF_MDS_REINT_MIGRATE); - -struct req_format RQF_MDS_REINT_SETATTR = - DEFINE_REQ_FMT0("MDS_REINT_SETATTR", - mds_reint_setattr_client, mds_setattr_server); -EXPORT_SYMBOL(RQF_MDS_REINT_SETATTR); - -struct req_format RQF_MDS_REINT_SETXATTR = - DEFINE_REQ_FMT0("MDS_REINT_SETXATTR", - mds_reint_setxattr_client, mdt_body_only); -EXPORT_SYMBOL(RQF_MDS_REINT_SETXATTR); - -struct req_format RQF_MDS_CONNECT = - DEFINE_REQ_FMT0("MDS_CONNECT", - obd_connect_client, obd_connect_server); -EXPORT_SYMBOL(RQF_MDS_CONNECT); - -struct req_format RQF_MDS_DISCONNECT = - DEFINE_REQ_FMT0("MDS_DISCONNECT", empty, empty); -EXPORT_SYMBOL(RQF_MDS_DISCONNECT); - -struct req_format RQF_MDS_GET_INFO = - DEFINE_REQ_FMT0("MDS_GET_INFO", mds_getinfo_client, - mds_getinfo_server); -EXPORT_SYMBOL(RQF_MDS_GET_INFO); - -struct req_format RQF_LDLM_ENQUEUE = - DEFINE_REQ_FMT0("LDLM_ENQUEUE", - ldlm_enqueue_client, ldlm_enqueue_lvb_server); -EXPORT_SYMBOL(RQF_LDLM_ENQUEUE); - -struct req_format RQF_LDLM_ENQUEUE_LVB = - DEFINE_REQ_FMT0("LDLM_ENQUEUE_LVB", - ldlm_enqueue_client, ldlm_enqueue_lvb_server); -EXPORT_SYMBOL(RQF_LDLM_ENQUEUE_LVB); - -struct req_format RQF_LDLM_CONVERT = - DEFINE_REQ_FMT0("LDLM_CONVERT", - ldlm_enqueue_client, ldlm_enqueue_server); -EXPORT_SYMBOL(RQF_LDLM_CONVERT); - -struct req_format RQF_LDLM_CANCEL = - DEFINE_REQ_FMT0("LDLM_CANCEL", ldlm_enqueue_client, empty); -EXPORT_SYMBOL(RQF_LDLM_CANCEL); - -struct req_format RQF_LDLM_CALLBACK = - DEFINE_REQ_FMT0("LDLM_CALLBACK", ldlm_enqueue_client, empty); -EXPORT_SYMBOL(RQF_LDLM_CALLBACK); - -struct req_format RQF_LDLM_CP_CALLBACK = - DEFINE_REQ_FMT0("LDLM_CP_CALLBACK", ldlm_cp_callback_client, empty); -EXPORT_SYMBOL(RQF_LDLM_CP_CALLBACK); - -struct req_format RQF_LDLM_BL_CALLBACK = - DEFINE_REQ_FMT0("LDLM_BL_CALLBACK", ldlm_enqueue_client, empty); -EXPORT_SYMBOL(RQF_LDLM_BL_CALLBACK); - -struct req_format RQF_LDLM_GL_CALLBACK = - DEFINE_REQ_FMT0("LDLM_GL_CALLBACK", ldlm_enqueue_client, - ldlm_gl_callback_server); -EXPORT_SYMBOL(RQF_LDLM_GL_CALLBACK); - -struct req_format RQF_LDLM_GL_DESC_CALLBACK = - DEFINE_REQ_FMT0("LDLM_GL_CALLBACK", ldlm_gl_callback_desc_client, - ldlm_gl_callback_server); -EXPORT_SYMBOL(RQF_LDLM_GL_DESC_CALLBACK); - -struct req_format RQF_LDLM_INTENT_BASIC = - DEFINE_REQ_FMT0("LDLM_INTENT_BASIC", - ldlm_intent_basic_client, ldlm_enqueue_lvb_server); -EXPORT_SYMBOL(RQF_LDLM_INTENT_BASIC); - -struct req_format RQF_LDLM_INTENT = - DEFINE_REQ_FMT0("LDLM_INTENT", - ldlm_intent_client, ldlm_intent_server); -EXPORT_SYMBOL(RQF_LDLM_INTENT); - -struct req_format RQF_LDLM_INTENT_LAYOUT = - DEFINE_REQ_FMT0("LDLM_INTENT_LAYOUT ", - ldlm_intent_layout_client, ldlm_enqueue_lvb_server); -EXPORT_SYMBOL(RQF_LDLM_INTENT_LAYOUT); - -struct req_format RQF_LDLM_INTENT_GETATTR = - DEFINE_REQ_FMT0("LDLM_INTENT_GETATTR", - ldlm_intent_getattr_client, ldlm_intent_getattr_server); -EXPORT_SYMBOL(RQF_LDLM_INTENT_GETATTR); - -struct req_format RQF_LDLM_INTENT_OPEN = - DEFINE_REQ_FMT0("LDLM_INTENT_OPEN", - ldlm_intent_open_client, ldlm_intent_open_server); -EXPORT_SYMBOL(RQF_LDLM_INTENT_OPEN); - -struct req_format RQF_LDLM_INTENT_CREATE = - DEFINE_REQ_FMT0("LDLM_INTENT_CREATE", - ldlm_intent_create_client, ldlm_intent_getattr_server); -EXPORT_SYMBOL(RQF_LDLM_INTENT_CREATE); - -struct req_format RQF_LDLM_INTENT_UNLINK = - DEFINE_REQ_FMT0("LDLM_INTENT_UNLINK", - ldlm_intent_unlink_client, ldlm_intent_server); -EXPORT_SYMBOL(RQF_LDLM_INTENT_UNLINK); - -struct req_format RQF_LDLM_INTENT_GETXATTR = - DEFINE_REQ_FMT0("LDLM_INTENT_GETXATTR", - ldlm_intent_getxattr_client, - ldlm_intent_getxattr_server); -EXPORT_SYMBOL(RQF_LDLM_INTENT_GETXATTR); - -struct req_format RQF_MDS_CLOSE = - DEFINE_REQ_FMT0("MDS_CLOSE", - mdt_close_client, mds_last_unlink_server); -EXPORT_SYMBOL(RQF_MDS_CLOSE); - -struct req_format RQF_MDS_INTENT_CLOSE = - DEFINE_REQ_FMT0("MDS_CLOSE", - mdt_intent_close_client, mds_last_unlink_server); -EXPORT_SYMBOL(RQF_MDS_INTENT_CLOSE); - -struct req_format RQF_MDS_READPAGE = - DEFINE_REQ_FMT0("MDS_READPAGE", - mdt_body_capa, mdt_body_only); -EXPORT_SYMBOL(RQF_MDS_READPAGE); - -struct req_format RQF_MDS_HSM_ACTION = - DEFINE_REQ_FMT0("MDS_HSM_ACTION", mdt_body_capa, mdt_hsm_action_server); -EXPORT_SYMBOL(RQF_MDS_HSM_ACTION); - -struct req_format RQF_MDS_HSM_PROGRESS = - DEFINE_REQ_FMT0("MDS_HSM_PROGRESS", mdt_hsm_progress, empty); -EXPORT_SYMBOL(RQF_MDS_HSM_PROGRESS); - -struct req_format RQF_MDS_HSM_CT_REGISTER = - DEFINE_REQ_FMT0("MDS_HSM_CT_REGISTER", mdt_hsm_ct_register, empty); -EXPORT_SYMBOL(RQF_MDS_HSM_CT_REGISTER); - -struct req_format RQF_MDS_HSM_CT_UNREGISTER = - DEFINE_REQ_FMT0("MDS_HSM_CT_UNREGISTER", mdt_hsm_ct_unregister, empty); -EXPORT_SYMBOL(RQF_MDS_HSM_CT_UNREGISTER); - -struct req_format RQF_MDS_HSM_STATE_GET = - DEFINE_REQ_FMT0("MDS_HSM_STATE_GET", - mdt_body_capa, mdt_hsm_state_get_server); -EXPORT_SYMBOL(RQF_MDS_HSM_STATE_GET); - -struct req_format RQF_MDS_HSM_STATE_SET = - DEFINE_REQ_FMT0("MDS_HSM_STATE_SET", mdt_hsm_state_set, empty); -EXPORT_SYMBOL(RQF_MDS_HSM_STATE_SET); - -struct req_format RQF_MDS_HSM_REQUEST = - DEFINE_REQ_FMT0("MDS_HSM_REQUEST", mdt_hsm_request, empty); -EXPORT_SYMBOL(RQF_MDS_HSM_REQUEST); - -struct req_format RQF_MDS_SWAP_LAYOUTS = - DEFINE_REQ_FMT0("MDS_SWAP_LAYOUTS", - mdt_swap_layouts, empty); -EXPORT_SYMBOL(RQF_MDS_SWAP_LAYOUTS); - -/* This is for split */ -struct req_format RQF_MDS_WRITEPAGE = - DEFINE_REQ_FMT0("MDS_WRITEPAGE", - mdt_body_capa, mdt_body_only); -EXPORT_SYMBOL(RQF_MDS_WRITEPAGE); - -struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE = - DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_CREATE", - llog_origin_handle_create_client, llogd_body_only); -EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_CREATE); - -struct req_format RQF_LLOG_ORIGIN_HANDLE_DESTROY = - DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_DESTROY", - llogd_body_only, llogd_body_only); -EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_DESTROY); - -struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK = - DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_NEXT_BLOCK", - llogd_body_only, llog_origin_handle_next_block_server); -EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK); - -struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK = - DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_PREV_BLOCK", - llogd_body_only, llog_origin_handle_next_block_server); -EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK); - -struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER = - DEFINE_REQ_FMT0("LLOG_ORIGIN_HANDLE_READ_HEADER", - llogd_body_only, llog_log_hdr_only); -EXPORT_SYMBOL(RQF_LLOG_ORIGIN_HANDLE_READ_HEADER); - -struct req_format RQF_LLOG_ORIGIN_CONNECT = - DEFINE_REQ_FMT0("LLOG_ORIGIN_CONNECT", llogd_conn_body_only, empty); -EXPORT_SYMBOL(RQF_LLOG_ORIGIN_CONNECT); - -struct req_format RQF_CONNECT = - DEFINE_REQ_FMT0("CONNECT", obd_connect_client, obd_connect_server); -EXPORT_SYMBOL(RQF_CONNECT); - -struct req_format RQF_OST_CONNECT = - DEFINE_REQ_FMT0("OST_CONNECT", - obd_connect_client, obd_connect_server); -EXPORT_SYMBOL(RQF_OST_CONNECT); - -struct req_format RQF_OST_DISCONNECT = - DEFINE_REQ_FMT0("OST_DISCONNECT", empty, empty); -EXPORT_SYMBOL(RQF_OST_DISCONNECT); - -struct req_format RQF_OST_GETATTR = - DEFINE_REQ_FMT0("OST_GETATTR", ost_body_capa, ost_body_only); -EXPORT_SYMBOL(RQF_OST_GETATTR); - -struct req_format RQF_OST_SETATTR = - DEFINE_REQ_FMT0("OST_SETATTR", ost_body_capa, ost_body_only); -EXPORT_SYMBOL(RQF_OST_SETATTR); - -struct req_format RQF_OST_CREATE = - DEFINE_REQ_FMT0("OST_CREATE", ost_body_only, ost_body_only); -EXPORT_SYMBOL(RQF_OST_CREATE); - -struct req_format RQF_OST_PUNCH = - DEFINE_REQ_FMT0("OST_PUNCH", ost_body_capa, ost_body_only); -EXPORT_SYMBOL(RQF_OST_PUNCH); - -struct req_format RQF_OST_SYNC = - DEFINE_REQ_FMT0("OST_SYNC", ost_body_capa, ost_body_only); -EXPORT_SYMBOL(RQF_OST_SYNC); - -struct req_format RQF_OST_DESTROY = - DEFINE_REQ_FMT0("OST_DESTROY", ost_destroy_client, ost_body_only); -EXPORT_SYMBOL(RQF_OST_DESTROY); - -struct req_format RQF_OST_BRW_READ = - DEFINE_REQ_FMT0("OST_BRW_READ", ost_brw_client, ost_brw_read_server); -EXPORT_SYMBOL(RQF_OST_BRW_READ); - -struct req_format RQF_OST_BRW_WRITE = - DEFINE_REQ_FMT0("OST_BRW_WRITE", ost_brw_client, ost_brw_write_server); -EXPORT_SYMBOL(RQF_OST_BRW_WRITE); - -struct req_format RQF_OST_STATFS = - DEFINE_REQ_FMT0("OST_STATFS", empty, obd_statfs_server); -EXPORT_SYMBOL(RQF_OST_STATFS); - -struct req_format RQF_OST_SET_GRANT_INFO = - DEFINE_REQ_FMT0("OST_SET_GRANT_INFO", ost_grant_shrink_client, - ost_body_only); -EXPORT_SYMBOL(RQF_OST_SET_GRANT_INFO); - -struct req_format RQF_OST_GET_INFO = - DEFINE_REQ_FMT0("OST_GET_INFO", ost_get_info_generic_client, - ost_get_info_generic_server); -EXPORT_SYMBOL(RQF_OST_GET_INFO); - -struct req_format RQF_OST_GET_INFO_LAST_ID = - DEFINE_REQ_FMT0("OST_GET_INFO_LAST_ID", ost_get_info_generic_client, - ost_get_last_id_server); -EXPORT_SYMBOL(RQF_OST_GET_INFO_LAST_ID); - -struct req_format RQF_OST_GET_INFO_LAST_FID = - DEFINE_REQ_FMT0("OST_GET_INFO_LAST_FID", ost_get_last_fid_client, - ost_get_last_fid_server); -EXPORT_SYMBOL(RQF_OST_GET_INFO_LAST_FID); - -struct req_format RQF_OST_SET_INFO_LAST_FID = - DEFINE_REQ_FMT0("OST_SET_INFO_LAST_FID", obd_set_info_client, - empty); -EXPORT_SYMBOL(RQF_OST_SET_INFO_LAST_FID); - -struct req_format RQF_OST_GET_INFO_FIEMAP = - DEFINE_REQ_FMT0("OST_GET_INFO_FIEMAP", ost_get_fiemap_client, - ost_get_fiemap_server); -EXPORT_SYMBOL(RQF_OST_GET_INFO_FIEMAP); - -/* Convenience macro */ -#define FMT_FIELD(fmt, i, j) ((fmt)->rf_fields[(i)].d[(j)]) - -/** - * Initializes the capsule abstraction by computing and setting the \a rf_idx - * field of RQFs and the \a rmf_offset field of RMFs. - */ -int req_layout_init(void) -{ - size_t i; - size_t j; - size_t k; - struct req_format *rf = NULL; - - for (i = 0; i < ARRAY_SIZE(req_formats); ++i) { - rf = req_formats[i]; - rf->rf_idx = i; - for (j = 0; j < RCL_NR; ++j) { - LASSERT(rf->rf_fields[j].nr <= REQ_MAX_FIELD_NR); - for (k = 0; k < rf->rf_fields[j].nr; ++k) { - struct req_msg_field *field; - - field = (typeof(field))rf->rf_fields[j].d[k]; - LASSERT(!(field->rmf_flags & RMF_F_STRUCT_ARRAY) - || field->rmf_size > 0); - LASSERT(field->rmf_offset[i][j] == 0); - /* - * k + 1 to detect unused format/field - * combinations. - */ - field->rmf_offset[i][j] = k + 1; - } - } - } - return 0; -} -EXPORT_SYMBOL(req_layout_init); - -void req_layout_fini(void) -{ -} -EXPORT_SYMBOL(req_layout_fini); - -/** - * Initializes the expected sizes of each RMF in a \a pill (\a rc_area) to -1. - * - * Actual/expected field sizes are set elsewhere in functions in this file: - * req_capsule_init(), req_capsule_server_pack(), req_capsule_set_size() and - * req_capsule_msg_size(). The \a rc_area information is used by. - * ptlrpc_request_set_replen(). - */ -static void req_capsule_init_area(struct req_capsule *pill) -{ - size_t i; - - for (i = 0; i < ARRAY_SIZE(pill->rc_area[RCL_CLIENT]); i++) { - pill->rc_area[RCL_CLIENT][i] = -1; - pill->rc_area[RCL_SERVER][i] = -1; - } -} - -/** - * Initialize a pill. - * - * The \a location indicates whether the caller is executing on the client side - * (RCL_CLIENT) or server side (RCL_SERVER).. - */ -void req_capsule_init(struct req_capsule *pill, - struct ptlrpc_request *req, - enum req_location location) -{ - LASSERT(location == RCL_SERVER || location == RCL_CLIENT); - - /* - * Today all capsules are embedded in ptlrpc_request structs, - * but just in case that ever isn't the case, we don't reach - * into req unless req != NULL and pill is the one embedded in - * the req. - * - * The req->rq_pill_init flag makes it safe to initialize a pill - * twice, which might happen in the OST paths as a result of the - * high-priority RPC queue getting peeked at before ost_handle() - * handles an OST RPC. - */ - if (req && pill == &req->rq_pill && req->rq_pill_init) - return; - - memset(pill, 0, sizeof(*pill)); - pill->rc_req = req; - pill->rc_loc = location; - req_capsule_init_area(pill); - - if (req && pill == &req->rq_pill) - req->rq_pill_init = 1; -} -EXPORT_SYMBOL(req_capsule_init); - -void req_capsule_fini(struct req_capsule *pill) -{ -} -EXPORT_SYMBOL(req_capsule_fini); - -static int __req_format_is_sane(const struct req_format *fmt) -{ - return fmt->rf_idx < ARRAY_SIZE(req_formats) && - req_formats[fmt->rf_idx] == fmt; -} - -static struct lustre_msg *__req_msg(const struct req_capsule *pill, - enum req_location loc) -{ - struct ptlrpc_request *req; - - req = pill->rc_req; - return loc == RCL_CLIENT ? req->rq_reqmsg : req->rq_repmsg; -} - -/** - * Set the format (\a fmt) of a \a pill; format changes are not allowed here - * (see req_capsule_extend()). - */ -void req_capsule_set(struct req_capsule *pill, const struct req_format *fmt) -{ - LASSERT(!pill->rc_fmt || pill->rc_fmt == fmt); - LASSERT(__req_format_is_sane(fmt)); - - pill->rc_fmt = fmt; -} -EXPORT_SYMBOL(req_capsule_set); - -/** - * Fills in any parts of the \a rc_area of a \a pill that haven't been filled in - * yet. - - * \a rc_area is an array of REQ_MAX_FIELD_NR elements, used to store sizes of - * variable-sized fields. The field sizes come from the declared \a rmf_size - * field of a \a pill's \a rc_fmt's RMF's. - */ -size_t req_capsule_filled_sizes(struct req_capsule *pill, - enum req_location loc) -{ - const struct req_format *fmt = pill->rc_fmt; - size_t i; - - for (i = 0; i < fmt->rf_fields[loc].nr; ++i) { - if (pill->rc_area[loc][i] == -1) { - pill->rc_area[loc][i] = - fmt->rf_fields[loc].d[i]->rmf_size; - if (pill->rc_area[loc][i] == -1) { - /* - * Skip the following fields. - * - * If this LASSERT() trips then you're missing a - * call to req_capsule_set_size(). - */ - LASSERT(loc != RCL_SERVER); - break; - } - } - } - return i; -} -EXPORT_SYMBOL(req_capsule_filled_sizes); - -/** - * Capsule equivalent of lustre_pack_request() and lustre_pack_reply(). - * - * This function uses the \a pill's \a rc_area as filled in by - * req_capsule_set_size() or req_capsule_filled_sizes() (the latter is called by - * this function). - */ -int req_capsule_server_pack(struct req_capsule *pill) -{ - const struct req_format *fmt; - int count; - int rc; - - LASSERT(pill->rc_loc == RCL_SERVER); - fmt = pill->rc_fmt; - LASSERT(fmt); - - count = req_capsule_filled_sizes(pill, RCL_SERVER); - rc = lustre_pack_reply(pill->rc_req, count, - pill->rc_area[RCL_SERVER], NULL); - if (rc != 0) { - DEBUG_REQ(D_ERROR, pill->rc_req, - "Cannot pack %d fields in format `%s': ", - count, fmt->rf_name); - } - return rc; -} -EXPORT_SYMBOL(req_capsule_server_pack); - -/** - * Returns the PTLRPC request or reply (\a loc) buffer offset of a \a pill - * corresponding to the given RMF (\a field). - */ -static u32 __req_capsule_offset(const struct req_capsule *pill, - const struct req_msg_field *field, - enum req_location loc) -{ - u32 offset; - - offset = field->rmf_offset[pill->rc_fmt->rf_idx][loc]; - LASSERTF(offset > 0, "%s:%s, off=%d, loc=%d\n", pill->rc_fmt->rf_name, - field->rmf_name, offset, loc); - offset--; - - LASSERT(offset < REQ_MAX_FIELD_NR); - return offset; -} - -/** - * Helper for __req_capsule_get(); swabs value / array of values and/or dumps - * them if desired. - */ -static -void -swabber_dumper_helper(struct req_capsule *pill, - const struct req_msg_field *field, - enum req_location loc, - int offset, - void *value, int len, int dump, void (*swabber)(void *)) -{ - void *p; - int i; - int n; - int do_swab; - int inout = loc == RCL_CLIENT; - - swabber = swabber ?: field->rmf_swabber; - - if (ptlrpc_buf_need_swab(pill->rc_req, inout, offset) && - swabber && value) - do_swab = 1; - else - do_swab = 0; - - if (!field->rmf_dumper) - dump = 0; - - if (!(field->rmf_flags & RMF_F_STRUCT_ARRAY)) { - if (dump) { - CDEBUG(D_RPCTRACE, "Dump of %sfield %s follows\n", - do_swab ? "unswabbed " : "", field->rmf_name); - field->rmf_dumper(value); - } - if (!do_swab) - return; - swabber(value); - ptlrpc_buf_set_swabbed(pill->rc_req, inout, offset); - if (dump && field->rmf_dumper) { - CDEBUG(D_RPCTRACE, "Dump of swabbed field %s follows\n", - field->rmf_name); - field->rmf_dumper(value); - } - - return; - } - - /* - * We're swabbing an array; swabber() swabs a single array element, so - * swab every element. - */ - LASSERT((len % field->rmf_size) == 0); - for (p = value, i = 0, n = len / field->rmf_size; - i < n; - i++, p += field->rmf_size) { - if (dump) { - CDEBUG(D_RPCTRACE, "Dump of %sarray field %s, element %d follows\n", - do_swab ? "unswabbed " : "", field->rmf_name, i); - field->rmf_dumper(p); - } - if (!do_swab) - continue; - swabber(p); - if (dump) { - CDEBUG(D_RPCTRACE, "Dump of swabbed array field %s, element %d follows\n", - field->rmf_name, i); - field->rmf_dumper(value); - } - } - if (do_swab) - ptlrpc_buf_set_swabbed(pill->rc_req, inout, offset); -} - -/** - * Returns the pointer to a PTLRPC request or reply (\a loc) buffer of a \a pill - * corresponding to the given RMF (\a field). - * - * The buffer will be swabbed using the given \a swabber. If \a swabber == NULL - * then the \a rmf_swabber from the RMF will be used. Soon there will be no - * calls to __req_capsule_get() with a non-NULL \a swabber; \a swabber will then - * be removed. Fields with the \a RMF_F_STRUCT_ARRAY flag set will have each - * element of the array swabbed. - */ -static void *__req_capsule_get(struct req_capsule *pill, - const struct req_msg_field *field, - enum req_location loc, - void (*swabber)(void *), - int dump) -{ - const struct req_format *fmt; - struct lustre_msg *msg; - void *value; - u32 len; - u32 offset; - - void *(*getter)(struct lustre_msg *m, u32 n, u32 minlen); - - static const char *rcl_names[RCL_NR] = { - [RCL_CLIENT] = "client", - [RCL_SERVER] = "server" - }; - - fmt = pill->rc_fmt; - LASSERT(fmt); - LASSERT(fmt != LP_POISON); - LASSERT(__req_format_is_sane(fmt)); - - offset = __req_capsule_offset(pill, field, loc); - - msg = __req_msg(pill, loc); - LASSERT(msg); - - getter = (field->rmf_flags & RMF_F_STRING) ? - (typeof(getter))lustre_msg_string : lustre_msg_buf; - - if (field->rmf_flags & (RMF_F_STRUCT_ARRAY | RMF_F_NO_SIZE_CHECK)) { - /* - * We've already asserted that field->rmf_size > 0 in - * req_layout_init(). - */ - len = lustre_msg_buflen(msg, offset); - if (!(field->rmf_flags & RMF_F_NO_SIZE_CHECK) && - (len % field->rmf_size)) { - CERROR("%s: array field size mismatch %d modulo %u != 0 (%d)\n", - field->rmf_name, len, field->rmf_size, loc); - return NULL; - } - } else if (pill->rc_area[loc][offset] != -1) { - len = pill->rc_area[loc][offset]; - } else { - len = max_t(typeof(field->rmf_size), field->rmf_size, 0); - } - value = getter(msg, offset, len); - - if (!value) { - DEBUG_REQ(D_ERROR, pill->rc_req, - "Wrong buffer for field `%s' (%u of %u) in format `%s': %u vs. %u (%s)\n", - field->rmf_name, offset, lustre_msg_bufcount(msg), - fmt->rf_name, lustre_msg_buflen(msg, offset), len, - rcl_names[loc]); - } else { - swabber_dumper_helper(pill, field, loc, offset, value, len, - dump, swabber); - } - - return value; -} - -/** - * Trivial wrapper around __req_capsule_get(), that returns the PTLRPC request - * buffer corresponding to the given RMF (\a field) of a \a pill. - */ -void *req_capsule_client_get(struct req_capsule *pill, - const struct req_msg_field *field) -{ - return __req_capsule_get(pill, field, RCL_CLIENT, NULL, 0); -} -EXPORT_SYMBOL(req_capsule_client_get); - -/** - * Same as req_capsule_client_get(), but with a \a swabber argument. - * - * Currently unused; will be removed when req_capsule_server_swab_get() is - * unused too. - */ -void *req_capsule_client_swab_get(struct req_capsule *pill, - const struct req_msg_field *field, - void *swabber) -{ - return __req_capsule_get(pill, field, RCL_CLIENT, swabber, 0); -} -EXPORT_SYMBOL(req_capsule_client_swab_get); - -/** - * Utility that combines req_capsule_set_size() and req_capsule_client_get(). - * - * First the \a pill's request \a field's size is set (\a rc_area) using - * req_capsule_set_size() with the given \a len. Then the actual buffer is - * returned. - */ -void *req_capsule_client_sized_get(struct req_capsule *pill, - const struct req_msg_field *field, - u32 len) -{ - req_capsule_set_size(pill, field, RCL_CLIENT, len); - return __req_capsule_get(pill, field, RCL_CLIENT, NULL, 0); -} -EXPORT_SYMBOL(req_capsule_client_sized_get); - -/** - * Trivial wrapper around __req_capsule_get(), that returns the PTLRPC reply - * buffer corresponding to the given RMF (\a field) of a \a pill. - */ -void *req_capsule_server_get(struct req_capsule *pill, - const struct req_msg_field *field) -{ - return __req_capsule_get(pill, field, RCL_SERVER, NULL, 0); -} -EXPORT_SYMBOL(req_capsule_server_get); - -/** - * Same as req_capsule_server_get(), but with a \a swabber argument. - * - * Ideally all swabbing should be done pursuant to RMF definitions, with no - * swabbing done outside this capsule abstraction. - */ -void *req_capsule_server_swab_get(struct req_capsule *pill, - const struct req_msg_field *field, - void *swabber) -{ - return __req_capsule_get(pill, field, RCL_SERVER, swabber, 0); -} -EXPORT_SYMBOL(req_capsule_server_swab_get); - -/** - * Utility that combines req_capsule_set_size() and req_capsule_server_get(). - * - * First the \a pill's request \a field's size is set (\a rc_area) using - * req_capsule_set_size() with the given \a len. Then the actual buffer is - * returned. - */ -void *req_capsule_server_sized_get(struct req_capsule *pill, - const struct req_msg_field *field, - u32 len) -{ - req_capsule_set_size(pill, field, RCL_SERVER, len); - return __req_capsule_get(pill, field, RCL_SERVER, NULL, 0); -} -EXPORT_SYMBOL(req_capsule_server_sized_get); - -void *req_capsule_server_sized_swab_get(struct req_capsule *pill, - const struct req_msg_field *field, - u32 len, void *swabber) -{ - req_capsule_set_size(pill, field, RCL_SERVER, len); - return __req_capsule_get(pill, field, RCL_SERVER, swabber, 0); -} -EXPORT_SYMBOL(req_capsule_server_sized_swab_get); - -/** - * Set the size of the PTLRPC request/reply (\a loc) buffer for the given \a - * field of the given \a pill. - * - * This function must be used when constructing variable sized fields of a - * request or reply. - */ -void req_capsule_set_size(struct req_capsule *pill, - const struct req_msg_field *field, - enum req_location loc, u32 size) -{ - LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT); - - if ((size != (u32)field->rmf_size) && - (field->rmf_size != -1) && - !(field->rmf_flags & RMF_F_NO_SIZE_CHECK) && - (size > 0)) { - u32 rmf_size = (u32)field->rmf_size; - - if ((field->rmf_flags & RMF_F_STRUCT_ARRAY) && - (size % rmf_size != 0)) { - CERROR("%s: array field size mismatch %u %% %u != 0 (%d)\n", - field->rmf_name, size, rmf_size, loc); - LBUG(); - } else if (!(field->rmf_flags & RMF_F_STRUCT_ARRAY) && - size < rmf_size) { - CERROR("%s: field size mismatch %u != %u (%d)\n", - field->rmf_name, size, rmf_size, loc); - LBUG(); - } - } - - pill->rc_area[loc][__req_capsule_offset(pill, field, loc)] = size; -} -EXPORT_SYMBOL(req_capsule_set_size); - -/** - * Return the actual PTLRPC buffer length of a request or reply (\a loc) - * for the given \a pill's given \a field. - * - * NB: this function doesn't correspond with req_capsule_set_size(), which - * actually sets the size in pill.rc_area[loc][offset], but this function - * returns the message buflen[offset], maybe we should use another name. - */ -u32 req_capsule_get_size(const struct req_capsule *pill, - const struct req_msg_field *field, - enum req_location loc) -{ - LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT); - - return lustre_msg_buflen(__req_msg(pill, loc), - __req_capsule_offset(pill, field, loc)); -} -EXPORT_SYMBOL(req_capsule_get_size); - -/** - * Wrapper around lustre_msg_size() that returns the PTLRPC size needed for the - * given \a pill's request or reply (\a loc) given the field size recorded in - * the \a pill's rc_area. - * - * See also req_capsule_set_size(). - */ -u32 req_capsule_msg_size(struct req_capsule *pill, enum req_location loc) -{ - return lustre_msg_size(pill->rc_req->rq_import->imp_msg_magic, - pill->rc_fmt->rf_fields[loc].nr, - pill->rc_area[loc]); -} - -/** - * While req_capsule_msg_size() computes the size of a PTLRPC request or reply - * (\a loc) given a \a pill's \a rc_area, this function computes the size of a - * PTLRPC request or reply given only an RQF (\a fmt). - * - * This function should not be used for formats which contain variable size - * fields. - */ -u32 req_capsule_fmt_size(__u32 magic, const struct req_format *fmt, - enum req_location loc) -{ - size_t i = 0; - u32 size; - - /* - * This function should probably LASSERT() that fmt has no fields with - * RMF_F_STRUCT_ARRAY in rmf_flags, since we can't know here how many - * elements in the array there will ultimately be, but then, we could - * assume that there will be at least one element, and that's just what - * we do. - */ - size = lustre_msg_hdr_size(magic, fmt->rf_fields[loc].nr); - if (!size) - return size; - - for (; i < fmt->rf_fields[loc].nr; ++i) - if (fmt->rf_fields[loc].d[i]->rmf_size != -1) - size += cfs_size_round(fmt->rf_fields[loc].d[i]-> - rmf_size); - return size; -} - -/** - * Changes the format of an RPC. - * - * The pill must already have been initialized, which means that it already has - * a request format. The new format \a fmt must be an extension of the pill's - * old format. Specifically: the new format must have as many request and reply - * fields as the old one, and all fields shared by the old and new format must - * be at least as large in the new format. - * - * The new format's fields may be of different "type" than the old format, but - * only for fields that are "opaque" blobs: fields which have a) have no - * \a rmf_swabber, b) \a rmf_flags == 0 or RMF_F_NO_SIZE_CHECK, and c) \a - * rmf_size == -1 or \a rmf_flags == RMF_F_NO_SIZE_CHECK. For example, - * OBD_SET_INFO has a key field and an opaque value field that gets interpreted - * according to the key field. When the value, according to the key, contains a - * structure (or array thereof) to be swabbed, the format should be changed to - * one where the value field has \a rmf_size/rmf_flags/rmf_swabber set - * accordingly. - */ -void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt) -{ - int i; - size_t j; - - const struct req_format *old; - - LASSERT(pill->rc_fmt); - LASSERT(__req_format_is_sane(fmt)); - - old = pill->rc_fmt; - /* - * Sanity checking... - */ - for (i = 0; i < RCL_NR; ++i) { - LASSERT(fmt->rf_fields[i].nr >= old->rf_fields[i].nr); - for (j = 0; j < old->rf_fields[i].nr - 1; ++j) { - const struct req_msg_field *ofield = FMT_FIELD(old, i, j); - - /* "opaque" fields can be transmogrified */ - if (!ofield->rmf_swabber && - (ofield->rmf_flags & ~RMF_F_NO_SIZE_CHECK) == 0 && - (ofield->rmf_size == -1 || - ofield->rmf_flags == RMF_F_NO_SIZE_CHECK)) - continue; - LASSERT(FMT_FIELD(fmt, i, j) == FMT_FIELD(old, i, j)); - } - /* - * Last field in old format can be shorter than in new. - */ - LASSERT(FMT_FIELD(fmt, i, j)->rmf_size >= - FMT_FIELD(old, i, j)->rmf_size); - } - - pill->rc_fmt = fmt; -} -EXPORT_SYMBOL(req_capsule_extend); - -/** - * This function returns a non-zero value if the given \a field is present in - * the format (\a rc_fmt) of \a pill's PTLRPC request or reply (\a loc), else it - * returns 0. - */ -int req_capsule_has_field(const struct req_capsule *pill, - const struct req_msg_field *field, - enum req_location loc) -{ - LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT); - - return field->rmf_offset[pill->rc_fmt->rf_idx][loc]; -} -EXPORT_SYMBOL(req_capsule_has_field); - -/** - * Returns a non-zero value if the given \a field is present in the given \a - * pill's PTLRPC request or reply (\a loc), else it returns 0. - */ -static int req_capsule_field_present(const struct req_capsule *pill, - const struct req_msg_field *field, - enum req_location loc) -{ - u32 offset; - - LASSERT(loc == RCL_SERVER || loc == RCL_CLIENT); - LASSERT(req_capsule_has_field(pill, field, loc)); - - offset = __req_capsule_offset(pill, field, loc); - return lustre_msg_bufcount(__req_msg(pill, loc)) > offset; -} - -/** - * This function shrinks the size of the _buffer_ of the \a pill's PTLRPC - * request or reply (\a loc). - * - * This is not the opposite of req_capsule_extend(). - */ -void req_capsule_shrink(struct req_capsule *pill, - const struct req_msg_field *field, - u32 newlen, enum req_location loc) -{ - const struct req_format *fmt; - struct lustre_msg *msg; - u32 len; - int offset; - - fmt = pill->rc_fmt; - LASSERT(fmt); - LASSERT(__req_format_is_sane(fmt)); - LASSERT(req_capsule_has_field(pill, field, loc)); - LASSERT(req_capsule_field_present(pill, field, loc)); - - offset = __req_capsule_offset(pill, field, loc); - - msg = __req_msg(pill, loc); - len = lustre_msg_buflen(msg, offset); - LASSERTF(newlen <= len, "%s:%s, oldlen=%u, newlen=%u\n", - fmt->rf_name, field->rmf_name, len, newlen); - - if (loc == RCL_CLIENT) - pill->rc_req->rq_reqlen = lustre_shrink_msg(msg, offset, newlen, - 1); - else - pill->rc_req->rq_replen = lustre_shrink_msg(msg, offset, newlen, - 1); -} -EXPORT_SYMBOL(req_capsule_shrink); diff --git a/drivers/staging/lustre/lustre/ptlrpc/llog_client.c b/drivers/staging/lustre/lustre/ptlrpc/llog_client.c deleted file mode 100644 index 946d538121dea139ef367aaf95cd747bee076137..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ptlrpc/llog_client.c +++ /dev/null @@ -1,338 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/ptlrpc/llog_client.c - * - * remote api for llog - client side - * - * Author: Andreas Dilger - */ - -#define DEBUG_SUBSYSTEM S_LOG - -#include -#include -#include -#include - -#define LLOG_CLIENT_ENTRY(ctxt, imp) do { \ - mutex_lock(&ctxt->loc_mutex); \ - if (ctxt->loc_imp) { \ - imp = class_import_get(ctxt->loc_imp); \ - } else { \ - CERROR("ctxt->loc_imp == NULL for context idx %d." \ - "Unable to complete MDS/OSS recovery," \ - "but I'll try again next time. Not fatal.\n", \ - ctxt->loc_idx); \ - imp = NULL; \ - mutex_unlock(&ctxt->loc_mutex); \ - return (-EINVAL); \ - } \ - mutex_unlock(&ctxt->loc_mutex); \ -} while (0) - -#define LLOG_CLIENT_EXIT(ctxt, imp) do { \ - mutex_lock(&ctxt->loc_mutex); \ - if (ctxt->loc_imp != imp) \ - CWARN("loc_imp has changed from %p to %p\n", \ - ctxt->loc_imp, imp); \ - class_import_put(imp); \ - mutex_unlock(&ctxt->loc_mutex); \ -} while (0) - -/* This is a callback from the llog_* functions. - * Assumes caller has already pushed us into the kernel context. - */ -static int llog_client_open(const struct lu_env *env, - struct llog_handle *lgh, struct llog_logid *logid, - char *name, enum llog_open_param open_param) -{ - struct obd_import *imp; - struct llogd_body *body; - struct llog_ctxt *ctxt = lgh->lgh_ctxt; - struct ptlrpc_request *req = NULL; - int rc; - - LLOG_CLIENT_ENTRY(ctxt, imp); - - /* client cannot create llog */ - LASSERTF(open_param != LLOG_OPEN_NEW, "%#x\n", open_param); - LASSERT(lgh); - - req = ptlrpc_request_alloc(imp, &RQF_LLOG_ORIGIN_HANDLE_CREATE); - if (!req) { - rc = -ENOMEM; - goto out; - } - - if (name) - req_capsule_set_size(&req->rq_pill, &RMF_NAME, RCL_CLIENT, - strlen(name) + 1); - - rc = ptlrpc_request_pack(req, LUSTRE_LOG_VERSION, - LLOG_ORIGIN_HANDLE_CREATE); - if (rc) { - ptlrpc_request_free(req); - req = NULL; - goto out; - } - ptlrpc_request_set_replen(req); - - body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); - if (logid) - body->lgd_logid = *logid; - body->lgd_ctxt_idx = ctxt->loc_idx - 1; - - if (name) { - char *tmp; - - tmp = req_capsule_client_sized_get(&req->rq_pill, &RMF_NAME, - strlen(name) + 1); - LASSERT(tmp); - strcpy(tmp, name); - } - - rc = ptlrpc_queue_wait(req); - if (rc) - goto out; - - body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY); - if (!body) { - rc = -EFAULT; - goto out; - } - - lgh->lgh_id = body->lgd_logid; - lgh->lgh_ctxt = ctxt; -out: - LLOG_CLIENT_EXIT(ctxt, imp); - ptlrpc_req_finished(req); - return rc; -} - -static int llog_client_next_block(const struct lu_env *env, - struct llog_handle *loghandle, - int *cur_idx, int next_idx, - __u64 *cur_offset, void *buf, int len) -{ - struct obd_import *imp; - struct ptlrpc_request *req = NULL; - struct llogd_body *body; - void *ptr; - int rc; - - LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp); - req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK, - LUSTRE_LOG_VERSION, - LLOG_ORIGIN_HANDLE_NEXT_BLOCK); - if (!req) { - rc = -ENOMEM; - goto err_exit; - } - - body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); - body->lgd_logid = loghandle->lgh_id; - body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1; - body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags; - body->lgd_index = next_idx; - body->lgd_saved_index = *cur_idx; - body->lgd_len = len; - body->lgd_cur_offset = *cur_offset; - - req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, len); - ptlrpc_request_set_replen(req); - rc = ptlrpc_queue_wait(req); - if (rc) - goto out; - - body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY); - if (!body) { - rc = -EFAULT; - goto out; - } - - /* The log records are swabbed as they are processed */ - ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA); - if (!ptr) { - rc = -EFAULT; - goto out; - } - - *cur_idx = body->lgd_saved_index; - *cur_offset = body->lgd_cur_offset; - - memcpy(buf, ptr, len); -out: - ptlrpc_req_finished(req); -err_exit: - LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp); - return rc; -} - -static int llog_client_prev_block(const struct lu_env *env, - struct llog_handle *loghandle, - int prev_idx, void *buf, int len) -{ - struct obd_import *imp; - struct ptlrpc_request *req = NULL; - struct llogd_body *body; - void *ptr; - int rc; - - LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp); - req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK, - LUSTRE_LOG_VERSION, - LLOG_ORIGIN_HANDLE_PREV_BLOCK); - if (!req) { - rc = -ENOMEM; - goto err_exit; - } - - body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); - body->lgd_logid = loghandle->lgh_id; - body->lgd_ctxt_idx = loghandle->lgh_ctxt->loc_idx - 1; - body->lgd_llh_flags = loghandle->lgh_hdr->llh_flags; - body->lgd_index = prev_idx; - body->lgd_len = len; - - req_capsule_set_size(&req->rq_pill, &RMF_EADATA, RCL_SERVER, len); - ptlrpc_request_set_replen(req); - - rc = ptlrpc_queue_wait(req); - if (rc) - goto out; - - body = req_capsule_server_get(&req->rq_pill, &RMF_LLOGD_BODY); - if (!body) { - rc = -EFAULT; - goto out; - } - - ptr = req_capsule_server_get(&req->rq_pill, &RMF_EADATA); - if (!ptr) { - rc = -EFAULT; - goto out; - } - - memcpy(buf, ptr, len); -out: - ptlrpc_req_finished(req); -err_exit: - LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp); - return rc; -} - -static int llog_client_read_header(const struct lu_env *env, - struct llog_handle *handle) -{ - struct obd_import *imp; - struct ptlrpc_request *req = NULL; - struct llogd_body *body; - struct llog_log_hdr *hdr; - struct llog_rec_hdr *llh_hdr; - int rc; - - LLOG_CLIENT_ENTRY(handle->lgh_ctxt, imp); - req = ptlrpc_request_alloc_pack(imp, &RQF_LLOG_ORIGIN_HANDLE_READ_HEADER, - LUSTRE_LOG_VERSION, - LLOG_ORIGIN_HANDLE_READ_HEADER); - if (!req) { - rc = -ENOMEM; - goto err_exit; - } - - body = req_capsule_client_get(&req->rq_pill, &RMF_LLOGD_BODY); - body->lgd_logid = handle->lgh_id; - body->lgd_ctxt_idx = handle->lgh_ctxt->loc_idx - 1; - body->lgd_llh_flags = handle->lgh_hdr->llh_flags; - - ptlrpc_request_set_replen(req); - rc = ptlrpc_queue_wait(req); - if (rc) - goto out; - - hdr = req_capsule_server_get(&req->rq_pill, &RMF_LLOG_LOG_HDR); - if (!hdr) { - rc = -EFAULT; - goto out; - } - - if (handle->lgh_hdr_size < hdr->llh_hdr.lrh_len) { - rc = -EFAULT; - goto out; - } - - memcpy(handle->lgh_hdr, hdr, hdr->llh_hdr.lrh_len); - handle->lgh_last_idx = LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_index; - - /* sanity checks */ - llh_hdr = &handle->lgh_hdr->llh_hdr; - if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) { - CERROR("bad log header magic: %#x (expecting %#x)\n", - llh_hdr->lrh_type, LLOG_HDR_MAGIC); - rc = -EIO; - } else if (llh_hdr->lrh_len != - LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_len || - (llh_hdr->lrh_len & (llh_hdr->lrh_len - 1)) || - llh_hdr->lrh_len < LLOG_MIN_CHUNK_SIZE || - llh_hdr->lrh_len > handle->lgh_hdr_size) { - CERROR("incorrectly sized log header: %#x (expecting %#x) (power of two > 8192)\n", - llh_hdr->lrh_len, - LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_len); - CERROR("you may need to re-run lconf --write_conf.\n"); - rc = -EIO; - } -out: - ptlrpc_req_finished(req); -err_exit: - LLOG_CLIENT_EXIT(handle->lgh_ctxt, imp); - return rc; -} - -static int llog_client_close(const struct lu_env *env, - struct llog_handle *handle) -{ - /* this doesn't call LLOG_ORIGIN_HANDLE_CLOSE because - * the servers all close the file at the end of every - * other LLOG_ RPC. - */ - return 0; -} - -struct llog_operations llog_client_ops = { - .lop_next_block = llog_client_next_block, - .lop_prev_block = llog_client_prev_block, - .lop_read_header = llog_client_read_header, - .lop_open = llog_client_open, - .lop_close = llog_client_close, -}; -EXPORT_SYMBOL(llog_client_ops); diff --git a/drivers/staging/lustre/lustre/ptlrpc/llog_net.c b/drivers/staging/lustre/lustre/ptlrpc/llog_net.c deleted file mode 100644 index b871d9e40a9efa031a718bd91b68c8c352abdd80..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ptlrpc/llog_net.c +++ /dev/null @@ -1,67 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/ptlrpc/llog_net.c - * - * OST<->MDS recovery logging infrastructure. - * - * Invariants in implementation: - * - we do not share logs among different OST<->MDS connections, so that - * if an OST or MDS fails it need only look at log(s) relevant to itself - * - * Author: Andreas Dilger - */ - -#define DEBUG_SUBSYSTEM S_LOG - -#include -#include -#include - -int llog_initiator_connect(struct llog_ctxt *ctxt) -{ - struct obd_import *new_imp; - - LASSERT(ctxt); - new_imp = ctxt->loc_obd->u.cli.cl_import; - LASSERTF(!ctxt->loc_imp || ctxt->loc_imp == new_imp, - "%p - %p\n", ctxt->loc_imp, new_imp); - mutex_lock(&ctxt->loc_mutex); - if (ctxt->loc_imp != new_imp) { - if (ctxt->loc_imp) - class_import_put(ctxt->loc_imp); - ctxt->loc_imp = class_import_get(new_imp); - } - mutex_unlock(&ctxt->loc_mutex); - return 0; -} -EXPORT_SYMBOL(llog_initiator_connect); diff --git a/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c b/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c deleted file mode 100644 index 0b638837f88b0169b0b297752014e9a99f9865b7..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ptlrpc/lproc_ptlrpc.c +++ /dev/null @@ -1,1316 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ -#define DEBUG_SUBSYSTEM S_CLASS - -#include -#include -#include -#include -#include -#include -#include "ptlrpc_internal.h" - -static struct ll_rpc_opcode { - __u32 opcode; - const char *opname; -} ll_rpc_opcode_table[LUSTRE_MAX_OPCODES] = { - { OST_REPLY, "ost_reply" }, - { OST_GETATTR, "ost_getattr" }, - { OST_SETATTR, "ost_setattr" }, - { OST_READ, "ost_read" }, - { OST_WRITE, "ost_write" }, - { OST_CREATE, "ost_create" }, - { OST_DESTROY, "ost_destroy" }, - { OST_GET_INFO, "ost_get_info" }, - { OST_CONNECT, "ost_connect" }, - { OST_DISCONNECT, "ost_disconnect" }, - { OST_PUNCH, "ost_punch" }, - { OST_OPEN, "ost_open" }, - { OST_CLOSE, "ost_close" }, - { OST_STATFS, "ost_statfs" }, - { 14, NULL }, /* formerly OST_SAN_READ */ - { 15, NULL }, /* formerly OST_SAN_WRITE */ - { OST_SYNC, "ost_sync" }, - { OST_SET_INFO, "ost_set_info" }, - { OST_QUOTACHECK, "ost_quotacheck" }, - { OST_QUOTACTL, "ost_quotactl" }, - { OST_QUOTA_ADJUST_QUNIT, "ost_quota_adjust_qunit" }, - { MDS_GETATTR, "mds_getattr" }, - { MDS_GETATTR_NAME, "mds_getattr_lock" }, - { MDS_CLOSE, "mds_close" }, - { MDS_REINT, "mds_reint" }, - { MDS_READPAGE, "mds_readpage" }, - { MDS_CONNECT, "mds_connect" }, - { MDS_DISCONNECT, "mds_disconnect" }, - { MDS_GETSTATUS, "mds_getstatus" }, - { MDS_STATFS, "mds_statfs" }, - { MDS_PIN, "mds_pin" }, - { MDS_UNPIN, "mds_unpin" }, - { MDS_SYNC, "mds_sync" }, - { MDS_DONE_WRITING, "mds_done_writing" }, - { MDS_SET_INFO, "mds_set_info" }, - { MDS_QUOTACHECK, "mds_quotacheck" }, - { MDS_QUOTACTL, "mds_quotactl" }, - { MDS_GETXATTR, "mds_getxattr" }, - { MDS_SETXATTR, "mds_setxattr" }, - { MDS_WRITEPAGE, "mds_writepage" }, - { MDS_IS_SUBDIR, "mds_is_subdir" }, - { MDS_GET_INFO, "mds_get_info" }, - { MDS_HSM_STATE_GET, "mds_hsm_state_get" }, - { MDS_HSM_STATE_SET, "mds_hsm_state_set" }, - { MDS_HSM_ACTION, "mds_hsm_action" }, - { MDS_HSM_PROGRESS, "mds_hsm_progress" }, - { MDS_HSM_REQUEST, "mds_hsm_request" }, - { MDS_HSM_CT_REGISTER, "mds_hsm_ct_register" }, - { MDS_HSM_CT_UNREGISTER, "mds_hsm_ct_unregister" }, - { MDS_SWAP_LAYOUTS, "mds_swap_layouts" }, - { LDLM_ENQUEUE, "ldlm_enqueue" }, - { LDLM_CONVERT, "ldlm_convert" }, - { LDLM_CANCEL, "ldlm_cancel" }, - { LDLM_BL_CALLBACK, "ldlm_bl_callback" }, - { LDLM_CP_CALLBACK, "ldlm_cp_callback" }, - { LDLM_GL_CALLBACK, "ldlm_gl_callback" }, - { LDLM_SET_INFO, "ldlm_set_info" }, - { MGS_CONNECT, "mgs_connect" }, - { MGS_DISCONNECT, "mgs_disconnect" }, - { MGS_EXCEPTION, "mgs_exception" }, - { MGS_TARGET_REG, "mgs_target_reg" }, - { MGS_TARGET_DEL, "mgs_target_del" }, - { MGS_SET_INFO, "mgs_set_info" }, - { MGS_CONFIG_READ, "mgs_config_read" }, - { OBD_PING, "obd_ping" }, - { OBD_LOG_CANCEL, "llog_cancel" }, - { OBD_QC_CALLBACK, "obd_quota_callback" }, - { OBD_IDX_READ, "dt_index_read" }, - { LLOG_ORIGIN_HANDLE_CREATE, "llog_origin_handle_open" }, - { LLOG_ORIGIN_HANDLE_NEXT_BLOCK, "llog_origin_handle_next_block" }, - { LLOG_ORIGIN_HANDLE_READ_HEADER, "llog_origin_handle_read_header" }, - { LLOG_ORIGIN_HANDLE_WRITE_REC, "llog_origin_handle_write_rec" }, - { LLOG_ORIGIN_HANDLE_CLOSE, "llog_origin_handle_close" }, - { LLOG_ORIGIN_CONNECT, "llog_origin_connect" }, - { LLOG_CATINFO, "llog_catinfo" }, - { LLOG_ORIGIN_HANDLE_PREV_BLOCK, "llog_origin_handle_prev_block" }, - { LLOG_ORIGIN_HANDLE_DESTROY, "llog_origin_handle_destroy" }, - { QUOTA_DQACQ, "quota_acquire" }, - { QUOTA_DQREL, "quota_release" }, - { SEQ_QUERY, "seq_query" }, - { SEC_CTX_INIT, "sec_ctx_init" }, - { SEC_CTX_INIT_CONT, "sec_ctx_init_cont" }, - { SEC_CTX_FINI, "sec_ctx_fini" }, - { FLD_QUERY, "fld_query" }, - { FLD_READ, "fld_read" }, -}; - -static struct ll_eopcode { - __u32 opcode; - const char *opname; -} ll_eopcode_table[EXTRA_LAST_OPC] = { - { LDLM_GLIMPSE_ENQUEUE, "ldlm_glimpse_enqueue" }, - { LDLM_PLAIN_ENQUEUE, "ldlm_plain_enqueue" }, - { LDLM_EXTENT_ENQUEUE, "ldlm_extent_enqueue" }, - { LDLM_FLOCK_ENQUEUE, "ldlm_flock_enqueue" }, - { LDLM_IBITS_ENQUEUE, "ldlm_ibits_enqueue" }, - { MDS_REINT_SETATTR, "mds_reint_setattr" }, - { MDS_REINT_CREATE, "mds_reint_create" }, - { MDS_REINT_LINK, "mds_reint_link" }, - { MDS_REINT_UNLINK, "mds_reint_unlink" }, - { MDS_REINT_RENAME, "mds_reint_rename" }, - { MDS_REINT_OPEN, "mds_reint_open" }, - { MDS_REINT_SETXATTR, "mds_reint_setxattr" }, - { BRW_READ_BYTES, "read_bytes" }, - { BRW_WRITE_BYTES, "write_bytes" }, -}; - -const char *ll_opcode2str(__u32 opcode) -{ - /* When one of the assertions below fail, chances are that: - * 1) A new opcode was added in include/lustre/lustre_idl.h, - * but is missing from the table above. - * or 2) The opcode space was renumbered or rearranged, - * and the opcode_offset() function in - * ptlrpc_internal.h needs to be modified. - */ - __u32 offset = opcode_offset(opcode); - - LASSERTF(offset < LUSTRE_MAX_OPCODES, - "offset %u >= LUSTRE_MAX_OPCODES %u\n", - offset, LUSTRE_MAX_OPCODES); - LASSERTF(ll_rpc_opcode_table[offset].opcode == opcode, - "ll_rpc_opcode_table[%u].opcode %u != opcode %u\n", - offset, ll_rpc_opcode_table[offset].opcode, opcode); - return ll_rpc_opcode_table[offset].opname; -} - -static const char *ll_eopcode2str(__u32 opcode) -{ - LASSERT(ll_eopcode_table[opcode].opcode == opcode); - return ll_eopcode_table[opcode].opname; -} - -static void -ptlrpc_ldebugfs_register(struct dentry *root, char *dir, - char *name, - struct dentry **debugfs_root_ret, - struct lprocfs_stats **stats_ret) -{ - struct dentry *svc_debugfs_entry; - struct lprocfs_stats *svc_stats; - int i; - unsigned int svc_counter_config = LPROCFS_CNTR_AVGMINMAX | - LPROCFS_CNTR_STDDEV; - - LASSERT(!*debugfs_root_ret); - LASSERT(!*stats_ret); - - svc_stats = lprocfs_alloc_stats(EXTRA_MAX_OPCODES + LUSTRE_MAX_OPCODES, - 0); - if (!svc_stats) - return; - - if (dir) - svc_debugfs_entry = debugfs_create_dir(dir, root); - else - svc_debugfs_entry = root; - - lprocfs_counter_init(svc_stats, PTLRPC_REQWAIT_CNTR, - svc_counter_config, "req_waittime", "usec"); - lprocfs_counter_init(svc_stats, PTLRPC_REQQDEPTH_CNTR, - svc_counter_config, "req_qdepth", "reqs"); - lprocfs_counter_init(svc_stats, PTLRPC_REQACTIVE_CNTR, - svc_counter_config, "req_active", "reqs"); - lprocfs_counter_init(svc_stats, PTLRPC_TIMEOUT, - svc_counter_config, "req_timeout", "sec"); - lprocfs_counter_init(svc_stats, PTLRPC_REQBUF_AVAIL_CNTR, - svc_counter_config, "reqbuf_avail", "bufs"); - for (i = 0; i < EXTRA_LAST_OPC; i++) { - char *units; - - switch (i) { - case BRW_WRITE_BYTES: - case BRW_READ_BYTES: - units = "bytes"; - break; - default: - units = "reqs"; - break; - } - lprocfs_counter_init(svc_stats, PTLRPC_LAST_CNTR + i, - svc_counter_config, - ll_eopcode2str(i), units); - } - for (i = 0; i < LUSTRE_MAX_OPCODES; i++) { - __u32 opcode = ll_rpc_opcode_table[i].opcode; - - lprocfs_counter_init(svc_stats, - EXTRA_MAX_OPCODES + i, svc_counter_config, - ll_opcode2str(opcode), "usec"); - } - - debugfs_create_file("stats", 0644, svc_debugfs_entry, svc_stats, - &lprocfs_stats_seq_fops); - if (dir) - *debugfs_root_ret = svc_debugfs_entry; - *stats_ret = svc_stats; -} - -static int -ptlrpc_lprocfs_req_history_len_seq_show(struct seq_file *m, void *v) -{ - struct ptlrpc_service *svc = m->private; - struct ptlrpc_service_part *svcpt; - int total = 0; - int i; - - ptlrpc_service_for_each_part(svcpt, i, svc) - total += svcpt->scp_hist_nrqbds; - - seq_printf(m, "%d\n", total); - return 0; -} - -LPROC_SEQ_FOPS_RO(ptlrpc_lprocfs_req_history_len); - -static int -ptlrpc_lprocfs_req_history_max_seq_show(struct seq_file *m, void *n) -{ - struct ptlrpc_service *svc = m->private; - struct ptlrpc_service_part *svcpt; - int total = 0; - int i; - - ptlrpc_service_for_each_part(svcpt, i, svc) - total += svc->srv_hist_nrqbds_cpt_max; - - seq_printf(m, "%d\n", total); - return 0; -} - -static ssize_t -ptlrpc_lprocfs_req_history_max_seq_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *off) -{ - struct ptlrpc_service *svc = ((struct seq_file *)file->private_data)->private; - int bufpages; - int val; - int rc; - - rc = lprocfs_write_helper(buffer, count, &val); - if (rc < 0) - return rc; - - if (val < 0) - return -ERANGE; - - /* This sanity check is more of an insanity check; we can still - * hose a kernel by allowing the request history to grow too - * far. - */ - bufpages = (svc->srv_buf_size + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (val > totalram_pages / (2 * bufpages)) - return -ERANGE; - - spin_lock(&svc->srv_lock); - - if (val == 0) - svc->srv_hist_nrqbds_cpt_max = 0; - else - svc->srv_hist_nrqbds_cpt_max = max(1, (val / svc->srv_ncpts)); - - spin_unlock(&svc->srv_lock); - - return count; -} - -LPROC_SEQ_FOPS(ptlrpc_lprocfs_req_history_max); - -static ssize_t threads_min_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service, - srv_kobj); - - return sprintf(buf, "%d\n", svc->srv_nthrs_cpt_init * svc->srv_ncpts); -} - -static ssize_t threads_min_store(struct kobject *kobj, struct attribute *attr, - const char *buffer, size_t count) -{ - struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service, - srv_kobj); - unsigned long val; - int rc = kstrtoul(buffer, 10, &val); - - if (rc < 0) - return rc; - - if (val / svc->srv_ncpts < PTLRPC_NTHRS_INIT) - return -ERANGE; - - spin_lock(&svc->srv_lock); - if (val > svc->srv_nthrs_cpt_limit * svc->srv_ncpts) { - spin_unlock(&svc->srv_lock); - return -ERANGE; - } - - svc->srv_nthrs_cpt_init = val / svc->srv_ncpts; - - spin_unlock(&svc->srv_lock); - - return count; -} -LUSTRE_RW_ATTR(threads_min); - -static ssize_t threads_started_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service, - srv_kobj); - struct ptlrpc_service_part *svcpt; - int total = 0; - int i; - - ptlrpc_service_for_each_part(svcpt, i, svc) - total += svcpt->scp_nthrs_running; - - return sprintf(buf, "%d\n", total); -} -LUSTRE_RO_ATTR(threads_started); - -static ssize_t threads_max_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service, - srv_kobj); - - return sprintf(buf, "%d\n", svc->srv_nthrs_cpt_limit * svc->srv_ncpts); -} - -static ssize_t threads_max_store(struct kobject *kobj, struct attribute *attr, - const char *buffer, size_t count) -{ - struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service, - srv_kobj); - unsigned long val; - int rc = kstrtoul(buffer, 10, &val); - - if (rc < 0) - return rc; - - if (val / svc->srv_ncpts < PTLRPC_NTHRS_INIT) - return -ERANGE; - - spin_lock(&svc->srv_lock); - if (val < svc->srv_nthrs_cpt_init * svc->srv_ncpts) { - spin_unlock(&svc->srv_lock); - return -ERANGE; - } - - svc->srv_nthrs_cpt_limit = val / svc->srv_ncpts; - - spin_unlock(&svc->srv_lock); - - return count; -} -LUSTRE_RW_ATTR(threads_max); - -/** - * \addtogoup nrs - * @{ - */ - -/** - * Translates \e ptlrpc_nrs_pol_state values to human-readable strings. - * - * \param[in] state The policy state - */ -static const char *nrs_state2str(enum ptlrpc_nrs_pol_state state) -{ - switch (state) { - default: - LBUG(); - case NRS_POL_STATE_INVALID: - return "invalid"; - case NRS_POL_STATE_STOPPED: - return "stopped"; - case NRS_POL_STATE_STOPPING: - return "stopping"; - case NRS_POL_STATE_STARTING: - return "starting"; - case NRS_POL_STATE_STARTED: - return "started"; - } -} - -/** - * Obtains status information for \a policy. - * - * Information is copied in \a info. - * - * \param[in] policy The policy - * \param[out] info Holds returned status information - */ -static void nrs_policy_get_info_locked(struct ptlrpc_nrs_policy *policy, - struct ptlrpc_nrs_pol_info *info) -{ - assert_spin_locked(&policy->pol_nrs->nrs_lock); - - memcpy(info->pi_name, policy->pol_desc->pd_name, NRS_POL_NAME_MAX); - - info->pi_fallback = !!(policy->pol_flags & PTLRPC_NRS_FL_FALLBACK); - info->pi_state = policy->pol_state; - /** - * XXX: These are accessed without holding - * ptlrpc_service_part::scp_req_lock. - */ - info->pi_req_queued = policy->pol_req_queued; - info->pi_req_started = policy->pol_req_started; -} - -/** - * Reads and prints policy status information for all policies of a PTLRPC - * service. - */ -static int ptlrpc_lprocfs_nrs_seq_show(struct seq_file *m, void *n) -{ - struct ptlrpc_service *svc = m->private; - struct ptlrpc_service_part *svcpt; - struct ptlrpc_nrs *nrs; - struct ptlrpc_nrs_policy *policy; - struct ptlrpc_nrs_pol_info *infos; - struct ptlrpc_nrs_pol_info tmp; - unsigned int num_pols; - unsigned int pol_idx = 0; - bool hp = false; - int i; - int rc = 0; - - /** - * Serialize NRS core lprocfs operations with policy registration/ - * unregistration. - */ - mutex_lock(&nrs_core.nrs_mutex); - - /** - * Use the first service partition's regular NRS head in order to obtain - * the number of policies registered with NRS heads of this service. All - * service partitions will have the same number of policies. - */ - nrs = nrs_svcpt2nrs(svc->srv_parts[0], false); - - spin_lock(&nrs->nrs_lock); - num_pols = svc->srv_parts[0]->scp_nrs_reg.nrs_num_pols; - spin_unlock(&nrs->nrs_lock); - - infos = kcalloc(num_pols, sizeof(*infos), GFP_NOFS); - if (!infos) { - rc = -ENOMEM; - goto unlock; - } -again: - - ptlrpc_service_for_each_part(svcpt, i, svc) { - nrs = nrs_svcpt2nrs(svcpt, hp); - spin_lock(&nrs->nrs_lock); - - pol_idx = 0; - - list_for_each_entry(policy, &nrs->nrs_policy_list, pol_list) { - LASSERT(pol_idx < num_pols); - - nrs_policy_get_info_locked(policy, &tmp); - /** - * Copy values when handling the first service - * partition. - */ - if (i == 0) { - memcpy(infos[pol_idx].pi_name, tmp.pi_name, - NRS_POL_NAME_MAX); - memcpy(&infos[pol_idx].pi_state, &tmp.pi_state, - sizeof(tmp.pi_state)); - infos[pol_idx].pi_fallback = tmp.pi_fallback; - /** - * For the rest of the service partitions - * sanity-check the values we get. - */ - } else { - LASSERT(strncmp(infos[pol_idx].pi_name, - tmp.pi_name, - NRS_POL_NAME_MAX) == 0); - /** - * Not asserting ptlrpc_nrs_pol_info::pi_state, - * because it may be different between - * instances of the same policy in different - * service partitions. - */ - LASSERT(infos[pol_idx].pi_fallback == - tmp.pi_fallback); - } - - infos[pol_idx].pi_req_queued += tmp.pi_req_queued; - infos[pol_idx].pi_req_started += tmp.pi_req_started; - - pol_idx++; - } - spin_unlock(&nrs->nrs_lock); - } - - /** - * Policy status information output is in YAML format. - * For example: - * - * regular_requests: - * - name: fifo - * state: started - * fallback: yes - * queued: 0 - * active: 0 - * - * - name: crrn - * state: started - * fallback: no - * queued: 2015 - * active: 384 - * - * high_priority_requests: - * - name: fifo - * state: started - * fallback: yes - * queued: 0 - * active: 2 - * - * - name: crrn - * state: stopped - * fallback: no - * queued: 0 - * active: 0 - */ - seq_printf(m, "%s\n", - !hp ? "\nregular_requests:" : "high_priority_requests:"); - - for (pol_idx = 0; pol_idx < num_pols; pol_idx++) { - seq_printf(m, " - name: %s\n" - " state: %s\n" - " fallback: %s\n" - " queued: %-20d\n" - " active: %-20d\n\n", - infos[pol_idx].pi_name, - nrs_state2str(infos[pol_idx].pi_state), - infos[pol_idx].pi_fallback ? "yes" : "no", - (int)infos[pol_idx].pi_req_queued, - (int)infos[pol_idx].pi_req_started); - } - - if (!hp && nrs_svc_has_hp(svc)) { - memset(infos, 0, num_pols * sizeof(*infos)); - - /** - * Redo the processing for the service's HP NRS heads' policies. - */ - hp = true; - goto again; - } - - kfree(infos); -unlock: - mutex_unlock(&nrs_core.nrs_mutex); - - return rc; -} - -/** - * The longest valid command string is the maximum policy name size, plus the - * length of the " reg" substring - */ -#define LPROCFS_NRS_WR_MAX_CMD (NRS_POL_NAME_MAX + sizeof(" reg") - 1) - -/** - * Starts and stops a given policy on a PTLRPC service. - * - * Commands consist of the policy name, followed by an optional [reg|hp] token; - * if the optional token is omitted, the operation is performed on both the - * regular and high-priority (if the service has one) NRS head. - */ -static ssize_t ptlrpc_lprocfs_nrs_seq_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *off) -{ - struct ptlrpc_service *svc = ((struct seq_file *)file->private_data)->private; - enum ptlrpc_nrs_queue_type queue = PTLRPC_NRS_QUEUE_BOTH; - char *cmd; - char *cmd_copy = NULL; - char *token; - int rc = 0; - - if (count >= LPROCFS_NRS_WR_MAX_CMD) - return -EINVAL; - - cmd = kzalloc(LPROCFS_NRS_WR_MAX_CMD, GFP_NOFS); - if (!cmd) - return -ENOMEM; - /** - * strsep() modifies its argument, so keep a copy - */ - cmd_copy = cmd; - - if (copy_from_user(cmd, buffer, count)) { - rc = -EFAULT; - goto out; - } - - cmd[count] = '\0'; - - token = strsep(&cmd, " "); - - if (strlen(token) > NRS_POL_NAME_MAX - 1) { - rc = -EINVAL; - goto out; - } - - /** - * No [reg|hp] token has been specified - */ - if (!cmd) - goto default_queue; - - /** - * The second token is either NULL, or an optional [reg|hp] string - */ - if (strcmp(cmd, "reg") == 0) { - queue = PTLRPC_NRS_QUEUE_REG; - } else if (strcmp(cmd, "hp") == 0) { - queue = PTLRPC_NRS_QUEUE_HP; - } else { - rc = -EINVAL; - goto out; - } - -default_queue: - - if (queue == PTLRPC_NRS_QUEUE_HP && !nrs_svc_has_hp(svc)) { - rc = -ENODEV; - goto out; - } else if (queue == PTLRPC_NRS_QUEUE_BOTH && !nrs_svc_has_hp(svc)) { - queue = PTLRPC_NRS_QUEUE_REG; - } - - /** - * Serialize NRS core lprocfs operations with policy registration/ - * unregistration. - */ - mutex_lock(&nrs_core.nrs_mutex); - - rc = ptlrpc_nrs_policy_control(svc, queue, token, PTLRPC_NRS_CTL_START, - false, NULL); - - mutex_unlock(&nrs_core.nrs_mutex); -out: - kfree(cmd_copy); - - return rc < 0 ? rc : count; -} - -LPROC_SEQ_FOPS(ptlrpc_lprocfs_nrs); - -/** @} nrs */ - -struct ptlrpc_srh_iterator { - int srhi_idx; - __u64 srhi_seq; - struct ptlrpc_request *srhi_req; -}; - -static int -ptlrpc_lprocfs_svc_req_history_seek(struct ptlrpc_service_part *svcpt, - struct ptlrpc_srh_iterator *srhi, - __u64 seq) -{ - struct list_head *e; - struct ptlrpc_request *req; - - if (srhi->srhi_req && srhi->srhi_seq > svcpt->scp_hist_seq_culled && - srhi->srhi_seq <= seq) { - /* If srhi_req was set previously, hasn't been culled and - * we're searching for a seq on or after it (i.e. more - * recent), search from it onwards. - * Since the service history is LRU (i.e. culled reqs will - * be near the head), we shouldn't have to do long - * re-scans - */ - LASSERTF(srhi->srhi_seq == srhi->srhi_req->rq_history_seq, - "%s:%d: seek seq %llu, request seq %llu\n", - svcpt->scp_service->srv_name, svcpt->scp_cpt, - srhi->srhi_seq, srhi->srhi_req->rq_history_seq); - LASSERTF(!list_empty(&svcpt->scp_hist_reqs), - "%s:%d: seek offset %llu, request seq %llu, last culled %llu\n", - svcpt->scp_service->srv_name, svcpt->scp_cpt, - seq, srhi->srhi_seq, svcpt->scp_hist_seq_culled); - e = &srhi->srhi_req->rq_history_list; - } else { - /* search from start */ - e = svcpt->scp_hist_reqs.next; - } - - while (e != &svcpt->scp_hist_reqs) { - req = list_entry(e, struct ptlrpc_request, rq_history_list); - - if (req->rq_history_seq >= seq) { - srhi->srhi_seq = req->rq_history_seq; - srhi->srhi_req = req; - return 0; - } - e = e->next; - } - - return -ENOENT; -} - -/* - * ptlrpc history sequence is used as "position" of seq_file, in some case, - * seq_read() will increase "position" to indicate reading the next - * element, however, low bits of history sequence are reserved for CPT id - * (check the details from comments before ptlrpc_req_add_history), which - * means seq_read() might change CPT id of history sequence and never - * finish reading of requests on a CPT. To make it work, we have to shift - * CPT id to high bits and timestamp to low bits, so seq_read() will only - * increase timestamp which can correctly indicate the next position. - */ - -/* convert seq_file pos to cpt */ -#define PTLRPC_REQ_POS2CPT(svc, pos) \ - ((svc)->srv_cpt_bits == 0 ? 0 : \ - (__u64)(pos) >> (64 - (svc)->srv_cpt_bits)) - -/* make up seq_file pos from cpt */ -#define PTLRPC_REQ_CPT2POS(svc, cpt) \ - ((svc)->srv_cpt_bits == 0 ? 0 : \ - (cpt) << (64 - (svc)->srv_cpt_bits)) - -/* convert sequence to position */ -#define PTLRPC_REQ_SEQ2POS(svc, seq) \ - ((svc)->srv_cpt_bits == 0 ? (seq) : \ - ((seq) >> (svc)->srv_cpt_bits) | \ - ((seq) << (64 - (svc)->srv_cpt_bits))) - -/* convert position to sequence */ -#define PTLRPC_REQ_POS2SEQ(svc, pos) \ - ((svc)->srv_cpt_bits == 0 ? (pos) : \ - ((__u64)(pos) << (svc)->srv_cpt_bits) | \ - ((__u64)(pos) >> (64 - (svc)->srv_cpt_bits))) - -static void * -ptlrpc_lprocfs_svc_req_history_start(struct seq_file *s, loff_t *pos) -{ - struct ptlrpc_service *svc = s->private; - struct ptlrpc_service_part *svcpt; - struct ptlrpc_srh_iterator *srhi; - unsigned int cpt; - int rc; - int i; - - if (sizeof(loff_t) != sizeof(__u64)) { /* can't support */ - CWARN("Failed to read request history because size of loff_t %d can't match size of u64\n", - (int)sizeof(loff_t)); - return NULL; - } - - srhi = kzalloc(sizeof(*srhi), GFP_NOFS); - if (!srhi) - return NULL; - - srhi->srhi_seq = 0; - srhi->srhi_req = NULL; - - cpt = PTLRPC_REQ_POS2CPT(svc, *pos); - - ptlrpc_service_for_each_part(svcpt, i, svc) { - if (i < cpt) /* skip */ - continue; - if (i > cpt) /* make up the lowest position for this CPT */ - *pos = PTLRPC_REQ_CPT2POS(svc, i); - - spin_lock(&svcpt->scp_lock); - rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, - PTLRPC_REQ_POS2SEQ(svc, *pos)); - spin_unlock(&svcpt->scp_lock); - if (rc == 0) { - *pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq); - srhi->srhi_idx = i; - return srhi; - } - } - - kfree(srhi); - return NULL; -} - -static void -ptlrpc_lprocfs_svc_req_history_stop(struct seq_file *s, void *iter) -{ - struct ptlrpc_srh_iterator *srhi = iter; - - kfree(srhi); -} - -static void * -ptlrpc_lprocfs_svc_req_history_next(struct seq_file *s, - void *iter, loff_t *pos) -{ - struct ptlrpc_service *svc = s->private; - struct ptlrpc_srh_iterator *srhi = iter; - struct ptlrpc_service_part *svcpt; - __u64 seq; - int rc; - int i; - - for (i = srhi->srhi_idx; i < svc->srv_ncpts; i++) { - svcpt = svc->srv_parts[i]; - - if (i > srhi->srhi_idx) { /* reset iterator for a new CPT */ - srhi->srhi_req = NULL; - seq = 0; - srhi->srhi_seq = 0; - } else { /* the next sequence */ - seq = srhi->srhi_seq + (1 << svc->srv_cpt_bits); - } - - spin_lock(&svcpt->scp_lock); - rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, seq); - spin_unlock(&svcpt->scp_lock); - if (rc == 0) { - *pos = PTLRPC_REQ_SEQ2POS(svc, srhi->srhi_seq); - srhi->srhi_idx = i; - return srhi; - } - } - - kfree(srhi); - return NULL; -} - -static int ptlrpc_lprocfs_svc_req_history_show(struct seq_file *s, void *iter) -{ - struct ptlrpc_service *svc = s->private; - struct ptlrpc_srh_iterator *srhi = iter; - struct ptlrpc_service_part *svcpt; - struct ptlrpc_request *req; - int rc; - - LASSERT(srhi->srhi_idx < svc->srv_ncpts); - - svcpt = svc->srv_parts[srhi->srhi_idx]; - - spin_lock(&svcpt->scp_lock); - - rc = ptlrpc_lprocfs_svc_req_history_seek(svcpt, srhi, srhi->srhi_seq); - - if (rc == 0) { - struct timespec64 arrival, sent, arrivaldiff; - char nidstr[LNET_NIDSTR_SIZE]; - - req = srhi->srhi_req; - - libcfs_nid2str_r(req->rq_self, nidstr, sizeof(nidstr)); - arrival.tv_sec = req->rq_arrival_time.tv_sec; - arrival.tv_nsec = req->rq_arrival_time.tv_nsec; - sent.tv_sec = req->rq_sent; - sent.tv_nsec = 0; - arrivaldiff = timespec64_sub(sent, arrival); - - /* Print common req fields. - * CAVEAT EMPTOR: we're racing with the service handler - * here. The request could contain any old crap, so you - * must be just as careful as the service's request - * parser. Currently I only print stuff here I know is OK - * to look at coz it was set up in request_in_callback()!!! - */ - seq_printf(s, "%lld:%s:%s:x%llu:%d:%s:%lld.%06lld:%lld.%06llds(%+lld.0s) ", - req->rq_history_seq, nidstr, - libcfs_id2str(req->rq_peer), req->rq_xid, - req->rq_reqlen, ptlrpc_rqphase2str(req), - (s64)req->rq_arrival_time.tv_sec, - (s64)req->rq_arrival_time.tv_nsec / NSEC_PER_USEC, - (s64)arrivaldiff.tv_sec, - (s64)(arrivaldiff.tv_nsec / NSEC_PER_USEC), - (s64)(req->rq_sent - req->rq_deadline)); - if (!svc->srv_ops.so_req_printer) - seq_putc(s, '\n'); - else - svc->srv_ops.so_req_printer(s, srhi->srhi_req); - } - - spin_unlock(&svcpt->scp_lock); - return rc; -} - -static int -ptlrpc_lprocfs_svc_req_history_open(struct inode *inode, struct file *file) -{ - static const struct seq_operations sops = { - .start = ptlrpc_lprocfs_svc_req_history_start, - .stop = ptlrpc_lprocfs_svc_req_history_stop, - .next = ptlrpc_lprocfs_svc_req_history_next, - .show = ptlrpc_lprocfs_svc_req_history_show, - }; - struct seq_file *seqf; - int rc; - - rc = seq_open(file, &sops); - if (rc) - return rc; - - seqf = file->private_data; - seqf->private = inode->i_private; - return 0; -} - -/* See also lprocfs_rd_timeouts */ -static int ptlrpc_lprocfs_timeouts_seq_show(struct seq_file *m, void *n) -{ - struct ptlrpc_service *svc = m->private; - struct ptlrpc_service_part *svcpt; - struct dhms ts; - time64_t worstt; - unsigned int cur; - unsigned int worst; - int i; - - if (AT_OFF) { - seq_printf(m, "adaptive timeouts off, using obd_timeout %u\n", - obd_timeout); - return 0; - } - - ptlrpc_service_for_each_part(svcpt, i, svc) { - cur = at_get(&svcpt->scp_at_estimate); - worst = svcpt->scp_at_estimate.at_worst_ever; - worstt = svcpt->scp_at_estimate.at_worst_time; - s2dhms(&ts, ktime_get_real_seconds() - worstt); - - seq_printf(m, "%10s : cur %3u worst %3u (at %lld, " - DHMS_FMT " ago) ", "service", - cur, worst, (s64)worstt, DHMS_VARS(&ts)); - - lprocfs_at_hist_helper(m, &svcpt->scp_at_estimate); - } - - return 0; -} - -LPROC_SEQ_FOPS_RO(ptlrpc_lprocfs_timeouts); - -static ssize_t high_priority_ratio_show(struct kobject *kobj, - struct attribute *attr, - char *buf) -{ - struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service, - srv_kobj); - return sprintf(buf, "%d\n", svc->srv_hpreq_ratio); -} - -static ssize_t high_priority_ratio_store(struct kobject *kobj, - struct attribute *attr, - const char *buffer, - size_t count) -{ - struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service, - srv_kobj); - int rc; - int val; - - rc = kstrtoint(buffer, 10, &val); - if (rc < 0) - return rc; - - if (val < 0) - return -ERANGE; - - spin_lock(&svc->srv_lock); - svc->srv_hpreq_ratio = val; - spin_unlock(&svc->srv_lock); - - return count; -} -LUSTRE_RW_ATTR(high_priority_ratio); - -static struct attribute *ptlrpc_svc_attrs[] = { - &lustre_attr_threads_min.attr, - &lustre_attr_threads_started.attr, - &lustre_attr_threads_max.attr, - &lustre_attr_high_priority_ratio.attr, - NULL, -}; - -static void ptlrpc_sysfs_svc_release(struct kobject *kobj) -{ - struct ptlrpc_service *svc = container_of(kobj, struct ptlrpc_service, - srv_kobj); - - complete(&svc->srv_kobj_unregister); -} - -static struct kobj_type ptlrpc_svc_ktype = { - .default_attrs = ptlrpc_svc_attrs, - .sysfs_ops = &lustre_sysfs_ops, - .release = ptlrpc_sysfs_svc_release, -}; - -void ptlrpc_sysfs_unregister_service(struct ptlrpc_service *svc) -{ - /* Let's see if we had a chance at initialization first */ - if (svc->srv_kobj.kset) { - kobject_put(&svc->srv_kobj); - wait_for_completion(&svc->srv_kobj_unregister); - } -} - -int ptlrpc_sysfs_register_service(struct kset *parent, - struct ptlrpc_service *svc) -{ - int rc; - - svc->srv_kobj.kset = parent; - init_completion(&svc->srv_kobj_unregister); - rc = kobject_init_and_add(&svc->srv_kobj, &ptlrpc_svc_ktype, NULL, - "%s", svc->srv_name); - - return rc; -} - -void ptlrpc_ldebugfs_register_service(struct dentry *entry, - struct ptlrpc_service *svc) -{ - struct lprocfs_vars lproc_vars[] = { - {.name = "req_buffer_history_len", - .fops = &ptlrpc_lprocfs_req_history_len_fops, - .data = svc}, - {.name = "req_buffer_history_max", - .fops = &ptlrpc_lprocfs_req_history_max_fops, - .data = svc}, - {.name = "timeouts", - .fops = &ptlrpc_lprocfs_timeouts_fops, - .data = svc}, - {.name = "nrs_policies", - .fops = &ptlrpc_lprocfs_nrs_fops, - .data = svc}, - {NULL} - }; - static const struct file_operations req_history_fops = { - .owner = THIS_MODULE, - .open = ptlrpc_lprocfs_svc_req_history_open, - .read = seq_read, - .llseek = seq_lseek, - .release = lprocfs_seq_release, - }; - - ptlrpc_ldebugfs_register(entry, svc->srv_name, - "stats", &svc->srv_debugfs_entry, - &svc->srv_stats); - - if (IS_ERR_OR_NULL(svc->srv_debugfs_entry)) - return; - - ldebugfs_add_vars(svc->srv_debugfs_entry, lproc_vars, NULL); - - debugfs_create_file("req_history", 0400, svc->srv_debugfs_entry, svc, - &req_history_fops); -} - -void ptlrpc_lprocfs_register_obd(struct obd_device *obddev) -{ - ptlrpc_ldebugfs_register(obddev->obd_debugfs_entry, NULL, "stats", - &obddev->obd_svc_debugfs_entry, - &obddev->obd_svc_stats); -} -EXPORT_SYMBOL(ptlrpc_lprocfs_register_obd); - -void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req, long amount) -{ - struct lprocfs_stats *svc_stats; - __u32 op = lustre_msg_get_opc(req->rq_reqmsg); - int opc = opcode_offset(op); - - svc_stats = req->rq_import->imp_obd->obd_svc_stats; - if (!svc_stats || opc <= 0) - return; - LASSERT(opc < LUSTRE_MAX_OPCODES); - if (!(op == LDLM_ENQUEUE || op == MDS_REINT)) - lprocfs_counter_add(svc_stats, opc + EXTRA_MAX_OPCODES, amount); -} - -void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes) -{ - struct lprocfs_stats *svc_stats; - int idx; - - if (!req->rq_import) - return; - svc_stats = req->rq_import->imp_obd->obd_svc_stats; - if (!svc_stats) - return; - idx = lustre_msg_get_opc(req->rq_reqmsg); - switch (idx) { - case OST_READ: - idx = BRW_READ_BYTES + PTLRPC_LAST_CNTR; - break; - case OST_WRITE: - idx = BRW_WRITE_BYTES + PTLRPC_LAST_CNTR; - break; - default: - LASSERTF(0, "unsupported opcode %u\n", idx); - break; - } - - lprocfs_counter_add(svc_stats, idx, bytes); -} -EXPORT_SYMBOL(ptlrpc_lprocfs_brw); - -void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc) -{ - debugfs_remove_recursive(svc->srv_debugfs_entry); - - if (svc->srv_stats) - lprocfs_free_stats(&svc->srv_stats); -} - -void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd) -{ - debugfs_remove_recursive(obd->obd_svc_debugfs_entry); - - if (obd->obd_svc_stats) - lprocfs_free_stats(&obd->obd_svc_stats); -} -EXPORT_SYMBOL(ptlrpc_lprocfs_unregister_obd); - -#undef BUFLEN - -int lprocfs_wr_ping(struct file *file, const char __user *buffer, - size_t count, loff_t *off) -{ - struct obd_device *obd = ((struct seq_file *)file->private_data)->private; - struct ptlrpc_request *req; - int rc; - - rc = lprocfs_climp_check(obd); - if (rc) - return rc; - - req = ptlrpc_prep_ping(obd->u.cli.cl_import); - up_read(&obd->u.cli.cl_sem); - if (!req) - return -ENOMEM; - - req->rq_send_state = LUSTRE_IMP_FULL; - - rc = ptlrpc_queue_wait(req); - - ptlrpc_req_finished(req); - if (rc >= 0) - return count; - return rc; -} -EXPORT_SYMBOL(lprocfs_wr_ping); - -/* Write the connection UUID to this file to attempt to connect to that node. - * The connection UUID is a node's primary NID. For example, - * "echo connection=192.168.0.1@tcp0::instance > .../import". - */ -int lprocfs_wr_import(struct file *file, const char __user *buffer, - size_t count, loff_t *off) -{ - struct obd_device *obd = ((struct seq_file *)file->private_data)->private; - struct obd_import *imp = obd->u.cli.cl_import; - char *kbuf = NULL; - char *uuid; - char *ptr; - int do_reconn = 1; - const char prefix[] = "connection="; - const int prefix_len = sizeof(prefix) - 1; - - if (count > PAGE_SIZE - 1 || count <= prefix_len) - return -EINVAL; - - kbuf = kzalloc(count + 1, GFP_NOFS); - if (!kbuf) - return -ENOMEM; - - if (copy_from_user(kbuf, buffer, count)) { - count = -EFAULT; - goto out; - } - - kbuf[count] = 0; - - /* only support connection=uuid::instance now */ - if (strncmp(prefix, kbuf, prefix_len) != 0) { - count = -EINVAL; - goto out; - } - - uuid = kbuf + prefix_len; - ptr = strstr(uuid, "::"); - if (ptr) { - __u32 inst; - char *endptr; - - *ptr = 0; - do_reconn = 0; - ptr += strlen("::"); - inst = simple_strtoul(ptr, &endptr, 10); - if (*endptr) { - CERROR("config: wrong instance # %s\n", ptr); - } else if (inst != imp->imp_connect_data.ocd_instance) { - CDEBUG(D_INFO, "IR: %s is connecting to an obsoleted target(%u/%u), reconnecting...\n", - imp->imp_obd->obd_name, - imp->imp_connect_data.ocd_instance, inst); - do_reconn = 1; - } else { - CDEBUG(D_INFO, "IR: %s has already been connecting to new target(%u)\n", - imp->imp_obd->obd_name, inst); - } - } - - if (do_reconn) - ptlrpc_recover_import(imp, uuid, 1); - -out: - kfree(kbuf); - return count; -} -EXPORT_SYMBOL(lprocfs_wr_import); - -int lprocfs_rd_pinger_recov(struct seq_file *m, void *n) -{ - struct obd_device *obd = m->private; - struct obd_import *imp = obd->u.cli.cl_import; - int rc; - - rc = lprocfs_climp_check(obd); - if (rc) - return rc; - - seq_printf(m, "%d\n", !imp->imp_no_pinger_recover); - up_read(&obd->u.cli.cl_sem); - - return 0; -} -EXPORT_SYMBOL(lprocfs_rd_pinger_recov); - -int lprocfs_wr_pinger_recov(struct file *file, const char __user *buffer, - size_t count, loff_t *off) -{ - struct obd_device *obd = ((struct seq_file *)file->private_data)->private; - struct client_obd *cli = &obd->u.cli; - struct obd_import *imp = cli->cl_import; - int rc, val; - - rc = lprocfs_write_helper(buffer, count, &val); - if (rc < 0) - return rc; - - if (val != 0 && val != 1) - return -ERANGE; - - rc = lprocfs_climp_check(obd); - if (rc) - return rc; - - spin_lock(&imp->imp_lock); - imp->imp_no_pinger_recover = !val; - spin_unlock(&imp->imp_lock); - up_read(&obd->u.cli.cl_sem); - - return count; -} -EXPORT_SYMBOL(lprocfs_wr_pinger_recov); diff --git a/drivers/staging/lustre/lustre/ptlrpc/niobuf.c b/drivers/staging/lustre/lustre/ptlrpc/niobuf.c deleted file mode 100644 index 2897afb8806c6de88e3d5c297b6688516b1ac986..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ptlrpc/niobuf.c +++ /dev/null @@ -1,771 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_RPC -#include -#include -#include -#include -#include -#include -#include "ptlrpc_internal.h" - -/** - * Helper function. Sends \a len bytes from \a base at offset \a offset - * over \a conn connection to portal \a portal. - * Returns 0 on success or error code. - */ -static int ptl_send_buf(struct lnet_handle_md *mdh, void *base, int len, - enum lnet_ack_req ack, struct ptlrpc_cb_id *cbid, - struct ptlrpc_connection *conn, int portal, __u64 xid, - unsigned int offset) -{ - int rc; - struct lnet_md md; - - LASSERT(portal != 0); - CDEBUG(D_INFO, "conn=%p id %s\n", conn, libcfs_id2str(conn->c_peer)); - md.start = base; - md.length = len; - md.threshold = (ack == LNET_ACK_REQ) ? 2 : 1; - md.options = PTLRPC_MD_OPTIONS; - md.user_ptr = cbid; - md.eq_handle = ptlrpc_eq_h; - - if (unlikely(ack == LNET_ACK_REQ && - OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_ACK, - OBD_FAIL_ONCE))) { - /* don't ask for the ack to simulate failing client */ - ack = LNET_NOACK_REQ; - } - - rc = LNetMDBind(md, LNET_UNLINK, mdh); - if (unlikely(rc != 0)) { - CERROR("LNetMDBind failed: %d\n", rc); - LASSERT(rc == -ENOMEM); - return -ENOMEM; - } - - CDEBUG(D_NET, "Sending %d bytes to portal %d, xid %lld, offset %u\n", - len, portal, xid, offset); - - rc = LNetPut(conn->c_self, *mdh, ack, - conn->c_peer, portal, xid, offset, 0); - if (unlikely(rc != 0)) { - int rc2; - /* We're going to get an UNLINK event when I unlink below, - * which will complete just like any other failed send, so - * I fall through and return success here! - */ - CERROR("LNetPut(%s, %d, %lld) failed: %d\n", - libcfs_id2str(conn->c_peer), portal, xid, rc); - rc2 = LNetMDUnlink(*mdh); - LASSERTF(rc2 == 0, "rc2 = %d\n", rc2); - } - - return 0; -} - -static void mdunlink_iterate_helper(struct lnet_handle_md *bd_mds, int count) -{ - int i; - - for (i = 0; i < count; i++) - LNetMDUnlink(bd_mds[i]); -} - -/** - * Register bulk at the sender for later transfer. - * Returns 0 on success or error code. - */ -static int ptlrpc_register_bulk(struct ptlrpc_request *req) -{ - struct ptlrpc_bulk_desc *desc = req->rq_bulk; - struct lnet_process_id peer; - int rc = 0; - int rc2; - int posted_md; - int total_md; - u64 mbits; - struct lnet_handle_me me_h; - struct lnet_md md; - - if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_BULK_GET_NET)) - return 0; - - /* NB no locking required until desc is on the network */ - LASSERT(desc->bd_nob > 0); - LASSERT(desc->bd_md_count == 0); - LASSERT(desc->bd_md_max_brw <= PTLRPC_BULK_OPS_COUNT); - LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES); - LASSERT(desc->bd_req); - LASSERT(ptlrpc_is_bulk_op_passive(desc->bd_type)); - - /* cleanup the state of the bulk for it will be reused */ - if (req->rq_resend || req->rq_send_state == LUSTRE_IMP_REPLAY) - desc->bd_nob_transferred = 0; - else - LASSERT(desc->bd_nob_transferred == 0); - - desc->bd_failure = 0; - - peer = desc->bd_import->imp_connection->c_peer; - - LASSERT(desc->bd_cbid.cbid_fn == client_bulk_callback); - LASSERT(desc->bd_cbid.cbid_arg == desc); - - total_md = DIV_ROUND_UP(desc->bd_iov_count, LNET_MAX_IOV); - /* rq_mbits is matchbits of the final bulk */ - mbits = req->rq_mbits - total_md + 1; - - LASSERTF(mbits == (req->rq_mbits & PTLRPC_BULK_OPS_MASK), - "first mbits = x%llu, last mbits = x%llu\n", - mbits, req->rq_mbits); - LASSERTF(!(desc->bd_registered && - req->rq_send_state != LUSTRE_IMP_REPLAY) || - mbits != desc->bd_last_mbits, - "registered: %d rq_mbits: %llu bd_last_mbits: %llu\n", - desc->bd_registered, mbits, desc->bd_last_mbits); - - desc->bd_registered = 1; - desc->bd_last_mbits = mbits; - desc->bd_md_count = total_md; - md.user_ptr = &desc->bd_cbid; - md.eq_handle = ptlrpc_eq_h; - md.threshold = 1; /* PUT or GET */ - - for (posted_md = 0; posted_md < total_md; posted_md++, mbits++) { - md.options = PTLRPC_MD_OPTIONS | - (ptlrpc_is_bulk_op_get(desc->bd_type) ? - LNET_MD_OP_GET : LNET_MD_OP_PUT); - ptlrpc_fill_bulk_md(&md, desc, posted_md); - - rc = LNetMEAttach(desc->bd_portal, peer, mbits, 0, - LNET_UNLINK, LNET_INS_AFTER, &me_h); - if (rc != 0) { - CERROR("%s: LNetMEAttach failed x%llu/%d: rc = %d\n", - desc->bd_import->imp_obd->obd_name, mbits, - posted_md, rc); - break; - } - - /* About to let the network at it... */ - rc = LNetMDAttach(me_h, md, LNET_UNLINK, - &desc->bd_mds[posted_md]); - if (rc != 0) { - CERROR("%s: LNetMDAttach failed x%llu/%d: rc = %d\n", - desc->bd_import->imp_obd->obd_name, mbits, - posted_md, rc); - rc2 = LNetMEUnlink(me_h); - LASSERT(rc2 == 0); - break; - } - } - - if (rc != 0) { - LASSERT(rc == -ENOMEM); - spin_lock(&desc->bd_lock); - desc->bd_md_count -= total_md - posted_md; - spin_unlock(&desc->bd_lock); - LASSERT(desc->bd_md_count >= 0); - mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw); - req->rq_status = -ENOMEM; - return -ENOMEM; - } - - spin_lock(&desc->bd_lock); - /* Holler if peer manages to touch buffers before he knows the mbits */ - if (desc->bd_md_count != total_md) - CWARN("%s: Peer %s touched %d buffers while I registered\n", - desc->bd_import->imp_obd->obd_name, libcfs_id2str(peer), - total_md - desc->bd_md_count); - spin_unlock(&desc->bd_lock); - - CDEBUG(D_NET, "Setup %u bulk %s buffers: %u pages %u bytes, mbits x%#llx-%#llx, portal %u\n", - desc->bd_md_count, - ptlrpc_is_bulk_op_get(desc->bd_type) ? "get-source" : "put-sink", - desc->bd_iov_count, desc->bd_nob, - desc->bd_last_mbits, req->rq_mbits, desc->bd_portal); - - return 0; -} - -/** - * Disconnect a bulk desc from the network. Idempotent. Not - * thread-safe (i.e. only interlocks with completion callback). - * Returns 1 on success or 0 if network unregistration failed for whatever - * reason. - */ -int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async) -{ - struct ptlrpc_bulk_desc *desc = req->rq_bulk; - wait_queue_head_t *wq; - int rc; - - LASSERT(!in_interrupt()); /* might sleep */ - - /* Let's setup deadline for reply unlink. */ - if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) && - async && req->rq_bulk_deadline == 0 && cfs_fail_val == 0) - req->rq_bulk_deadline = ktime_get_real_seconds() + LONG_UNLINK; - - if (ptlrpc_client_bulk_active(req) == 0) /* completed or */ - return 1; /* never registered */ - - LASSERT(desc->bd_req == req); /* bd_req NULL until registered */ - - /* the unlink ensures the callback happens ASAP and is the last - * one. If it fails, it must be because completion just happened, - * but we must still wait_event() in this case to give liblustre - * a chance to run client_bulk_callback() - */ - mdunlink_iterate_helper(desc->bd_mds, desc->bd_md_max_brw); - - if (ptlrpc_client_bulk_active(req) == 0) /* completed or */ - return 1; /* never registered */ - - /* Move to "Unregistering" phase as bulk was not unlinked yet. */ - ptlrpc_rqphase_move(req, RQ_PHASE_UNREG_BULK); - - /* Do not wait for unlink to finish. */ - if (async) - return 0; - - if (req->rq_set) - wq = &req->rq_set->set_waitq; - else - wq = &req->rq_reply_waitq; - - for (;;) { - /* Network access will complete in finite time but the HUGE - * timeout lets us CWARN for visibility of sluggish LNDs - */ - int cnt = 0; - while (cnt < LONG_UNLINK && - (rc = wait_event_idle_timeout(*wq, - !ptlrpc_client_bulk_active(req), - HZ)) == 0) - cnt += 1; - if (rc > 0) { - ptlrpc_rqphase_move(req, req->rq_next_phase); - return 1; - } - - DEBUG_REQ(D_WARNING, req, "Unexpectedly long timeout: desc %p", - desc); - } - return 0; -} - -static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags) -{ - struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; - struct ptlrpc_service *svc = svcpt->scp_service; - int service_time = max_t(int, ktime_get_real_seconds() - - req->rq_arrival_time.tv_sec, 1); - - if (!(flags & PTLRPC_REPLY_EARLY) && - (req->rq_type != PTL_RPC_MSG_ERR) && req->rq_reqmsg && - !(lustre_msg_get_flags(req->rq_reqmsg) & - (MSG_RESENT | MSG_REPLAY | - MSG_REQ_REPLAY_DONE | MSG_LOCK_REPLAY_DONE))) { - /* early replies, errors and recovery requests don't count - * toward our service time estimate - */ - int oldse = at_measured(&svcpt->scp_at_estimate, service_time); - - if (oldse != 0) { - DEBUG_REQ(D_ADAPTTO, req, - "svc %s changed estimate from %d to %d", - svc->srv_name, oldse, - at_get(&svcpt->scp_at_estimate)); - } - } - /* Report actual service time for client latency calc */ - lustre_msg_set_service_time(req->rq_repmsg, service_time); - /* Report service time estimate for future client reqs, but report 0 - * (to be ignored by client) if it's a error reply during recovery. - * (bz15815) - */ - if (req->rq_type == PTL_RPC_MSG_ERR && !req->rq_export) - lustre_msg_set_timeout(req->rq_repmsg, 0); - else - lustre_msg_set_timeout(req->rq_repmsg, - at_get(&svcpt->scp_at_estimate)); - - if (req->rq_reqmsg && - !(lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) { - CDEBUG(D_ADAPTTO, "No early reply support: flags=%#x req_flags=%#x magic=%x/%x len=%d\n", - flags, lustre_msg_get_flags(req->rq_reqmsg), - lustre_msg_get_magic(req->rq_reqmsg), - lustre_msg_get_magic(req->rq_repmsg), req->rq_replen); - } -} - -/** - * Send request reply from request \a req reply buffer. - * \a flags defines reply types - * Returns 0 on success or error code - */ -int ptlrpc_send_reply(struct ptlrpc_request *req, int flags) -{ - struct ptlrpc_reply_state *rs = req->rq_reply_state; - struct ptlrpc_connection *conn; - int rc; - - /* We must already have a reply buffer (only ptlrpc_error() may be - * called without one). The reply generated by sptlrpc layer (e.g. - * error notify, etc.) might have NULL rq->reqmsg; Otherwise we must - * have a request buffer which is either the actual (swabbed) incoming - * request, or a saved copy if this is a req saved in - * target_queue_final_reply(). - */ - LASSERT(req->rq_no_reply == 0); - LASSERT(req->rq_reqbuf); - LASSERT(rs); - LASSERT((flags & PTLRPC_REPLY_MAYBE_DIFFICULT) || !rs->rs_difficult); - LASSERT(req->rq_repmsg); - LASSERT(req->rq_repmsg == rs->rs_msg); - LASSERT(rs->rs_cb_id.cbid_fn == reply_out_callback); - LASSERT(rs->rs_cb_id.cbid_arg == rs); - - /* There may be no rq_export during failover */ - - if (unlikely(req->rq_export && req->rq_export->exp_obd && - req->rq_export->exp_obd->obd_fail)) { - /* Failed obd's only send ENODEV */ - req->rq_type = PTL_RPC_MSG_ERR; - req->rq_status = -ENODEV; - CDEBUG(D_HA, "sending ENODEV from failed obd %d\n", - req->rq_export->exp_obd->obd_minor); - } - - /* In order to keep interoperability with the client (< 2.3) which - * doesn't have pb_jobid in ptlrpc_body, We have to shrink the - * ptlrpc_body in reply buffer to ptlrpc_body_v2, otherwise, the - * reply buffer on client will be overflow. - * - * XXX Remove this whenever we drop the interoperability with - * such client. - */ - req->rq_replen = lustre_shrink_msg(req->rq_repmsg, 0, - sizeof(struct ptlrpc_body_v2), 1); - - if (req->rq_type != PTL_RPC_MSG_ERR) - req->rq_type = PTL_RPC_MSG_REPLY; - - lustre_msg_set_type(req->rq_repmsg, req->rq_type); - lustre_msg_set_status(req->rq_repmsg, - ptlrpc_status_hton(req->rq_status)); - lustre_msg_set_opc(req->rq_repmsg, - req->rq_reqmsg ? - lustre_msg_get_opc(req->rq_reqmsg) : 0); - - target_pack_pool_reply(req); - - ptlrpc_at_set_reply(req, flags); - - if (!req->rq_export || !req->rq_export->exp_connection) - conn = ptlrpc_connection_get(req->rq_peer, req->rq_self, NULL); - else - conn = ptlrpc_connection_addref(req->rq_export->exp_connection); - - if (unlikely(!conn)) { - CERROR("not replying on NULL connection\n"); /* bug 9635 */ - return -ENOTCONN; - } - ptlrpc_rs_addref(rs); /* +1 ref for the network */ - - rc = sptlrpc_svc_wrap_reply(req); - if (unlikely(rc)) - goto out; - - req->rq_sent = ktime_get_real_seconds(); - - rc = ptl_send_buf(&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len, - (rs->rs_difficult && !rs->rs_no_ack) ? - LNET_ACK_REQ : LNET_NOACK_REQ, - &rs->rs_cb_id, conn, - ptlrpc_req2svc(req)->srv_rep_portal, - req->rq_xid, req->rq_reply_off); -out: - if (unlikely(rc != 0)) - ptlrpc_req_drop_rs(req); - ptlrpc_connection_put(conn); - return rc; -} - -int ptlrpc_reply(struct ptlrpc_request *req) -{ - if (req->rq_no_reply) - return 0; - return ptlrpc_send_reply(req, 0); -} - -/** - * For request \a req send an error reply back. Create empty - * reply buffers if necessary. - */ -int ptlrpc_send_error(struct ptlrpc_request *req, int may_be_difficult) -{ - int rc; - - if (req->rq_no_reply) - return 0; - - if (!req->rq_repmsg) { - rc = lustre_pack_reply(req, 1, NULL, NULL); - if (rc) - return rc; - } - - if (req->rq_status != -ENOSPC && req->rq_status != -EACCES && - req->rq_status != -EPERM && req->rq_status != -ENOENT && - req->rq_status != -EINPROGRESS && req->rq_status != -EDQUOT) - req->rq_type = PTL_RPC_MSG_ERR; - - rc = ptlrpc_send_reply(req, may_be_difficult); - return rc; -} - -int ptlrpc_error(struct ptlrpc_request *req) -{ - return ptlrpc_send_error(req, 0); -} - -/** - * Send request \a request. - * if \a noreply is set, don't expect any reply back and don't set up - * reply buffers. - * Returns 0 on success or error code. - */ -int ptl_send_rpc(struct ptlrpc_request *request, int noreply) -{ - int rc; - int rc2; - unsigned int mpflag = 0; - struct ptlrpc_connection *connection; - struct lnet_handle_me reply_me_h; - struct lnet_md reply_md; - struct obd_import *imp = request->rq_import; - struct obd_device *obd = imp->imp_obd; - - if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_RPC)) - return 0; - - LASSERT(request->rq_type == PTL_RPC_MSG_REQUEST); - LASSERT(request->rq_wait_ctx == 0); - - /* If this is a re-transmit, we're required to have disengaged - * cleanly from the previous attempt - */ - LASSERT(!request->rq_receiving_reply); - LASSERT(!((lustre_msg_get_flags(request->rq_reqmsg) & MSG_REPLAY) && - (imp->imp_state == LUSTRE_IMP_FULL))); - - if (unlikely(obd && obd->obd_fail)) { - CDEBUG(D_HA, "muting rpc for failed imp obd %s\n", - obd->obd_name); - /* this prevents us from waiting in ptlrpc_queue_wait */ - spin_lock(&request->rq_lock); - request->rq_err = 1; - spin_unlock(&request->rq_lock); - request->rq_status = -ENODEV; - return -ENODEV; - } - - connection = imp->imp_connection; - - lustre_msg_set_handle(request->rq_reqmsg, - &imp->imp_remote_handle); - lustre_msg_set_type(request->rq_reqmsg, PTL_RPC_MSG_REQUEST); - lustre_msg_set_conn_cnt(request->rq_reqmsg, imp->imp_conn_cnt); - lustre_msghdr_set_flags(request->rq_reqmsg, imp->imp_msghdr_flags); - - /* - * If it's the first time to resend the request for EINPROGRESS, - * we need to allocate a new XID (see after_reply()), it's different - * from the resend for reply timeout. - */ - if (request->rq_nr_resend && list_empty(&request->rq_unreplied_list)) { - __u64 min_xid = 0; - /* - * resend for EINPROGRESS, allocate new xid to avoid reply - * reconstruction - */ - spin_lock(&imp->imp_lock); - ptlrpc_assign_next_xid_nolock(request); - min_xid = ptlrpc_known_replied_xid(imp); - spin_unlock(&imp->imp_lock); - - lustre_msg_set_last_xid(request->rq_reqmsg, min_xid); - DEBUG_REQ(D_RPCTRACE, request, "Allocating new xid for resend on EINPROGRESS"); - } - - if (request->rq_bulk) { - ptlrpc_set_bulk_mbits(request); - lustre_msg_set_mbits(request->rq_reqmsg, request->rq_mbits); - } - - if (list_empty(&request->rq_unreplied_list) || - request->rq_xid <= imp->imp_known_replied_xid) { - DEBUG_REQ(D_ERROR, request, - "xid: %llu, replied: %llu, list_empty:%d\n", - request->rq_xid, imp->imp_known_replied_xid, - list_empty(&request->rq_unreplied_list)); - LBUG(); - } - - /** - * For enabled AT all request should have AT_SUPPORT in the - * FULL import state when OBD_CONNECT_AT is set - */ - LASSERT(AT_OFF || imp->imp_state != LUSTRE_IMP_FULL || - (imp->imp_msghdr_flags & MSGHDR_AT_SUPPORT) || - !(imp->imp_connect_data.ocd_connect_flags & - OBD_CONNECT_AT)); - - if (request->rq_resend) - lustre_msg_add_flags(request->rq_reqmsg, MSG_RESENT); - - if (request->rq_memalloc) - mpflag = memalloc_noreclaim_save(); - - rc = sptlrpc_cli_wrap_request(request); - if (rc) { - /* - * set rq_sent so that this request is treated - * as a delayed send in the upper layers - */ - if (rc == -ENOMEM) - request->rq_sent = ktime_get_seconds(); - goto out; - } - - /* bulk register should be done after wrap_request() */ - if (request->rq_bulk) { - rc = ptlrpc_register_bulk(request); - if (rc != 0) - goto out; - } - - if (!noreply) { - LASSERT(request->rq_replen != 0); - if (!request->rq_repbuf) { - LASSERT(!request->rq_repdata); - LASSERT(!request->rq_repmsg); - rc = sptlrpc_cli_alloc_repbuf(request, - request->rq_replen); - if (rc) { - /* this prevents us from looping in - * ptlrpc_queue_wait - */ - spin_lock(&request->rq_lock); - request->rq_err = 1; - spin_unlock(&request->rq_lock); - request->rq_status = rc; - goto cleanup_bulk; - } - } else { - request->rq_repdata = NULL; - request->rq_repmsg = NULL; - } - - rc = LNetMEAttach(request->rq_reply_portal,/*XXX FIXME bug 249*/ - connection->c_peer, request->rq_xid, 0, - LNET_UNLINK, LNET_INS_AFTER, &reply_me_h); - if (rc != 0) { - CERROR("LNetMEAttach failed: %d\n", rc); - LASSERT(rc == -ENOMEM); - rc = -ENOMEM; - goto cleanup_bulk; - } - } - - spin_lock(&request->rq_lock); - /* We are responsible for unlinking the reply buffer */ - request->rq_reply_unlinked = noreply; - request->rq_receiving_reply = !noreply; - /* Clear any flags that may be present from previous sends. */ - request->rq_req_unlinked = 0; - request->rq_replied = 0; - request->rq_err = 0; - request->rq_timedout = 0; - request->rq_net_err = 0; - request->rq_resend = 0; - request->rq_restart = 0; - request->rq_reply_truncated = 0; - spin_unlock(&request->rq_lock); - - if (!noreply) { - reply_md.start = request->rq_repbuf; - reply_md.length = request->rq_repbuf_len; - /* Allow multiple early replies */ - reply_md.threshold = LNET_MD_THRESH_INF; - /* Manage remote for early replies */ - reply_md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT | - LNET_MD_MANAGE_REMOTE | - LNET_MD_TRUNCATE; /* allow to make EOVERFLOW error */ - reply_md.user_ptr = &request->rq_reply_cbid; - reply_md.eq_handle = ptlrpc_eq_h; - - /* We must see the unlink callback to set rq_reply_unlinked, - * so we can't auto-unlink - */ - rc = LNetMDAttach(reply_me_h, reply_md, LNET_RETAIN, - &request->rq_reply_md_h); - if (rc != 0) { - CERROR("LNetMDAttach failed: %d\n", rc); - LASSERT(rc == -ENOMEM); - spin_lock(&request->rq_lock); - /* ...but the MD attach didn't succeed... */ - request->rq_receiving_reply = 0; - spin_unlock(&request->rq_lock); - rc = -ENOMEM; - goto cleanup_me; - } - - CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid %llu, portal %u\n", - request->rq_repbuf_len, request->rq_xid, - request->rq_reply_portal); - } - - /* add references on request for request_out_callback */ - ptlrpc_request_addref(request); - if (obd && obd->obd_svc_stats) - lprocfs_counter_add(obd->obd_svc_stats, PTLRPC_REQACTIVE_CNTR, - atomic_read(&imp->imp_inflight)); - - OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5); - - ktime_get_real_ts64(&request->rq_sent_tv); - request->rq_sent = ktime_get_real_seconds(); - /* We give the server rq_timeout secs to process the req, and - * add the network latency for our local timeout. - */ - request->rq_deadline = request->rq_sent + request->rq_timeout + - ptlrpc_at_get_net_latency(request); - - ptlrpc_pinger_sending_on_import(imp); - - DEBUG_REQ(D_INFO, request, "send flg=%x", - lustre_msg_get_flags(request->rq_reqmsg)); - rc = ptl_send_buf(&request->rq_req_md_h, - request->rq_reqbuf, request->rq_reqdata_len, - LNET_NOACK_REQ, &request->rq_req_cbid, - connection, - request->rq_request_portal, - request->rq_xid, 0); - if (likely(rc == 0)) - goto out; - - request->rq_req_unlinked = 1; - ptlrpc_req_finished(request); - if (noreply) - goto out; - - cleanup_me: - /* MEUnlink is safe; the PUT didn't even get off the ground, and - * nobody apart from the PUT's target has the right nid+XID to - * access the reply buffer. - */ - rc2 = LNetMEUnlink(reply_me_h); - LASSERT(rc2 == 0); - /* UNLINKED callback called synchronously */ - LASSERT(!request->rq_receiving_reply); - - cleanup_bulk: - /* We do sync unlink here as there was no real transfer here so - * the chance to have long unlink to sluggish net is smaller here. - */ - ptlrpc_unregister_bulk(request, 0); - out: - if (request->rq_memalloc) - memalloc_noreclaim_restore(mpflag); - return rc; -} -EXPORT_SYMBOL(ptl_send_rpc); - -/** - * Register request buffer descriptor for request receiving. - */ -int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd) -{ - struct ptlrpc_service *service = rqbd->rqbd_svcpt->scp_service; - static struct lnet_process_id match_id = {LNET_NID_ANY, LNET_PID_ANY}; - int rc; - struct lnet_md md; - struct lnet_handle_me me_h; - - CDEBUG(D_NET, "LNetMEAttach: portal %d\n", - service->srv_req_portal); - - if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_RQBD)) - return -ENOMEM; - - /* NB: CPT affinity service should use new LNet flag LNET_INS_LOCAL, - * which means buffer can only be attached on local CPT, and LND - * threads can find it by grabbing a local lock - */ - rc = LNetMEAttach(service->srv_req_portal, - match_id, 0, ~0, LNET_UNLINK, - rqbd->rqbd_svcpt->scp_cpt >= 0 ? - LNET_INS_LOCAL : LNET_INS_AFTER, &me_h); - if (rc != 0) { - CERROR("LNetMEAttach failed: %d\n", rc); - return -ENOMEM; - } - - LASSERT(rqbd->rqbd_refcount == 0); - rqbd->rqbd_refcount = 1; - - md.start = rqbd->rqbd_buffer; - md.length = service->srv_buf_size; - md.max_size = service->srv_max_req_size; - md.threshold = LNET_MD_THRESH_INF; - md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT | LNET_MD_MAX_SIZE; - md.user_ptr = &rqbd->rqbd_cbid; - md.eq_handle = ptlrpc_eq_h; - - rc = LNetMDAttach(me_h, md, LNET_UNLINK, &rqbd->rqbd_md_h); - if (rc == 0) - return 0; - - CERROR("LNetMDAttach failed: %d;\n", rc); - LASSERT(rc == -ENOMEM); - rc = LNetMEUnlink(me_h); - LASSERT(rc == 0); - rqbd->rqbd_refcount = 0; - - return -ENOMEM; -} diff --git a/drivers/staging/lustre/lustre/ptlrpc/nrs.c b/drivers/staging/lustre/lustre/ptlrpc/nrs.c deleted file mode 100644 index e09b86529c5dc3e22e865d5923792a9f12f2c34b..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ptlrpc/nrs.c +++ /dev/null @@ -1,1613 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License version 2 for more details. A copy is - * included in the COPYING file that accompanied this code. - - * GPL HEADER END - */ -/* - * Copyright (c) 2011 Intel Corporation - * - * Copyright 2012 Xyratex Technology Limited - */ -/* - * lustre/ptlrpc/nrs.c - * - * Network Request Scheduler (NRS) - * - * Allows to reorder the handling of RPCs at servers. - * - * Author: Liang Zhen - * Author: Nikitas Angelinas - */ -/** - * \addtogoup nrs - * @{ - */ - -#define DEBUG_SUBSYSTEM S_RPC -#include -#include -#include -#include -#include -#include -#include "ptlrpc_internal.h" - -/** - * NRS core object. - */ -struct nrs_core nrs_core; - -static int nrs_policy_init(struct ptlrpc_nrs_policy *policy) -{ - return policy->pol_desc->pd_ops->op_policy_init ? - policy->pol_desc->pd_ops->op_policy_init(policy) : 0; -} - -static void nrs_policy_fini(struct ptlrpc_nrs_policy *policy) -{ - LASSERT(policy->pol_ref == 0); - LASSERT(policy->pol_req_queued == 0); - - if (policy->pol_desc->pd_ops->op_policy_fini) - policy->pol_desc->pd_ops->op_policy_fini(policy); -} - -static int nrs_policy_ctl_locked(struct ptlrpc_nrs_policy *policy, - enum ptlrpc_nrs_ctl opc, void *arg) -{ - /** - * The policy may be stopped, but the lprocfs files and - * ptlrpc_nrs_policy instances remain present until unregistration time. - * Do not perform the ctl operation if the policy is stopped, as - * policy->pol_private will be NULL in such a case. - */ - if (policy->pol_state == NRS_POL_STATE_STOPPED) - return -ENODEV; - - return policy->pol_desc->pd_ops->op_policy_ctl ? - policy->pol_desc->pd_ops->op_policy_ctl(policy, opc, arg) : - -ENOSYS; -} - -static void nrs_policy_stop0(struct ptlrpc_nrs_policy *policy) -{ - if (policy->pol_desc->pd_ops->op_policy_stop) - policy->pol_desc->pd_ops->op_policy_stop(policy); - - LASSERT(list_empty(&policy->pol_list_queued)); - LASSERT(policy->pol_req_queued == 0 && - policy->pol_req_started == 0); - - policy->pol_private = NULL; - - policy->pol_state = NRS_POL_STATE_STOPPED; - - if (atomic_dec_and_test(&policy->pol_desc->pd_refs)) - module_put(policy->pol_desc->pd_owner); -} - -static int nrs_policy_stop_locked(struct ptlrpc_nrs_policy *policy) -{ - struct ptlrpc_nrs *nrs = policy->pol_nrs; - - if (nrs->nrs_policy_fallback == policy && !nrs->nrs_stopping) - return -EPERM; - - if (policy->pol_state == NRS_POL_STATE_STARTING) - return -EAGAIN; - - /* In progress or already stopped */ - if (policy->pol_state != NRS_POL_STATE_STARTED) - return 0; - - policy->pol_state = NRS_POL_STATE_STOPPING; - - /* Immediately make it invisible */ - if (nrs->nrs_policy_primary == policy) { - nrs->nrs_policy_primary = NULL; - - } else { - LASSERT(nrs->nrs_policy_fallback == policy); - nrs->nrs_policy_fallback = NULL; - } - - /* I have the only refcount */ - if (policy->pol_ref == 1) - nrs_policy_stop0(policy); - - return 0; -} - -/** - * Transitions the \a nrs NRS head's primary policy to - * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING and if the policy has no - * pending usage references, to ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED. - * - * \param[in] nrs the NRS head to carry out this operation on - */ -static void nrs_policy_stop_primary(struct ptlrpc_nrs *nrs) -{ - struct ptlrpc_nrs_policy *tmp = nrs->nrs_policy_primary; - - if (!tmp) - return; - - nrs->nrs_policy_primary = NULL; - - LASSERT(tmp->pol_state == NRS_POL_STATE_STARTED); - tmp->pol_state = NRS_POL_STATE_STOPPING; - - if (tmp->pol_ref == 0) - nrs_policy_stop0(tmp); -} - -/** - * Transitions a policy across the ptlrpc_nrs_pol_state range of values, in - * response to an lprocfs command to start a policy. - * - * If a primary policy different to the current one is specified, this function - * will transition the new policy to the - * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTING and then to - * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED, and will then transition - * the old primary policy (if there is one) to - * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING, and if there are no outstanding - * references on the policy to ptlrpc_nrs_pol_stae::NRS_POL_STATE_STOPPED. - * - * If the fallback policy is specified, this is taken to indicate an instruction - * to stop the current primary policy, without substituting it with another - * primary policy, so the primary policy (if any) is transitioned to - * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING, and if there are no outstanding - * references on the policy to ptlrpc_nrs_pol_stae::NRS_POL_STATE_STOPPED. In - * this case, the fallback policy is only left active in the NRS head. - */ -static int nrs_policy_start_locked(struct ptlrpc_nrs_policy *policy) -{ - struct ptlrpc_nrs *nrs = policy->pol_nrs; - int rc = 0; - - /** - * Don't allow multiple starting which is too complex, and has no real - * benefit. - */ - if (nrs->nrs_policy_starting) - return -EAGAIN; - - LASSERT(policy->pol_state != NRS_POL_STATE_STARTING); - - if (policy->pol_state == NRS_POL_STATE_STOPPING) - return -EAGAIN; - - if (policy->pol_flags & PTLRPC_NRS_FL_FALLBACK) { - /** - * This is for cases in which the user sets the policy to the - * fallback policy (currently fifo for all services); i.e. the - * user is resetting the policy to the default; so we stop the - * primary policy, if any. - */ - if (policy == nrs->nrs_policy_fallback) { - nrs_policy_stop_primary(nrs); - return 0; - } - - /** - * If we reach here, we must be setting up the fallback policy - * at service startup time, and only a single policy with the - * nrs_policy_flags::PTLRPC_NRS_FL_FALLBACK flag set can - * register with NRS core. - */ - LASSERT(!nrs->nrs_policy_fallback); - } else { - /** - * Shouldn't start primary policy if w/o fallback policy. - */ - if (!nrs->nrs_policy_fallback) - return -EPERM; - - if (policy->pol_state == NRS_POL_STATE_STARTED) - return 0; - } - - /** - * Increase the module usage count for policies registering from other - * modules. - */ - if (atomic_inc_return(&policy->pol_desc->pd_refs) == 1 && - !try_module_get(policy->pol_desc->pd_owner)) { - atomic_dec(&policy->pol_desc->pd_refs); - CERROR("NRS: cannot get module for policy %s; is it alive?\n", - policy->pol_desc->pd_name); - return -ENODEV; - } - - /** - * Serialize policy starting across the NRS head - */ - nrs->nrs_policy_starting = 1; - - policy->pol_state = NRS_POL_STATE_STARTING; - - if (policy->pol_desc->pd_ops->op_policy_start) { - spin_unlock(&nrs->nrs_lock); - - rc = policy->pol_desc->pd_ops->op_policy_start(policy); - - spin_lock(&nrs->nrs_lock); - if (rc != 0) { - if (atomic_dec_and_test(&policy->pol_desc->pd_refs)) - module_put(policy->pol_desc->pd_owner); - - policy->pol_state = NRS_POL_STATE_STOPPED; - goto out; - } - } - - policy->pol_state = NRS_POL_STATE_STARTED; - - if (policy->pol_flags & PTLRPC_NRS_FL_FALLBACK) { - /** - * This path is only used at PTLRPC service setup time. - */ - nrs->nrs_policy_fallback = policy; - } else { - /* - * Try to stop the current primary policy if there is one. - */ - nrs_policy_stop_primary(nrs); - - /** - * And set the newly-started policy as the primary one. - */ - nrs->nrs_policy_primary = policy; - } - -out: - nrs->nrs_policy_starting = 0; - - return rc; -} - -/** - * Increases the policy's usage reference count. - */ -static inline void nrs_policy_get_locked(struct ptlrpc_nrs_policy *policy) -{ - policy->pol_ref++; -} - -/** - * Decreases the policy's usage reference count, and stops the policy in case it - * was already stopping and have no more outstanding usage references (which - * indicates it has no more queued or started requests, and can be safely - * stopped). - */ -static void nrs_policy_put_locked(struct ptlrpc_nrs_policy *policy) -{ - LASSERT(policy->pol_ref > 0); - - policy->pol_ref--; - if (unlikely(policy->pol_ref == 0 && - policy->pol_state == NRS_POL_STATE_STOPPING)) - nrs_policy_stop0(policy); -} - -static void nrs_policy_put(struct ptlrpc_nrs_policy *policy) -{ - spin_lock(&policy->pol_nrs->nrs_lock); - nrs_policy_put_locked(policy); - spin_unlock(&policy->pol_nrs->nrs_lock); -} - -/** - * Find and return a policy by name. - */ -static struct ptlrpc_nrs_policy *nrs_policy_find_locked(struct ptlrpc_nrs *nrs, - char *name) -{ - struct ptlrpc_nrs_policy *tmp; - - list_for_each_entry(tmp, &nrs->nrs_policy_list, pol_list) { - if (strncmp(tmp->pol_desc->pd_name, name, - NRS_POL_NAME_MAX) == 0) { - nrs_policy_get_locked(tmp); - return tmp; - } - } - return NULL; -} - -/** - * Release references for the resource hierarchy moving upwards towards the - * policy instance resource. - */ -static void nrs_resource_put(struct ptlrpc_nrs_resource *res) -{ - struct ptlrpc_nrs_policy *policy = res->res_policy; - - if (policy->pol_desc->pd_ops->op_res_put) { - struct ptlrpc_nrs_resource *parent; - - for (; res; res = parent) { - parent = res->res_parent; - policy->pol_desc->pd_ops->op_res_put(policy, res); - } - } -} - -/** - * Obtains references for each resource in the resource hierarchy for request - * \a nrq if it is to be handled by \a policy. - * - * \param[in] policy the policy - * \param[in] nrq the request - * \param[in] moving_req denotes whether this is a call to the function by - * ldlm_lock_reorder_req(), in order to move \a nrq to - * the high-priority NRS head; we should not sleep when - * set. - * - * \retval NULL resource hierarchy references not obtained - * \retval valid-pointer the bottom level of the resource hierarchy - * - * \see ptlrpc_nrs_pol_ops::op_res_get() - */ -static -struct ptlrpc_nrs_resource *nrs_resource_get(struct ptlrpc_nrs_policy *policy, - struct ptlrpc_nrs_request *nrq, - bool moving_req) -{ - /** - * Set to NULL to traverse the resource hierarchy from the top. - */ - struct ptlrpc_nrs_resource *res = NULL; - struct ptlrpc_nrs_resource *tmp = NULL; - int rc; - - while (1) { - rc = policy->pol_desc->pd_ops->op_res_get(policy, nrq, res, - &tmp, moving_req); - if (rc < 0) { - if (res) - nrs_resource_put(res); - return NULL; - } - - tmp->res_parent = res; - tmp->res_policy = policy; - res = tmp; - tmp = NULL; - /** - * Return once we have obtained a reference to the bottom level - * of the resource hierarchy. - */ - if (rc > 0) - return res; - } -} - -/** - * Obtains resources for the resource hierarchies and policy references for - * the fallback and current primary policy (if any), that will later be used - * to handle request \a nrq. - * - * \param[in] nrs the NRS head instance that will be handling request \a nrq. - * \param[in] nrq the request that is being handled. - * \param[out] resp the array where references to the resource hierarchy are - * stored. - * \param[in] moving_req is set when obtaining resources while moving a - * request from a policy on the regular NRS head to a - * policy on the HP NRS head (via - * ldlm_lock_reorder_req()). It signifies that - * allocations to get resources should be atomic; for - * a full explanation, see comment in - * ptlrpc_nrs_pol_ops::op_res_get(). - */ -static void nrs_resource_get_safe(struct ptlrpc_nrs *nrs, - struct ptlrpc_nrs_request *nrq, - struct ptlrpc_nrs_resource **resp, - bool moving_req) -{ - struct ptlrpc_nrs_policy *primary = NULL; - struct ptlrpc_nrs_policy *fallback = NULL; - - memset(resp, 0, sizeof(resp[0]) * NRS_RES_MAX); - - /** - * Obtain policy references. - */ - spin_lock(&nrs->nrs_lock); - - fallback = nrs->nrs_policy_fallback; - nrs_policy_get_locked(fallback); - - primary = nrs->nrs_policy_primary; - if (primary) - nrs_policy_get_locked(primary); - - spin_unlock(&nrs->nrs_lock); - - /** - * Obtain resource hierarchy references. - */ - resp[NRS_RES_FALLBACK] = nrs_resource_get(fallback, nrq, moving_req); - LASSERT(resp[NRS_RES_FALLBACK]); - - if (primary) { - resp[NRS_RES_PRIMARY] = nrs_resource_get(primary, nrq, - moving_req); - /** - * A primary policy may exist which may not wish to serve a - * particular request for different reasons; release the - * reference on the policy as it will not be used for this - * request. - */ - if (!resp[NRS_RES_PRIMARY]) - nrs_policy_put(primary); - } -} - -/** - * Releases references to resource hierarchies and policies, because they are no - * longer required; used when request handling has been completed, or the - * request is moving to the high priority NRS head. - * - * \param resp the resource hierarchy that is being released - * - * \see ptlrpc_nrs_req_finalize() - */ -static void nrs_resource_put_safe(struct ptlrpc_nrs_resource **resp) -{ - struct ptlrpc_nrs_policy *pols[NRS_RES_MAX]; - int i; - - for (i = 0; i < NRS_RES_MAX; i++) { - if (resp[i]) { - pols[i] = resp[i]->res_policy; - nrs_resource_put(resp[i]); - resp[i] = NULL; - } else { - pols[i] = NULL; - } - } - - for (i = 0; i < NRS_RES_MAX; i++) { - if (pols[i]) - nrs_policy_put(pols[i]); - } -} - -/** - * Obtains an NRS request from \a policy for handling or examination; the - * request should be removed in the 'handling' case. - * - * Calling into this function implies we already know the policy has a request - * waiting to be handled. - * - * \param[in] policy the policy from which a request - * \param[in] peek when set, signifies that we just want to examine the - * request, and not handle it, so the request is not removed - * from the policy. - * \param[in] force when set, it will force a policy to return a request if it - * has one pending - * - * \retval the NRS request to be handled - */ -static inline -struct ptlrpc_nrs_request *nrs_request_get(struct ptlrpc_nrs_policy *policy, - bool peek, bool force) -{ - struct ptlrpc_nrs_request *nrq; - - LASSERT(policy->pol_req_queued > 0); - - nrq = policy->pol_desc->pd_ops->op_req_get(policy, peek, force); - - LASSERT(ergo(nrq, nrs_request_policy(nrq) == policy)); - - return nrq; -} - -/** - * Enqueues request \a nrq for later handling, via one one the policies for - * which resources where earlier obtained via nrs_resource_get_safe(). The - * function attempts to enqueue the request first on the primary policy - * (if any), since this is the preferred choice. - * - * \param nrq the request being enqueued - * - * \see nrs_resource_get_safe() - */ -static inline void nrs_request_enqueue(struct ptlrpc_nrs_request *nrq) -{ - struct ptlrpc_nrs_policy *policy; - int rc; - int i; - - /** - * Try in descending order, because the primary policy (if any) is - * the preferred choice. - */ - for (i = NRS_RES_MAX - 1; i >= 0; i--) { - if (!nrq->nr_res_ptrs[i]) - continue; - - nrq->nr_res_idx = i; - policy = nrq->nr_res_ptrs[i]->res_policy; - - rc = policy->pol_desc->pd_ops->op_req_enqueue(policy, nrq); - if (rc == 0) { - policy->pol_nrs->nrs_req_queued++; - policy->pol_req_queued++; - return; - } - } - /** - * Should never get here, as at least the primary policy's - * ptlrpc_nrs_pol_ops::op_req_enqueue() implementation should always - * succeed. - */ - LBUG(); -} - -/** - * Called when a request has been handled - * - * \param[in] nrs the request that has been handled; can be used for - * job/resource control. - * - * \see ptlrpc_nrs_req_stop_nolock() - */ -static inline void nrs_request_stop(struct ptlrpc_nrs_request *nrq) -{ - struct ptlrpc_nrs_policy *policy = nrs_request_policy(nrq); - - if (policy->pol_desc->pd_ops->op_req_stop) - policy->pol_desc->pd_ops->op_req_stop(policy, nrq); - - LASSERT(policy->pol_nrs->nrs_req_started > 0); - LASSERT(policy->pol_req_started > 0); - - policy->pol_nrs->nrs_req_started--; - policy->pol_req_started--; -} - -/** - * Handler for operations that can be carried out on policies. - * - * Handles opcodes that are common to all policy types within NRS core, and - * passes any unknown opcodes to the policy-specific control function. - * - * \param[in] nrs the NRS head this policy belongs to. - * \param[in] name the human-readable policy name; should be the same as - * ptlrpc_nrs_pol_desc::pd_name. - * \param[in] opc the opcode of the operation being carried out. - * \param[in,out] arg can be used to pass information in and out between when - * carrying an operation; usually data that is private to - * the policy at some level, or generic policy status - * information. - * - * \retval -ve error condition - * \retval 0 operation was carried out successfully - */ -static int nrs_policy_ctl(struct ptlrpc_nrs *nrs, char *name, - enum ptlrpc_nrs_ctl opc, void *arg) -{ - struct ptlrpc_nrs_policy *policy; - int rc = 0; - - spin_lock(&nrs->nrs_lock); - - policy = nrs_policy_find_locked(nrs, name); - if (!policy) { - rc = -ENOENT; - goto out; - } - - if (policy->pol_state != NRS_POL_STATE_STARTED && - policy->pol_state != NRS_POL_STATE_STOPPED) { - rc = -EAGAIN; - goto out; - } - - switch (opc) { - /** - * Unknown opcode, pass it down to the policy-specific control - * function for handling. - */ - default: - rc = nrs_policy_ctl_locked(policy, opc, arg); - break; - - /** - * Start \e policy - */ - case PTLRPC_NRS_CTL_START: - rc = nrs_policy_start_locked(policy); - break; - } -out: - if (policy) - nrs_policy_put_locked(policy); - - spin_unlock(&nrs->nrs_lock); - - return rc; -} - -/** - * Unregisters a policy by name. - * - * \param[in] nrs the NRS head this policy belongs to. - * \param[in] name the human-readable policy name; should be the same as - * ptlrpc_nrs_pol_desc::pd_name - * - * \retval -ve error - * \retval 0 success - */ -static int nrs_policy_unregister(struct ptlrpc_nrs *nrs, char *name) -{ - struct ptlrpc_nrs_policy *policy = NULL; - - spin_lock(&nrs->nrs_lock); - - policy = nrs_policy_find_locked(nrs, name); - if (!policy) { - spin_unlock(&nrs->nrs_lock); - - CERROR("Can't find NRS policy %s\n", name); - return -ENOENT; - } - - if (policy->pol_ref > 1) { - CERROR("Policy %s is busy with %d references\n", name, - (int)policy->pol_ref); - nrs_policy_put_locked(policy); - - spin_unlock(&nrs->nrs_lock); - return -EBUSY; - } - - LASSERT(policy->pol_req_queued == 0); - LASSERT(policy->pol_req_started == 0); - - if (policy->pol_state != NRS_POL_STATE_STOPPED) { - nrs_policy_stop_locked(policy); - LASSERT(policy->pol_state == NRS_POL_STATE_STOPPED); - } - - list_del(&policy->pol_list); - nrs->nrs_num_pols--; - - nrs_policy_put_locked(policy); - - spin_unlock(&nrs->nrs_lock); - - nrs_policy_fini(policy); - - LASSERT(!policy->pol_private); - kfree(policy); - - return 0; -} - -/** - * Register a policy from \policy descriptor \a desc with NRS head \a nrs. - * - * \param[in] nrs the NRS head on which the policy will be registered. - * \param[in] desc the policy descriptor from which the information will be - * obtained to register the policy. - * - * \retval -ve error - * \retval 0 success - */ -static int nrs_policy_register(struct ptlrpc_nrs *nrs, - struct ptlrpc_nrs_pol_desc *desc) -{ - struct ptlrpc_nrs_policy *policy; - struct ptlrpc_nrs_policy *tmp; - struct ptlrpc_service_part *svcpt = nrs->nrs_svcpt; - int rc; - - LASSERT(desc->pd_ops->op_res_get); - LASSERT(desc->pd_ops->op_req_get); - LASSERT(desc->pd_ops->op_req_enqueue); - LASSERT(desc->pd_ops->op_req_dequeue); - LASSERT(desc->pd_compat); - - policy = kzalloc_node(sizeof(*policy), GFP_NOFS, - cfs_cpt_spread_node(svcpt->scp_service->srv_cptable, - svcpt->scp_cpt)); - if (!policy) - return -ENOMEM; - - policy->pol_nrs = nrs; - policy->pol_desc = desc; - policy->pol_state = NRS_POL_STATE_STOPPED; - policy->pol_flags = desc->pd_flags; - - INIT_LIST_HEAD(&policy->pol_list); - INIT_LIST_HEAD(&policy->pol_list_queued); - - rc = nrs_policy_init(policy); - if (rc != 0) { - kfree(policy); - return rc; - } - - spin_lock(&nrs->nrs_lock); - - tmp = nrs_policy_find_locked(nrs, policy->pol_desc->pd_name); - if (tmp) { - CERROR("NRS policy %s has been registered, can't register it for %s\n", - policy->pol_desc->pd_name, - svcpt->scp_service->srv_name); - nrs_policy_put_locked(tmp); - - spin_unlock(&nrs->nrs_lock); - nrs_policy_fini(policy); - kfree(policy); - - return -EEXIST; - } - - list_add_tail(&policy->pol_list, &nrs->nrs_policy_list); - nrs->nrs_num_pols++; - - if (policy->pol_flags & PTLRPC_NRS_FL_REG_START) - rc = nrs_policy_start_locked(policy); - - spin_unlock(&nrs->nrs_lock); - - if (rc != 0) - (void)nrs_policy_unregister(nrs, policy->pol_desc->pd_name); - - return rc; -} - -/** - * Enqueue request \a req using one of the policies its resources are referring - * to. - * - * \param[in] req the request to enqueue. - */ -static void ptlrpc_nrs_req_add_nolock(struct ptlrpc_request *req) -{ - struct ptlrpc_nrs_policy *policy; - - LASSERT(req->rq_nrq.nr_initialized); - LASSERT(!req->rq_nrq.nr_enqueued); - - nrs_request_enqueue(&req->rq_nrq); - req->rq_nrq.nr_enqueued = 1; - - policy = nrs_request_policy(&req->rq_nrq); - /** - * Add the policy to the NRS head's list of policies with enqueued - * requests, if it has not been added there. - */ - if (unlikely(list_empty(&policy->pol_list_queued))) - list_add_tail(&policy->pol_list_queued, - &policy->pol_nrs->nrs_policy_queued); -} - -/** - * Enqueue a request on the high priority NRS head. - * - * \param req the request to enqueue. - */ -static void ptlrpc_nrs_hpreq_add_nolock(struct ptlrpc_request *req) -{ - int opc = lustre_msg_get_opc(req->rq_reqmsg); - - spin_lock(&req->rq_lock); - req->rq_hp = 1; - ptlrpc_nrs_req_add_nolock(req); - if (opc != OBD_PING) - DEBUG_REQ(D_NET, req, "high priority req"); - spin_unlock(&req->rq_lock); -} - -/** - * Returns a boolean predicate indicating whether the policy described by - * \a desc is adequate for use with service \a svc. - * - * \param[in] svc the service - * \param[in] desc the policy descriptor - * - * \retval false the policy is not compatible with the service - * \retval true the policy is compatible with the service - */ -static inline bool nrs_policy_compatible(const struct ptlrpc_service *svc, - const struct ptlrpc_nrs_pol_desc *desc) -{ - return desc->pd_compat(svc, desc); -} - -/** - * Registers all compatible policies in nrs_core.nrs_policies, for NRS head - * \a nrs. - * - * \param[in] nrs the NRS head - * - * \retval -ve error - * \retval 0 success - * - * \pre mutex_is_locked(&nrs_core.nrs_mutex) - * - * \see ptlrpc_service_nrs_setup() - */ -static int nrs_register_policies_locked(struct ptlrpc_nrs *nrs) -{ - struct ptlrpc_nrs_pol_desc *desc; - /* for convenience */ - struct ptlrpc_service_part *svcpt = nrs->nrs_svcpt; - struct ptlrpc_service *svc = svcpt->scp_service; - int rc = -EINVAL; - - LASSERT(mutex_is_locked(&nrs_core.nrs_mutex)); - - list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) { - if (nrs_policy_compatible(svc, desc)) { - rc = nrs_policy_register(nrs, desc); - if (rc != 0) { - CERROR("Failed to register NRS policy %s for partition %d of service %s: %d\n", - desc->pd_name, svcpt->scp_cpt, - svc->srv_name, rc); - /** - * Fail registration if any of the policies' - * registration fails. - */ - break; - } - } - } - - return rc; -} - -/** - * Initializes NRS head \a nrs of service partition \a svcpt, and registers all - * compatible policies in NRS core, with the NRS head. - * - * \param[in] nrs the NRS head - * \param[in] svcpt the PTLRPC service partition to setup - * - * \retval -ve error - * \retval 0 success - * - * \pre mutex_is_locked(&nrs_core.nrs_mutex) - */ -static int nrs_svcpt_setup_locked0(struct ptlrpc_nrs *nrs, - struct ptlrpc_service_part *svcpt) -{ - enum ptlrpc_nrs_queue_type queue; - - LASSERT(mutex_is_locked(&nrs_core.nrs_mutex)); - - if (nrs == &svcpt->scp_nrs_reg) - queue = PTLRPC_NRS_QUEUE_REG; - else if (nrs == svcpt->scp_nrs_hp) - queue = PTLRPC_NRS_QUEUE_HP; - else - LBUG(); - - nrs->nrs_svcpt = svcpt; - nrs->nrs_queue_type = queue; - spin_lock_init(&nrs->nrs_lock); - INIT_LIST_HEAD(&nrs->nrs_policy_list); - INIT_LIST_HEAD(&nrs->nrs_policy_queued); - - return nrs_register_policies_locked(nrs); -} - -/** - * Allocates a regular and optionally a high-priority NRS head (if the service - * handles high-priority RPCs), and then registers all available compatible - * policies on those NRS heads. - * - * \param[in,out] svcpt the PTLRPC service partition to setup - * - * \pre mutex_is_locked(&nrs_core.nrs_mutex) - */ -static int nrs_svcpt_setup_locked(struct ptlrpc_service_part *svcpt) -{ - struct ptlrpc_nrs *nrs; - int rc; - - LASSERT(mutex_is_locked(&nrs_core.nrs_mutex)); - - /** - * Initialize the regular NRS head. - */ - nrs = nrs_svcpt2nrs(svcpt, false); - rc = nrs_svcpt_setup_locked0(nrs, svcpt); - if (rc < 0) - goto out; - - /** - * Optionally allocate a high-priority NRS head. - */ - if (!svcpt->scp_service->srv_ops.so_hpreq_handler) - goto out; - - svcpt->scp_nrs_hp = - kzalloc_node(sizeof(*svcpt->scp_nrs_hp), GFP_NOFS, - cfs_cpt_spread_node(svcpt->scp_service->srv_cptable, - svcpt->scp_cpt)); - if (!svcpt->scp_nrs_hp) { - rc = -ENOMEM; - goto out; - } - - nrs = nrs_svcpt2nrs(svcpt, true); - rc = nrs_svcpt_setup_locked0(nrs, svcpt); - -out: - return rc; -} - -/** - * Unregisters all policies on all available NRS heads in a service partition; - * called at PTLRPC service unregistration time. - * - * \param[in] svcpt the PTLRPC service partition - * - * \pre mutex_is_locked(&nrs_core.nrs_mutex) - */ -static void nrs_svcpt_cleanup_locked(struct ptlrpc_service_part *svcpt) -{ - struct ptlrpc_nrs *nrs; - struct ptlrpc_nrs_policy *policy; - struct ptlrpc_nrs_policy *tmp; - int rc; - bool hp = false; - - LASSERT(mutex_is_locked(&nrs_core.nrs_mutex)); - -again: - /* scp_nrs_hp could be NULL due to short of memory. */ - nrs = hp ? svcpt->scp_nrs_hp : &svcpt->scp_nrs_reg; - /* check the nrs_svcpt to see if nrs is initialized. */ - if (!nrs || !nrs->nrs_svcpt) - return; - nrs->nrs_stopping = 1; - - list_for_each_entry_safe(policy, tmp, &nrs->nrs_policy_list, pol_list) { - rc = nrs_policy_unregister(nrs, policy->pol_desc->pd_name); - LASSERT(rc == 0); - } - - /** - * If the service partition has an HP NRS head, clean that up as well. - */ - if (!hp && nrs_svcpt_has_hp(svcpt)) { - hp = true; - goto again; - } - - if (hp) - kfree(nrs); -} - -/** - * Returns the descriptor for a policy as identified by by \a name. - * - * \param[in] name the policy name - * - * \retval the policy descriptor - * \retval NULL - */ -static struct ptlrpc_nrs_pol_desc *nrs_policy_find_desc_locked(const char *name) -{ - struct ptlrpc_nrs_pol_desc *tmp; - - list_for_each_entry(tmp, &nrs_core.nrs_policies, pd_list) { - if (strncmp(tmp->pd_name, name, NRS_POL_NAME_MAX) == 0) - return tmp; - } - return NULL; -} - -/** - * Removes the policy from all supported NRS heads of all partitions of all - * PTLRPC services. - * - * \param[in] desc the policy descriptor to unregister - * - * \retval -ve error - * \retval 0 successfully unregistered policy on all supported NRS heads - * - * \pre mutex_is_locked(&nrs_core.nrs_mutex) - * \pre mutex_is_locked(&ptlrpc_all_services_mutex) - */ -static int nrs_policy_unregister_locked(struct ptlrpc_nrs_pol_desc *desc) -{ - struct ptlrpc_nrs *nrs; - struct ptlrpc_service *svc; - struct ptlrpc_service_part *svcpt; - int i; - int rc = 0; - - LASSERT(mutex_is_locked(&nrs_core.nrs_mutex)); - LASSERT(mutex_is_locked(&ptlrpc_all_services_mutex)); - - list_for_each_entry(svc, &ptlrpc_all_services, srv_list) { - if (!nrs_policy_compatible(svc, desc) || - unlikely(svc->srv_is_stopping)) - continue; - - ptlrpc_service_for_each_part(svcpt, i, svc) { - bool hp = false; - -again: - nrs = nrs_svcpt2nrs(svcpt, hp); - rc = nrs_policy_unregister(nrs, desc->pd_name); - /** - * Ignore -ENOENT as the policy may not have registered - * successfully on all service partitions. - */ - if (rc == -ENOENT) { - rc = 0; - } else if (rc != 0) { - CERROR("Failed to unregister NRS policy %s for partition %d of service %s: %d\n", - desc->pd_name, svcpt->scp_cpt, - svcpt->scp_service->srv_name, rc); - return rc; - } - - if (!hp && nrs_svc_has_hp(svc)) { - hp = true; - goto again; - } - } - - if (desc->pd_ops->op_lprocfs_fini) - desc->pd_ops->op_lprocfs_fini(svc); - } - - return rc; -} - -/** - * Registers a new policy with NRS core. - * - * The function will only succeed if policy registration with all compatible - * service partitions (if any) is successful. - * - * N.B. This function should be called either at ptlrpc module initialization - * time when registering a policy that ships with NRS core, or in a - * module's init() function for policies registering from other modules. - * - * \param[in] conf configuration information for the new policy to register - * - * \retval -ve error - * \retval 0 success - */ -static int ptlrpc_nrs_policy_register(struct ptlrpc_nrs_pol_conf *conf) -{ - struct ptlrpc_service *svc; - struct ptlrpc_nrs_pol_desc *desc; - size_t len; - int rc = 0; - - LASSERT(conf->nc_ops); - LASSERT(conf->nc_compat); - LASSERT(ergo(conf->nc_compat == nrs_policy_compat_one, - conf->nc_compat_svc_name)); - LASSERT(ergo((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) != 0, - conf->nc_owner)); - - conf->nc_name[NRS_POL_NAME_MAX - 1] = '\0'; - - /** - * External policies are not allowed to start immediately upon - * registration, as there is a relatively higher chance that their - * registration might fail. In such a case, some policy instances may - * already have requests queued wen unregistration needs to happen as - * part o cleanup; since there is currently no way to drain requests - * from a policy unless the service is unregistering, we just disallow - * this. - */ - if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) && - (conf->nc_flags & (PTLRPC_NRS_FL_FALLBACK | - PTLRPC_NRS_FL_REG_START))) { - CERROR("NRS: failing to register policy %s. Please check policy flags; external policies cannot act as fallback policies, or be started immediately upon registration without interaction with lprocfs\n", - conf->nc_name); - return -EINVAL; - } - - mutex_lock(&nrs_core.nrs_mutex); - - if (nrs_policy_find_desc_locked(conf->nc_name)) { - CERROR("NRS: failing to register policy %s which has already been registered with NRS core!\n", - conf->nc_name); - rc = -EEXIST; - goto fail; - } - - desc = kzalloc(sizeof(*desc), GFP_NOFS); - if (!desc) { - rc = -ENOMEM; - goto fail; - } - - len = strlcpy(desc->pd_name, conf->nc_name, sizeof(desc->pd_name)); - if (len >= sizeof(desc->pd_name)) { - kfree(desc); - rc = -E2BIG; - goto fail; - } - desc->pd_ops = conf->nc_ops; - desc->pd_compat = conf->nc_compat; - desc->pd_compat_svc_name = conf->nc_compat_svc_name; - if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) != 0) - desc->pd_owner = conf->nc_owner; - desc->pd_flags = conf->nc_flags; - atomic_set(&desc->pd_refs, 0); - - /** - * For policies that are held in the same module as NRS (currently - * ptlrpc), do not register the policy with all compatible services, - * as the services will not have started at this point, since we are - * calling from ptlrpc module initialization code. In such cases each - * service will register all compatible policies later, via - * ptlrpc_service_nrs_setup(). - */ - if ((conf->nc_flags & PTLRPC_NRS_FL_REG_EXTERN) == 0) - goto internal; - - /** - * Register the new policy on all compatible services - */ - mutex_lock(&ptlrpc_all_services_mutex); - - list_for_each_entry(svc, &ptlrpc_all_services, srv_list) { - struct ptlrpc_service_part *svcpt; - int i; - int rc2; - - if (!nrs_policy_compatible(svc, desc) || - unlikely(svc->srv_is_stopping)) - continue; - - ptlrpc_service_for_each_part(svcpt, i, svc) { - struct ptlrpc_nrs *nrs; - bool hp = false; -again: - nrs = nrs_svcpt2nrs(svcpt, hp); - rc = nrs_policy_register(nrs, desc); - if (rc != 0) { - CERROR("Failed to register NRS policy %s for partition %d of service %s: %d\n", - desc->pd_name, svcpt->scp_cpt, - svcpt->scp_service->srv_name, rc); - - rc2 = nrs_policy_unregister_locked(desc); - /** - * Should not fail at this point - */ - LASSERT(rc2 == 0); - mutex_unlock(&ptlrpc_all_services_mutex); - kfree(desc); - goto fail; - } - - if (!hp && nrs_svc_has_hp(svc)) { - hp = true; - goto again; - } - } - - /** - * No need to take a reference to other modules here, as we - * will be calling from the module's init() function. - */ - if (desc->pd_ops->op_lprocfs_init) { - rc = desc->pd_ops->op_lprocfs_init(svc); - if (rc != 0) { - rc2 = nrs_policy_unregister_locked(desc); - /** - * Should not fail at this point - */ - LASSERT(rc2 == 0); - mutex_unlock(&ptlrpc_all_services_mutex); - kfree(desc); - goto fail; - } - } - } - - mutex_unlock(&ptlrpc_all_services_mutex); -internal: - list_add_tail(&desc->pd_list, &nrs_core.nrs_policies); -fail: - mutex_unlock(&nrs_core.nrs_mutex); - - return rc; -} - -/** - * Setup NRS heads on all service partitions of service \a svc, and register - * all compatible policies on those NRS heads. - * - * To be called from within ptl - * \param[in] svc the service to setup - * - * \retval -ve error, the calling logic should eventually call - * ptlrpc_service_nrs_cleanup() to undo any work performed - * by this function. - * - * \see ptlrpc_register_service() - * \see ptlrpc_service_nrs_cleanup() - */ -int ptlrpc_service_nrs_setup(struct ptlrpc_service *svc) -{ - struct ptlrpc_service_part *svcpt; - const struct ptlrpc_nrs_pol_desc *desc; - int i; - int rc = 0; - - mutex_lock(&nrs_core.nrs_mutex); - - /** - * Initialize NRS heads on all service CPTs. - */ - ptlrpc_service_for_each_part(svcpt, i, svc) { - rc = nrs_svcpt_setup_locked(svcpt); - if (rc != 0) - goto failed; - } - - /** - * Set up lprocfs interfaces for all supported policies for the - * service. - */ - list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) { - if (!nrs_policy_compatible(svc, desc)) - continue; - - if (desc->pd_ops->op_lprocfs_init) { - rc = desc->pd_ops->op_lprocfs_init(svc); - if (rc != 0) - goto failed; - } - } - -failed: - - mutex_unlock(&nrs_core.nrs_mutex); - - return rc; -} - -/** - * Unregisters all policies on all service partitions of service \a svc. - * - * \param[in] svc the PTLRPC service to unregister - */ -void ptlrpc_service_nrs_cleanup(struct ptlrpc_service *svc) -{ - struct ptlrpc_service_part *svcpt; - const struct ptlrpc_nrs_pol_desc *desc; - int i; - - mutex_lock(&nrs_core.nrs_mutex); - - /** - * Clean up NRS heads on all service partitions - */ - ptlrpc_service_for_each_part(svcpt, i, svc) - nrs_svcpt_cleanup_locked(svcpt); - - /** - * Clean up lprocfs interfaces for all supported policies for the - * service. - */ - list_for_each_entry(desc, &nrs_core.nrs_policies, pd_list) { - if (!nrs_policy_compatible(svc, desc)) - continue; - - if (desc->pd_ops->op_lprocfs_fini) - desc->pd_ops->op_lprocfs_fini(svc); - } - - mutex_unlock(&nrs_core.nrs_mutex); -} - -/** - * Obtains NRS head resources for request \a req. - * - * These could be either on the regular or HP NRS head of \a svcpt; resources - * taken on the regular head can later be swapped for HP head resources by - * ldlm_lock_reorder_req(). - * - * \param[in] svcpt the service partition - * \param[in] req the request - * \param[in] hp which NRS head of \a svcpt to use - */ -void ptlrpc_nrs_req_initialize(struct ptlrpc_service_part *svcpt, - struct ptlrpc_request *req, bool hp) -{ - struct ptlrpc_nrs *nrs = nrs_svcpt2nrs(svcpt, hp); - - memset(&req->rq_nrq, 0, sizeof(req->rq_nrq)); - nrs_resource_get_safe(nrs, &req->rq_nrq, req->rq_nrq.nr_res_ptrs, - false); - - /** - * It is fine to access \e nr_initialized without locking as there is - * no contention at this early stage. - */ - req->rq_nrq.nr_initialized = 1; -} - -/** - * Releases resources for a request; is called after the request has been - * handled. - * - * \param[in] req the request - * - * \see ptlrpc_server_finish_request() - */ -void ptlrpc_nrs_req_finalize(struct ptlrpc_request *req) -{ - if (req->rq_nrq.nr_initialized) { - nrs_resource_put_safe(req->rq_nrq.nr_res_ptrs); - /* no protection on bit nr_initialized because no - * contention at this late stage - */ - req->rq_nrq.nr_finalized = 1; - } -} - -void ptlrpc_nrs_req_stop_nolock(struct ptlrpc_request *req) -{ - if (req->rq_nrq.nr_started) - nrs_request_stop(&req->rq_nrq); -} - -/** - * Enqueues request \a req on either the regular or high-priority NRS head - * of service partition \a svcpt. - * - * \param[in] svcpt the service partition - * \param[in] req the request to be enqueued - * \param[in] hp whether to enqueue the request on the regular or - * high-priority NRS head. - */ -void ptlrpc_nrs_req_add(struct ptlrpc_service_part *svcpt, - struct ptlrpc_request *req, bool hp) -{ - spin_lock(&svcpt->scp_req_lock); - - if (hp) - ptlrpc_nrs_hpreq_add_nolock(req); - else - ptlrpc_nrs_req_add_nolock(req); - - spin_unlock(&svcpt->scp_req_lock); -} - -static void nrs_request_removed(struct ptlrpc_nrs_policy *policy) -{ - LASSERT(policy->pol_nrs->nrs_req_queued > 0); - LASSERT(policy->pol_req_queued > 0); - - policy->pol_nrs->nrs_req_queued--; - policy->pol_req_queued--; - - /** - * If the policy has no more requests queued, remove it from - * ptlrpc_nrs::nrs_policy_queued. - */ - if (unlikely(policy->pol_req_queued == 0)) { - list_del_init(&policy->pol_list_queued); - - /** - * If there are other policies with queued requests, move the - * current policy to the end so that we can round robin over - * all policies and drain the requests. - */ - } else if (policy->pol_req_queued != policy->pol_nrs->nrs_req_queued) { - LASSERT(policy->pol_req_queued < - policy->pol_nrs->nrs_req_queued); - - list_move_tail(&policy->pol_list_queued, - &policy->pol_nrs->nrs_policy_queued); - } -} - -/** - * Obtains a request for handling from an NRS head of service partition - * \a svcpt. - * - * \param[in] svcpt the service partition - * \param[in] hp whether to obtain a request from the regular or - * high-priority NRS head. - * \param[in] peek when set, signifies that we just want to examine the - * request, and not handle it, so the request is not removed - * from the policy. - * \param[in] force when set, it will force a policy to return a request if it - * has one pending - * - * \retval the request to be handled - * \retval NULL the head has no requests to serve - */ -struct ptlrpc_request * -ptlrpc_nrs_req_get_nolock0(struct ptlrpc_service_part *svcpt, bool hp, - bool peek, bool force) -{ - struct ptlrpc_nrs *nrs = nrs_svcpt2nrs(svcpt, hp); - struct ptlrpc_nrs_policy *policy; - struct ptlrpc_nrs_request *nrq; - - /** - * Always try to drain requests from all NRS polices even if they are - * inactive, because the user can change policy status at runtime. - */ - list_for_each_entry(policy, &nrs->nrs_policy_queued, pol_list_queued) { - nrq = nrs_request_get(policy, peek, force); - if (nrq) { - if (likely(!peek)) { - nrq->nr_started = 1; - - policy->pol_req_started++; - policy->pol_nrs->nrs_req_started++; - - nrs_request_removed(policy); - } - - return container_of(nrq, struct ptlrpc_request, rq_nrq); - } - } - - return NULL; -} - -/** - * Returns whether there are any requests currently enqueued on any of the - * policies of service partition's \a svcpt NRS head specified by \a hp. Should - * be called while holding ptlrpc_service_part::scp_req_lock to get a reliable - * result. - * - * \param[in] svcpt the service partition to enquire. - * \param[in] hp whether the regular or high-priority NRS head is to be - * enquired. - * - * \retval false the indicated NRS head has no enqueued requests. - * \retval true the indicated NRS head has some enqueued requests. - */ -bool ptlrpc_nrs_req_pending_nolock(struct ptlrpc_service_part *svcpt, bool hp) -{ - struct ptlrpc_nrs *nrs = nrs_svcpt2nrs(svcpt, hp); - - return nrs->nrs_req_queued > 0; -}; - -/** - * Carries out a control operation \a opc on the policy identified by the - * human-readable \a name, on either all partitions, or only on the first - * partition of service \a svc. - * - * \param[in] svc the service the policy belongs to. - * \param[in] queue whether to carry out the command on the policy which - * belongs to the regular, high-priority, or both NRS - * heads of service partitions of \a svc. - * \param[in] name the policy to act upon, by human-readable name - * \param[in] opc the opcode of the operation to carry out - * \param[in] single when set, the operation will only be carried out on the - * NRS heads of the first service partition of \a svc. - * This is useful for some policies which e.g. share - * identical values on the same parameters of different - * service partitions; when reading these parameters via - * lprocfs, these policies may just want to obtain and - * print out the values from the first service partition. - * Storing these values centrally elsewhere then could be - * another solution for this. - * \param[in,out] arg can be used as a generic in/out buffer between control - * operations and the user environment. - * - *\retval -ve error condition - *\retval 0 operation was carried out successfully - */ -int ptlrpc_nrs_policy_control(const struct ptlrpc_service *svc, - enum ptlrpc_nrs_queue_type queue, char *name, - enum ptlrpc_nrs_ctl opc, bool single, void *arg) -{ - struct ptlrpc_service_part *svcpt; - int i; - int rc = 0; - - LASSERT(opc != PTLRPC_NRS_CTL_INVALID); - - if ((queue & PTLRPC_NRS_QUEUE_BOTH) == 0) - return -EINVAL; - - ptlrpc_service_for_each_part(svcpt, i, svc) { - if ((queue & PTLRPC_NRS_QUEUE_REG) != 0) { - rc = nrs_policy_ctl(nrs_svcpt2nrs(svcpt, false), name, - opc, arg); - if (rc != 0 || (queue == PTLRPC_NRS_QUEUE_REG && - single)) - goto out; - } - - if ((queue & PTLRPC_NRS_QUEUE_HP) != 0) { - /** - * XXX: We could optionally check for - * nrs_svc_has_hp(svc) here, and return an error if it - * is false. Right now we rely on the policies' lprocfs - * handlers that call the present function to make this - * check; if they fail to do so, they might hit the - * assertion inside nrs_svcpt2nrs() below. - */ - rc = nrs_policy_ctl(nrs_svcpt2nrs(svcpt, true), name, - opc, arg); - if (rc != 0 || single) - goto out; - } - } -out: - return rc; -} - -/** - * Adds all policies that ship with the ptlrpc module, to NRS core's list of - * policies \e nrs_core.nrs_policies. - * - * \retval 0 all policies have been registered successfully - * \retval -ve error - */ -int ptlrpc_nrs_init(void) -{ - int rc; - - mutex_init(&nrs_core.nrs_mutex); - INIT_LIST_HEAD(&nrs_core.nrs_policies); - - rc = ptlrpc_nrs_policy_register(&nrs_conf_fifo); - if (rc != 0) - goto fail; - - return rc; -fail: - /** - * Since no PTLRPC services have been started at this point, all we need - * to do for cleanup is to free the descriptors. - */ - ptlrpc_nrs_fini(); - - return rc; -} - -/** - * Removes all policy descriptors from nrs_core::nrs_policies, and frees the - * policy descriptors. - * - * Since all PTLRPC services are stopped at this point, there are no more - * instances of any policies, because each service will have stopped its policy - * instances in ptlrpc_service_nrs_cleanup(), so we just need to free the - * descriptors here. - */ -void ptlrpc_nrs_fini(void) -{ - struct ptlrpc_nrs_pol_desc *desc; - struct ptlrpc_nrs_pol_desc *tmp; - - list_for_each_entry_safe(desc, tmp, &nrs_core.nrs_policies, pd_list) { - list_del_init(&desc->pd_list); - kfree(desc); - } -} - -/** @} nrs */ diff --git a/drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c b/drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c deleted file mode 100644 index ff630d94dd264ca65f79f27b3c6f8ba559dedefb..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ptlrpc/nrs_fifo.c +++ /dev/null @@ -1,270 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License version 2 for more details. A copy is - * included in the COPYING file that accompanied this code. - - * GPL HEADER END - */ -/* - * Copyright (c) 2011 Intel Corporation - * - * Copyright 2012 Xyratex Technology Limited - */ -/* - * lustre/ptlrpc/nrs_fifo.c - * - * Network Request Scheduler (NRS) FIFO policy - * - * Handles RPCs in a FIFO manner, as received from the network. This policy is - * a logical wrapper around previous, non-NRS functionality. It is used as the - * default and fallback policy for all types of RPCs on all PTLRPC service - * partitions, for both regular and high-priority NRS heads. Default here means - * the policy is the one enabled at PTLRPC service partition startup time, and - * fallback means the policy is used to handle RPCs that are not handled - * successfully or are not handled at all by any primary policy that may be - * enabled on a given NRS head. - * - * Author: Liang Zhen - * Author: Nikitas Angelinas - */ -/** - * \addtogoup nrs - * @{ - */ - -#define DEBUG_SUBSYSTEM S_RPC -#include -#include -#include -#include -#include "ptlrpc_internal.h" - -/** - * \name fifo - * - * The FIFO policy is a logical wrapper around previous, non-NRS functionality. - * It schedules RPCs in the same order as they are queued from LNet. - * - * @{ - */ - -#define NRS_POL_NAME_FIFO "fifo" - -/** - * Is called before the policy transitions into - * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED; allocates and initializes a - * policy-specific private data structure. - * - * \param[in] policy The policy to start - * - * \retval -ENOMEM OOM error - * \retval 0 success - * - * \see nrs_policy_register() - * \see nrs_policy_ctl() - */ -static int nrs_fifo_start(struct ptlrpc_nrs_policy *policy) -{ - struct nrs_fifo_head *head; - - head = kzalloc_node(sizeof(*head), GFP_NOFS, - cfs_cpt_spread_node(nrs_pol2cptab(policy), - nrs_pol2cptid(policy))); - if (!head) - return -ENOMEM; - - INIT_LIST_HEAD(&head->fh_list); - policy->pol_private = head; - return 0; -} - -/** - * Is called before the policy transitions into - * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPED; deallocates the policy-specific - * private data structure. - * - * \param[in] policy The policy to stop - * - * \see nrs_policy_stop0() - */ -static void nrs_fifo_stop(struct ptlrpc_nrs_policy *policy) -{ - struct nrs_fifo_head *head = policy->pol_private; - - LASSERT(head); - LASSERT(list_empty(&head->fh_list)); - - kfree(head); -} - -/** - * Is called for obtaining a FIFO policy resource. - * - * \param[in] policy The policy on which the request is being asked for - * \param[in] nrq The request for which resources are being taken - * \param[in] parent Parent resource, unused in this policy - * \param[out] resp Resources references are placed in this array - * \param[in] moving_req Signifies limited caller context; unused in this - * policy - * - * \retval 1 The FIFO policy only has a one-level resource hierarchy, as since - * it implements a simple scheduling algorithm in which request - * priority is determined on the request arrival order, it does not - * need to maintain a set of resources that would otherwise be used - * to calculate a request's priority. - * - * \see nrs_resource_get_safe() - */ -static int nrs_fifo_res_get(struct ptlrpc_nrs_policy *policy, - struct ptlrpc_nrs_request *nrq, - const struct ptlrpc_nrs_resource *parent, - struct ptlrpc_nrs_resource **resp, bool moving_req) -{ - /** - * Just return the resource embedded inside nrs_fifo_head, and end this - * resource hierarchy reference request. - */ - *resp = &((struct nrs_fifo_head *)policy->pol_private)->fh_res; - return 1; -} - -/** - * Called when getting a request from the FIFO policy for handling, or just - * peeking; removes the request from the policy when it is to be handled. - * - * \param[in] policy The policy - * \param[in] peek When set, signifies that we just want to examine the - * request, and not handle it, so the request is not removed - * from the policy. - * \param[in] force Force the policy to return a request; unused in this - * policy - * - * \retval The request to be handled; this is the next request in the FIFO - * queue - * - * \see ptlrpc_nrs_req_get_nolock() - * \see nrs_request_get() - */ -static -struct ptlrpc_nrs_request *nrs_fifo_req_get(struct ptlrpc_nrs_policy *policy, - bool peek, bool force) -{ - struct nrs_fifo_head *head = policy->pol_private; - struct ptlrpc_nrs_request *nrq; - - nrq = unlikely(list_empty(&head->fh_list)) ? NULL : - list_entry(head->fh_list.next, struct ptlrpc_nrs_request, - nr_u.fifo.fr_list); - - if (likely(!peek && nrq)) { - struct ptlrpc_request *req = container_of(nrq, - struct ptlrpc_request, - rq_nrq); - - list_del_init(&nrq->nr_u.fifo.fr_list); - - CDEBUG(D_RPCTRACE, "NRS start %s request from %s, seq: %llu\n", - policy->pol_desc->pd_name, libcfs_id2str(req->rq_peer), - nrq->nr_u.fifo.fr_sequence); - } - - return nrq; -} - -/** - * Adds request \a nrq to \a policy's list of queued requests - * - * \param[in] policy The policy - * \param[in] nrq The request to add - * - * \retval 0 success; nrs_request_enqueue() assumes this function will always - * succeed - */ -static int nrs_fifo_req_add(struct ptlrpc_nrs_policy *policy, - struct ptlrpc_nrs_request *nrq) -{ - struct nrs_fifo_head *head; - - head = container_of(nrs_request_resource(nrq), struct nrs_fifo_head, - fh_res); - /** - * Only used for debugging - */ - nrq->nr_u.fifo.fr_sequence = head->fh_sequence++; - list_add_tail(&nrq->nr_u.fifo.fr_list, &head->fh_list); - - return 0; -} - -/** - * Removes request \a nrq from \a policy's list of queued requests. - * - * \param[in] policy The policy - * \param[in] nrq The request to remove - */ -static void nrs_fifo_req_del(struct ptlrpc_nrs_policy *policy, - struct ptlrpc_nrs_request *nrq) -{ - LASSERT(!list_empty(&nrq->nr_u.fifo.fr_list)); - list_del_init(&nrq->nr_u.fifo.fr_list); -} - -/** - * Prints a debug statement right before the request \a nrq stops being - * handled. - * - * \param[in] policy The policy handling the request - * \param[in] nrq The request being handled - * - * \see ptlrpc_server_finish_request() - * \see ptlrpc_nrs_req_stop_nolock() - */ -static void nrs_fifo_req_stop(struct ptlrpc_nrs_policy *policy, - struct ptlrpc_nrs_request *nrq) -{ - struct ptlrpc_request *req = container_of(nrq, struct ptlrpc_request, - rq_nrq); - - CDEBUG(D_RPCTRACE, "NRS stop %s request from %s, seq: %llu\n", - policy->pol_desc->pd_name, libcfs_id2str(req->rq_peer), - nrq->nr_u.fifo.fr_sequence); -} - -/** - * FIFO policy operations - */ -static const struct ptlrpc_nrs_pol_ops nrs_fifo_ops = { - .op_policy_start = nrs_fifo_start, - .op_policy_stop = nrs_fifo_stop, - .op_res_get = nrs_fifo_res_get, - .op_req_get = nrs_fifo_req_get, - .op_req_enqueue = nrs_fifo_req_add, - .op_req_dequeue = nrs_fifo_req_del, - .op_req_stop = nrs_fifo_req_stop, -}; - -/** - * FIFO policy configuration - */ -struct ptlrpc_nrs_pol_conf nrs_conf_fifo = { - .nc_name = NRS_POL_NAME_FIFO, - .nc_ops = &nrs_fifo_ops, - .nc_compat = nrs_policy_compat_all, - .nc_flags = PTLRPC_NRS_FL_FALLBACK | - PTLRPC_NRS_FL_REG_START -}; - -/** @} fifo */ - -/** @} nrs */ diff --git a/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c b/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c deleted file mode 100644 index 6ac9bb57066322d90b30e0a56a478f421ac2422c..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ptlrpc/pack_generic.c +++ /dev/null @@ -1,2311 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/ptlrpc/pack_generic.c - * - * (Un)packing of OST requests - * - * Author: Peter J. Braam - * Author: Phil Schwan - * Author: Eric Barton - */ - -#define DEBUG_SUBSYSTEM S_RPC - -#include - -#include -#include -#include -#include -#include -#include - -#include "ptlrpc_internal.h" - -static inline u32 lustre_msg_hdr_size_v2(u32 count) -{ - return cfs_size_round(offsetof(struct lustre_msg_v2, - lm_buflens[count])); -} - -u32 lustre_msg_hdr_size(__u32 magic, u32 count) -{ - switch (magic) { - case LUSTRE_MSG_MAGIC_V2: - return lustre_msg_hdr_size_v2(count); - default: - LASSERTF(0, "incorrect message magic: %08x\n", magic); - return 0; - } -} - -void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout, - u32 index) -{ - if (inout) - lustre_set_req_swabbed(req, index); - else - lustre_set_rep_swabbed(req, index); -} - -int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout, - u32 index) -{ - if (inout) - return (ptlrpc_req_need_swab(req) && - !lustre_req_swabbed(req, index)); - else - return (ptlrpc_rep_need_swab(req) && - !lustre_rep_swabbed(req, index)); -} - -/* early reply size */ -u32 lustre_msg_early_size(void) -{ - static u32 size; - - if (!size) { - /* Always reply old ptlrpc_body_v2 to keep interoperability - * with the old client (< 2.3) which doesn't have pb_jobid - * in the ptlrpc_body. - * - * XXX Remove this whenever we drop interoperability with such - * client. - */ - __u32 pblen = sizeof(struct ptlrpc_body_v2); - - size = lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, &pblen); - } - return size; -} -EXPORT_SYMBOL(lustre_msg_early_size); - -u32 lustre_msg_size_v2(int count, __u32 *lengths) -{ - u32 size; - int i; - - size = lustre_msg_hdr_size_v2(count); - for (i = 0; i < count; i++) - size += cfs_size_round(lengths[i]); - - return size; -} -EXPORT_SYMBOL(lustre_msg_size_v2); - -/* This returns the size of the buffer that is required to hold a lustre_msg - * with the given sub-buffer lengths. - * NOTE: this should only be used for NEW requests, and should always be - * in the form of a v2 request. If this is a connection to a v1 - * target then the first buffer will be stripped because the ptlrpc - * data is part of the lustre_msg_v1 header. b=14043 - */ -u32 lustre_msg_size(__u32 magic, int count, __u32 *lens) -{ - __u32 size[] = { sizeof(struct ptlrpc_body) }; - - if (!lens) { - LASSERT(count == 1); - lens = size; - } - - LASSERT(count > 0); - LASSERT(lens[MSG_PTLRPC_BODY_OFF] >= sizeof(struct ptlrpc_body_v2)); - - switch (magic) { - case LUSTRE_MSG_MAGIC_V2: - return lustre_msg_size_v2(count, lens); - default: - LASSERTF(0, "incorrect message magic: %08x\n", magic); - return 0; - } -} - -/* This is used to determine the size of a buffer that was already packed - * and will correctly handle the different message formats. - */ -u32 lustre_packed_msg_size(struct lustre_msg *msg) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: - return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); - default: - CERROR("incorrect message magic: %08x\n", msg->lm_magic); - return 0; - } -} - -void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens, - char **bufs) -{ - char *ptr; - int i; - - msg->lm_bufcount = count; - /* XXX: lm_secflvr uninitialized here */ - msg->lm_magic = LUSTRE_MSG_MAGIC_V2; - - for (i = 0; i < count; i++) - msg->lm_buflens[i] = lens[i]; - - if (!bufs) - return; - - ptr = (char *)msg + lustre_msg_hdr_size_v2(count); - for (i = 0; i < count; i++) { - char *tmp = bufs[i]; - - if (tmp) - memcpy(ptr, tmp, lens[i]); - ptr += cfs_size_round(lens[i]); - } -} -EXPORT_SYMBOL(lustre_init_msg_v2); - -static int lustre_pack_request_v2(struct ptlrpc_request *req, - int count, __u32 *lens, char **bufs) -{ - int reqlen, rc; - - reqlen = lustre_msg_size_v2(count, lens); - - rc = sptlrpc_cli_alloc_reqbuf(req, reqlen); - if (rc) - return rc; - - req->rq_reqlen = reqlen; - - lustre_init_msg_v2(req->rq_reqmsg, count, lens, bufs); - lustre_msg_add_version(req->rq_reqmsg, PTLRPC_MSG_VERSION); - return 0; -} - -int lustre_pack_request(struct ptlrpc_request *req, __u32 magic, int count, - __u32 *lens, char **bufs) -{ - __u32 size[] = { sizeof(struct ptlrpc_body) }; - - if (!lens) { - LASSERT(count == 1); - lens = size; - } - - LASSERT(count > 0); - LASSERT(lens[MSG_PTLRPC_BODY_OFF] == sizeof(struct ptlrpc_body)); - - /* only use new format, we don't need to be compatible with 1.4 */ - return lustre_pack_request_v2(req, count, lens, bufs); -} - -#if RS_DEBUG -LIST_HEAD(ptlrpc_rs_debug_lru); -spinlock_t ptlrpc_rs_debug_lock; - -#define PTLRPC_RS_DEBUG_LRU_ADD(rs) \ -do { \ - spin_lock(&ptlrpc_rs_debug_lock); \ - list_add_tail(&(rs)->rs_debug_list, &ptlrpc_rs_debug_lru); \ - spin_unlock(&ptlrpc_rs_debug_lock); \ -} while (0) - -#define PTLRPC_RS_DEBUG_LRU_DEL(rs) \ -do { \ - spin_lock(&ptlrpc_rs_debug_lock); \ - list_del(&(rs)->rs_debug_list); \ - spin_unlock(&ptlrpc_rs_debug_lock); \ -} while (0) -#else -# define PTLRPC_RS_DEBUG_LRU_ADD(rs) do {} while (0) -# define PTLRPC_RS_DEBUG_LRU_DEL(rs) do {} while (0) -#endif - -struct ptlrpc_reply_state * -lustre_get_emerg_rs(struct ptlrpc_service_part *svcpt) -{ - struct ptlrpc_reply_state *rs = NULL; - - spin_lock(&svcpt->scp_rep_lock); - - /* See if we have anything in a pool, and wait if nothing */ - while (list_empty(&svcpt->scp_rep_idle)) { - int rc; - - spin_unlock(&svcpt->scp_rep_lock); - /* If we cannot get anything for some long time, we better - * bail out instead of waiting infinitely - */ - rc = wait_event_idle_timeout(svcpt->scp_rep_waitq, - !list_empty(&svcpt->scp_rep_idle), - 10 * HZ); - if (rc == 0) - goto out; - spin_lock(&svcpt->scp_rep_lock); - } - - rs = list_entry(svcpt->scp_rep_idle.next, - struct ptlrpc_reply_state, rs_list); - list_del(&rs->rs_list); - - spin_unlock(&svcpt->scp_rep_lock); - - memset(rs, 0, svcpt->scp_service->srv_max_reply_size); - rs->rs_size = svcpt->scp_service->srv_max_reply_size; - rs->rs_svcpt = svcpt; - rs->rs_prealloc = 1; -out: - return rs; -} - -void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs) -{ - struct ptlrpc_service_part *svcpt = rs->rs_svcpt; - - spin_lock(&svcpt->scp_rep_lock); - list_add(&rs->rs_list, &svcpt->scp_rep_idle); - spin_unlock(&svcpt->scp_rep_lock); - wake_up(&svcpt->scp_rep_waitq); -} - -int lustre_pack_reply_v2(struct ptlrpc_request *req, int count, - __u32 *lens, char **bufs, int flags) -{ - struct ptlrpc_reply_state *rs; - int msg_len, rc; - - LASSERT(!req->rq_reply_state); - - if ((flags & LPRFL_EARLY_REPLY) == 0) { - spin_lock(&req->rq_lock); - req->rq_packed_final = 1; - spin_unlock(&req->rq_lock); - } - - msg_len = lustre_msg_size_v2(count, lens); - rc = sptlrpc_svc_alloc_rs(req, msg_len); - if (rc) - return rc; - - rs = req->rq_reply_state; - atomic_set(&rs->rs_refcount, 1); /* 1 ref for rq_reply_state */ - rs->rs_cb_id.cbid_fn = reply_out_callback; - rs->rs_cb_id.cbid_arg = rs; - rs->rs_svcpt = req->rq_rqbd->rqbd_svcpt; - INIT_LIST_HEAD(&rs->rs_exp_list); - INIT_LIST_HEAD(&rs->rs_obd_list); - INIT_LIST_HEAD(&rs->rs_list); - spin_lock_init(&rs->rs_lock); - - req->rq_replen = msg_len; - req->rq_reply_state = rs; - req->rq_repmsg = rs->rs_msg; - - lustre_init_msg_v2(rs->rs_msg, count, lens, bufs); - lustre_msg_add_version(rs->rs_msg, PTLRPC_MSG_VERSION); - - PTLRPC_RS_DEBUG_LRU_ADD(rs); - - return 0; -} -EXPORT_SYMBOL(lustre_pack_reply_v2); - -int lustre_pack_reply_flags(struct ptlrpc_request *req, int count, __u32 *lens, - char **bufs, int flags) -{ - int rc = 0; - __u32 size[] = { sizeof(struct ptlrpc_body) }; - - if (!lens) { - LASSERT(count == 1); - lens = size; - } - - LASSERT(count > 0); - LASSERT(lens[MSG_PTLRPC_BODY_OFF] == sizeof(struct ptlrpc_body)); - - switch (req->rq_reqmsg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: - rc = lustre_pack_reply_v2(req, count, lens, bufs, flags); - break; - default: - LASSERTF(0, "incorrect message magic: %08x\n", - req->rq_reqmsg->lm_magic); - rc = -EINVAL; - } - if (rc != 0) - CERROR("lustre_pack_reply failed: rc=%d size=%d\n", rc, - lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens)); - return rc; -} - -int lustre_pack_reply(struct ptlrpc_request *req, int count, __u32 *lens, - char **bufs) -{ - return lustre_pack_reply_flags(req, count, lens, bufs, 0); -} -EXPORT_SYMBOL(lustre_pack_reply); - -void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, u32 n, u32 min_size) -{ - u32 i, offset, buflen, bufcount; - - bufcount = m->lm_bufcount; - if (unlikely(n >= bufcount)) { - CDEBUG(D_INFO, "msg %p buffer[%d] not present (count %d)\n", - m, n, bufcount); - return NULL; - } - - buflen = m->lm_buflens[n]; - if (unlikely(buflen < min_size)) { - CERROR("msg %p buffer[%d] size %d too small (required %d, opc=%d)\n", - m, n, buflen, min_size, - n == MSG_PTLRPC_BODY_OFF ? -1 : lustre_msg_get_opc(m)); - return NULL; - } - - offset = lustre_msg_hdr_size_v2(bufcount); - for (i = 0; i < n; i++) - offset += cfs_size_round(m->lm_buflens[i]); - - return (char *)m + offset; -} - -void *lustre_msg_buf(struct lustre_msg *m, u32 n, u32 min_size) -{ - switch (m->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: - return lustre_msg_buf_v2(m, n, min_size); - default: - LASSERTF(0, "incorrect message magic: %08x (msg:%p)\n", - m->lm_magic, m); - return NULL; - } -} -EXPORT_SYMBOL(lustre_msg_buf); - -static int lustre_shrink_msg_v2(struct lustre_msg_v2 *msg, u32 segment, - unsigned int newlen, int move_data) -{ - char *tail = NULL, *newpos; - int tail_len = 0, n; - - LASSERT(msg); - LASSERT(msg->lm_bufcount > segment); - LASSERT(msg->lm_buflens[segment] >= newlen); - - if (msg->lm_buflens[segment] == newlen) - goto out; - - if (move_data && msg->lm_bufcount > segment + 1) { - tail = lustre_msg_buf_v2(msg, segment + 1, 0); - for (n = segment + 1; n < msg->lm_bufcount; n++) - tail_len += cfs_size_round(msg->lm_buflens[n]); - } - - msg->lm_buflens[segment] = newlen; - - if (tail && tail_len) { - newpos = lustre_msg_buf_v2(msg, segment + 1, 0); - LASSERT(newpos <= tail); - if (newpos != tail) - memmove(newpos, tail, tail_len); - } -out: - return lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); -} - -/* - * for @msg, shrink @segment to size @newlen. if @move_data is non-zero, - * we also move data forward from @segment + 1. - * - * if @newlen == 0, we remove the segment completely, but we still keep the - * totally bufcount the same to save possible data moving. this will leave a - * unused segment with size 0 at the tail, but that's ok. - * - * return new msg size after shrinking. - * - * CAUTION: - * + if any buffers higher than @segment has been filled in, must call shrink - * with non-zero @move_data. - * + caller should NOT keep pointers to msg buffers which higher than @segment - * after call shrink. - */ -int lustre_shrink_msg(struct lustre_msg *msg, int segment, - unsigned int newlen, int move_data) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: - return lustre_shrink_msg_v2(msg, segment, newlen, move_data); - default: - LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); - } - return 0; -} -EXPORT_SYMBOL(lustre_shrink_msg); - -void lustre_free_reply_state(struct ptlrpc_reply_state *rs) -{ - PTLRPC_RS_DEBUG_LRU_DEL(rs); - - LASSERT(atomic_read(&rs->rs_refcount) == 0); - LASSERT(!rs->rs_difficult || rs->rs_handled); - LASSERT(!rs->rs_on_net); - LASSERT(!rs->rs_scheduled); - LASSERT(!rs->rs_export); - LASSERT(rs->rs_nlocks == 0); - LASSERT(list_empty(&rs->rs_exp_list)); - LASSERT(list_empty(&rs->rs_obd_list)); - - sptlrpc_svc_free_rs(rs); -} - -static int lustre_unpack_msg_v2(struct lustre_msg_v2 *m, int len) -{ - int swabbed, required_len, i; - - /* Now we know the sender speaks my language. */ - required_len = lustre_msg_hdr_size_v2(0); - if (len < required_len) { - /* can't even look inside the message */ - CERROR("message length %d too small for lustre_msg\n", len); - return -EINVAL; - } - - swabbed = (m->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED); - - if (swabbed) { - __swab32s(&m->lm_magic); - __swab32s(&m->lm_bufcount); - __swab32s(&m->lm_secflvr); - __swab32s(&m->lm_repsize); - __swab32s(&m->lm_cksum); - __swab32s(&m->lm_flags); - BUILD_BUG_ON(offsetof(typeof(*m), lm_padding_2) == 0); - BUILD_BUG_ON(offsetof(typeof(*m), lm_padding_3) == 0); - } - - required_len = lustre_msg_hdr_size_v2(m->lm_bufcount); - if (len < required_len) { - /* didn't receive all the buffer lengths */ - CERROR("message length %d too small for %d buflens\n", - len, m->lm_bufcount); - return -EINVAL; - } - - for (i = 0; i < m->lm_bufcount; i++) { - if (swabbed) - __swab32s(&m->lm_buflens[i]); - required_len += cfs_size_round(m->lm_buflens[i]); - } - - if (len < required_len) { - CERROR("len: %d, required_len %d\n", len, required_len); - CERROR("bufcount: %d\n", m->lm_bufcount); - for (i = 0; i < m->lm_bufcount; i++) - CERROR("buffer %d length %d\n", i, m->lm_buflens[i]); - return -EINVAL; - } - - return swabbed; -} - -int __lustre_unpack_msg(struct lustre_msg *m, int len) -{ - int required_len, rc; - - /* We can provide a slightly better error log, if we check the - * message magic and version first. In the future, struct - * lustre_msg may grow, and we'd like to log a version mismatch, - * rather than a short message. - * - */ - required_len = offsetof(struct lustre_msg, lm_magic) + - sizeof(m->lm_magic); - if (len < required_len) { - /* can't even look inside the message */ - CERROR("message length %d too small for magic/version check\n", - len); - return -EINVAL; - } - - rc = lustre_unpack_msg_v2(m, len); - - return rc; -} -EXPORT_SYMBOL(__lustre_unpack_msg); - -int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len) -{ - int rc; - - rc = __lustre_unpack_msg(req->rq_reqmsg, len); - if (rc == 1) { - lustre_set_req_swabbed(req, MSG_PTLRPC_HEADER_OFF); - rc = 0; - } - return rc; -} - -int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len) -{ - int rc; - - rc = __lustre_unpack_msg(req->rq_repmsg, len); - if (rc == 1) { - lustre_set_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF); - rc = 0; - } - return rc; -} - -static inline int lustre_unpack_ptlrpc_body_v2(struct ptlrpc_request *req, - const int inout, int offset) -{ - struct ptlrpc_body *pb; - struct lustre_msg_v2 *m = inout ? req->rq_reqmsg : req->rq_repmsg; - - pb = lustre_msg_buf_v2(m, offset, sizeof(struct ptlrpc_body_v2)); - if (!pb) { - CERROR("error unpacking ptlrpc body\n"); - return -EFAULT; - } - if (ptlrpc_buf_need_swab(req, inout, offset)) { - lustre_swab_ptlrpc_body(pb); - ptlrpc_buf_set_swabbed(req, inout, offset); - } - - if ((pb->pb_version & ~LUSTRE_VERSION_MASK) != PTLRPC_MSG_VERSION) { - CERROR("wrong lustre_msg version %08x\n", pb->pb_version); - return -EINVAL; - } - - if (!inout) - pb->pb_status = ptlrpc_status_ntoh(pb->pb_status); - - return 0; -} - -int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset) -{ - switch (req->rq_reqmsg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: - return lustre_unpack_ptlrpc_body_v2(req, 1, offset); - default: - CERROR("bad lustre msg magic: %08x\n", - req->rq_reqmsg->lm_magic); - return -EINVAL; - } -} - -int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset) -{ - switch (req->rq_repmsg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: - return lustre_unpack_ptlrpc_body_v2(req, 0, offset); - default: - CERROR("bad lustre msg magic: %08x\n", - req->rq_repmsg->lm_magic); - return -EINVAL; - } -} - -static inline u32 lustre_msg_buflen_v2(struct lustre_msg_v2 *m, u32 n) -{ - if (n >= m->lm_bufcount) - return 0; - - return m->lm_buflens[n]; -} - -/** - * lustre_msg_buflen - return the length of buffer \a n in message \a m - * \param m lustre_msg (request or reply) to look at - * \param n message index (base 0) - * - * returns zero for non-existent message indices - */ -u32 lustre_msg_buflen(struct lustre_msg *m, u32 n) -{ - switch (m->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: - return lustre_msg_buflen_v2(m, n); - default: - CERROR("incorrect message magic: %08x\n", m->lm_magic); - return 0; - } -} -EXPORT_SYMBOL(lustre_msg_buflen); - -/* NB return the bufcount for lustre_msg_v2 format, so if message is packed - * in V1 format, the result is one bigger. (add struct ptlrpc_body). - */ -u32 lustre_msg_bufcount(struct lustre_msg *m) -{ - switch (m->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: - return m->lm_bufcount; - default: - CERROR("incorrect message magic: %08x\n", m->lm_magic); - return 0; - } -} - -char *lustre_msg_string(struct lustre_msg *m, u32 index, u32 max_len) -{ - /* max_len == 0 means the string should fill the buffer */ - char *str; - u32 slen, blen; - - switch (m->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: - str = lustre_msg_buf_v2(m, index, 0); - blen = lustre_msg_buflen_v2(m, index); - break; - default: - LASSERTF(0, "incorrect message magic: %08x\n", m->lm_magic); - } - - if (!str) { - CERROR("can't unpack string in msg %p buffer[%d]\n", m, index); - return NULL; - } - - slen = strnlen(str, blen); - - if (slen == blen) { /* not NULL terminated */ - CERROR("can't unpack non-NULL terminated string in msg %p buffer[%d] len %d\n", - m, index, blen); - return NULL; - } - - if (max_len == 0) { - if (slen != blen - 1) { - CERROR("can't unpack short string in msg %p buffer[%d] len %d: strlen %d\n", - m, index, blen, slen); - return NULL; - } - } else if (slen > max_len) { - CERROR("can't unpack oversized string in msg %p buffer[%d] len %d strlen %d: max %d expected\n", - m, index, blen, slen, max_len); - return NULL; - } - - return str; -} - -/* Wrap up the normal fixed length cases */ -static inline void *__lustre_swab_buf(struct lustre_msg *msg, u32 index, - u32 min_size, void *swabber) -{ - void *ptr = NULL; - - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: - ptr = lustre_msg_buf_v2(msg, index, min_size); - break; - default: - CERROR("incorrect message magic: %08x\n", msg->lm_magic); - } - - if (ptr && swabber) - ((void (*)(void *))swabber)(ptr); - - return ptr; -} - -static inline struct ptlrpc_body *lustre_msg_ptlrpc_body(struct lustre_msg *msg) -{ - return lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, - sizeof(struct ptlrpc_body_v2)); -} - -__u32 lustre_msghdr_get_flags(struct lustre_msg *msg) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: - /* already in host endian */ - return msg->lm_flags; - default: - CERROR("incorrect message magic: %08x\n", msg->lm_magic); - return 0; - } -} -EXPORT_SYMBOL(lustre_msghdr_get_flags); - -void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: - msg->lm_flags = flags; - return; - default: - LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); - } -} - -__u32 lustre_msg_get_flags(struct lustre_msg *msg) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - if (pb) - return pb->pb_flags; - - CERROR("invalid msg %p: no ptlrpc body!\n", msg); - } - /* fall through */ - default: - /* flags might be printed in debug code while message - * uninitialized - */ - return 0; - } -} -EXPORT_SYMBOL(lustre_msg_get_flags); - -void lustre_msg_add_flags(struct lustre_msg *msg, u32 flags) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); - pb->pb_flags |= flags; - return; - } - default: - LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); - } -} -EXPORT_SYMBOL(lustre_msg_add_flags); - -void lustre_msg_set_flags(struct lustre_msg *msg, u32 flags) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); - pb->pb_flags = flags; - return; - } - default: - LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); - } -} - -void lustre_msg_clear_flags(struct lustre_msg *msg, u32 flags) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); - pb->pb_flags &= ~(flags & MSG_GEN_FLAG_MASK); - return; - } - default: - LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); - } -} -EXPORT_SYMBOL(lustre_msg_clear_flags); - -__u32 lustre_msg_get_op_flags(struct lustre_msg *msg) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - if (pb) - return pb->pb_op_flags; - - CERROR("invalid msg %p: no ptlrpc body!\n", msg); - } - /* fall through */ - default: - return 0; - } -} - -void lustre_msg_add_op_flags(struct lustre_msg *msg, u32 flags) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); - pb->pb_op_flags |= flags; - return; - } - default: - LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); - } -} -EXPORT_SYMBOL(lustre_msg_add_op_flags); - -struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - if (!pb) { - CERROR("invalid msg %p: no ptlrpc body!\n", msg); - return NULL; - } - return &pb->pb_handle; - } - default: - CERROR("incorrect message magic: %08x\n", msg->lm_magic); - return NULL; - } -} - -__u32 lustre_msg_get_type(struct lustre_msg *msg) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - if (!pb) { - CERROR("invalid msg %p: no ptlrpc body!\n", msg); - return PTL_RPC_MSG_ERR; - } - return pb->pb_type; - } - default: - CERROR("incorrect message magic: %08x\n", msg->lm_magic); - return PTL_RPC_MSG_ERR; - } -} -EXPORT_SYMBOL(lustre_msg_get_type); - -void lustre_msg_add_version(struct lustre_msg *msg, u32 version) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); - pb->pb_version |= version; - return; - } - default: - LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); - } -} - -__u32 lustre_msg_get_opc(struct lustre_msg *msg) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - if (!pb) { - CERROR("invalid msg %p: no ptlrpc body!\n", msg); - return 0; - } - return pb->pb_opc; - } - default: - CERROR("incorrect message magic: %08x (msg:%p)\n", - msg->lm_magic, msg); - return 0; - } -} -EXPORT_SYMBOL(lustre_msg_get_opc); - -__u16 lustre_msg_get_tag(struct lustre_msg *msg) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - if (!pb) { - CERROR("invalid msg %p: no ptlrpc body!\n", msg); - return 0; - } - return pb->pb_tag; - } - default: - CERROR("incorrect message magic: %08x\n", msg->lm_magic); - return 0; - } -} -EXPORT_SYMBOL(lustre_msg_get_tag); - -__u64 lustre_msg_get_last_committed(struct lustre_msg *msg) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - if (!pb) { - CERROR("invalid msg %p: no ptlrpc body!\n", msg); - return 0; - } - return pb->pb_last_committed; - } - default: - CERROR("incorrect message magic: %08x\n", msg->lm_magic); - return 0; - } -} -EXPORT_SYMBOL(lustre_msg_get_last_committed); - -__u64 *lustre_msg_get_versions(struct lustre_msg *msg) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - if (!pb) { - CERROR("invalid msg %p: no ptlrpc body!\n", msg); - return NULL; - } - return pb->pb_pre_versions; - } - default: - CERROR("incorrect message magic: %08x\n", msg->lm_magic); - return NULL; - } -} -EXPORT_SYMBOL(lustre_msg_get_versions); - -__u64 lustre_msg_get_transno(struct lustre_msg *msg) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - if (!pb) { - CERROR("invalid msg %p: no ptlrpc body!\n", msg); - return 0; - } - return pb->pb_transno; - } - default: - CERROR("incorrect message magic: %08x\n", msg->lm_magic); - return 0; - } -} -EXPORT_SYMBOL(lustre_msg_get_transno); - -int lustre_msg_get_status(struct lustre_msg *msg) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - if (pb) - return pb->pb_status; - - CERROR("invalid msg %p: no ptlrpc body!\n", msg); - } - /* fall through */ - default: - /* status might be printed in debug code while message - * uninitialized - */ - return -EINVAL; - } -} -EXPORT_SYMBOL(lustre_msg_get_status); - -__u64 lustre_msg_get_slv(struct lustre_msg *msg) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - if (!pb) { - CERROR("invalid msg %p: no ptlrpc body!\n", msg); - return -EINVAL; - } - return pb->pb_slv; - } - default: - CERROR("invalid msg magic %08x\n", msg->lm_magic); - return -EINVAL; - } -} - -void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - if (!pb) { - CERROR("invalid msg %p: no ptlrpc body!\n", msg); - return; - } - pb->pb_slv = slv; - return; - } - default: - CERROR("invalid msg magic %x\n", msg->lm_magic); - return; - } -} - -__u32 lustre_msg_get_limit(struct lustre_msg *msg) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - if (!pb) { - CERROR("invalid msg %p: no ptlrpc body!\n", msg); - return -EINVAL; - } - return pb->pb_limit; - } - default: - CERROR("invalid msg magic %x\n", msg->lm_magic); - return -EINVAL; - } -} - -void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - if (!pb) { - CERROR("invalid msg %p: no ptlrpc body!\n", msg); - return; - } - pb->pb_limit = limit; - return; - } - default: - CERROR("invalid msg magic %08x\n", msg->lm_magic); - return; - } -} - -__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - if (!pb) { - CERROR("invalid msg %p: no ptlrpc body!\n", msg); - return 0; - } - return pb->pb_conn_cnt; - } - default: - CERROR("incorrect message magic: %08x\n", msg->lm_magic); - return 0; - } -} -EXPORT_SYMBOL(lustre_msg_get_conn_cnt); - -__u32 lustre_msg_get_magic(struct lustre_msg *msg) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: - return msg->lm_magic; - default: - CERROR("incorrect message magic: %08x\n", msg->lm_magic); - return 0; - } -} - -__u32 lustre_msg_get_timeout(struct lustre_msg *msg) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - if (!pb) { - CERROR("invalid msg %p: no ptlrpc body!\n", msg); - return 0; - } - return pb->pb_timeout; - } - default: - CERROR("incorrect message magic: %08x\n", msg->lm_magic); - return -EPROTO; - } -} - -__u32 lustre_msg_get_service_time(struct lustre_msg *msg) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - if (!pb) { - CERROR("invalid msg %p: no ptlrpc body!\n", msg); - return 0; - } - return pb->pb_service_time; - } - default: - CERROR("incorrect message magic: %08x\n", msg->lm_magic); - return 0; - } -} - -__u32 lustre_msg_get_cksum(struct lustre_msg *msg) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: - return msg->lm_cksum; - default: - CERROR("incorrect message magic: %08x\n", msg->lm_magic); - return 0; - } -} - -__u32 lustre_msg_calc_cksum(struct lustre_msg *msg) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - __u32 crc; - unsigned int hsize = 4; - - cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32, (unsigned char *)pb, - lustre_msg_buflen(msg, - MSG_PTLRPC_BODY_OFF), - NULL, 0, (unsigned char *)&crc, &hsize); - return crc; - } - default: - CERROR("incorrect message magic: %08x\n", msg->lm_magic); - return 0; - } -} - -void lustre_msg_set_handle(struct lustre_msg *msg, struct lustre_handle *handle) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); - pb->pb_handle = *handle; - return; - } - default: - LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); - } -} - -void lustre_msg_set_type(struct lustre_msg *msg, __u32 type) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); - pb->pb_type = type; - return; - } - default: - LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); - } -} - -void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); - pb->pb_opc = opc; - return; - } - default: - LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); - } -} - -void lustre_msg_set_last_xid(struct lustre_msg *msg, u64 last_xid) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); - pb->pb_last_xid = last_xid; - return; - } - default: - LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); - } -} - -void lustre_msg_set_tag(struct lustre_msg *msg, __u16 tag) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); - pb->pb_tag = tag; - return; - } - default: - LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); - } -} -EXPORT_SYMBOL(lustre_msg_set_tag); - -void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); - pb->pb_pre_versions[0] = versions[0]; - pb->pb_pre_versions[1] = versions[1]; - pb->pb_pre_versions[2] = versions[2]; - pb->pb_pre_versions[3] = versions[3]; - return; - } - default: - LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); - } -} -EXPORT_SYMBOL(lustre_msg_set_versions); - -void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); - pb->pb_transno = transno; - return; - } - default: - LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); - } -} -EXPORT_SYMBOL(lustre_msg_set_transno); - -void lustre_msg_set_status(struct lustre_msg *msg, __u32 status) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); - pb->pb_status = status; - return; - } - default: - LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); - } -} -EXPORT_SYMBOL(lustre_msg_set_status); - -void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); - pb->pb_conn_cnt = conn_cnt; - return; - } - default: - LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); - } -} - -void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); - pb->pb_timeout = timeout; - return; - } - default: - LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); - } -} - -void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); - pb->pb_service_time = service_time; - return; - } - default: - LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); - } -} - -void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - __u32 opc = lustre_msg_get_opc(msg); - struct ptlrpc_body *pb; - - /* Don't set jobid for ldlm ast RPCs, they've been shrunk. - * See the comment in ptlrpc_request_pack(). - */ - if (!opc || opc == LDLM_BL_CALLBACK || - opc == LDLM_CP_CALLBACK || opc == LDLM_GL_CALLBACK) - return; - - pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, - sizeof(struct ptlrpc_body)); - LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); - - if (jobid) - memcpy(pb->pb_jobid, jobid, LUSTRE_JOBID_SIZE); - else if (pb->pb_jobid[0] == '\0') - lustre_get_jobid(pb->pb_jobid); - return; - } - default: - LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); - } -} -EXPORT_SYMBOL(lustre_msg_set_jobid); - -void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: - msg->lm_cksum = cksum; - return; - default: - LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); - } -} - -void lustre_msg_set_mbits(struct lustre_msg *msg, __u64 mbits) -{ - switch (msg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: { - struct ptlrpc_body *pb = lustre_msg_ptlrpc_body(msg); - - LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); - pb->pb_mbits = mbits; - return; - } - default: - LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); - } -} - -void ptlrpc_request_set_replen(struct ptlrpc_request *req) -{ - int count = req_capsule_filled_sizes(&req->rq_pill, RCL_SERVER); - - req->rq_replen = lustre_msg_size(req->rq_reqmsg->lm_magic, count, - req->rq_pill.rc_area[RCL_SERVER]); - if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2) - req->rq_reqmsg->lm_repsize = req->rq_replen; -} -EXPORT_SYMBOL(ptlrpc_request_set_replen); - -/** - * Send a remote set_info_async. - * - * This may go from client to server or server to client. - */ -int do_set_info_async(struct obd_import *imp, - int opcode, int version, - u32 keylen, void *key, - u32 vallen, void *val, - struct ptlrpc_request_set *set) -{ - struct ptlrpc_request *req; - char *tmp; - int rc; - - req = ptlrpc_request_alloc(imp, &RQF_OBD_SET_INFO); - if (!req) - return -ENOMEM; - - req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_KEY, - RCL_CLIENT, keylen); - req_capsule_set_size(&req->rq_pill, &RMF_SETINFO_VAL, - RCL_CLIENT, vallen); - rc = ptlrpc_request_pack(req, version, opcode); - if (rc) { - ptlrpc_request_free(req); - return rc; - } - - tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_KEY); - memcpy(tmp, key, keylen); - tmp = req_capsule_client_get(&req->rq_pill, &RMF_SETINFO_VAL); - memcpy(tmp, val, vallen); - - ptlrpc_request_set_replen(req); - - if (set) { - ptlrpc_set_add_req(set, req); - ptlrpc_check_set(NULL, set); - } else { - rc = ptlrpc_queue_wait(req); - ptlrpc_req_finished(req); - } - - return rc; -} -EXPORT_SYMBOL(do_set_info_async); - -/* byte flipping routines for all wire types declared in - * lustre_idl.h implemented here. - */ -void lustre_swab_ptlrpc_body(struct ptlrpc_body *b) -{ - __swab32s(&b->pb_type); - __swab32s(&b->pb_version); - __swab32s(&b->pb_opc); - __swab32s(&b->pb_status); - __swab64s(&b->pb_last_xid); - __swab16s(&b->pb_tag); - __swab64s(&b->pb_last_committed); - __swab64s(&b->pb_transno); - __swab32s(&b->pb_flags); - __swab32s(&b->pb_op_flags); - __swab32s(&b->pb_conn_cnt); - __swab32s(&b->pb_timeout); - __swab32s(&b->pb_service_time); - __swab32s(&b->pb_limit); - __swab64s(&b->pb_slv); - __swab64s(&b->pb_pre_versions[0]); - __swab64s(&b->pb_pre_versions[1]); - __swab64s(&b->pb_pre_versions[2]); - __swab64s(&b->pb_pre_versions[3]); - __swab64s(&b->pb_mbits); - BUILD_BUG_ON(offsetof(typeof(*b), pb_padding0) == 0); - BUILD_BUG_ON(offsetof(typeof(*b), pb_padding1) == 0); - BUILD_BUG_ON(offsetof(typeof(*b), pb_padding64_0) == 0); - BUILD_BUG_ON(offsetof(typeof(*b), pb_padding64_1) == 0); - BUILD_BUG_ON(offsetof(typeof(*b), pb_padding64_2) == 0); - /* While we need to maintain compatibility between - * clients and servers without ptlrpc_body_v2 (< 2.3) - * do not swab any fields beyond pb_jobid, as we are - * using this swab function for both ptlrpc_body - * and ptlrpc_body_v2. - */ - BUILD_BUG_ON(offsetof(typeof(*b), pb_jobid) == 0); -} - -void lustre_swab_connect(struct obd_connect_data *ocd) -{ - __swab64s(&ocd->ocd_connect_flags); - __swab32s(&ocd->ocd_version); - __swab32s(&ocd->ocd_grant); - __swab64s(&ocd->ocd_ibits_known); - __swab32s(&ocd->ocd_index); - __swab32s(&ocd->ocd_brw_size); - /* ocd_blocksize and ocd_inodespace don't need to be swabbed because - * they are 8-byte values - */ - __swab16s(&ocd->ocd_grant_extent); - __swab32s(&ocd->ocd_unused); - __swab64s(&ocd->ocd_transno); - __swab32s(&ocd->ocd_group); - __swab32s(&ocd->ocd_cksum_types); - __swab32s(&ocd->ocd_instance); - /* Fields after ocd_cksum_types are only accessible by the receiver - * if the corresponding flag in ocd_connect_flags is set. Accessing - * any field after ocd_maxbytes on the receiver without a valid flag - * may result in out-of-bound memory access and kernel oops. - */ - if (ocd->ocd_connect_flags & OBD_CONNECT_MAX_EASIZE) - __swab32s(&ocd->ocd_max_easize); - if (ocd->ocd_connect_flags & OBD_CONNECT_MAXBYTES) - __swab64s(&ocd->ocd_maxbytes); - if (ocd->ocd_connect_flags & OBD_CONNECT_MULTIMODRPCS) - __swab16s(&ocd->ocd_maxmodrpcs); - BUILD_BUG_ON(!offsetof(typeof(*ocd), padding0)); - BUILD_BUG_ON(offsetof(typeof(*ocd), padding1) == 0); - if (ocd->ocd_connect_flags & OBD_CONNECT_FLAGS2) - __swab64s(&ocd->ocd_connect_flags2); - BUILD_BUG_ON(offsetof(typeof(*ocd), padding3) == 0); - BUILD_BUG_ON(offsetof(typeof(*ocd), padding4) == 0); - BUILD_BUG_ON(offsetof(typeof(*ocd), padding5) == 0); - BUILD_BUG_ON(offsetof(typeof(*ocd), padding6) == 0); - BUILD_BUG_ON(offsetof(typeof(*ocd), padding7) == 0); - BUILD_BUG_ON(offsetof(typeof(*ocd), padding8) == 0); - BUILD_BUG_ON(offsetof(typeof(*ocd), padding9) == 0); - BUILD_BUG_ON(offsetof(typeof(*ocd), paddingA) == 0); - BUILD_BUG_ON(offsetof(typeof(*ocd), paddingB) == 0); - BUILD_BUG_ON(offsetof(typeof(*ocd), paddingC) == 0); - BUILD_BUG_ON(offsetof(typeof(*ocd), paddingD) == 0); - BUILD_BUG_ON(offsetof(typeof(*ocd), paddingE) == 0); - BUILD_BUG_ON(offsetof(typeof(*ocd), paddingF) == 0); -} - -static void lustre_swab_obdo(struct obdo *o) -{ - __swab64s(&o->o_valid); - lustre_swab_ost_id(&o->o_oi); - __swab64s(&o->o_parent_seq); - __swab64s(&o->o_size); - __swab64s(&o->o_mtime); - __swab64s(&o->o_atime); - __swab64s(&o->o_ctime); - __swab64s(&o->o_blocks); - __swab64s(&o->o_grant); - __swab32s(&o->o_blksize); - __swab32s(&o->o_mode); - __swab32s(&o->o_uid); - __swab32s(&o->o_gid); - __swab32s(&o->o_flags); - __swab32s(&o->o_nlink); - __swab32s(&o->o_parent_oid); - __swab32s(&o->o_misc); - __swab64s(&o->o_ioepoch); - __swab32s(&o->o_stripe_idx); - __swab32s(&o->o_parent_ver); - /* o_handle is opaque */ - /* o_lcookie is swabbed elsewhere */ - __swab32s(&o->o_uid_h); - __swab32s(&o->o_gid_h); - __swab64s(&o->o_data_version); - BUILD_BUG_ON(offsetof(typeof(*o), o_padding_4) == 0); - BUILD_BUG_ON(offsetof(typeof(*o), o_padding_5) == 0); - BUILD_BUG_ON(offsetof(typeof(*o), o_padding_6) == 0); -} - -void lustre_swab_obd_statfs(struct obd_statfs *os) -{ - __swab64s(&os->os_type); - __swab64s(&os->os_blocks); - __swab64s(&os->os_bfree); - __swab64s(&os->os_bavail); - __swab64s(&os->os_files); - __swab64s(&os->os_ffree); - /* no need to swab os_fsid */ - __swab32s(&os->os_bsize); - __swab32s(&os->os_namelen); - __swab64s(&os->os_maxbytes); - __swab32s(&os->os_state); - BUILD_BUG_ON(offsetof(typeof(*os), os_fprecreated) == 0); - BUILD_BUG_ON(offsetof(typeof(*os), os_spare2) == 0); - BUILD_BUG_ON(offsetof(typeof(*os), os_spare3) == 0); - BUILD_BUG_ON(offsetof(typeof(*os), os_spare4) == 0); - BUILD_BUG_ON(offsetof(typeof(*os), os_spare5) == 0); - BUILD_BUG_ON(offsetof(typeof(*os), os_spare6) == 0); - BUILD_BUG_ON(offsetof(typeof(*os), os_spare7) == 0); - BUILD_BUG_ON(offsetof(typeof(*os), os_spare8) == 0); - BUILD_BUG_ON(offsetof(typeof(*os), os_spare9) == 0); -} - -void lustre_swab_obd_ioobj(struct obd_ioobj *ioo) -{ - lustre_swab_ost_id(&ioo->ioo_oid); - __swab32s(&ioo->ioo_max_brw); - __swab32s(&ioo->ioo_bufcnt); -} - -void lustre_swab_niobuf_remote(struct niobuf_remote *nbr) -{ - __swab64s(&nbr->rnb_offset); - __swab32s(&nbr->rnb_len); - __swab32s(&nbr->rnb_flags); -} - -void lustre_swab_ost_body(struct ost_body *b) -{ - lustre_swab_obdo(&b->oa); -} - -void lustre_swab_ost_last_id(u64 *id) -{ - __swab64s(id); -} - -void lustre_swab_generic_32s(__u32 *val) -{ - __swab32s(val); -} - -void lustre_swab_gl_desc(union ldlm_gl_desc *desc) -{ - lustre_swab_lu_fid(&desc->lquota_desc.gl_id.qid_fid); - __swab64s(&desc->lquota_desc.gl_flags); - __swab64s(&desc->lquota_desc.gl_ver); - __swab64s(&desc->lquota_desc.gl_hardlimit); - __swab64s(&desc->lquota_desc.gl_softlimit); - __swab64s(&desc->lquota_desc.gl_time); - BUILD_BUG_ON(offsetof(typeof(desc->lquota_desc), gl_pad2) == 0); -} - -void lustre_swab_ost_lvb_v1(struct ost_lvb_v1 *lvb) -{ - __swab64s(&lvb->lvb_size); - __swab64s(&lvb->lvb_mtime); - __swab64s(&lvb->lvb_atime); - __swab64s(&lvb->lvb_ctime); - __swab64s(&lvb->lvb_blocks); -} -EXPORT_SYMBOL(lustre_swab_ost_lvb_v1); - -void lustre_swab_ost_lvb(struct ost_lvb *lvb) -{ - __swab64s(&lvb->lvb_size); - __swab64s(&lvb->lvb_mtime); - __swab64s(&lvb->lvb_atime); - __swab64s(&lvb->lvb_ctime); - __swab64s(&lvb->lvb_blocks); - __swab32s(&lvb->lvb_mtime_ns); - __swab32s(&lvb->lvb_atime_ns); - __swab32s(&lvb->lvb_ctime_ns); - __swab32s(&lvb->lvb_padding); -} -EXPORT_SYMBOL(lustre_swab_ost_lvb); - -void lustre_swab_lquota_lvb(struct lquota_lvb *lvb) -{ - __swab64s(&lvb->lvb_flags); - __swab64s(&lvb->lvb_id_may_rel); - __swab64s(&lvb->lvb_id_rel); - __swab64s(&lvb->lvb_id_qunit); - __swab64s(&lvb->lvb_pad1); -} -EXPORT_SYMBOL(lustre_swab_lquota_lvb); - -void lustre_swab_mdt_body(struct mdt_body *b) -{ - lustre_swab_lu_fid(&b->mbo_fid1); - lustre_swab_lu_fid(&b->mbo_fid2); - /* handle is opaque */ - __swab64s(&b->mbo_valid); - __swab64s(&b->mbo_size); - __swab64s(&b->mbo_mtime); - __swab64s(&b->mbo_atime); - __swab64s(&b->mbo_ctime); - __swab64s(&b->mbo_blocks); - __swab64s(&b->mbo_ioepoch); - __swab64s(&b->mbo_t_state); - __swab32s(&b->mbo_fsuid); - __swab32s(&b->mbo_fsgid); - __swab32s(&b->mbo_capability); - __swab32s(&b->mbo_mode); - __swab32s(&b->mbo_uid); - __swab32s(&b->mbo_gid); - __swab32s(&b->mbo_flags); - __swab32s(&b->mbo_rdev); - __swab32s(&b->mbo_nlink); - BUILD_BUG_ON(offsetof(typeof(*b), mbo_unused2) == 0); - __swab32s(&b->mbo_suppgid); - __swab32s(&b->mbo_eadatasize); - __swab32s(&b->mbo_aclsize); - __swab32s(&b->mbo_max_mdsize); - BUILD_BUG_ON(!offsetof(typeof(*b), mbo_unused3)); - __swab32s(&b->mbo_uid_h); - __swab32s(&b->mbo_gid_h); - BUILD_BUG_ON(offsetof(typeof(*b), mbo_padding_5) == 0); -} - -void lustre_swab_mdt_ioepoch(struct mdt_ioepoch *b) -{ - /* handle is opaque */ - /* mio_handle is opaque */ - BUILD_BUG_ON(!offsetof(typeof(*b), mio_unused1)); - BUILD_BUG_ON(!offsetof(typeof(*b), mio_unused2)); - BUILD_BUG_ON(!offsetof(typeof(*b), mio_padding)); -} - -void lustre_swab_mgs_target_info(struct mgs_target_info *mti) -{ - int i; - - __swab32s(&mti->mti_lustre_ver); - __swab32s(&mti->mti_stripe_index); - __swab32s(&mti->mti_config_ver); - __swab32s(&mti->mti_flags); - __swab32s(&mti->mti_instance); - __swab32s(&mti->mti_nid_count); - BUILD_BUG_ON(sizeof(lnet_nid_t) != sizeof(__u64)); - for (i = 0; i < MTI_NIDS_MAX; i++) - __swab64s(&mti->mti_nids[i]); -} - -void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *entry) -{ - __u8 i; - - __swab64s(&entry->mne_version); - __swab32s(&entry->mne_instance); - __swab32s(&entry->mne_index); - __swab32s(&entry->mne_length); - - /* mne_nid_(count|type) must be one byte size because we're gonna - * access it w/o swapping. */ - BUILD_BUG_ON(sizeof(entry->mne_nid_count) != sizeof(__u8)); - BUILD_BUG_ON(sizeof(entry->mne_nid_type) != sizeof(__u8)); - - /* remove this assertion if ipv6 is supported. */ - LASSERT(entry->mne_nid_type == 0); - for (i = 0; i < entry->mne_nid_count; i++) { - BUILD_BUG_ON(sizeof(lnet_nid_t) != sizeof(__u64)); - __swab64s(&entry->u.nids[i]); - } -} -EXPORT_SYMBOL(lustre_swab_mgs_nidtbl_entry); - -void lustre_swab_mgs_config_body(struct mgs_config_body *body) -{ - __swab64s(&body->mcb_offset); - __swab32s(&body->mcb_units); - __swab16s(&body->mcb_type); -} - -void lustre_swab_mgs_config_res(struct mgs_config_res *body) -{ - __swab64s(&body->mcr_offset); - __swab64s(&body->mcr_size); -} - -static void lustre_swab_obd_dqinfo(struct obd_dqinfo *i) -{ - __swab64s(&i->dqi_bgrace); - __swab64s(&i->dqi_igrace); - __swab32s(&i->dqi_flags); - __swab32s(&i->dqi_valid); -} - -static void lustre_swab_obd_dqblk(struct obd_dqblk *b) -{ - __swab64s(&b->dqb_ihardlimit); - __swab64s(&b->dqb_isoftlimit); - __swab64s(&b->dqb_curinodes); - __swab64s(&b->dqb_bhardlimit); - __swab64s(&b->dqb_bsoftlimit); - __swab64s(&b->dqb_curspace); - __swab64s(&b->dqb_btime); - __swab64s(&b->dqb_itime); - __swab32s(&b->dqb_valid); - BUILD_BUG_ON(offsetof(typeof(*b), dqb_padding) == 0); -} - -void lustre_swab_obd_quotactl(struct obd_quotactl *q) -{ - __swab32s(&q->qc_cmd); - __swab32s(&q->qc_type); - __swab32s(&q->qc_id); - __swab32s(&q->qc_stat); - lustre_swab_obd_dqinfo(&q->qc_dqinfo); - lustre_swab_obd_dqblk(&q->qc_dqblk); -} - -void lustre_swab_fid2path(struct getinfo_fid2path *gf) -{ - lustre_swab_lu_fid(&gf->gf_fid); - __swab64s(&gf->gf_recno); - __swab32s(&gf->gf_linkno); - __swab32s(&gf->gf_pathlen); -} -EXPORT_SYMBOL(lustre_swab_fid2path); - -static void lustre_swab_fiemap_extent(struct fiemap_extent *fm_extent) -{ - __swab64s(&fm_extent->fe_logical); - __swab64s(&fm_extent->fe_physical); - __swab64s(&fm_extent->fe_length); - __swab32s(&fm_extent->fe_flags); - __swab32s(&fm_extent->fe_device); -} - -void lustre_swab_fiemap(struct fiemap *fiemap) -{ - __u32 i; - - __swab64s(&fiemap->fm_start); - __swab64s(&fiemap->fm_length); - __swab32s(&fiemap->fm_flags); - __swab32s(&fiemap->fm_mapped_extents); - __swab32s(&fiemap->fm_extent_count); - __swab32s(&fiemap->fm_reserved); - - for (i = 0; i < fiemap->fm_mapped_extents; i++) - lustre_swab_fiemap_extent(&fiemap->fm_extents[i]); -} - -void lustre_swab_mdt_rec_reint (struct mdt_rec_reint *rr) -{ - __swab32s(&rr->rr_opcode); - __swab32s(&rr->rr_cap); - __swab32s(&rr->rr_fsuid); - /* rr_fsuid_h is unused */ - __swab32s(&rr->rr_fsgid); - /* rr_fsgid_h is unused */ - __swab32s(&rr->rr_suppgid1); - /* rr_suppgid1_h is unused */ - __swab32s(&rr->rr_suppgid2); - /* rr_suppgid2_h is unused */ - lustre_swab_lu_fid(&rr->rr_fid1); - lustre_swab_lu_fid(&rr->rr_fid2); - __swab64s(&rr->rr_mtime); - __swab64s(&rr->rr_atime); - __swab64s(&rr->rr_ctime); - __swab64s(&rr->rr_size); - __swab64s(&rr->rr_blocks); - __swab32s(&rr->rr_bias); - __swab32s(&rr->rr_mode); - __swab32s(&rr->rr_flags); - __swab32s(&rr->rr_flags_h); - __swab32s(&rr->rr_umask); - - BUILD_BUG_ON(offsetof(typeof(*rr), rr_padding_4) == 0); -}; - -void lustre_swab_lov_desc(struct lov_desc *ld) -{ - __swab32s(&ld->ld_tgt_count); - __swab32s(&ld->ld_active_tgt_count); - __swab32s(&ld->ld_default_stripe_count); - __swab32s(&ld->ld_pattern); - __swab64s(&ld->ld_default_stripe_size); - __swab64s(&ld->ld_default_stripe_offset); - __swab32s(&ld->ld_qos_maxage); - /* uuid endian insensitive */ -} -EXPORT_SYMBOL(lustre_swab_lov_desc); - -/* This structure is always in little-endian */ -static void lustre_swab_lmv_mds_md_v1(struct lmv_mds_md_v1 *lmm1) -{ - int i; - - __swab32s(&lmm1->lmv_magic); - __swab32s(&lmm1->lmv_stripe_count); - __swab32s(&lmm1->lmv_master_mdt_index); - __swab32s(&lmm1->lmv_hash_type); - __swab32s(&lmm1->lmv_layout_version); - for (i = 0; i < lmm1->lmv_stripe_count; i++) - lustre_swab_lu_fid(&lmm1->lmv_stripe_fids[i]); -} - -void lustre_swab_lmv_mds_md(union lmv_mds_md *lmm) -{ - switch (lmm->lmv_magic) { - case LMV_MAGIC_V1: - lustre_swab_lmv_mds_md_v1(&lmm->lmv_md_v1); - break; - default: - break; - } -} -EXPORT_SYMBOL(lustre_swab_lmv_mds_md); - -void lustre_swab_lmv_user_md(struct lmv_user_md *lum) -{ - __swab32s(&lum->lum_magic); - __swab32s(&lum->lum_stripe_count); - __swab32s(&lum->lum_stripe_offset); - __swab32s(&lum->lum_hash_type); - __swab32s(&lum->lum_type); - BUILD_BUG_ON(!offsetof(typeof(*lum), lum_padding1)); -} -EXPORT_SYMBOL(lustre_swab_lmv_user_md); - -static void lustre_swab_lmm_oi(struct ost_id *oi) -{ - __swab64s(&oi->oi.oi_id); - __swab64s(&oi->oi.oi_seq); -} - -static void lustre_swab_lov_user_md_common(struct lov_user_md_v1 *lum) -{ - __swab32s(&lum->lmm_magic); - __swab32s(&lum->lmm_pattern); - lustre_swab_lmm_oi(&lum->lmm_oi); - __swab32s(&lum->lmm_stripe_size); - __swab16s(&lum->lmm_stripe_count); - __swab16s(&lum->lmm_stripe_offset); -} - -void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum) -{ - CDEBUG(D_IOCTL, "swabbing lov_user_md v1\n"); - lustre_swab_lov_user_md_common(lum); -} -EXPORT_SYMBOL(lustre_swab_lov_user_md_v1); - -void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum) -{ - CDEBUG(D_IOCTL, "swabbing lov_user_md v3\n"); - lustre_swab_lov_user_md_common((struct lov_user_md_v1 *)lum); - /* lmm_pool_name nothing to do with char */ -} -EXPORT_SYMBOL(lustre_swab_lov_user_md_v3); - -void lustre_swab_lov_mds_md(struct lov_mds_md *lmm) -{ - CDEBUG(D_IOCTL, "swabbing lov_mds_md\n"); - __swab32s(&lmm->lmm_magic); - __swab32s(&lmm->lmm_pattern); - lustre_swab_lmm_oi(&lmm->lmm_oi); - __swab32s(&lmm->lmm_stripe_size); - __swab16s(&lmm->lmm_stripe_count); - __swab16s(&lmm->lmm_layout_gen); -} -EXPORT_SYMBOL(lustre_swab_lov_mds_md); - -void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod, - int stripe_count) -{ - int i; - - for (i = 0; i < stripe_count; i++) { - lustre_swab_ost_id(&lod[i].l_ost_oi); - __swab32s(&lod[i].l_ost_gen); - __swab32s(&lod[i].l_ost_idx); - } -} -EXPORT_SYMBOL(lustre_swab_lov_user_md_objects); - -static void lustre_swab_ldlm_res_id(struct ldlm_res_id *id) -{ - int i; - - for (i = 0; i < RES_NAME_SIZE; i++) - __swab64s(&id->name[i]); -} - -static void lustre_swab_ldlm_policy_data(union ldlm_wire_policy_data *d) -{ - /* the lock data is a union and the first two fields are always an - * extent so it's ok to process an LDLM_EXTENT and LDLM_FLOCK lock - * data the same way. - */ - __swab64s(&d->l_extent.start); - __swab64s(&d->l_extent.end); - __swab64s(&d->l_extent.gid); - __swab64s(&d->l_flock.lfw_owner); - __swab32s(&d->l_flock.lfw_pid); -} - -void lustre_swab_ldlm_intent(struct ldlm_intent *i) -{ - __swab64s(&i->opc); -} - -static void lustre_swab_ldlm_resource_desc(struct ldlm_resource_desc *r) -{ - __swab32s(&r->lr_type); - BUILD_BUG_ON(offsetof(typeof(*r), lr_padding) == 0); - lustre_swab_ldlm_res_id(&r->lr_name); -} - -static void lustre_swab_ldlm_lock_desc(struct ldlm_lock_desc *l) -{ - lustre_swab_ldlm_resource_desc(&l->l_resource); - __swab32s(&l->l_req_mode); - __swab32s(&l->l_granted_mode); - lustre_swab_ldlm_policy_data(&l->l_policy_data); -} - -void lustre_swab_ldlm_request(struct ldlm_request *rq) -{ - __swab32s(&rq->lock_flags); - lustre_swab_ldlm_lock_desc(&rq->lock_desc); - __swab32s(&rq->lock_count); - /* lock_handle[] opaque */ -} - -void lustre_swab_ldlm_reply(struct ldlm_reply *r) -{ - __swab32s(&r->lock_flags); - BUILD_BUG_ON(offsetof(typeof(*r), lock_padding) == 0); - lustre_swab_ldlm_lock_desc(&r->lock_desc); - /* lock_handle opaque */ - __swab64s(&r->lock_policy_res1); - __swab64s(&r->lock_policy_res2); -} - -/* Dump functions */ -void dump_ioo(struct obd_ioobj *ioo) -{ - CDEBUG(D_RPCTRACE, - "obd_ioobj: ioo_oid=" DOSTID ", ioo_max_brw=%#x, ioo_bufct=%d\n", - POSTID(&ioo->ioo_oid), ioo->ioo_max_brw, - ioo->ioo_bufcnt); -} - -void dump_rniobuf(struct niobuf_remote *nb) -{ - CDEBUG(D_RPCTRACE, "niobuf_remote: offset=%llu, len=%d, flags=%x\n", - nb->rnb_offset, nb->rnb_len, nb->rnb_flags); -} - -static void dump_obdo(struct obdo *oa) -{ - __u32 valid = oa->o_valid; - - CDEBUG(D_RPCTRACE, "obdo: o_valid = %08x\n", valid); - if (valid & OBD_MD_FLID) - CDEBUG(D_RPCTRACE, "obdo: id = " DOSTID "\n", POSTID(&oa->o_oi)); - if (valid & OBD_MD_FLFID) - CDEBUG(D_RPCTRACE, "obdo: o_parent_seq = %#llx\n", - oa->o_parent_seq); - if (valid & OBD_MD_FLSIZE) - CDEBUG(D_RPCTRACE, "obdo: o_size = %lld\n", oa->o_size); - if (valid & OBD_MD_FLMTIME) - CDEBUG(D_RPCTRACE, "obdo: o_mtime = %lld\n", oa->o_mtime); - if (valid & OBD_MD_FLATIME) - CDEBUG(D_RPCTRACE, "obdo: o_atime = %lld\n", oa->o_atime); - if (valid & OBD_MD_FLCTIME) - CDEBUG(D_RPCTRACE, "obdo: o_ctime = %lld\n", oa->o_ctime); - if (valid & OBD_MD_FLBLOCKS) /* allocation of space */ - CDEBUG(D_RPCTRACE, "obdo: o_blocks = %lld\n", oa->o_blocks); - if (valid & OBD_MD_FLGRANT) - CDEBUG(D_RPCTRACE, "obdo: o_grant = %lld\n", oa->o_grant); - if (valid & OBD_MD_FLBLKSZ) - CDEBUG(D_RPCTRACE, "obdo: o_blksize = %d\n", oa->o_blksize); - if (valid & (OBD_MD_FLTYPE | OBD_MD_FLMODE)) - CDEBUG(D_RPCTRACE, "obdo: o_mode = %o\n", - oa->o_mode & ((valid & OBD_MD_FLTYPE ? S_IFMT : 0) | - (valid & OBD_MD_FLMODE ? ~S_IFMT : 0))); - if (valid & OBD_MD_FLUID) - CDEBUG(D_RPCTRACE, "obdo: o_uid = %u\n", oa->o_uid); - if (valid & OBD_MD_FLUID) - CDEBUG(D_RPCTRACE, "obdo: o_uid_h = %u\n", oa->o_uid_h); - if (valid & OBD_MD_FLGID) - CDEBUG(D_RPCTRACE, "obdo: o_gid = %u\n", oa->o_gid); - if (valid & OBD_MD_FLGID) - CDEBUG(D_RPCTRACE, "obdo: o_gid_h = %u\n", oa->o_gid_h); - if (valid & OBD_MD_FLFLAGS) - CDEBUG(D_RPCTRACE, "obdo: o_flags = %x\n", oa->o_flags); - if (valid & OBD_MD_FLNLINK) - CDEBUG(D_RPCTRACE, "obdo: o_nlink = %u\n", oa->o_nlink); - else if (valid & OBD_MD_FLCKSUM) - CDEBUG(D_RPCTRACE, "obdo: o_checksum (o_nlink) = %u\n", - oa->o_nlink); - if (valid & OBD_MD_FLGENER) - CDEBUG(D_RPCTRACE, "obdo: o_parent_oid = %x\n", - oa->o_parent_oid); - if (valid & OBD_MD_FLEPOCH) - CDEBUG(D_RPCTRACE, "obdo: o_ioepoch = %lld\n", - oa->o_ioepoch); - if (valid & OBD_MD_FLFID) { - CDEBUG(D_RPCTRACE, "obdo: o_stripe_idx = %u\n", - oa->o_stripe_idx); - CDEBUG(D_RPCTRACE, "obdo: o_parent_ver = %x\n", - oa->o_parent_ver); - } - if (valid & OBD_MD_FLHANDLE) - CDEBUG(D_RPCTRACE, "obdo: o_handle = %lld\n", - oa->o_handle.cookie); -} - -void dump_ost_body(struct ost_body *ob) -{ - dump_obdo(&ob->oa); -} - -void dump_rcs(__u32 *rc) -{ - CDEBUG(D_RPCTRACE, "rmf_rcs: %d\n", *rc); -} - -static inline int req_ptlrpc_body_swabbed(struct ptlrpc_request *req) -{ - LASSERT(req->rq_reqmsg); - - switch (req->rq_reqmsg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: - return lustre_req_swabbed(req, MSG_PTLRPC_BODY_OFF); - default: - CERROR("bad lustre msg magic: %#08X\n", - req->rq_reqmsg->lm_magic); - } - return 0; -} - -static inline int rep_ptlrpc_body_swabbed(struct ptlrpc_request *req) -{ - LASSERT(req->rq_repmsg); - - switch (req->rq_repmsg->lm_magic) { - case LUSTRE_MSG_MAGIC_V2: - return lustre_rep_swabbed(req, MSG_PTLRPC_BODY_OFF); - default: - /* uninitialized yet */ - return 0; - } -} - -void _debug_req(struct ptlrpc_request *req, - struct libcfs_debug_msg_data *msgdata, - const char *fmt, ...) -{ - int req_ok = req->rq_reqmsg != NULL; - int rep_ok = req->rq_repmsg != NULL; - lnet_nid_t nid = LNET_NID_ANY; - va_list args; - - if (ptlrpc_req_need_swab(req)) { - req_ok = req_ok && req_ptlrpc_body_swabbed(req); - rep_ok = rep_ok && rep_ptlrpc_body_swabbed(req); - } - - if (req->rq_import && req->rq_import->imp_connection) - nid = req->rq_import->imp_connection->c_peer.nid; - else if (req->rq_export && req->rq_export->exp_connection) - nid = req->rq_export->exp_connection->c_peer.nid; - - va_start(args, fmt); - libcfs_debug_vmsg2(msgdata, fmt, args, - " req@%p x%llu/t%lld(%lld) o%d->%s@%s:%d/%d lens %d/%d e %d to %lld dl %lld ref %d fl " REQ_FLAGS_FMT "/%x/%x rc %d/%d\n", - req, req->rq_xid, req->rq_transno, - req_ok ? lustre_msg_get_transno(req->rq_reqmsg) : 0, - req_ok ? lustre_msg_get_opc(req->rq_reqmsg) : -1, - req->rq_import ? - req->rq_import->imp_obd->obd_name : - req->rq_export ? - req->rq_export->exp_client_uuid.uuid : - "", - libcfs_nid2str(nid), - req->rq_request_portal, req->rq_reply_portal, - req->rq_reqlen, req->rq_replen, - req->rq_early_count, (s64)req->rq_timedout, - (s64)req->rq_deadline, - atomic_read(&req->rq_refcount), - DEBUG_REQ_FLAGS(req), - req_ok ? lustre_msg_get_flags(req->rq_reqmsg) : -1, - rep_ok ? lustre_msg_get_flags(req->rq_repmsg) : -1, - req->rq_status, - rep_ok ? lustre_msg_get_status(req->rq_repmsg) : -1); - va_end(args); -} -EXPORT_SYMBOL(_debug_req); - -void lustre_swab_lustre_capa(struct lustre_capa *c) -{ - lustre_swab_lu_fid(&c->lc_fid); - __swab64s(&c->lc_opc); - __swab64s(&c->lc_uid); - __swab64s(&c->lc_gid); - __swab32s(&c->lc_flags); - __swab32s(&c->lc_keyid); - __swab32s(&c->lc_timeout); - __swab32s(&c->lc_expiry); -} - -void lustre_swab_hsm_user_state(struct hsm_user_state *state) -{ - __swab32s(&state->hus_states); - __swab32s(&state->hus_archive_id); -} - -void lustre_swab_hsm_state_set(struct hsm_state_set *hss) -{ - __swab32s(&hss->hss_valid); - __swab64s(&hss->hss_setmask); - __swab64s(&hss->hss_clearmask); - __swab32s(&hss->hss_archive_id); -} -EXPORT_SYMBOL(lustre_swab_hsm_state_set); - -static void lustre_swab_hsm_extent(struct hsm_extent *extent) -{ - __swab64s(&extent->offset); - __swab64s(&extent->length); -} - -void lustre_swab_hsm_current_action(struct hsm_current_action *action) -{ - __swab32s(&action->hca_state); - __swab32s(&action->hca_action); - lustre_swab_hsm_extent(&action->hca_location); -} - -void lustre_swab_hsm_user_item(struct hsm_user_item *hui) -{ - lustre_swab_lu_fid(&hui->hui_fid); - lustre_swab_hsm_extent(&hui->hui_extent); -} - -void lustre_swab_layout_intent(struct layout_intent *li) -{ - __swab32s(&li->li_opc); - __swab32s(&li->li_flags); - __swab64s(&li->li_start); - __swab64s(&li->li_end); -} - -void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk) -{ - lustre_swab_lu_fid(&hpk->hpk_fid); - __swab64s(&hpk->hpk_cookie); - __swab64s(&hpk->hpk_extent.offset); - __swab64s(&hpk->hpk_extent.length); - __swab16s(&hpk->hpk_flags); - __swab16s(&hpk->hpk_errval); -} - -void lustre_swab_hsm_request(struct hsm_request *hr) -{ - __swab32s(&hr->hr_action); - __swab32s(&hr->hr_archive_id); - __swab64s(&hr->hr_flags); - __swab32s(&hr->hr_itemcount); - __swab32s(&hr->hr_data_len); -} - -void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl) -{ - __swab64s(&msl->msl_flags); -} -EXPORT_SYMBOL(lustre_swab_swap_layouts); - -void lustre_swab_close_data(struct close_data *cd) -{ - lustre_swab_lu_fid(&cd->cd_fid); - __swab64s(&cd->cd_data_version); -} diff --git a/drivers/staging/lustre/lustre/ptlrpc/pers.c b/drivers/staging/lustre/lustre/ptlrpc/pers.c deleted file mode 100644 index 2466868afb9c6c5d2aad5c2d633dd45d8f897131..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ptlrpc/pers.c +++ /dev/null @@ -1,72 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2014, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_RPC - -#include -#include -#include -#include -#include - -#include "ptlrpc_internal.h" - -void ptlrpc_fill_bulk_md(struct lnet_md *md, struct ptlrpc_bulk_desc *desc, - int mdidx) -{ - int offset = mdidx * LNET_MAX_IOV; - - BUILD_BUG_ON(PTLRPC_MAX_BRW_PAGES >= LI_POISON); - - LASSERT(mdidx < desc->bd_md_max_brw); - LASSERT(desc->bd_iov_count <= PTLRPC_MAX_BRW_PAGES); - LASSERT(!(md->options & (LNET_MD_IOVEC | LNET_MD_KIOV | - LNET_MD_PHYS))); - - md->length = max(0, desc->bd_iov_count - mdidx * LNET_MAX_IOV); - md->length = min_t(unsigned int, LNET_MAX_IOV, md->length); - - if (ptlrpc_is_bulk_desc_kiov(desc->bd_type)) { - md->options |= LNET_MD_KIOV; - if (GET_ENC_KIOV(desc)) - md->start = &BD_GET_ENC_KIOV(desc, offset); - else - md->start = &BD_GET_KIOV(desc, offset); - } else { - md->options |= LNET_MD_IOVEC; - if (GET_ENC_KVEC(desc)) - md->start = &BD_GET_ENC_KVEC(desc, offset); - else - md->start = &BD_GET_KVEC(desc, offset); - } -} diff --git a/drivers/staging/lustre/lustre/ptlrpc/pinger.c b/drivers/staging/lustre/lustre/ptlrpc/pinger.c deleted file mode 100644 index b3297b5ce3959d08bd56cb0e2cfbf61875887b1c..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ptlrpc/pinger.c +++ /dev/null @@ -1,474 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/ptlrpc/pinger.c - * - * Portal-RPC reconnection and replay operations, for use in recovery. - */ - -#define DEBUG_SUBSYSTEM S_RPC - -#include -#include -#include "ptlrpc_internal.h" - -struct mutex pinger_mutex; -static LIST_HEAD(pinger_imports); -static struct list_head timeout_list = LIST_HEAD_INIT(timeout_list); - -struct ptlrpc_request * -ptlrpc_prep_ping(struct obd_import *imp) -{ - struct ptlrpc_request *req; - - req = ptlrpc_request_alloc_pack(imp, &RQF_OBD_PING, - LUSTRE_OBD_VERSION, OBD_PING); - if (req) { - ptlrpc_request_set_replen(req); - req->rq_no_resend = 1; - req->rq_no_delay = 1; - } - return req; -} - -int ptlrpc_obd_ping(struct obd_device *obd) -{ - int rc; - struct ptlrpc_request *req; - - req = ptlrpc_prep_ping(obd->u.cli.cl_import); - if (!req) - return -ENOMEM; - - req->rq_send_state = LUSTRE_IMP_FULL; - - rc = ptlrpc_queue_wait(req); - - ptlrpc_req_finished(req); - - return rc; -} -EXPORT_SYMBOL(ptlrpc_obd_ping); - -static int ptlrpc_ping(struct obd_import *imp) -{ - struct ptlrpc_request *req; - - req = ptlrpc_prep_ping(imp); - if (!req) { - CERROR("OOM trying to ping %s->%s\n", - imp->imp_obd->obd_uuid.uuid, - obd2cli_tgt(imp->imp_obd)); - return -ENOMEM; - } - - DEBUG_REQ(D_INFO, req, "pinging %s->%s", - imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd)); - ptlrpcd_add_req(req); - - return 0; -} - -static void ptlrpc_update_next_ping(struct obd_import *imp, int soon) -{ - int time = soon ? PING_INTERVAL_SHORT : PING_INTERVAL; - - if (imp->imp_state == LUSTRE_IMP_DISCON) { - int dtime = max_t(int, CONNECTION_SWITCH_MIN, - AT_OFF ? 0 : - at_get(&imp->imp_at.iat_net_latency)); - time = min(time, dtime); - } - imp->imp_next_ping = jiffies + time * HZ; -} - -static inline int imp_is_deactive(struct obd_import *imp) -{ - return (imp->imp_deactive || - OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_IMP_DEACTIVE)); -} - -static inline int ptlrpc_next_reconnect(struct obd_import *imp) -{ - if (imp->imp_server_timeout) - return jiffies + obd_timeout / 2 * HZ; - else - return jiffies + obd_timeout * HZ; -} - -static long pinger_check_timeout(unsigned long time) -{ - struct timeout_item *item; - unsigned long timeout = PING_INTERVAL; - - /* The timeout list is a increase order sorted list */ - mutex_lock(&pinger_mutex); - list_for_each_entry(item, &timeout_list, ti_chain) { - int ti_timeout = item->ti_timeout; - - if (timeout > ti_timeout) - timeout = ti_timeout; - break; - } - mutex_unlock(&pinger_mutex); - - return time + timeout * HZ - jiffies; -} - -static bool ir_up; - -void ptlrpc_pinger_ir_up(void) -{ - CDEBUG(D_HA, "IR up\n"); - ir_up = true; -} -EXPORT_SYMBOL(ptlrpc_pinger_ir_up); - -void ptlrpc_pinger_ir_down(void) -{ - CDEBUG(D_HA, "IR down\n"); - ir_up = false; -} -EXPORT_SYMBOL(ptlrpc_pinger_ir_down); - -static void ptlrpc_pinger_process_import(struct obd_import *imp, - unsigned long this_ping) -{ - int level; - int force; - int force_next; - int suppress; - - spin_lock(&imp->imp_lock); - - level = imp->imp_state; - force = imp->imp_force_verify; - force_next = imp->imp_force_next_verify; - /* - * This will be used below only if the import is "FULL". - */ - suppress = ir_up && OCD_HAS_FLAG(&imp->imp_connect_data, PINGLESS); - - imp->imp_force_verify = 0; - - if (time_after_eq(imp->imp_next_ping - 5, this_ping) && - !force) { - spin_unlock(&imp->imp_lock); - return; - } - - imp->imp_force_next_verify = 0; - - spin_unlock(&imp->imp_lock); - - CDEBUG(level == LUSTRE_IMP_FULL ? D_INFO : D_HA, "%s->%s: level %s/%u force %u force_next %u deactive %u pingable %u suppress %u\n", - imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd), - ptlrpc_import_state_name(level), level, force, force_next, - imp->imp_deactive, imp->imp_pingable, suppress); - - if (level == LUSTRE_IMP_DISCON && !imp_is_deactive(imp)) { - /* wait for a while before trying recovery again */ - imp->imp_next_ping = ptlrpc_next_reconnect(imp); - if (!imp->imp_no_pinger_recover) - ptlrpc_initiate_recovery(imp); - } else if (level != LUSTRE_IMP_FULL || - imp->imp_obd->obd_no_recov || - imp_is_deactive(imp)) { - CDEBUG(D_HA, "%s->%s: not pinging (in recovery or recovery disabled: %s)\n", - imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd), - ptlrpc_import_state_name(level)); - if (force) { - spin_lock(&imp->imp_lock); - imp->imp_force_verify = 1; - spin_unlock(&imp->imp_lock); - } - } else if ((imp->imp_pingable && !suppress) || force_next || force) { - ptlrpc_ping(imp); - } -} - -static struct workqueue_struct *pinger_wq; -static void ptlrpc_pinger_main(struct work_struct *ws); -static DECLARE_DELAYED_WORK(ping_work, ptlrpc_pinger_main); - -static void ptlrpc_pinger_main(struct work_struct *ws) -{ - unsigned long this_ping = jiffies; - long time_to_next_wake; - struct timeout_item *item; - struct obd_import *imp; - - do { - mutex_lock(&pinger_mutex); - list_for_each_entry(item, &timeout_list, ti_chain) { - item->ti_cb(item, item->ti_cb_data); - } - list_for_each_entry(imp, &pinger_imports, imp_pinger_chain) { - ptlrpc_pinger_process_import(imp, this_ping); - /* obd_timeout might have changed */ - if (imp->imp_pingable && imp->imp_next_ping && - time_after(imp->imp_next_ping, - this_ping + PING_INTERVAL * HZ)) - ptlrpc_update_next_ping(imp, 0); - } - mutex_unlock(&pinger_mutex); - - /* Wait until the next ping time, or until we're stopped. */ - time_to_next_wake = pinger_check_timeout(this_ping); - /* The ping sent by ptlrpc_send_rpc may get sent out - * say .01 second after this. - * ptlrpc_pinger_sending_on_import will then set the - * next ping time to next_ping + .01 sec, which means - * we will SKIP the next ping at next_ping, and the - * ping will get sent 2 timeouts from now! Beware. - */ - CDEBUG(D_INFO, "next wakeup in %ld (%ld)\n", - time_to_next_wake, - this_ping + PING_INTERVAL * HZ); - } while (time_to_next_wake <= 0); - - queue_delayed_work(pinger_wq, &ping_work, - round_jiffies_up_relative(time_to_next_wake)); -} - -int ptlrpc_start_pinger(void) -{ - if (pinger_wq) - return -EALREADY; - - pinger_wq = alloc_workqueue("ptlrpc_pinger", WQ_MEM_RECLAIM, 1); - if (!pinger_wq) { - CERROR("cannot start pinger workqueue\n"); - return -ENOMEM; - } - - queue_delayed_work(pinger_wq, &ping_work, 0); - return 0; -} - -static int ptlrpc_pinger_remove_timeouts(void); - -int ptlrpc_stop_pinger(void) -{ - int rc = 0; - - if (!pinger_wq) - return -EALREADY; - - ptlrpc_pinger_remove_timeouts(); - cancel_delayed_work_sync(&ping_work); - destroy_workqueue(pinger_wq); - pinger_wq = NULL; - - return rc; -} - -void ptlrpc_pinger_sending_on_import(struct obd_import *imp) -{ - ptlrpc_update_next_ping(imp, 0); -} - -void ptlrpc_pinger_commit_expected(struct obd_import *imp) -{ - ptlrpc_update_next_ping(imp, 1); - assert_spin_locked(&imp->imp_lock); - /* - * Avoid reading stale imp_connect_data. When not sure if pings are - * expected or not on next connection, we assume they are not and force - * one anyway to guarantee the chance of updating - * imp_peer_committed_transno. - */ - if (imp->imp_state != LUSTRE_IMP_FULL || - OCD_HAS_FLAG(&imp->imp_connect_data, PINGLESS)) - imp->imp_force_next_verify = 1; -} - -int ptlrpc_pinger_add_import(struct obd_import *imp) -{ - if (!list_empty(&imp->imp_pinger_chain)) - return -EALREADY; - - mutex_lock(&pinger_mutex); - CDEBUG(D_HA, "adding pingable import %s->%s\n", - imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd)); - /* if we add to pinger we want recovery on this import */ - imp->imp_obd->obd_no_recov = 0; - ptlrpc_update_next_ping(imp, 0); - /* XXX sort, blah blah */ - list_add_tail(&imp->imp_pinger_chain, &pinger_imports); - class_import_get(imp); - - ptlrpc_pinger_wake_up(); - mutex_unlock(&pinger_mutex); - - return 0; -} -EXPORT_SYMBOL(ptlrpc_pinger_add_import); - -int ptlrpc_pinger_del_import(struct obd_import *imp) -{ - if (list_empty(&imp->imp_pinger_chain)) - return -ENOENT; - - mutex_lock(&pinger_mutex); - list_del_init(&imp->imp_pinger_chain); - CDEBUG(D_HA, "removing pingable import %s->%s\n", - imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd)); - /* if we remove from pinger we don't want recovery on this import */ - imp->imp_obd->obd_no_recov = 1; - class_import_put(imp); - mutex_unlock(&pinger_mutex); - return 0; -} -EXPORT_SYMBOL(ptlrpc_pinger_del_import); - -/** - * Register a timeout callback to the pinger list, and the callback will - * be called when timeout happens. - */ -static struct timeout_item *ptlrpc_new_timeout(int time, - enum timeout_event event, - timeout_cb_t cb, void *data) -{ - struct timeout_item *ti; - - ti = kzalloc(sizeof(*ti), GFP_NOFS); - if (!ti) - return NULL; - - INIT_LIST_HEAD(&ti->ti_obd_list); - INIT_LIST_HEAD(&ti->ti_chain); - ti->ti_timeout = time; - ti->ti_event = event; - ti->ti_cb = cb; - ti->ti_cb_data = data; - - return ti; -} - -/** - * Register timeout event on the pinger thread. - * Note: the timeout list is an sorted list with increased timeout value. - */ -static struct timeout_item* -ptlrpc_pinger_register_timeout(int time, enum timeout_event event, - timeout_cb_t cb, void *data) -{ - struct timeout_item *item, *tmp; - - LASSERT(mutex_is_locked(&pinger_mutex)); - - list_for_each_entry(item, &timeout_list, ti_chain) - if (item->ti_event == event) - goto out; - - item = ptlrpc_new_timeout(time, event, cb, data); - if (item) { - list_for_each_entry_reverse(tmp, &timeout_list, ti_chain) { - if (tmp->ti_timeout < time) { - list_add(&item->ti_chain, &tmp->ti_chain); - goto out; - } - } - list_add(&item->ti_chain, &timeout_list); - } -out: - return item; -} - -/* Add a client_obd to the timeout event list, when timeout(@time) - * happens, the callback(@cb) will be called. - */ -int ptlrpc_add_timeout_client(int time, enum timeout_event event, - timeout_cb_t cb, void *data, - struct list_head *obd_list) -{ - struct timeout_item *ti; - - mutex_lock(&pinger_mutex); - ti = ptlrpc_pinger_register_timeout(time, event, cb, data); - if (!ti) { - mutex_unlock(&pinger_mutex); - return -EINVAL; - } - list_add(obd_list, &ti->ti_obd_list); - mutex_unlock(&pinger_mutex); - return 0; -} -EXPORT_SYMBOL(ptlrpc_add_timeout_client); - -int ptlrpc_del_timeout_client(struct list_head *obd_list, - enum timeout_event event) -{ - struct timeout_item *ti = NULL, *item; - - if (list_empty(obd_list)) - return 0; - mutex_lock(&pinger_mutex); - list_del_init(obd_list); - /** - * If there are no obd attached to the timeout event - * list, remove this timeout event from the pinger - */ - list_for_each_entry(item, &timeout_list, ti_chain) { - if (item->ti_event == event) { - ti = item; - break; - } - } - if (list_empty(&ti->ti_obd_list)) { - list_del(&ti->ti_chain); - kfree(ti); - } - mutex_unlock(&pinger_mutex); - return 0; -} -EXPORT_SYMBOL(ptlrpc_del_timeout_client); - -static int ptlrpc_pinger_remove_timeouts(void) -{ - struct timeout_item *item, *tmp; - - mutex_lock(&pinger_mutex); - list_for_each_entry_safe(item, tmp, &timeout_list, ti_chain) { - LASSERT(list_empty(&item->ti_obd_list)); - list_del(&item->ti_chain); - kfree(item); - } - mutex_unlock(&pinger_mutex); - return 0; -} - -void ptlrpc_pinger_wake_up(void) -{ - mod_delayed_work(pinger_wq, &ping_work, 0); -} diff --git a/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h b/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h deleted file mode 100644 index 134b7423451958fc17fc2a56de54dfde728afe23..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_internal.h +++ /dev/null @@ -1,371 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -/* Intramodule declarations for ptlrpc. */ - -#ifndef PTLRPC_INTERNAL_H -#define PTLRPC_INTERNAL_H - -#include "../ldlm/ldlm_internal.h" - -struct ldlm_namespace; -struct obd_import; -struct ldlm_res_id; -struct ptlrpc_request_set; -extern int test_req_buffer_pressure; -extern struct mutex ptlrpc_all_services_mutex; -extern struct list_head ptlrpc_all_services; - -extern struct mutex ptlrpcd_mutex; -extern struct mutex pinger_mutex; - -int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait); -/* ptlrpcd.c */ -int ptlrpcd_start(struct ptlrpcd_ctl *pc); - -/* client.c */ -void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req, - unsigned int service_time); -struct ptlrpc_bulk_desc *ptlrpc_new_bulk(unsigned int nfrags, - unsigned int max_brw, - enum ptlrpc_bulk_op_type type, - unsigned int portal, - const struct ptlrpc_bulk_frag_ops *ops); -int ptlrpc_request_cache_init(void); -void ptlrpc_request_cache_fini(void); -struct ptlrpc_request *ptlrpc_request_cache_alloc(gfp_t flags); -void ptlrpc_request_cache_free(struct ptlrpc_request *req); -void ptlrpc_init_xid(void); -void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc, - struct ptlrpc_request *req); -void ptlrpc_expired_set(struct ptlrpc_request_set *set); -int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set); -void ptlrpc_resend_req(struct ptlrpc_request *request); -void ptlrpc_set_bulk_mbits(struct ptlrpc_request *req); -void ptlrpc_assign_next_xid_nolock(struct ptlrpc_request *req); -__u64 ptlrpc_known_replied_xid(struct obd_import *imp); -void ptlrpc_add_unreplied(struct ptlrpc_request *req); - -/* events.c */ -int ptlrpc_init_portals(void); -void ptlrpc_exit_portals(void); - -void ptlrpc_request_handle_notconn(struct ptlrpc_request *req); -void lustre_assert_wire_constants(void); -int ptlrpc_import_in_recovery(struct obd_import *imp); -int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt); -int ptlrpc_replay_next(struct obd_import *imp, int *inflight); -void ptlrpc_initiate_recovery(struct obd_import *imp); - -int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset); -int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset); - -int ptlrpc_sysfs_register_service(struct kset *parent, - struct ptlrpc_service *svc); -void ptlrpc_sysfs_unregister_service(struct ptlrpc_service *svc); - -void ptlrpc_ldebugfs_register_service(struct dentry *debugfs_entry, - struct ptlrpc_service *svc); -void ptlrpc_lprocfs_unregister_service(struct ptlrpc_service *svc); -void ptlrpc_lprocfs_rpc_sent(struct ptlrpc_request *req, long amount); - -/* NRS */ - -/** - * NRS core object. - * - * Holds NRS core fields. - */ -struct nrs_core { - /** - * Protects nrs_core::nrs_policies, serializes external policy - * registration/unregistration, and NRS core lprocfs operations. - */ - struct mutex nrs_mutex; - /** - * List of all policy descriptors registered with NRS core; protected - * by nrs_core::nrs_mutex. - */ - struct list_head nrs_policies; - -}; - -extern struct nrs_core nrs_core; - -int ptlrpc_service_nrs_setup(struct ptlrpc_service *svc); -void ptlrpc_service_nrs_cleanup(struct ptlrpc_service *svc); - -void ptlrpc_nrs_req_initialize(struct ptlrpc_service_part *svcpt, - struct ptlrpc_request *req, bool hp); -void ptlrpc_nrs_req_finalize(struct ptlrpc_request *req); -void ptlrpc_nrs_req_stop_nolock(struct ptlrpc_request *req); -void ptlrpc_nrs_req_add(struct ptlrpc_service_part *svcpt, - struct ptlrpc_request *req, bool hp); - -struct ptlrpc_request * -ptlrpc_nrs_req_get_nolock0(struct ptlrpc_service_part *svcpt, bool hp, - bool peek, bool force); - -static inline struct ptlrpc_request * -ptlrpc_nrs_req_get_nolock(struct ptlrpc_service_part *svcpt, bool hp, - bool force) -{ - return ptlrpc_nrs_req_get_nolock0(svcpt, hp, false, force); -} - -bool ptlrpc_nrs_req_pending_nolock(struct ptlrpc_service_part *svcpt, bool hp); - -int ptlrpc_nrs_policy_control(const struct ptlrpc_service *svc, - enum ptlrpc_nrs_queue_type queue, char *name, - enum ptlrpc_nrs_ctl opc, bool single, void *arg); - -int ptlrpc_nrs_init(void); -void ptlrpc_nrs_fini(void); - -static inline bool nrs_svcpt_has_hp(const struct ptlrpc_service_part *svcpt) -{ - return svcpt->scp_nrs_hp != NULL; -} - -static inline bool nrs_svc_has_hp(const struct ptlrpc_service *svc) -{ - /** - * If the first service partition has an HP NRS head, all service - * partitions will. - */ - return nrs_svcpt_has_hp(svc->srv_parts[0]); -} - -static inline -struct ptlrpc_nrs *nrs_svcpt2nrs(struct ptlrpc_service_part *svcpt, bool hp) -{ - LASSERT(ergo(hp, nrs_svcpt_has_hp(svcpt))); - return hp ? svcpt->scp_nrs_hp : &svcpt->scp_nrs_reg; -} - -static inline int nrs_pol2cptid(const struct ptlrpc_nrs_policy *policy) -{ - return policy->pol_nrs->nrs_svcpt->scp_cpt; -} - -static inline -struct ptlrpc_service *nrs_pol2svc(struct ptlrpc_nrs_policy *policy) -{ - return policy->pol_nrs->nrs_svcpt->scp_service; -} - -static inline -struct ptlrpc_service_part *nrs_pol2svcpt(struct ptlrpc_nrs_policy *policy) -{ - return policy->pol_nrs->nrs_svcpt; -} - -static inline -struct cfs_cpt_table *nrs_pol2cptab(struct ptlrpc_nrs_policy *policy) -{ - return nrs_pol2svc(policy)->srv_cptable; -} - -static inline struct ptlrpc_nrs_resource * -nrs_request_resource(struct ptlrpc_nrs_request *nrq) -{ - LASSERT(nrq->nr_initialized); - LASSERT(!nrq->nr_finalized); - - return nrq->nr_res_ptrs[nrq->nr_res_idx]; -} - -static inline -struct ptlrpc_nrs_policy *nrs_request_policy(struct ptlrpc_nrs_request *nrq) -{ - return nrs_request_resource(nrq)->res_policy; -} - -#define NRS_LPROCFS_QUANTUM_NAME_REG "reg_quantum:" -#define NRS_LPROCFS_QUANTUM_NAME_HP "hp_quantum:" - -/** - * the maximum size of nrs_crrn_client::cc_quantum and nrs_orr_data::od_quantum. - */ -#define LPROCFS_NRS_QUANTUM_MAX 65535 - -/** - * Max valid command string is the size of the labels, plus "65535" twice, plus - * a separating space character. - */ -#define LPROCFS_NRS_WR_QUANTUM_MAX_CMD \ - sizeof(NRS_LPROCFS_QUANTUM_NAME_REG __stringify(LPROCFS_NRS_QUANTUM_MAX) " " \ - NRS_LPROCFS_QUANTUM_NAME_HP __stringify(LPROCFS_NRS_QUANTUM_MAX)) - -/* ptlrpc/nrs_fifo.c */ -extern struct ptlrpc_nrs_pol_conf nrs_conf_fifo; - -/* recovd_thread.c */ - -int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink); - -/* pers.c */ -void ptlrpc_fill_bulk_md(struct lnet_md *md, struct ptlrpc_bulk_desc *desc, - int mdcnt); - -/* pack_generic.c */ -struct ptlrpc_reply_state * -lustre_get_emerg_rs(struct ptlrpc_service_part *svcpt); -void lustre_put_emerg_rs(struct ptlrpc_reply_state *rs); - -/* pinger.c */ -int ptlrpc_start_pinger(void); -int ptlrpc_stop_pinger(void); -void ptlrpc_pinger_sending_on_import(struct obd_import *imp); -void ptlrpc_pinger_commit_expected(struct obd_import *imp); -void ptlrpc_pinger_wake_up(void); - -/* sec_null.c */ -int sptlrpc_null_init(void); -void sptlrpc_null_fini(void); - -/* sec_plain.c */ -int sptlrpc_plain_init(void); -void sptlrpc_plain_fini(void); - -/* sec_bulk.c */ -int sptlrpc_enc_pool_init(void); -void sptlrpc_enc_pool_fini(void); -int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v); - -/* sec_lproc.c */ -void sptlrpc_lproc_init(void); -void sptlrpc_lproc_fini(void); - -/* sec_gc.c */ -int sptlrpc_gc_init(void); -void sptlrpc_gc_fini(void); - -/* sec_config.c */ -void sptlrpc_conf_choose_flavor(enum lustre_sec_part from, - enum lustre_sec_part to, - struct obd_uuid *target, - lnet_nid_t nid, - struct sptlrpc_flavor *sf); -int sptlrpc_conf_init(void); -void sptlrpc_conf_fini(void); - -/* sec.c */ -int sptlrpc_init(void); -void sptlrpc_fini(void); - -static inline bool ptlrpc_recoverable_error(int rc) -{ - return (rc == -ENOTCONN || rc == -ENODEV); -} - -static inline int tgt_mod_init(void) -{ - return 0; -} - -static inline void tgt_mod_exit(void) -{ - return; -} - -static inline void ptlrpc_reqset_put(struct ptlrpc_request_set *set) -{ - if (atomic_dec_and_test(&set->set_refcount)) - kfree(set); -} - -/** initialise ptlrpc common fields */ -static inline void ptlrpc_req_comm_init(struct ptlrpc_request *req) -{ - spin_lock_init(&req->rq_lock); - atomic_set(&req->rq_refcount, 1); - INIT_LIST_HEAD(&req->rq_list); - INIT_LIST_HEAD(&req->rq_replay_list); -} - -/** initialise client side ptlrpc request */ -static inline void ptlrpc_cli_req_init(struct ptlrpc_request *req) -{ - struct ptlrpc_cli_req *cr = &req->rq_cli; - - ptlrpc_req_comm_init(req); - - req->rq_receiving_reply = 0; - req->rq_req_unlinked = 1; - req->rq_reply_unlinked = 1; - - req->rq_receiving_reply = 0; - req->rq_req_unlinked = 1; - req->rq_reply_unlinked = 1; - - INIT_LIST_HEAD(&cr->cr_set_chain); - INIT_LIST_HEAD(&cr->cr_ctx_chain); - INIT_LIST_HEAD(&cr->cr_unreplied_list); - init_waitqueue_head(&cr->cr_reply_waitq); - init_waitqueue_head(&cr->cr_set_waitq); -} - -/** initialise server side ptlrpc request */ -static inline void ptlrpc_srv_req_init(struct ptlrpc_request *req) -{ - struct ptlrpc_srv_req *sr = &req->rq_srv; - - ptlrpc_req_comm_init(req); - req->rq_srv_req = 1; - INIT_LIST_HEAD(&sr->sr_exp_list); - INIT_LIST_HEAD(&sr->sr_timed_list); - INIT_LIST_HEAD(&sr->sr_hist_list); -} - -static inline bool ptlrpc_req_is_connect(struct ptlrpc_request *req) -{ - if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_CONNECT || - lustre_msg_get_opc(req->rq_reqmsg) == OST_CONNECT || - lustre_msg_get_opc(req->rq_reqmsg) == MGS_CONNECT) - return true; - else - return false; -} - -static inline bool ptlrpc_req_is_disconnect(struct ptlrpc_request *req) -{ - if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_DISCONNECT || - lustre_msg_get_opc(req->rq_reqmsg) == OST_DISCONNECT || - lustre_msg_get_opc(req->rq_reqmsg) == MGS_DISCONNECT) - return true; - else - return false; -} - -#endif /* PTLRPC_INTERNAL_H */ diff --git a/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c b/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c deleted file mode 100644 index 5c32b657b3b53f319f89b8293824a70fd1a3ffc2..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ptlrpc/ptlrpc_module.c +++ /dev/null @@ -1,186 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_RPC - -#include -#include -#include -#include - -#include "ptlrpc_internal.h" - -extern spinlock_t ptlrpc_last_xid_lock; -#if RS_DEBUG -extern spinlock_t ptlrpc_rs_debug_lock; -#endif - -DEFINE_MUTEX(ptlrpc_startup); -static int ptlrpc_active = 0; - -int ptlrpc_inc_ref(void) -{ - int rc = 0; - - mutex_lock(&ptlrpc_startup); - if (ptlrpc_active++ == 0) { - ptlrpc_put_connection_superhack = ptlrpc_connection_put; - - rc = ptlrpc_init_portals(); - if (!rc) { - rc= ptlrpc_start_pinger(); - if (rc) - ptlrpc_exit_portals(); - } - if (rc) - ptlrpc_active--; - } - mutex_unlock(&ptlrpc_startup); - return rc; -} -EXPORT_SYMBOL(ptlrpc_inc_ref); - -void ptlrpc_dec_ref(void) -{ - mutex_lock(&ptlrpc_startup); - if (--ptlrpc_active == 0) { - ptlrpc_stop_pinger(); - ptlrpc_exit_portals(); - } - mutex_unlock(&ptlrpc_startup); -} -EXPORT_SYMBOL(ptlrpc_dec_ref); - -static int __init ptlrpc_init(void) -{ - int rc, cleanup_phase = 0; - - lustre_assert_wire_constants(); -#if RS_DEBUG - spin_lock_init(&ptlrpc_rs_debug_lock); -#endif - mutex_init(&ptlrpc_all_services_mutex); - mutex_init(&pinger_mutex); - mutex_init(&ptlrpcd_mutex); - ptlrpc_init_xid(); - - rc = libcfs_setup(); - if (rc) - return rc; - - rc = req_layout_init(); - if (rc) - return rc; - - rc = ptlrpc_hr_init(); - if (rc) - return rc; - - cleanup_phase = 1; - rc = ptlrpc_request_cache_init(); - if (rc) - goto cleanup; - - cleanup_phase = 3; - - rc = ptlrpc_connection_init(); - if (rc) - goto cleanup; - - cleanup_phase = 5; - rc = ldlm_init(); - if (rc) - goto cleanup; - - cleanup_phase = 6; - rc = sptlrpc_init(); - if (rc) - goto cleanup; - - cleanup_phase = 7; - rc = ptlrpc_nrs_init(); - if (rc) - goto cleanup; - - cleanup_phase = 8; - rc = tgt_mod_init(); - if (rc) - goto cleanup; - return 0; - -cleanup: - switch (cleanup_phase) { - case 8: - ptlrpc_nrs_fini(); - /* Fall through */ - case 7: - sptlrpc_fini(); - /* Fall through */ - case 6: - ldlm_exit(); - /* Fall through */ - case 5: - ptlrpc_connection_fini(); - /* Fall through */ - case 3: - ptlrpc_request_cache_fini(); - /* Fall through */ - case 1: - ptlrpc_hr_fini(); - req_layout_fini(); - /* Fall through */ - default: - ; - } - - return rc; -} - -static void __exit ptlrpc_exit(void) -{ - tgt_mod_exit(); - ptlrpc_nrs_fini(); - sptlrpc_fini(); - ldlm_exit(); - ptlrpc_request_cache_fini(); - ptlrpc_hr_fini(); - ptlrpc_connection_fini(); -} - -MODULE_AUTHOR("OpenSFS, Inc. "); -MODULE_DESCRIPTION("Lustre Request Processor and Lock Management"); -MODULE_VERSION(LUSTRE_VERSION_STRING); -MODULE_LICENSE("GPL"); - -module_init(ptlrpc_init); -module_exit(ptlrpc_exit); diff --git a/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c b/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c deleted file mode 100644 index 531005411edfc6e537dcb2d96e695dcc4cab9511..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ptlrpc/ptlrpcd.c +++ /dev/null @@ -1,914 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/ptlrpc/ptlrpcd.c - */ - -/** \defgroup ptlrpcd PortalRPC daemon - * - * ptlrpcd is a special thread with its own set where other user might add - * requests when they don't want to wait for their completion. - * PtlRPCD will take care of sending such requests and then processing their - * replies and calling completion callbacks as necessary. - * The callbacks are called directly from ptlrpcd context. - * It is important to never significantly block (esp. on RPCs!) within such - * completion handler or a deadlock might occur where ptlrpcd enters some - * callback that attempts to send another RPC and wait for it to return, - * during which time ptlrpcd is completely blocked, so e.g. if import - * fails, recovery cannot progress because connection requests are also - * sent by ptlrpcd. - * - * @{ - */ - -#define DEBUG_SUBSYSTEM S_RPC - -#include -#include -#include -#include -#include -#include -#include -#include /* for obd_zombie */ -#include /* for OBD_FAIL_CHECK */ -#include /* cl_env_{get,put}() */ -#include - -#include "ptlrpc_internal.h" - -/* One of these per CPT. */ -struct ptlrpcd { - int pd_size; - int pd_index; - int pd_cpt; - int pd_cursor; - int pd_nthreads; - int pd_groupsize; - struct ptlrpcd_ctl pd_threads[0]; -}; - -/* - * max_ptlrpcds is obsolete, but retained to ensure that the kernel - * module will load on a system where it has been tuned. - * A value other than 0 implies it was tuned, in which case the value - * is used to derive a setting for ptlrpcd_per_cpt_max. - */ -static int max_ptlrpcds; -module_param(max_ptlrpcds, int, 0644); -MODULE_PARM_DESC(max_ptlrpcds, - "Max ptlrpcd thread count to be started (obsolete)."); - -/* - * ptlrpcd_bind_policy is obsolete, but retained to ensure that - * the kernel module will load on a system where it has been tuned. - * A value other than 0 implies it was tuned, in which case the value - * is used to derive a setting for ptlrpcd_partner_group_size. - */ -static int ptlrpcd_bind_policy; -module_param(ptlrpcd_bind_policy, int, 0644); -MODULE_PARM_DESC(ptlrpcd_bind_policy, - "Ptlrpcd threads binding mode (obsolete)."); - -/* - * ptlrpcd_per_cpt_max: The maximum number of ptlrpcd threads to run - * in a CPT. - */ -static int ptlrpcd_per_cpt_max; -module_param(ptlrpcd_per_cpt_max, int, 0644); -MODULE_PARM_DESC(ptlrpcd_per_cpt_max, - "Max ptlrpcd thread count to be started per CPT."); - -/* - * ptlrpcd_partner_group_size: The desired number of threads in each - * ptlrpcd partner thread group. Default is 2, corresponding to the - * old PDB_POLICY_PAIR. A negative value makes all ptlrpcd threads in - * a CPT partners of each other. - */ -static int ptlrpcd_partner_group_size; -module_param(ptlrpcd_partner_group_size, int, 0644); -MODULE_PARM_DESC(ptlrpcd_partner_group_size, - "Number of ptlrpcd threads in a partner group."); - -/* - * ptlrpcd_cpts: A CPT string describing the CPU partitions that - * ptlrpcd threads should run on. Used to make ptlrpcd threads run on - * a subset of all CPTs. - * - * ptlrpcd_cpts=2 - * ptlrpcd_cpts=[2] - * run ptlrpcd threads only on CPT 2. - * - * ptlrpcd_cpts=0-3 - * ptlrpcd_cpts=[0-3] - * run ptlrpcd threads on CPTs 0, 1, 2, and 3. - * - * ptlrpcd_cpts=[0-3,5,7] - * run ptlrpcd threads on CPTS 0, 1, 2, 3, 5, and 7. - */ -static char *ptlrpcd_cpts; -module_param(ptlrpcd_cpts, charp, 0644); -MODULE_PARM_DESC(ptlrpcd_cpts, - "CPU partitions ptlrpcd threads should run in"); - -/* ptlrpcds_cpt_idx maps cpt numbers to an index in the ptlrpcds array. */ -static int *ptlrpcds_cpt_idx; - -/* ptlrpcds_num is the number of entries in the ptlrpcds array. */ -static int ptlrpcds_num; -static struct ptlrpcd **ptlrpcds; - -/* - * In addition to the regular thread pool above, there is a single - * global recovery thread. Recovery isn't critical for performance, - * and doesn't block, but must always be able to proceed, and it is - * possible that all normal ptlrpcd threads are blocked. Hence the - * need for a dedicated thread. - */ -static struct ptlrpcd_ctl ptlrpcd_rcv; - -struct mutex ptlrpcd_mutex; -static int ptlrpcd_users; - -void ptlrpcd_wake(struct ptlrpc_request *req) -{ - struct ptlrpc_request_set *set = req->rq_set; - - wake_up(&set->set_waitq); -} -EXPORT_SYMBOL(ptlrpcd_wake); - -static struct ptlrpcd_ctl * -ptlrpcd_select_pc(struct ptlrpc_request *req) -{ - struct ptlrpcd *pd; - int cpt; - int idx; - - if (req && req->rq_send_state != LUSTRE_IMP_FULL) - return &ptlrpcd_rcv; - - cpt = cfs_cpt_current(cfs_cpt_tab, 1); - if (!ptlrpcds_cpt_idx) - idx = cpt; - else - idx = ptlrpcds_cpt_idx[cpt]; - pd = ptlrpcds[idx]; - - /* We do not care whether it is strict load balance. */ - idx = pd->pd_cursor; - if (++idx == pd->pd_nthreads) - idx = 0; - pd->pd_cursor = idx; - - return &pd->pd_threads[idx]; -} - -/** - * Return transferred RPCs count. - */ -static int ptlrpcd_steal_rqset(struct ptlrpc_request_set *des, - struct ptlrpc_request_set *src) -{ - struct ptlrpc_request *req, *tmp; - int rc = 0; - - spin_lock(&src->set_new_req_lock); - if (likely(!list_empty(&src->set_new_requests))) { - list_for_each_entry_safe(req, tmp, &src->set_new_requests, rq_set_chain) - req->rq_set = des; - - list_splice_init(&src->set_new_requests, &des->set_requests); - rc = atomic_read(&src->set_new_count); - atomic_add(rc, &des->set_remaining); - atomic_set(&src->set_new_count, 0); - } - spin_unlock(&src->set_new_req_lock); - return rc; -} - -/** - * Requests that are added to the ptlrpcd queue are sent via - * ptlrpcd_check->ptlrpc_check_set(). - */ -void ptlrpcd_add_req(struct ptlrpc_request *req) -{ - struct ptlrpcd_ctl *pc; - - if (req->rq_reqmsg) - lustre_msg_set_jobid(req->rq_reqmsg, NULL); - - spin_lock(&req->rq_lock); - if (req->rq_invalid_rqset) { - req->rq_invalid_rqset = 0; - spin_unlock(&req->rq_lock); - if (wait_event_idle_timeout(req->rq_set_waitq, - !req->rq_set, - 5 * HZ) == 0) - wait_event_idle(req->rq_set_waitq, - !req->rq_set); - } else if (req->rq_set) { - /* If we have a valid "rq_set", just reuse it to avoid double - * linked. - */ - LASSERT(req->rq_phase == RQ_PHASE_NEW); - LASSERT(req->rq_send_state == LUSTRE_IMP_REPLAY); - - /* ptlrpc_check_set will decrease the count */ - atomic_inc(&req->rq_set->set_remaining); - spin_unlock(&req->rq_lock); - wake_up(&req->rq_set->set_waitq); - return; - } else { - spin_unlock(&req->rq_lock); - } - - pc = ptlrpcd_select_pc(req); - - DEBUG_REQ(D_INFO, req, "add req [%p] to pc [%s:%d]", - req, pc->pc_name, pc->pc_index); - - ptlrpc_set_add_new_req(pc, req); -} -EXPORT_SYMBOL(ptlrpcd_add_req); - -static inline void ptlrpc_reqset_get(struct ptlrpc_request_set *set) -{ - atomic_inc(&set->set_refcount); -} - -/** - * Check if there is more work to do on ptlrpcd set. - * Returns 1 if yes. - */ -static int ptlrpcd_check(struct lu_env *env, struct ptlrpcd_ctl *pc) -{ - struct ptlrpc_request *req, *tmp; - struct ptlrpc_request_set *set = pc->pc_set; - int rc = 0; - int rc2; - - if (atomic_read(&set->set_new_count)) { - spin_lock(&set->set_new_req_lock); - if (likely(!list_empty(&set->set_new_requests))) { - list_splice_init(&set->set_new_requests, - &set->set_requests); - atomic_add(atomic_read(&set->set_new_count), - &set->set_remaining); - atomic_set(&set->set_new_count, 0); - /* - * Need to calculate its timeout. - */ - rc = 1; - } - spin_unlock(&set->set_new_req_lock); - } - - /* We should call lu_env_refill() before handling new requests to make - * sure that env key the requests depending on really exists. - */ - rc2 = lu_env_refill(env); - if (rc2 != 0) { - /* - * XXX This is very awkward situation, because - * execution can neither continue (request - * interpreters assume that env is set up), nor repeat - * the loop (as this potentially results in a tight - * loop of -ENOMEM's). - * - * Fortunately, refill only ever does something when - * new modules are loaded, i.e., early during boot up. - */ - CERROR("Failure to refill session: %d\n", rc2); - return rc; - } - - if (atomic_read(&set->set_remaining)) - rc |= ptlrpc_check_set(env, set); - - /* NB: ptlrpc_check_set has already moved completed request at the - * head of seq::set_requests - */ - list_for_each_entry_safe(req, tmp, &set->set_requests, rq_set_chain) { - if (req->rq_phase != RQ_PHASE_COMPLETE) - break; - - list_del_init(&req->rq_set_chain); - req->rq_set = NULL; - ptlrpc_req_finished(req); - } - - if (rc == 0) { - /* - * If new requests have been added, make sure to wake up. - */ - rc = atomic_read(&set->set_new_count); - - /* If we have nothing to do, check whether we can take some - * work from our partner threads. - */ - if (rc == 0 && pc->pc_npartners > 0) { - struct ptlrpcd_ctl *partner; - struct ptlrpc_request_set *ps; - int first = pc->pc_cursor; - - do { - partner = pc->pc_partners[pc->pc_cursor++]; - if (pc->pc_cursor >= pc->pc_npartners) - pc->pc_cursor = 0; - if (!partner) - continue; - - spin_lock(&partner->pc_lock); - ps = partner->pc_set; - if (!ps) { - spin_unlock(&partner->pc_lock); - continue; - } - - ptlrpc_reqset_get(ps); - spin_unlock(&partner->pc_lock); - - if (atomic_read(&ps->set_new_count)) { - rc = ptlrpcd_steal_rqset(set, ps); - if (rc > 0) - CDEBUG(D_RPCTRACE, "transfer %d async RPCs [%d->%d]\n", - rc, partner->pc_index, - pc->pc_index); - } - ptlrpc_reqset_put(ps); - } while (rc == 0 && pc->pc_cursor != first); - } - } - - return rc; -} - -/** - * Main ptlrpcd thread. - * ptlrpc's code paths like to execute in process context, so we have this - * thread which spins on a set which contains the rpcs and sends them. - * - */ -static int ptlrpcd(void *arg) -{ - struct ptlrpcd_ctl *pc = arg; - struct ptlrpc_request_set *set; - struct lu_context ses = { 0 }; - struct lu_env env = { .le_ses = &ses }; - int rc = 0; - int exit = 0; - - unshare_fs_struct(); - if (cfs_cpt_bind(cfs_cpt_tab, pc->pc_cpt) != 0) - CWARN("Failed to bind %s on CPT %d\n", pc->pc_name, pc->pc_cpt); - - /* - * Allocate the request set after the thread has been bound - * above. This is safe because no requests will be queued - * until all ptlrpcd threads have confirmed that they have - * successfully started. - */ - set = ptlrpc_prep_set(); - if (!set) { - rc = -ENOMEM; - goto failed; - } - spin_lock(&pc->pc_lock); - pc->pc_set = set; - spin_unlock(&pc->pc_lock); - /* - * XXX So far only "client" ptlrpcd uses an environment. In - * the future, ptlrpcd thread (or a thread-set) has to given - * an argument, describing its "scope". - */ - rc = lu_context_init(&env.le_ctx, - LCT_CL_THREAD | LCT_REMEMBER | LCT_NOREF); - if (rc == 0) { - rc = lu_context_init(env.le_ses, - LCT_SESSION | LCT_REMEMBER | LCT_NOREF); - if (rc != 0) - lu_context_fini(&env.le_ctx); - } - - if (rc != 0) - goto failed; - - complete(&pc->pc_starting); - - /* - * This mainloop strongly resembles ptlrpc_set_wait() except that our - * set never completes. ptlrpcd_check() calls ptlrpc_check_set() when - * there are requests in the set. New requests come in on the set's - * new_req_list and ptlrpcd_check() moves them into the set. - */ - do { - int timeout; - - timeout = ptlrpc_set_next_timeout(set); - - lu_context_enter(&env.le_ctx); - lu_context_enter(env.le_ses); - if (wait_event_idle_timeout(set->set_waitq, - ptlrpcd_check(&env, pc), - (timeout ? timeout : 1) * HZ) == 0) - ptlrpc_expired_set(set); - - lu_context_exit(&env.le_ctx); - lu_context_exit(env.le_ses); - - /* - * Abort inflight rpcs for forced stop case. - */ - if (test_bit(LIOD_STOP, &pc->pc_flags)) { - if (test_bit(LIOD_FORCE, &pc->pc_flags)) - ptlrpc_abort_set(set); - exit++; - } - - /* - * Let's make one more loop to make sure that ptlrpcd_check() - * copied all raced new rpcs into the set so we can kill them. - */ - } while (exit < 2); - - /* - * Wait for inflight requests to drain. - */ - if (!list_empty(&set->set_requests)) - ptlrpc_set_wait(set); - lu_context_fini(&env.le_ctx); - lu_context_fini(env.le_ses); - - complete(&pc->pc_finishing); - - return 0; -failed: - pc->pc_error = rc; - complete(&pc->pc_starting); - return rc; -} - -static void ptlrpcd_ctl_init(struct ptlrpcd_ctl *pc, int index, int cpt) -{ - pc->pc_index = index; - pc->pc_cpt = cpt; - init_completion(&pc->pc_starting); - init_completion(&pc->pc_finishing); - spin_lock_init(&pc->pc_lock); - - if (index < 0) { - /* Recovery thread. */ - snprintf(pc->pc_name, sizeof(pc->pc_name), "ptlrpcd_rcv"); - } else { - /* Regular thread. */ - snprintf(pc->pc_name, sizeof(pc->pc_name), - "ptlrpcd_%02d_%02d", cpt, index); - } -} - -/* XXX: We want multiple CPU cores to share the async RPC load. So we - * start many ptlrpcd threads. We also want to reduce the ptlrpcd - * overhead caused by data transfer cross-CPU cores. So we bind - * all ptlrpcd threads to a CPT, in the expectation that CPTs - * will be defined in a way that matches these boundaries. Within - * a CPT a ptlrpcd thread can be scheduled on any available core. - * - * Each ptlrpcd thread has its own request queue. This can cause - * response delay if the thread is already busy. To help with - * this we define partner threads: these are other threads bound - * to the same CPT which will check for work in each other's - * request queues if they have no work to do. - * - * The desired number of partner threads can be tuned by setting - * ptlrpcd_partner_group_size. The default is to create pairs of - * partner threads. - */ -static int ptlrpcd_partners(struct ptlrpcd *pd, int index) -{ - struct ptlrpcd_ctl *pc; - struct ptlrpcd_ctl **ppc; - int first; - int i; - int rc = 0; - int size; - - LASSERT(index >= 0 && index < pd->pd_nthreads); - pc = &pd->pd_threads[index]; - pc->pc_npartners = pd->pd_groupsize - 1; - - if (pc->pc_npartners <= 0) - goto out; - - size = sizeof(struct ptlrpcd_ctl *) * pc->pc_npartners; - pc->pc_partners = kzalloc_node(size, GFP_NOFS, - cfs_cpt_spread_node(cfs_cpt_tab, - pc->pc_cpt)); - if (!pc->pc_partners) { - pc->pc_npartners = 0; - rc = -ENOMEM; - goto out; - } - - first = index - index % pd->pd_groupsize; - ppc = pc->pc_partners; - for (i = first; i < first + pd->pd_groupsize; i++) { - if (i != index) - *ppc++ = &pd->pd_threads[i]; - } -out: - return rc; -} - -int ptlrpcd_start(struct ptlrpcd_ctl *pc) -{ - struct task_struct *task; - int rc = 0; - - /* - * Do not allow start second thread for one pc. - */ - if (test_and_set_bit(LIOD_START, &pc->pc_flags)) { - CWARN("Starting second thread (%s) for same pc %p\n", - pc->pc_name, pc); - return 0; - } - - task = kthread_run(ptlrpcd, pc, "%s", pc->pc_name); - if (IS_ERR(task)) { - rc = PTR_ERR(task); - goto out_set; - } - - wait_for_completion(&pc->pc_starting); - rc = pc->pc_error; - if (rc != 0) - goto out_set; - - return 0; - -out_set: - if (pc->pc_set) { - struct ptlrpc_request_set *set = pc->pc_set; - - spin_lock(&pc->pc_lock); - pc->pc_set = NULL; - spin_unlock(&pc->pc_lock); - ptlrpc_set_destroy(set); - } - clear_bit(LIOD_START, &pc->pc_flags); - return rc; -} - -void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force) -{ - if (!test_bit(LIOD_START, &pc->pc_flags)) { - CWARN("Thread for pc %p was not started\n", pc); - return; - } - - set_bit(LIOD_STOP, &pc->pc_flags); - if (force) - set_bit(LIOD_FORCE, &pc->pc_flags); - wake_up(&pc->pc_set->set_waitq); -} - -void ptlrpcd_free(struct ptlrpcd_ctl *pc) -{ - struct ptlrpc_request_set *set = pc->pc_set; - - if (!test_bit(LIOD_START, &pc->pc_flags)) { - CWARN("Thread for pc %p was not started\n", pc); - goto out; - } - - wait_for_completion(&pc->pc_finishing); - - spin_lock(&pc->pc_lock); - pc->pc_set = NULL; - spin_unlock(&pc->pc_lock); - ptlrpc_set_destroy(set); - - clear_bit(LIOD_START, &pc->pc_flags); - clear_bit(LIOD_STOP, &pc->pc_flags); - clear_bit(LIOD_FORCE, &pc->pc_flags); - -out: - if (pc->pc_npartners > 0) { - LASSERT(pc->pc_partners); - - kfree(pc->pc_partners); - pc->pc_partners = NULL; - } - pc->pc_npartners = 0; - pc->pc_error = 0; -} - -static void ptlrpcd_fini(void) -{ - int i; - int j; - - if (ptlrpcds) { - for (i = 0; i < ptlrpcds_num; i++) { - if (!ptlrpcds[i]) - break; - for (j = 0; j < ptlrpcds[i]->pd_nthreads; j++) - ptlrpcd_stop(&ptlrpcds[i]->pd_threads[j], 0); - for (j = 0; j < ptlrpcds[i]->pd_nthreads; j++) - ptlrpcd_free(&ptlrpcds[i]->pd_threads[j]); - kfree(ptlrpcds[i]); - ptlrpcds[i] = NULL; - } - kfree(ptlrpcds); - } - ptlrpcds_num = 0; - - ptlrpcd_stop(&ptlrpcd_rcv, 0); - ptlrpcd_free(&ptlrpcd_rcv); - - kfree(ptlrpcds_cpt_idx); - ptlrpcds_cpt_idx = NULL; -} - -static int ptlrpcd_init(void) -{ - int nthreads; - int groupsize; - int size; - int i; - int j; - int rc = 0; - struct cfs_cpt_table *cptable; - __u32 *cpts = NULL; - int ncpts; - int cpt; - struct ptlrpcd *pd; - - /* - * Determine the CPTs that ptlrpcd threads will run on. - */ - cptable = cfs_cpt_tab; - ncpts = cfs_cpt_number(cptable); - if (ptlrpcd_cpts) { - struct cfs_expr_list *el; - - size = ncpts * sizeof(ptlrpcds_cpt_idx[0]); - ptlrpcds_cpt_idx = kzalloc(size, GFP_KERNEL); - if (!ptlrpcds_cpt_idx) { - rc = -ENOMEM; - goto out; - } - - rc = cfs_expr_list_parse(ptlrpcd_cpts, - strlen(ptlrpcd_cpts), - 0, ncpts - 1, &el); - - if (rc != 0) { - CERROR("ptlrpcd_cpts: invalid CPT pattern string: %s", - ptlrpcd_cpts); - rc = -EINVAL; - goto out; - } - - rc = cfs_expr_list_values(el, ncpts, &cpts); - cfs_expr_list_free(el); - if (rc <= 0) { - CERROR("ptlrpcd_cpts: failed to parse CPT array %s: %d\n", - ptlrpcd_cpts, rc); - if (rc == 0) - rc = -EINVAL; - goto out; - } - - /* - * Create the cpt-to-index map. When there is no match - * in the cpt table, pick a cpt at random. This could - * be changed to take the topology of the system into - * account. - */ - for (cpt = 0; cpt < ncpts; cpt++) { - for (i = 0; i < rc; i++) - if (cpts[i] == cpt) - break; - if (i >= rc) - i = cpt % rc; - ptlrpcds_cpt_idx[cpt] = i; - } - - cfs_expr_list_values_free(cpts, rc); - ncpts = rc; - } - ptlrpcds_num = ncpts; - - size = ncpts * sizeof(ptlrpcds[0]); - ptlrpcds = kzalloc(size, GFP_KERNEL); - if (!ptlrpcds) { - rc = -ENOMEM; - goto out; - } - - /* - * The max_ptlrpcds parameter is obsolete, but do something - * sane if it has been tuned, and complain if - * ptlrpcd_per_cpt_max has also been tuned. - */ - if (max_ptlrpcds != 0) { - CWARN("max_ptlrpcds is obsolete.\n"); - if (ptlrpcd_per_cpt_max == 0) { - ptlrpcd_per_cpt_max = max_ptlrpcds / ncpts; - /* Round up if there is a remainder. */ - if (max_ptlrpcds % ncpts != 0) - ptlrpcd_per_cpt_max++; - CWARN("Setting ptlrpcd_per_cpt_max = %d\n", - ptlrpcd_per_cpt_max); - } else { - CWARN("ptlrpd_per_cpt_max is also set!\n"); - } - } - - /* - * The ptlrpcd_bind_policy parameter is obsolete, but do - * something sane if it has been tuned, and complain if - * ptlrpcd_partner_group_size is also tuned. - */ - if (ptlrpcd_bind_policy != 0) { - CWARN("ptlrpcd_bind_policy is obsolete.\n"); - if (ptlrpcd_partner_group_size == 0) { - switch (ptlrpcd_bind_policy) { - case 1: /* PDB_POLICY_NONE */ - case 2: /* PDB_POLICY_FULL */ - ptlrpcd_partner_group_size = 1; - break; - case 3: /* PDB_POLICY_PAIR */ - ptlrpcd_partner_group_size = 2; - break; - case 4: /* PDB_POLICY_NEIGHBOR */ -#ifdef CONFIG_NUMA - ptlrpcd_partner_group_size = -1; /* CPT */ -#else - ptlrpcd_partner_group_size = 3; /* Triplets */ -#endif - break; - default: /* Illegal value, use the default. */ - ptlrpcd_partner_group_size = 2; - break; - } - CWARN("Setting ptlrpcd_partner_group_size = %d\n", - ptlrpcd_partner_group_size); - } else { - CWARN("ptlrpcd_partner_group_size is also set!\n"); - } - } - - if (ptlrpcd_partner_group_size == 0) - ptlrpcd_partner_group_size = 2; - else if (ptlrpcd_partner_group_size < 0) - ptlrpcd_partner_group_size = -1; - else if (ptlrpcd_per_cpt_max > 0 && - ptlrpcd_partner_group_size > ptlrpcd_per_cpt_max) - ptlrpcd_partner_group_size = ptlrpcd_per_cpt_max; - - /* - * Start the recovery thread first. - */ - set_bit(LIOD_RECOVERY, &ptlrpcd_rcv.pc_flags); - ptlrpcd_ctl_init(&ptlrpcd_rcv, -1, CFS_CPT_ANY); - rc = ptlrpcd_start(&ptlrpcd_rcv); - if (rc < 0) - goto out; - - for (i = 0; i < ncpts; i++) { - if (!cpts) - cpt = i; - else - cpt = cpts[i]; - - nthreads = cfs_cpt_weight(cptable, cpt); - if (ptlrpcd_per_cpt_max > 0 && ptlrpcd_per_cpt_max < nthreads) - nthreads = ptlrpcd_per_cpt_max; - if (nthreads < 2) - nthreads = 2; - - if (ptlrpcd_partner_group_size <= 0) { - groupsize = nthreads; - } else if (nthreads <= ptlrpcd_partner_group_size) { - groupsize = nthreads; - } else { - groupsize = ptlrpcd_partner_group_size; - if (nthreads % groupsize != 0) - nthreads += groupsize - (nthreads % groupsize); - } - - size = offsetof(struct ptlrpcd, pd_threads[nthreads]); - pd = kzalloc_node(size, GFP_NOFS, - cfs_cpt_spread_node(cfs_cpt_tab, cpt)); - if (!pd) { - rc = -ENOMEM; - goto out; - } - pd->pd_size = size; - pd->pd_index = i; - pd->pd_cpt = cpt; - pd->pd_cursor = 0; - pd->pd_nthreads = nthreads; - pd->pd_groupsize = groupsize; - ptlrpcds[i] = pd; - - /* - * The ptlrpcd threads in a partner group can access - * each other's struct ptlrpcd_ctl, so these must be - * initialized before any thread is started. - */ - for (j = 0; j < nthreads; j++) { - ptlrpcd_ctl_init(&pd->pd_threads[j], j, cpt); - rc = ptlrpcd_partners(pd, j); - if (rc < 0) - goto out; - } - - /* XXX: We start nthreads ptlrpc daemons. - * Each of them can process any non-recovery - * async RPC to improve overall async RPC - * efficiency. - * - * But there are some issues with async I/O RPCs - * and async non-I/O RPCs processed in the same - * set under some cases. The ptlrpcd may be - * blocked by some async I/O RPC(s), then will - * cause other async non-I/O RPC(s) can not be - * processed in time. - * - * Maybe we should distinguish blocked async RPCs - * from non-blocked async RPCs, and process them - * in different ptlrpcd sets to avoid unnecessary - * dependency. But how to distribute async RPCs - * load among all the ptlrpc daemons becomes - * another trouble. - */ - for (j = 0; j < nthreads; j++) { - rc = ptlrpcd_start(&pd->pd_threads[j]); - if (rc < 0) - goto out; - } - } -out: - if (rc != 0) - ptlrpcd_fini(); - - return rc; -} - -int ptlrpcd_addref(void) -{ - int rc = 0; - - mutex_lock(&ptlrpcd_mutex); - if (++ptlrpcd_users == 1) { - rc = ptlrpcd_init(); - if (rc < 0) - ptlrpcd_users--; - } - mutex_unlock(&ptlrpcd_mutex); - return rc; -} -EXPORT_SYMBOL(ptlrpcd_addref); - -void ptlrpcd_decref(void) -{ - mutex_lock(&ptlrpcd_mutex); - if (--ptlrpcd_users == 0) - ptlrpcd_fini(); - mutex_unlock(&ptlrpcd_mutex); -} -EXPORT_SYMBOL(ptlrpcd_decref); -/** @} ptlrpcd */ diff --git a/drivers/staging/lustre/lustre/ptlrpc/recover.c b/drivers/staging/lustre/lustre/ptlrpc/recover.c deleted file mode 100644 index 2ea0a7ff87dd5bbc238fd1942c5c43355bb90863..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ptlrpc/recover.c +++ /dev/null @@ -1,374 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/ptlrpc/recover.c - * - * Author: Mike Shaver - */ - -#define DEBUG_SUBSYSTEM S_RPC -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ptlrpc_internal.h" - -/** - * Start recovery on disconnected import. - * This is done by just attempting a connect - */ -void ptlrpc_initiate_recovery(struct obd_import *imp) -{ - CDEBUG(D_HA, "%s: starting recovery\n", obd2cli_tgt(imp->imp_obd)); - ptlrpc_connect_import(imp); -} - -/** - * Identify what request from replay list needs to be replayed next - * (based on what we have already replayed) and send it to server. - */ -int ptlrpc_replay_next(struct obd_import *imp, int *inflight) -{ - int rc = 0; - struct ptlrpc_request *req = NULL, *pos; - __u64 last_transno; - - *inflight = 0; - - /* It might have committed some after we last spoke, so make sure we - * get rid of them now. - */ - spin_lock(&imp->imp_lock); - imp->imp_last_transno_checked = 0; - ptlrpc_free_committed(imp); - last_transno = imp->imp_last_replay_transno; - - CDEBUG(D_HA, "import %p from %s committed %llu last %llu\n", - imp, obd2cli_tgt(imp->imp_obd), - imp->imp_peer_committed_transno, last_transno); - - /* Replay all the committed open requests on committed_list first */ - if (!list_empty(&imp->imp_committed_list)) { - req = list_last_entry(&imp->imp_committed_list, - struct ptlrpc_request, rq_replay_list); - - /* The last request on committed_list hasn't been replayed */ - if (req->rq_transno > last_transno) { - if (!imp->imp_resend_replay || - imp->imp_replay_cursor == &imp->imp_committed_list) - imp->imp_replay_cursor = imp->imp_replay_cursor->next; - - while (imp->imp_replay_cursor != - &imp->imp_committed_list) { - req = list_entry(imp->imp_replay_cursor, - struct ptlrpc_request, - rq_replay_list); - if (req->rq_transno > last_transno) - break; - - req = NULL; - LASSERT(!list_empty(imp->imp_replay_cursor)); - imp->imp_replay_cursor = - imp->imp_replay_cursor->next; - } - } else { - /* All requests on committed_list have been replayed */ - imp->imp_replay_cursor = &imp->imp_committed_list; - req = NULL; - } - } - - /* All the requests in committed list have been replayed, let's replay - * the imp_replay_list - */ - if (!req) { - struct ptlrpc_request *tmp; - list_for_each_entry_safe(tmp, pos, &imp->imp_replay_list, - rq_replay_list) { - if (tmp->rq_transno > last_transno) { - req = tmp; - break; - } - } - } - - /* If need to resend the last sent transno (because a reconnect - * has occurred), then stop on the matching req and send it again. - * If, however, the last sent transno has been committed then we - * continue replay from the next request. - */ - if (req && imp->imp_resend_replay) - lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT); - - /* The resend replay request may have been removed from the - * unreplied list. - */ - if (req && imp->imp_resend_replay && - list_empty(&req->rq_unreplied_list)) { - ptlrpc_add_unreplied(req); - imp->imp_known_replied_xid = ptlrpc_known_replied_xid(imp); - } - - imp->imp_resend_replay = 0; - spin_unlock(&imp->imp_lock); - - if (req) { - /* The request should have been added back in unreplied list - * by ptlrpc_prepare_replay(). - */ - LASSERT(!list_empty(&req->rq_unreplied_list)); - - rc = ptlrpc_replay_req(req); - if (rc) { - CERROR("recovery replay error %d for req %llu\n", - rc, req->rq_xid); - return rc; - } - *inflight = 1; - } - return rc; -} - -/** - * Schedule resending of request on sending_list. This is done after - * we completed replaying of requests and locks. - */ -int ptlrpc_resend(struct obd_import *imp) -{ - struct ptlrpc_request *req, *next; - - /* As long as we're in recovery, nothing should be added to the sending - * list, so we don't need to hold the lock during this iteration and - * resend process. - */ - /* Well... what if lctl recover is called twice at the same time? - */ - spin_lock(&imp->imp_lock); - if (imp->imp_state != LUSTRE_IMP_RECOVER) { - spin_unlock(&imp->imp_lock); - return -1; - } - - list_for_each_entry_safe(req, next, &imp->imp_sending_list, rq_list) { - LASSERTF((long)req > PAGE_SIZE && req != LP_POISON, - "req %p bad\n", req); - LASSERTF(req->rq_type != LI_POISON, "req %p freed\n", req); - - /* - * If the request is allowed to be sent during replay and it - * is not timeout yet, then it does not need to be resent. - */ - if (!ptlrpc_no_resend(req) && - (req->rq_timedout || !req->rq_allow_replay)) - ptlrpc_resend_req(req); - } - spin_unlock(&imp->imp_lock); - - OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT, 2); - return 0; -} - -/** - * Go through all requests in delayed list and wake their threads - * for resending - */ -void ptlrpc_wake_delayed(struct obd_import *imp) -{ - struct ptlrpc_request *req, *pos; - - spin_lock(&imp->imp_lock); - list_for_each_entry_safe(req, pos, &imp->imp_delayed_list, rq_list) { - DEBUG_REQ(D_HA, req, "waking (set %p):", req->rq_set); - ptlrpc_client_wake_req(req); - } - spin_unlock(&imp->imp_lock); -} - -void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req) -{ - struct obd_import *imp = failed_req->rq_import; - - CDEBUG(D_HA, "import %s of %s@%s abruptly disconnected: reconnecting\n", - imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd), - imp->imp_connection->c_remote_uuid.uuid); - - if (ptlrpc_set_import_discon(imp, - lustre_msg_get_conn_cnt(failed_req->rq_reqmsg))) { - if (!imp->imp_replayable) { - CDEBUG(D_HA, "import %s@%s for %s not replayable, auto-deactivating\n", - obd2cli_tgt(imp->imp_obd), - imp->imp_connection->c_remote_uuid.uuid, - imp->imp_obd->obd_name); - ptlrpc_deactivate_import(imp); - } - /* to control recovery via lctl {disable|enable}_recovery */ - if (imp->imp_deactive == 0) - ptlrpc_connect_import(imp); - } - - /* Wait for recovery to complete and resend. If evicted, then - * this request will be errored out later. - */ - spin_lock(&failed_req->rq_lock); - if (!failed_req->rq_no_resend) - failed_req->rq_resend = 1; - spin_unlock(&failed_req->rq_lock); -} - -/** - * Administratively active/deactive a client. - * This should only be called by the ioctl interface, currently - * - the lctl deactivate and activate commands - * - echo 0/1 >> /sys/fs/lustre/osc/XXX/active - * - client umount -f (ll_umount_begin) - */ -int ptlrpc_set_import_active(struct obd_import *imp, int active) -{ - struct obd_device *obd = imp->imp_obd; - int rc = 0; - - LASSERT(obd); - - /* When deactivating, mark import invalid, and abort in-flight - * requests. - */ - if (!active) { - LCONSOLE_WARN("setting import %s INACTIVE by administrator request\n", - obd2cli_tgt(imp->imp_obd)); - - /* set before invalidate to avoid messages about imp_inval - * set without imp_deactive in ptlrpc_import_delay_req - */ - spin_lock(&imp->imp_lock); - imp->imp_deactive = 1; - spin_unlock(&imp->imp_lock); - - obd_import_event(imp->imp_obd, imp, IMP_EVENT_DEACTIVATE); - - ptlrpc_invalidate_import(imp); - } - - /* When activating, mark import valid, and attempt recovery */ - if (active) { - CDEBUG(D_HA, "setting import %s VALID\n", - obd2cli_tgt(imp->imp_obd)); - - spin_lock(&imp->imp_lock); - imp->imp_deactive = 0; - spin_unlock(&imp->imp_lock); - obd_import_event(imp->imp_obd, imp, IMP_EVENT_ACTIVATE); - - rc = ptlrpc_recover_import(imp, NULL, 0); - } - - return rc; -} -EXPORT_SYMBOL(ptlrpc_set_import_active); - -/* Attempt to reconnect an import */ -int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async) -{ - int rc = 0; - - spin_lock(&imp->imp_lock); - if (imp->imp_state == LUSTRE_IMP_NEW || imp->imp_deactive || - atomic_read(&imp->imp_inval_count)) - rc = -EINVAL; - spin_unlock(&imp->imp_lock); - if (rc) - goto out; - - /* force import to be disconnected. */ - ptlrpc_set_import_discon(imp, 0); - - if (new_uuid) { - struct obd_uuid uuid; - - /* intruct import to use new uuid */ - obd_str2uuid(&uuid, new_uuid); - rc = import_set_conn_priority(imp, &uuid); - if (rc) - goto out; - } - - /* Check if reconnect is already in progress */ - spin_lock(&imp->imp_lock); - if (imp->imp_state != LUSTRE_IMP_DISCON) { - imp->imp_force_verify = 1; - rc = -EALREADY; - } - spin_unlock(&imp->imp_lock); - if (rc) - goto out; - - rc = ptlrpc_connect_import(imp); - if (rc) - goto out; - - if (!async) { - CDEBUG(D_HA, "%s: recovery started, waiting %u seconds\n", - obd2cli_tgt(imp->imp_obd), obd_timeout); - - rc = wait_event_idle_timeout(imp->imp_recovery_waitq, - !ptlrpc_import_in_recovery(imp), - obd_timeout * HZ); - CDEBUG(D_HA, "%s: recovery finished\n", - obd2cli_tgt(imp->imp_obd)); - rc = rc ? 0 : -ETIMEDOUT; - } - -out: - return rc; -} -EXPORT_SYMBOL(ptlrpc_recover_import); - -int ptlrpc_import_in_recovery(struct obd_import *imp) -{ - int in_recovery = 1; - - spin_lock(&imp->imp_lock); - if (imp->imp_state == LUSTRE_IMP_FULL || - imp->imp_state == LUSTRE_IMP_CLOSED || - imp->imp_state == LUSTRE_IMP_DISCON || - imp->imp_obd->obd_no_recov) - in_recovery = 0; - spin_unlock(&imp->imp_lock); - - return in_recovery; -} diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec.c b/drivers/staging/lustre/lustre/ptlrpc/sec.c deleted file mode 100644 index e193f3346e6fe87c163ebf6be7e90eb6e5effec3..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ptlrpc/sec.c +++ /dev/null @@ -1,2379 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/ptlrpc/sec.c - * - * Author: Eric Mei - */ - -#define DEBUG_SUBSYSTEM S_SEC - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "ptlrpc_internal.h" - -/*********************************************** - * policy registers * - ***********************************************/ - -static rwlock_t policy_lock; -static struct ptlrpc_sec_policy *policies[SPTLRPC_POLICY_MAX] = { - NULL, -}; - -int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy) -{ - __u16 number = policy->sp_policy; - - LASSERT(policy->sp_name); - LASSERT(policy->sp_cops); - LASSERT(policy->sp_sops); - - if (number >= SPTLRPC_POLICY_MAX) - return -EINVAL; - - write_lock(&policy_lock); - if (unlikely(policies[number])) { - write_unlock(&policy_lock); - return -EALREADY; - } - policies[number] = policy; - write_unlock(&policy_lock); - - CDEBUG(D_SEC, "%s: registered\n", policy->sp_name); - return 0; -} -EXPORT_SYMBOL(sptlrpc_register_policy); - -int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy) -{ - __u16 number = policy->sp_policy; - - LASSERT(number < SPTLRPC_POLICY_MAX); - - write_lock(&policy_lock); - if (unlikely(!policies[number])) { - write_unlock(&policy_lock); - CERROR("%s: already unregistered\n", policy->sp_name); - return -EINVAL; - } - - LASSERT(policies[number] == policy); - policies[number] = NULL; - write_unlock(&policy_lock); - - CDEBUG(D_SEC, "%s: unregistered\n", policy->sp_name); - return 0; -} -EXPORT_SYMBOL(sptlrpc_unregister_policy); - -static -struct ptlrpc_sec_policy *sptlrpc_wireflavor2policy(__u32 flavor) -{ - static DEFINE_MUTEX(load_mutex); - static atomic_t loaded = ATOMIC_INIT(0); - struct ptlrpc_sec_policy *policy; - __u16 number = SPTLRPC_FLVR_POLICY(flavor); - __u16 flag = 0; - - if (number >= SPTLRPC_POLICY_MAX) - return NULL; - - while (1) { - read_lock(&policy_lock); - policy = policies[number]; - if (policy && !try_module_get(policy->sp_owner)) - policy = NULL; - if (!policy) - flag = atomic_read(&loaded); - read_unlock(&policy_lock); - - if (policy || flag != 0 || - number != SPTLRPC_POLICY_GSS) - break; - - /* try to load gss module, once */ - mutex_lock(&load_mutex); - if (atomic_read(&loaded) == 0) { - if (request_module("ptlrpc_gss") == 0) - CDEBUG(D_SEC, - "module ptlrpc_gss loaded on demand\n"); - else - CERROR("Unable to load module ptlrpc_gss\n"); - - atomic_set(&loaded, 1); - } - mutex_unlock(&load_mutex); - } - - return policy; -} - -__u32 sptlrpc_name2flavor_base(const char *name) -{ - if (!strcmp(name, "null")) - return SPTLRPC_FLVR_NULL; - if (!strcmp(name, "plain")) - return SPTLRPC_FLVR_PLAIN; - if (!strcmp(name, "krb5n")) - return SPTLRPC_FLVR_KRB5N; - if (!strcmp(name, "krb5a")) - return SPTLRPC_FLVR_KRB5A; - if (!strcmp(name, "krb5i")) - return SPTLRPC_FLVR_KRB5I; - if (!strcmp(name, "krb5p")) - return SPTLRPC_FLVR_KRB5P; - - return SPTLRPC_FLVR_INVALID; -} -EXPORT_SYMBOL(sptlrpc_name2flavor_base); - -const char *sptlrpc_flavor2name_base(__u32 flvr) -{ - __u32 base = SPTLRPC_FLVR_BASE(flvr); - - if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_NULL)) - return "null"; - else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN)) - return "plain"; - else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5N)) - return "krb5n"; - else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5A)) - return "krb5a"; - else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5I)) - return "krb5i"; - else if (base == SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_KRB5P)) - return "krb5p"; - - CERROR("invalid wire flavor 0x%x\n", flvr); - return "invalid"; -} -EXPORT_SYMBOL(sptlrpc_flavor2name_base); - -char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf, - char *buf, int bufsize) -{ - if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN) - snprintf(buf, bufsize, "hash:%s", - sptlrpc_get_hash_name(sf->u_bulk.hash.hash_alg)); - else - snprintf(buf, bufsize, "%s", - sptlrpc_flavor2name_base(sf->sf_rpc)); - - buf[bufsize - 1] = '\0'; - return buf; -} -EXPORT_SYMBOL(sptlrpc_flavor2name_bulk); - -char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize) -{ - strlcpy(buf, sptlrpc_flavor2name_base(sf->sf_rpc), bufsize); - - /* - * currently we don't support customized bulk specification for - * flavors other than plain - */ - if (SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN) { - char bspec[16]; - - bspec[0] = '-'; - sptlrpc_flavor2name_bulk(sf, &bspec[1], sizeof(bspec) - 1); - strlcat(buf, bspec, bufsize); - } - - return buf; -} -EXPORT_SYMBOL(sptlrpc_flavor2name); - -static char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize) -{ - buf[0] = '\0'; - - if (flags & PTLRPC_SEC_FL_REVERSE) - strlcat(buf, "reverse,", bufsize); - if (flags & PTLRPC_SEC_FL_ROOTONLY) - strlcat(buf, "rootonly,", bufsize); - if (flags & PTLRPC_SEC_FL_UDESC) - strlcat(buf, "udesc,", bufsize); - if (flags & PTLRPC_SEC_FL_BULK) - strlcat(buf, "bulk,", bufsize); - if (buf[0] == '\0') - strlcat(buf, "-,", bufsize); - - return buf; -} - -/************************************************** - * client context APIs * - **************************************************/ - -static -struct ptlrpc_cli_ctx *get_my_ctx(struct ptlrpc_sec *sec) -{ - struct vfs_cred vcred; - int create = 1, remove_dead = 1; - - LASSERT(sec); - LASSERT(sec->ps_policy->sp_cops->lookup_ctx); - - if (sec->ps_flvr.sf_flags & (PTLRPC_SEC_FL_REVERSE | - PTLRPC_SEC_FL_ROOTONLY)) { - vcred.vc_uid = 0; - vcred.vc_gid = 0; - if (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_REVERSE) { - create = 0; - remove_dead = 0; - } - } else { - vcred.vc_uid = from_kuid(&init_user_ns, current_uid()); - vcred.vc_gid = from_kgid(&init_user_ns, current_gid()); - } - - return sec->ps_policy->sp_cops->lookup_ctx(sec, &vcred, - create, remove_dead); -} - -struct ptlrpc_cli_ctx *sptlrpc_cli_ctx_get(struct ptlrpc_cli_ctx *ctx) -{ - atomic_inc(&ctx->cc_refcount); - return ctx; -} -EXPORT_SYMBOL(sptlrpc_cli_ctx_get); - -void sptlrpc_cli_ctx_put(struct ptlrpc_cli_ctx *ctx, int sync) -{ - struct ptlrpc_sec *sec = ctx->cc_sec; - - LASSERT(sec); - LASSERT_ATOMIC_POS(&ctx->cc_refcount); - - if (!atomic_dec_and_test(&ctx->cc_refcount)) - return; - - sec->ps_policy->sp_cops->release_ctx(sec, ctx, sync); -} -EXPORT_SYMBOL(sptlrpc_cli_ctx_put); - -static int import_sec_check_expire(struct obd_import *imp) -{ - int adapt = 0; - - spin_lock(&imp->imp_lock); - if (imp->imp_sec_expire && - imp->imp_sec_expire < ktime_get_real_seconds()) { - adapt = 1; - imp->imp_sec_expire = 0; - } - spin_unlock(&imp->imp_lock); - - if (!adapt) - return 0; - - CDEBUG(D_SEC, "found delayed sec adapt expired, do it now\n"); - return sptlrpc_import_sec_adapt(imp, NULL, NULL); -} - -/** - * Get and validate the client side ptlrpc security facilities from - * \a imp. There is a race condition on client reconnect when the import is - * being destroyed while there are outstanding client bound requests. In - * this case do not output any error messages if import secuity is not - * found. - * - * \param[in] imp obd import associated with client - * \param[out] sec client side ptlrpc security - * - * \retval 0 if security retrieved successfully - * \retval -ve errno if there was a problem - */ -static int import_sec_validate_get(struct obd_import *imp, - struct ptlrpc_sec **sec) -{ - int rc; - - if (unlikely(imp->imp_sec_expire)) { - rc = import_sec_check_expire(imp); - if (rc) - return rc; - } - - *sec = sptlrpc_import_sec_ref(imp); - if (!*sec) { - CERROR("import %p (%s) with no sec\n", - imp, ptlrpc_import_state_name(imp->imp_state)); - return -EACCES; - } - - if (unlikely((*sec)->ps_dying)) { - CERROR("attempt to use dying sec %p\n", sec); - sptlrpc_sec_put(*sec); - return -EACCES; - } - - return 0; -} - -/** - * Given a \a req, find or allocate a appropriate context for it. - * \pre req->rq_cli_ctx == NULL. - * - * \retval 0 succeed, and req->rq_cli_ctx is set. - * \retval -ev error number, and req->rq_cli_ctx == NULL. - */ -int sptlrpc_req_get_ctx(struct ptlrpc_request *req) -{ - struct obd_import *imp = req->rq_import; - struct ptlrpc_sec *sec; - int rc; - - LASSERT(!req->rq_cli_ctx); - LASSERT(imp); - - rc = import_sec_validate_get(imp, &sec); - if (rc) - return rc; - - req->rq_cli_ctx = get_my_ctx(sec); - - sptlrpc_sec_put(sec); - - if (!req->rq_cli_ctx) { - CERROR("req %p: fail to get context\n", req); - return -ECONNREFUSED; - } - - return 0; -} - -/** - * Drop the context for \a req. - * \pre req->rq_cli_ctx != NULL. - * \post req->rq_cli_ctx == NULL. - * - * If \a sync == 0, this function should return quickly without sleep; - * otherwise it might trigger and wait for the whole process of sending - * an context-destroying rpc to server. - */ -void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync) -{ - LASSERT(req); - LASSERT(req->rq_cli_ctx); - - /* request might be asked to release earlier while still - * in the context waiting list. - */ - if (!list_empty(&req->rq_ctx_chain)) { - spin_lock(&req->rq_cli_ctx->cc_lock); - list_del_init(&req->rq_ctx_chain); - spin_unlock(&req->rq_cli_ctx->cc_lock); - } - - sptlrpc_cli_ctx_put(req->rq_cli_ctx, sync); - req->rq_cli_ctx = NULL; -} - -static -int sptlrpc_req_ctx_switch(struct ptlrpc_request *req, - struct ptlrpc_cli_ctx *oldctx, - struct ptlrpc_cli_ctx *newctx) -{ - struct sptlrpc_flavor old_flvr; - char *reqmsg = NULL; /* to workaround old gcc */ - int reqmsg_size; - int rc = 0; - - LASSERT(req->rq_reqmsg); - LASSERT(req->rq_reqlen); - LASSERT(req->rq_replen); - - CDEBUG(D_SEC, "req %p: switch ctx %p(%u->%s) -> %p(%u->%s), switch sec %p(%s) -> %p(%s)\n", - req, - oldctx, oldctx->cc_vcred.vc_uid, sec2target_str(oldctx->cc_sec), - newctx, newctx->cc_vcred.vc_uid, sec2target_str(newctx->cc_sec), - oldctx->cc_sec, oldctx->cc_sec->ps_policy->sp_name, - newctx->cc_sec, newctx->cc_sec->ps_policy->sp_name); - - /* save flavor */ - old_flvr = req->rq_flvr; - - /* save request message */ - reqmsg_size = req->rq_reqlen; - if (reqmsg_size != 0) { - reqmsg = kvzalloc(reqmsg_size, GFP_NOFS); - if (!reqmsg) - return -ENOMEM; - memcpy(reqmsg, req->rq_reqmsg, reqmsg_size); - } - - /* release old req/rep buf */ - req->rq_cli_ctx = oldctx; - sptlrpc_cli_free_reqbuf(req); - sptlrpc_cli_free_repbuf(req); - req->rq_cli_ctx = newctx; - - /* recalculate the flavor */ - sptlrpc_req_set_flavor(req, 0); - - /* alloc new request buffer - * we don't need to alloc reply buffer here, leave it to the - * rest procedure of ptlrpc - */ - if (reqmsg_size != 0) { - rc = sptlrpc_cli_alloc_reqbuf(req, reqmsg_size); - if (!rc) { - LASSERT(req->rq_reqmsg); - memcpy(req->rq_reqmsg, reqmsg, reqmsg_size); - } else { - CWARN("failed to alloc reqbuf: %d\n", rc); - req->rq_flvr = old_flvr; - } - - kvfree(reqmsg); - } - return rc; -} - -/** - * If current context of \a req is dead somehow, e.g. we just switched flavor - * thus marked original contexts dead, we'll find a new context for it. if - * no switch is needed, \a req will end up with the same context. - * - * \note a request must have a context, to keep other parts of code happy. - * In any case of failure during the switching, we must restore the old one. - */ -static int sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req) -{ - struct ptlrpc_cli_ctx *oldctx = req->rq_cli_ctx; - struct ptlrpc_cli_ctx *newctx; - int rc; - - LASSERT(oldctx); - - sptlrpc_cli_ctx_get(oldctx); - sptlrpc_req_put_ctx(req, 0); - - rc = sptlrpc_req_get_ctx(req); - if (unlikely(rc)) { - LASSERT(!req->rq_cli_ctx); - - /* restore old ctx */ - req->rq_cli_ctx = oldctx; - return rc; - } - - newctx = req->rq_cli_ctx; - LASSERT(newctx); - - if (unlikely(newctx == oldctx && - test_bit(PTLRPC_CTX_DEAD_BIT, &oldctx->cc_flags))) { - /* - * still get the old dead ctx, usually means system too busy - */ - CDEBUG(D_SEC, - "ctx (%p, fl %lx) doesn't switch, relax a little bit\n", - newctx, newctx->cc_flags); - - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(msecs_to_jiffies(MSEC_PER_SEC)); - } else if (unlikely(!test_bit(PTLRPC_CTX_UPTODATE_BIT, &newctx->cc_flags))) { - /* - * new ctx not up to date yet - */ - CDEBUG(D_SEC, - "ctx (%p, fl %lx) doesn't switch, not up to date yet\n", - newctx, newctx->cc_flags); - } else { - /* - * it's possible newctx == oldctx if we're switching - * subflavor with the same sec. - */ - rc = sptlrpc_req_ctx_switch(req, oldctx, newctx); - if (rc) { - /* restore old ctx */ - sptlrpc_req_put_ctx(req, 0); - req->rq_cli_ctx = oldctx; - return rc; - } - - LASSERT(req->rq_cli_ctx == newctx); - } - - sptlrpc_cli_ctx_put(oldctx, 1); - return 0; -} - -static -int ctx_check_refresh(struct ptlrpc_cli_ctx *ctx) -{ - if (cli_ctx_is_refreshed(ctx)) - return 1; - return 0; -} - -static -int ctx_refresh_timeout(struct ptlrpc_request *req) -{ - int rc; - - /* conn_cnt is needed in expire_one_request */ - lustre_msg_set_conn_cnt(req->rq_reqmsg, req->rq_import->imp_conn_cnt); - - rc = ptlrpc_expire_one_request(req, 1); - /* if we started recovery, we should mark this ctx dead; otherwise - * in case of lgssd died nobody would retire this ctx, following - * connecting will still find the same ctx thus cause deadlock. - * there's an assumption that expire time of the request should be - * later than the context refresh expire time. - */ - if (rc == 0) - req->rq_cli_ctx->cc_ops->force_die(req->rq_cli_ctx, 0); - return rc; -} - -static -void ctx_refresh_interrupt(struct ptlrpc_request *req) -{ - spin_lock(&req->rq_lock); - req->rq_intr = 1; - spin_unlock(&req->rq_lock); -} - -static -void req_off_ctx_list(struct ptlrpc_request *req, struct ptlrpc_cli_ctx *ctx) -{ - spin_lock(&ctx->cc_lock); - if (!list_empty(&req->rq_ctx_chain)) - list_del_init(&req->rq_ctx_chain); - spin_unlock(&ctx->cc_lock); -} - -/** - * To refresh the context of \req, if it's not up-to-date. - * \param timeout - * - < 0: don't wait - * - = 0: wait until success or fatal error occur - * - > 0: timeout value (in seconds) - * - * The status of the context could be subject to be changed by other threads - * at any time. We allow this race, but once we return with 0, the caller will - * suppose it's uptodated and keep using it until the owning rpc is done. - * - * \retval 0 only if the context is uptodated. - * \retval -ev error number. - */ -int sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout) -{ - struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; - struct ptlrpc_sec *sec; - int rc; - - LASSERT(ctx); - - if (req->rq_ctx_init || req->rq_ctx_fini) - return 0; - - /* - * during the process a request's context might change type even - * (e.g. from gss ctx to null ctx), so each loop we need to re-check - * everything - */ -again: - rc = import_sec_validate_get(req->rq_import, &sec); - if (rc) - return rc; - - if (sec->ps_flvr.sf_rpc != req->rq_flvr.sf_rpc) { - CDEBUG(D_SEC, "req %p: flavor has changed %x -> %x\n", - req, req->rq_flvr.sf_rpc, sec->ps_flvr.sf_rpc); - req_off_ctx_list(req, ctx); - sptlrpc_req_replace_dead_ctx(req); - ctx = req->rq_cli_ctx; - } - sptlrpc_sec_put(sec); - - if (cli_ctx_is_eternal(ctx)) - return 0; - - if (unlikely(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags))) { - LASSERT(ctx->cc_ops->refresh); - ctx->cc_ops->refresh(ctx); - } - LASSERT(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags) == 0); - - LASSERT(ctx->cc_ops->validate); - if (ctx->cc_ops->validate(ctx) == 0) { - req_off_ctx_list(req, ctx); - return 0; - } - - if (unlikely(test_bit(PTLRPC_CTX_ERROR_BIT, &ctx->cc_flags))) { - spin_lock(&req->rq_lock); - req->rq_err = 1; - spin_unlock(&req->rq_lock); - req_off_ctx_list(req, ctx); - return -EPERM; - } - - /* - * There's a subtle issue for resending RPCs, suppose following - * situation: - * 1. the request was sent to server. - * 2. recovery was kicked start, after finished the request was - * marked as resent. - * 3. resend the request. - * 4. old reply from server received, we accept and verify the reply. - * this has to be success, otherwise the error will be aware - * by application. - * 5. new reply from server received, dropped by LNet. - * - * Note the xid of old & new request is the same. We can't simply - * change xid for the resent request because the server replies on - * it for reply reconstruction. - * - * Commonly the original context should be uptodate because we - * have a expiry nice time; server will keep its context because - * we at least hold a ref of old context which prevent context - * destroying RPC being sent. So server still can accept the request - * and finish the RPC. But if that's not the case: - * 1. If server side context has been trimmed, a NO_CONTEXT will - * be returned, gss_cli_ctx_verify/unseal will switch to new - * context by force. - * 2. Current context never be refreshed, then we are fine: we - * never really send request with old context before. - */ - if (test_bit(PTLRPC_CTX_UPTODATE_BIT, &ctx->cc_flags) && - unlikely(req->rq_reqmsg) && - lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) { - req_off_ctx_list(req, ctx); - return 0; - } - - if (unlikely(test_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags))) { - req_off_ctx_list(req, ctx); - /* - * don't switch ctx if import was deactivated - */ - if (req->rq_import->imp_deactive) { - spin_lock(&req->rq_lock); - req->rq_err = 1; - spin_unlock(&req->rq_lock); - return -EINTR; - } - - rc = sptlrpc_req_replace_dead_ctx(req); - if (rc) { - LASSERT(ctx == req->rq_cli_ctx); - CERROR("req %p: failed to replace dead ctx %p: %d\n", - req, ctx, rc); - spin_lock(&req->rq_lock); - req->rq_err = 1; - spin_unlock(&req->rq_lock); - return rc; - } - - ctx = req->rq_cli_ctx; - goto again; - } - - /* - * Now we're sure this context is during upcall, add myself into - * waiting list - */ - spin_lock(&ctx->cc_lock); - if (list_empty(&req->rq_ctx_chain)) - list_add(&req->rq_ctx_chain, &ctx->cc_req_list); - spin_unlock(&ctx->cc_lock); - - if (timeout < 0) - return -EWOULDBLOCK; - - /* Clear any flags that may be present from previous sends */ - LASSERT(req->rq_receiving_reply == 0); - spin_lock(&req->rq_lock); - req->rq_err = 0; - req->rq_timedout = 0; - req->rq_resend = 0; - req->rq_restart = 0; - spin_unlock(&req->rq_lock); - - rc = wait_event_idle_timeout(req->rq_reply_waitq, - ctx_check_refresh(ctx), - timeout * HZ); - if (rc == 0 && ctx_refresh_timeout(req) == 0) { - /* Keep waiting, but enable some signals */ - rc = l_wait_event_abortable(req->rq_reply_waitq, - ctx_check_refresh(ctx)); - if (rc == 0) - rc = 1; - } - - if (rc > 0) - /* condition is true */ - rc = 0; - else if (rc == 0) - /* Timed out */ - rc = -ETIMEDOUT; - else { - /* Aborted by signal */ - rc = -EINTR; - ctx_refresh_interrupt(req); - } - - /* - * following cases could lead us here: - * - successfully refreshed; - * - interrupted; - * - timedout, and we don't want recover from the failure; - * - timedout, and waked up upon recovery finished; - * - someone else mark this ctx dead by force; - * - someone invalidate the req and call ptlrpc_client_wake_req(), - * e.g. ptlrpc_abort_inflight(); - */ - if (!cli_ctx_is_refreshed(ctx)) { - /* timed out or interrupted */ - req_off_ctx_list(req, ctx); - - LASSERT(rc != 0); - return rc; - } - - goto again; -} - -/** - * Initialize flavor settings for \a req, according to \a opcode. - * - * \note this could be called in two situations: - * - new request from ptlrpc_pre_req(), with proper @opcode - * - old request which changed ctx in the middle, with @opcode == 0 - */ -void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode) -{ - struct ptlrpc_sec *sec; - - LASSERT(req->rq_import); - LASSERT(req->rq_cli_ctx); - LASSERT(req->rq_cli_ctx->cc_sec); - LASSERT(req->rq_bulk_read == 0 || req->rq_bulk_write == 0); - - /* special security flags according to opcode */ - switch (opcode) { - case OST_READ: - case MDS_READPAGE: - case MGS_CONFIG_READ: - case OBD_IDX_READ: - req->rq_bulk_read = 1; - break; - case OST_WRITE: - case MDS_WRITEPAGE: - req->rq_bulk_write = 1; - break; - case SEC_CTX_INIT: - req->rq_ctx_init = 1; - break; - case SEC_CTX_FINI: - req->rq_ctx_fini = 1; - break; - case 0: - /* init/fini rpc won't be resend, so can't be here */ - LASSERT(req->rq_ctx_init == 0); - LASSERT(req->rq_ctx_fini == 0); - - /* cleanup flags, which should be recalculated */ - req->rq_pack_udesc = 0; - req->rq_pack_bulk = 0; - break; - } - - sec = req->rq_cli_ctx->cc_sec; - - spin_lock(&sec->ps_lock); - req->rq_flvr = sec->ps_flvr; - spin_unlock(&sec->ps_lock); - - /* force SVC_NULL for context initiation rpc, SVC_INTG for context - * destruction rpc - */ - if (unlikely(req->rq_ctx_init)) - flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_NULL); - else if (unlikely(req->rq_ctx_fini)) - flvr_set_svc(&req->rq_flvr.sf_rpc, SPTLRPC_SVC_INTG); - - /* user descriptor flag, null security can't do it anyway */ - if ((sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_UDESC) && - (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_NULL)) - req->rq_pack_udesc = 1; - - /* bulk security flag */ - if ((req->rq_bulk_read || req->rq_bulk_write) && - sptlrpc_flavor_has_bulk(&req->rq_flvr)) - req->rq_pack_bulk = 1; -} - -void sptlrpc_request_out_callback(struct ptlrpc_request *req) -{ - if (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc) != SPTLRPC_SVC_PRIV) - return; - - LASSERT(req->rq_clrbuf); - if (req->rq_pool || !req->rq_reqbuf) - return; - - kvfree(req->rq_reqbuf); - req->rq_reqbuf = NULL; - req->rq_reqbuf_len = 0; -} - -/** - * Given an import \a imp, check whether current user has a valid context - * or not. We may create a new context and try to refresh it, and try - * repeatedly try in case of non-fatal errors. Return 0 means success. - */ -int sptlrpc_import_check_ctx(struct obd_import *imp) -{ - struct ptlrpc_sec *sec; - struct ptlrpc_cli_ctx *ctx; - struct ptlrpc_request *req = NULL; - int rc; - - might_sleep(); - - sec = sptlrpc_import_sec_ref(imp); - ctx = get_my_ctx(sec); - sptlrpc_sec_put(sec); - - if (!ctx) - return -ENOMEM; - - if (cli_ctx_is_eternal(ctx) || - ctx->cc_ops->validate(ctx) == 0) { - sptlrpc_cli_ctx_put(ctx, 1); - return 0; - } - - if (cli_ctx_is_error(ctx)) { - sptlrpc_cli_ctx_put(ctx, 1); - return -EACCES; - } - - req = ptlrpc_request_cache_alloc(GFP_NOFS); - if (!req) - return -ENOMEM; - - ptlrpc_cli_req_init(req); - atomic_set(&req->rq_refcount, 10000); - - req->rq_import = imp; - req->rq_flvr = sec->ps_flvr; - req->rq_cli_ctx = ctx; - - rc = sptlrpc_req_refresh_ctx(req, 0); - LASSERT(list_empty(&req->rq_ctx_chain)); - sptlrpc_cli_ctx_put(req->rq_cli_ctx, 1); - ptlrpc_request_cache_free(req); - - return rc; -} - -/** - * Used by ptlrpc client, to perform the pre-defined security transformation - * upon the request message of \a req. After this function called, - * req->rq_reqmsg is still accessible as clear text. - */ -int sptlrpc_cli_wrap_request(struct ptlrpc_request *req) -{ - struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; - int rc = 0; - - LASSERT(ctx); - LASSERT(ctx->cc_sec); - LASSERT(req->rq_reqbuf || req->rq_clrbuf); - - /* we wrap bulk request here because now we can be sure - * the context is uptodate. - */ - if (req->rq_bulk) { - rc = sptlrpc_cli_wrap_bulk(req, req->rq_bulk); - if (rc) - return rc; - } - - switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) { - case SPTLRPC_SVC_NULL: - case SPTLRPC_SVC_AUTH: - case SPTLRPC_SVC_INTG: - LASSERT(ctx->cc_ops->sign); - rc = ctx->cc_ops->sign(ctx, req); - break; - case SPTLRPC_SVC_PRIV: - LASSERT(ctx->cc_ops->seal); - rc = ctx->cc_ops->seal(ctx, req); - break; - default: - LBUG(); - } - - if (rc == 0) { - LASSERT(req->rq_reqdata_len); - LASSERT(req->rq_reqdata_len % 8 == 0); - LASSERT(req->rq_reqdata_len <= req->rq_reqbuf_len); - } - - return rc; -} - -static int do_cli_unwrap_reply(struct ptlrpc_request *req) -{ - struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; - int rc; - - LASSERT(ctx); - LASSERT(ctx->cc_sec); - LASSERT(req->rq_repbuf); - LASSERT(req->rq_repdata); - LASSERT(!req->rq_repmsg); - - req->rq_rep_swab_mask = 0; - - rc = __lustre_unpack_msg(req->rq_repdata, req->rq_repdata_len); - switch (rc) { - case 1: - lustre_set_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF); - case 0: - break; - default: - CERROR("failed unpack reply: x%llu\n", req->rq_xid); - return -EPROTO; - } - - if (req->rq_repdata_len < sizeof(struct lustre_msg)) { - CERROR("replied data length %d too small\n", - req->rq_repdata_len); - return -EPROTO; - } - - if (SPTLRPC_FLVR_POLICY(req->rq_repdata->lm_secflvr) != - SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc)) { - CERROR("reply policy %u doesn't match request policy %u\n", - SPTLRPC_FLVR_POLICY(req->rq_repdata->lm_secflvr), - SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc)); - return -EPROTO; - } - - switch (SPTLRPC_FLVR_SVC(req->rq_flvr.sf_rpc)) { - case SPTLRPC_SVC_NULL: - case SPTLRPC_SVC_AUTH: - case SPTLRPC_SVC_INTG: - LASSERT(ctx->cc_ops->verify); - rc = ctx->cc_ops->verify(ctx, req); - break; - case SPTLRPC_SVC_PRIV: - LASSERT(ctx->cc_ops->unseal); - rc = ctx->cc_ops->unseal(ctx, req); - break; - default: - LBUG(); - } - LASSERT(rc || req->rq_repmsg || req->rq_resend); - - if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL && - !req->rq_ctx_init) - req->rq_rep_swab_mask = 0; - return rc; -} - -/** - * Used by ptlrpc client, to perform security transformation upon the reply - * message of \a req. After return successfully, req->rq_repmsg points to - * the reply message in clear text. - * - * \pre the reply buffer should have been un-posted from LNet, so nothing is - * going to change. - */ -int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req) -{ - LASSERT(req->rq_repbuf); - LASSERT(!req->rq_repdata); - LASSERT(!req->rq_repmsg); - LASSERT(req->rq_reply_off + req->rq_nob_received <= req->rq_repbuf_len); - - if (req->rq_reply_off == 0 && - (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) { - CERROR("real reply with offset 0\n"); - return -EPROTO; - } - - if (req->rq_reply_off % 8 != 0) { - CERROR("reply at odd offset %u\n", req->rq_reply_off); - return -EPROTO; - } - - req->rq_repdata = (struct lustre_msg *) - (req->rq_repbuf + req->rq_reply_off); - req->rq_repdata_len = req->rq_nob_received; - - return do_cli_unwrap_reply(req); -} - -/** - * Used by ptlrpc client, to perform security transformation upon the early - * reply message of \a req. We expect the rq_reply_off is 0, and - * rq_nob_received is the early reply size. - * - * Because the receive buffer might be still posted, the reply data might be - * changed at any time, no matter we're holding rq_lock or not. For this reason - * we allocate a separate ptlrpc_request and reply buffer for early reply - * processing. - * - * \retval 0 success, \a req_ret is filled with a duplicated ptlrpc_request. - * Later the caller must call sptlrpc_cli_finish_early_reply() on the returned - * \a *req_ret to release it. - * \retval -ev error number, and \a req_ret will not be set. - */ -int sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req, - struct ptlrpc_request **req_ret) -{ - struct ptlrpc_request *early_req; - char *early_buf; - int early_bufsz, early_size; - int rc; - - early_req = ptlrpc_request_cache_alloc(GFP_NOFS); - if (!early_req) - return -ENOMEM; - - ptlrpc_cli_req_init(early_req); - - early_size = req->rq_nob_received; - early_bufsz = size_roundup_power2(early_size); - early_buf = kvzalloc(early_bufsz, GFP_NOFS); - if (!early_buf) { - rc = -ENOMEM; - goto err_req; - } - - /* sanity checkings and copy data out, do it inside spinlock */ - spin_lock(&req->rq_lock); - - if (req->rq_replied) { - spin_unlock(&req->rq_lock); - rc = -EALREADY; - goto err_buf; - } - - LASSERT(req->rq_repbuf); - LASSERT(!req->rq_repdata); - LASSERT(!req->rq_repmsg); - - if (req->rq_reply_off != 0) { - CERROR("early reply with offset %u\n", req->rq_reply_off); - spin_unlock(&req->rq_lock); - rc = -EPROTO; - goto err_buf; - } - - if (req->rq_nob_received != early_size) { - /* even another early arrived the size should be the same */ - CERROR("data size has changed from %u to %u\n", - early_size, req->rq_nob_received); - spin_unlock(&req->rq_lock); - rc = -EINVAL; - goto err_buf; - } - - if (req->rq_nob_received < sizeof(struct lustre_msg)) { - CERROR("early reply length %d too small\n", - req->rq_nob_received); - spin_unlock(&req->rq_lock); - rc = -EALREADY; - goto err_buf; - } - - memcpy(early_buf, req->rq_repbuf, early_size); - spin_unlock(&req->rq_lock); - - early_req->rq_cli_ctx = sptlrpc_cli_ctx_get(req->rq_cli_ctx); - early_req->rq_flvr = req->rq_flvr; - early_req->rq_repbuf = early_buf; - early_req->rq_repbuf_len = early_bufsz; - early_req->rq_repdata = (struct lustre_msg *)early_buf; - early_req->rq_repdata_len = early_size; - early_req->rq_early = 1; - early_req->rq_reqmsg = req->rq_reqmsg; - - rc = do_cli_unwrap_reply(early_req); - if (rc) { - DEBUG_REQ(D_ADAPTTO, early_req, - "error %d unwrap early reply", rc); - goto err_ctx; - } - - LASSERT(early_req->rq_repmsg); - *req_ret = early_req; - return 0; - -err_ctx: - sptlrpc_cli_ctx_put(early_req->rq_cli_ctx, 1); -err_buf: - kvfree(early_buf); -err_req: - ptlrpc_request_cache_free(early_req); - return rc; -} - -/** - * Used by ptlrpc client, to release a processed early reply \a early_req. - * - * \pre \a early_req was obtained from calling sptlrpc_cli_unwrap_early_reply(). - */ -void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req) -{ - LASSERT(early_req->rq_repbuf); - LASSERT(early_req->rq_repdata); - LASSERT(early_req->rq_repmsg); - - sptlrpc_cli_ctx_put(early_req->rq_cli_ctx, 1); - kvfree(early_req->rq_repbuf); - ptlrpc_request_cache_free(early_req); -} - -/************************************************** - * sec ID * - **************************************************/ - -/* - * "fixed" sec (e.g. null) use sec_id < 0 - */ -static atomic_t sptlrpc_sec_id = ATOMIC_INIT(1); - -int sptlrpc_get_next_secid(void) -{ - return atomic_inc_return(&sptlrpc_sec_id); -} -EXPORT_SYMBOL(sptlrpc_get_next_secid); - -/************************************************** - * client side high-level security APIs * - **************************************************/ - -static int sec_cop_flush_ctx_cache(struct ptlrpc_sec *sec, uid_t uid, - int grace, int force) -{ - struct ptlrpc_sec_policy *policy = sec->ps_policy; - - LASSERT(policy->sp_cops); - LASSERT(policy->sp_cops->flush_ctx_cache); - - return policy->sp_cops->flush_ctx_cache(sec, uid, grace, force); -} - -static void sec_cop_destroy_sec(struct ptlrpc_sec *sec) -{ - struct ptlrpc_sec_policy *policy = sec->ps_policy; - - LASSERT_ATOMIC_ZERO(&sec->ps_refcount); - LASSERT_ATOMIC_ZERO(&sec->ps_nctx); - LASSERT(policy->sp_cops->destroy_sec); - - CDEBUG(D_SEC, "%s@%p: being destroyed\n", sec->ps_policy->sp_name, sec); - - policy->sp_cops->destroy_sec(sec); - sptlrpc_policy_put(policy); -} - -static void sptlrpc_sec_kill(struct ptlrpc_sec *sec) -{ - LASSERT_ATOMIC_POS(&sec->ps_refcount); - - if (sec->ps_policy->sp_cops->kill_sec) { - sec->ps_policy->sp_cops->kill_sec(sec); - - sec_cop_flush_ctx_cache(sec, -1, 1, 1); - } -} - -static struct ptlrpc_sec *sptlrpc_sec_get(struct ptlrpc_sec *sec) -{ - if (sec) - atomic_inc(&sec->ps_refcount); - - return sec; -} - -void sptlrpc_sec_put(struct ptlrpc_sec *sec) -{ - if (sec) { - LASSERT_ATOMIC_POS(&sec->ps_refcount); - - if (atomic_dec_and_test(&sec->ps_refcount)) { - sptlrpc_gc_del_sec(sec); - sec_cop_destroy_sec(sec); - } - } -} -EXPORT_SYMBOL(sptlrpc_sec_put); - -/* - * policy module is responsible for taking reference of import - */ -static -struct ptlrpc_sec *sptlrpc_sec_create(struct obd_import *imp, - struct ptlrpc_svc_ctx *svc_ctx, - struct sptlrpc_flavor *sf, - enum lustre_sec_part sp) -{ - struct ptlrpc_sec_policy *policy; - struct ptlrpc_sec *sec; - char str[32]; - - if (svc_ctx) { - LASSERT(imp->imp_dlm_fake == 1); - - CDEBUG(D_SEC, "%s %s: reverse sec using flavor %s\n", - imp->imp_obd->obd_type->typ_name, - imp->imp_obd->obd_name, - sptlrpc_flavor2name(sf, str, sizeof(str))); - - policy = sptlrpc_policy_get(svc_ctx->sc_policy); - sf->sf_flags |= PTLRPC_SEC_FL_REVERSE | PTLRPC_SEC_FL_ROOTONLY; - } else { - LASSERT(imp->imp_dlm_fake == 0); - - CDEBUG(D_SEC, "%s %s: select security flavor %s\n", - imp->imp_obd->obd_type->typ_name, - imp->imp_obd->obd_name, - sptlrpc_flavor2name(sf, str, sizeof(str))); - - policy = sptlrpc_wireflavor2policy(sf->sf_rpc); - if (!policy) { - CERROR("invalid flavor 0x%x\n", sf->sf_rpc); - return NULL; - } - } - - sec = policy->sp_cops->create_sec(imp, svc_ctx, sf); - if (sec) { - atomic_inc(&sec->ps_refcount); - - sec->ps_part = sp; - - if (sec->ps_gc_interval && policy->sp_cops->gc_ctx) - sptlrpc_gc_add_sec(sec); - } else { - sptlrpc_policy_put(policy); - } - - return sec; -} - -struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp) -{ - struct ptlrpc_sec *sec; - - spin_lock(&imp->imp_lock); - sec = sptlrpc_sec_get(imp->imp_sec); - spin_unlock(&imp->imp_lock); - - return sec; -} -EXPORT_SYMBOL(sptlrpc_import_sec_ref); - -static void sptlrpc_import_sec_install(struct obd_import *imp, - struct ptlrpc_sec *sec) -{ - struct ptlrpc_sec *old_sec; - - LASSERT_ATOMIC_POS(&sec->ps_refcount); - - spin_lock(&imp->imp_lock); - old_sec = imp->imp_sec; - imp->imp_sec = sec; - spin_unlock(&imp->imp_lock); - - if (old_sec) { - sptlrpc_sec_kill(old_sec); - - /* balance the ref taken by this import */ - sptlrpc_sec_put(old_sec); - } -} - -static inline -int flavor_equal(struct sptlrpc_flavor *sf1, struct sptlrpc_flavor *sf2) -{ - return (memcmp(sf1, sf2, sizeof(*sf1)) == 0); -} - -static inline -void flavor_copy(struct sptlrpc_flavor *dst, struct sptlrpc_flavor *src) -{ - *dst = *src; -} - -static void sptlrpc_import_sec_adapt_inplace(struct obd_import *imp, - struct ptlrpc_sec *sec, - struct sptlrpc_flavor *sf) -{ - char str1[32], str2[32]; - - if (sec->ps_flvr.sf_flags != sf->sf_flags) - CDEBUG(D_SEC, "changing sec flags: %s -> %s\n", - sptlrpc_secflags2str(sec->ps_flvr.sf_flags, - str1, sizeof(str1)), - sptlrpc_secflags2str(sf->sf_flags, - str2, sizeof(str2))); - - spin_lock(&sec->ps_lock); - flavor_copy(&sec->ps_flvr, sf); - spin_unlock(&sec->ps_lock); -} - -/** - * To get an appropriate ptlrpc_sec for the \a imp, according to the current - * configuration. Upon called, imp->imp_sec may or may not be NULL. - * - * - regular import: \a svc_ctx should be NULL and \a flvr is ignored; - * - reverse import: \a svc_ctx and \a flvr are obtained from incoming request. - */ -int sptlrpc_import_sec_adapt(struct obd_import *imp, - struct ptlrpc_svc_ctx *svc_ctx, - struct sptlrpc_flavor *flvr) -{ - struct ptlrpc_connection *conn; - struct sptlrpc_flavor sf; - struct ptlrpc_sec *sec, *newsec; - enum lustre_sec_part sp; - char str[24]; - int rc = 0; - - might_sleep(); - - if (!imp) - return 0; - - conn = imp->imp_connection; - - if (!svc_ctx) { - struct client_obd *cliobd = &imp->imp_obd->u.cli; - /* - * normal import, determine flavor from rule set, except - * for mgc the flavor is predetermined. - */ - if (cliobd->cl_sp_me == LUSTRE_SP_MGC) - sf = cliobd->cl_flvr_mgc; - else - sptlrpc_conf_choose_flavor(cliobd->cl_sp_me, - cliobd->cl_sp_to, - &cliobd->cl_target_uuid, - conn->c_self, &sf); - - sp = imp->imp_obd->u.cli.cl_sp_me; - } else { - /* reverse import, determine flavor from incoming request */ - sf = *flvr; - - if (sf.sf_rpc != SPTLRPC_FLVR_NULL) - sf.sf_flags = PTLRPC_SEC_FL_REVERSE | - PTLRPC_SEC_FL_ROOTONLY; - - sp = sptlrpc_target_sec_part(imp->imp_obd); - } - - sec = sptlrpc_import_sec_ref(imp); - if (sec) { - char str2[24]; - - if (flavor_equal(&sf, &sec->ps_flvr)) - goto out; - - CDEBUG(D_SEC, "import %s->%s: changing flavor %s -> %s\n", - imp->imp_obd->obd_name, - obd_uuid2str(&conn->c_remote_uuid), - sptlrpc_flavor2name(&sec->ps_flvr, str, sizeof(str)), - sptlrpc_flavor2name(&sf, str2, sizeof(str2))); - - if (SPTLRPC_FLVR_POLICY(sf.sf_rpc) == - SPTLRPC_FLVR_POLICY(sec->ps_flvr.sf_rpc) && - SPTLRPC_FLVR_MECH(sf.sf_rpc) == - SPTLRPC_FLVR_MECH(sec->ps_flvr.sf_rpc)) { - sptlrpc_import_sec_adapt_inplace(imp, sec, &sf); - goto out; - } - } else if (SPTLRPC_FLVR_BASE(sf.sf_rpc) != - SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_NULL)) { - CDEBUG(D_SEC, "import %s->%s netid %x: select flavor %s\n", - imp->imp_obd->obd_name, - obd_uuid2str(&conn->c_remote_uuid), - LNET_NIDNET(conn->c_self), - sptlrpc_flavor2name(&sf, str, sizeof(str))); - } - - mutex_lock(&imp->imp_sec_mutex); - - newsec = sptlrpc_sec_create(imp, svc_ctx, &sf, sp); - if (newsec) { - sptlrpc_import_sec_install(imp, newsec); - } else { - CERROR("import %s->%s: failed to create new sec\n", - imp->imp_obd->obd_name, - obd_uuid2str(&conn->c_remote_uuid)); - rc = -EPERM; - } - - mutex_unlock(&imp->imp_sec_mutex); -out: - sptlrpc_sec_put(sec); - return rc; -} - -void sptlrpc_import_sec_put(struct obd_import *imp) -{ - if (imp->imp_sec) { - sptlrpc_sec_kill(imp->imp_sec); - - sptlrpc_sec_put(imp->imp_sec); - imp->imp_sec = NULL; - } -} - -static void import_flush_ctx_common(struct obd_import *imp, - uid_t uid, int grace, int force) -{ - struct ptlrpc_sec *sec; - - if (!imp) - return; - - sec = sptlrpc_import_sec_ref(imp); - if (!sec) - return; - - sec_cop_flush_ctx_cache(sec, uid, grace, force); - sptlrpc_sec_put(sec); -} - -void sptlrpc_import_flush_my_ctx(struct obd_import *imp) -{ - import_flush_ctx_common(imp, from_kuid(&init_user_ns, current_uid()), - 1, 1); -} -EXPORT_SYMBOL(sptlrpc_import_flush_my_ctx); - -void sptlrpc_import_flush_all_ctx(struct obd_import *imp) -{ - import_flush_ctx_common(imp, -1, 1, 1); -} -EXPORT_SYMBOL(sptlrpc_import_flush_all_ctx); - -/** - * Used by ptlrpc client to allocate request buffer of \a req. Upon return - * successfully, req->rq_reqmsg points to a buffer with size \a msgsize. - */ -int sptlrpc_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize) -{ - struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; - struct ptlrpc_sec_policy *policy; - int rc; - - LASSERT(ctx); - LASSERT(ctx->cc_sec); - LASSERT(ctx->cc_sec->ps_policy); - LASSERT(!req->rq_reqmsg); - LASSERT_ATOMIC_POS(&ctx->cc_refcount); - - policy = ctx->cc_sec->ps_policy; - rc = policy->sp_cops->alloc_reqbuf(ctx->cc_sec, req, msgsize); - if (!rc) { - LASSERT(req->rq_reqmsg); - LASSERT(req->rq_reqbuf || req->rq_clrbuf); - - /* zeroing preallocated buffer */ - if (req->rq_pool) - memset(req->rq_reqmsg, 0, msgsize); - } - - return rc; -} - -/** - * Used by ptlrpc client to free request buffer of \a req. After this - * req->rq_reqmsg is set to NULL and should not be accessed anymore. - */ -void sptlrpc_cli_free_reqbuf(struct ptlrpc_request *req) -{ - struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; - struct ptlrpc_sec_policy *policy; - - LASSERT(ctx); - LASSERT(ctx->cc_sec); - LASSERT(ctx->cc_sec->ps_policy); - LASSERT_ATOMIC_POS(&ctx->cc_refcount); - - if (!req->rq_reqbuf && !req->rq_clrbuf) - return; - - policy = ctx->cc_sec->ps_policy; - policy->sp_cops->free_reqbuf(ctx->cc_sec, req); - req->rq_reqmsg = NULL; -} - -/* - * NOTE caller must guarantee the buffer size is enough for the enlargement - */ -void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg, - int segment, int newsize) -{ - void *src, *dst; - int oldsize, oldmsg_size, movesize; - - LASSERT(segment < msg->lm_bufcount); - LASSERT(msg->lm_buflens[segment] <= newsize); - - if (msg->lm_buflens[segment] == newsize) - return; - - /* nothing to do if we are enlarging the last segment */ - if (segment == msg->lm_bufcount - 1) { - msg->lm_buflens[segment] = newsize; - return; - } - - oldsize = msg->lm_buflens[segment]; - - src = lustre_msg_buf(msg, segment + 1, 0); - msg->lm_buflens[segment] = newsize; - dst = lustre_msg_buf(msg, segment + 1, 0); - msg->lm_buflens[segment] = oldsize; - - /* move from segment + 1 to end segment */ - LASSERT(msg->lm_magic == LUSTRE_MSG_MAGIC_V2); - oldmsg_size = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); - movesize = oldmsg_size - ((unsigned long)src - (unsigned long)msg); - LASSERT(movesize >= 0); - - if (movesize) - memmove(dst, src, movesize); - - /* note we don't clear the ares where old data live, not secret */ - - /* finally set new segment size */ - msg->lm_buflens[segment] = newsize; -} -EXPORT_SYMBOL(_sptlrpc_enlarge_msg_inplace); - -/** - * Used by ptlrpc client to enlarge the \a segment of request message pointed - * by req->rq_reqmsg to size \a newsize, all previously filled-in data will be - * preserved after the enlargement. this must be called after original request - * buffer being allocated. - * - * \note after this be called, rq_reqmsg and rq_reqlen might have been changed, - * so caller should refresh its local pointers if needed. - */ -int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req, - int segment, int newsize) -{ - struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; - struct ptlrpc_sec_cops *cops; - struct lustre_msg *msg = req->rq_reqmsg; - - LASSERT(ctx); - LASSERT(msg); - LASSERT(msg->lm_bufcount > segment); - LASSERT(msg->lm_buflens[segment] <= newsize); - - if (msg->lm_buflens[segment] == newsize) - return 0; - - cops = ctx->cc_sec->ps_policy->sp_cops; - LASSERT(cops->enlarge_reqbuf); - return cops->enlarge_reqbuf(ctx->cc_sec, req, segment, newsize); -} -EXPORT_SYMBOL(sptlrpc_cli_enlarge_reqbuf); - -/** - * Used by ptlrpc client to allocate reply buffer of \a req. - * - * \note After this, req->rq_repmsg is still not accessible. - */ -int sptlrpc_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize) -{ - struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; - struct ptlrpc_sec_policy *policy; - - LASSERT(ctx); - LASSERT(ctx->cc_sec); - LASSERT(ctx->cc_sec->ps_policy); - - if (req->rq_repbuf) - return 0; - - policy = ctx->cc_sec->ps_policy; - return policy->sp_cops->alloc_repbuf(ctx->cc_sec, req, msgsize); -} - -/** - * Used by ptlrpc client to free reply buffer of \a req. After this - * req->rq_repmsg is set to NULL and should not be accessed anymore. - */ -void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req) -{ - struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; - struct ptlrpc_sec_policy *policy; - - LASSERT(ctx); - LASSERT(ctx->cc_sec); - LASSERT(ctx->cc_sec->ps_policy); - LASSERT_ATOMIC_POS(&ctx->cc_refcount); - - if (!req->rq_repbuf) - return; - LASSERT(req->rq_repbuf_len); - - policy = ctx->cc_sec->ps_policy; - policy->sp_cops->free_repbuf(ctx->cc_sec, req); - req->rq_repmsg = NULL; -} - -static int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp, - struct ptlrpc_svc_ctx *ctx) -{ - struct ptlrpc_sec_policy *policy = ctx->sc_policy; - - if (!policy->sp_sops->install_rctx) - return 0; - return policy->sp_sops->install_rctx(imp, ctx); -} - -/**************************************** - * server side security * - ****************************************/ - -static int flavor_allowed(struct sptlrpc_flavor *exp, - struct ptlrpc_request *req) -{ - struct sptlrpc_flavor *flvr = &req->rq_flvr; - - if (exp->sf_rpc == SPTLRPC_FLVR_ANY || exp->sf_rpc == flvr->sf_rpc) - return 1; - - if ((req->rq_ctx_init || req->rq_ctx_fini) && - SPTLRPC_FLVR_POLICY(exp->sf_rpc) == - SPTLRPC_FLVR_POLICY(flvr->sf_rpc) && - SPTLRPC_FLVR_MECH(exp->sf_rpc) == SPTLRPC_FLVR_MECH(flvr->sf_rpc)) - return 1; - - return 0; -} - -#define EXP_FLVR_UPDATE_EXPIRE (OBD_TIMEOUT_DEFAULT + 10) - -/** - * Given an export \a exp, check whether the flavor of incoming \a req - * is allowed by the export \a exp. Main logic is about taking care of - * changing configurations. Return 0 means success. - */ -int sptlrpc_target_export_check(struct obd_export *exp, - struct ptlrpc_request *req) -{ - struct sptlrpc_flavor flavor; - - if (!exp) - return 0; - - /* client side export has no imp_reverse, skip - * FIXME maybe we should check flavor this as well??? - */ - if (!exp->exp_imp_reverse) - return 0; - - /* don't care about ctx fini rpc */ - if (req->rq_ctx_fini) - return 0; - - spin_lock(&exp->exp_lock); - - /* if flavor just changed (exp->exp_flvr_changed != 0), we wait for - * the first req with the new flavor, then treat it as current flavor, - * adapt reverse sec according to it. - * note the first rpc with new flavor might not be with root ctx, in - * which case delay the sec_adapt by leaving exp_flvr_adapt == 1. - */ - if (unlikely(exp->exp_flvr_changed) && - flavor_allowed(&exp->exp_flvr_old[1], req)) { - /* make the new flavor as "current", and old ones as - * about-to-expire - */ - CDEBUG(D_SEC, "exp %p: just changed: %x->%x\n", exp, - exp->exp_flvr.sf_rpc, exp->exp_flvr_old[1].sf_rpc); - flavor = exp->exp_flvr_old[1]; - exp->exp_flvr_old[1] = exp->exp_flvr_old[0]; - exp->exp_flvr_expire[1] = exp->exp_flvr_expire[0]; - exp->exp_flvr_old[0] = exp->exp_flvr; - exp->exp_flvr_expire[0] = ktime_get_real_seconds() + - EXP_FLVR_UPDATE_EXPIRE; - exp->exp_flvr = flavor; - - /* flavor change finished */ - exp->exp_flvr_changed = 0; - LASSERT(exp->exp_flvr_adapt == 1); - - /* if it's gss, we only interested in root ctx init */ - if (req->rq_auth_gss && - !(req->rq_ctx_init && - (req->rq_auth_usr_root || req->rq_auth_usr_mdt || - req->rq_auth_usr_ost))) { - spin_unlock(&exp->exp_lock); - CDEBUG(D_SEC, "is good but not root(%d:%d:%d:%d:%d)\n", - req->rq_auth_gss, req->rq_ctx_init, - req->rq_auth_usr_root, req->rq_auth_usr_mdt, - req->rq_auth_usr_ost); - return 0; - } - - exp->exp_flvr_adapt = 0; - spin_unlock(&exp->exp_lock); - - return sptlrpc_import_sec_adapt(exp->exp_imp_reverse, - req->rq_svc_ctx, &flavor); - } - - /* if it equals to the current flavor, we accept it, but need to - * dealing with reverse sec/ctx - */ - if (likely(flavor_allowed(&exp->exp_flvr, req))) { - /* most cases should return here, we only interested in - * gss root ctx init - */ - if (!req->rq_auth_gss || !req->rq_ctx_init || - (!req->rq_auth_usr_root && !req->rq_auth_usr_mdt && - !req->rq_auth_usr_ost)) { - spin_unlock(&exp->exp_lock); - return 0; - } - - /* if flavor just changed, we should not proceed, just leave - * it and current flavor will be discovered and replaced - * shortly, and let _this_ rpc pass through - */ - if (exp->exp_flvr_changed) { - LASSERT(exp->exp_flvr_adapt); - spin_unlock(&exp->exp_lock); - return 0; - } - - if (exp->exp_flvr_adapt) { - exp->exp_flvr_adapt = 0; - CDEBUG(D_SEC, "exp %p (%x|%x|%x): do delayed adapt\n", - exp, exp->exp_flvr.sf_rpc, - exp->exp_flvr_old[0].sf_rpc, - exp->exp_flvr_old[1].sf_rpc); - flavor = exp->exp_flvr; - spin_unlock(&exp->exp_lock); - - return sptlrpc_import_sec_adapt(exp->exp_imp_reverse, - req->rq_svc_ctx, - &flavor); - } else { - CDEBUG(D_SEC, "exp %p (%x|%x|%x): is current flavor, install rvs ctx\n", - exp, exp->exp_flvr.sf_rpc, - exp->exp_flvr_old[0].sf_rpc, - exp->exp_flvr_old[1].sf_rpc); - spin_unlock(&exp->exp_lock); - - return sptlrpc_svc_install_rvs_ctx(exp->exp_imp_reverse, - req->rq_svc_ctx); - } - } - - if (exp->exp_flvr_expire[0]) { - if (exp->exp_flvr_expire[0] >= ktime_get_real_seconds()) { - if (flavor_allowed(&exp->exp_flvr_old[0], req)) { - CDEBUG(D_SEC, "exp %p (%x|%x|%x): match the middle one (%lld)\n", exp, - exp->exp_flvr.sf_rpc, - exp->exp_flvr_old[0].sf_rpc, - exp->exp_flvr_old[1].sf_rpc, - (s64)(exp->exp_flvr_expire[0] - - ktime_get_real_seconds())); - spin_unlock(&exp->exp_lock); - return 0; - } - } else { - CDEBUG(D_SEC, "mark middle expired\n"); - exp->exp_flvr_expire[0] = 0; - } - CDEBUG(D_SEC, "exp %p (%x|%x|%x): %x not match middle\n", exp, - exp->exp_flvr.sf_rpc, - exp->exp_flvr_old[0].sf_rpc, exp->exp_flvr_old[1].sf_rpc, - req->rq_flvr.sf_rpc); - } - - /* now it doesn't match the current flavor, the only chance we can - * accept it is match the old flavors which is not expired. - */ - if (exp->exp_flvr_changed == 0 && exp->exp_flvr_expire[1]) { - if (exp->exp_flvr_expire[1] >= ktime_get_real_seconds()) { - if (flavor_allowed(&exp->exp_flvr_old[1], req)) { - CDEBUG(D_SEC, "exp %p (%x|%x|%x): match the oldest one (%lld)\n", - exp, - exp->exp_flvr.sf_rpc, - exp->exp_flvr_old[0].sf_rpc, - exp->exp_flvr_old[1].sf_rpc, - (s64)(exp->exp_flvr_expire[1] - - ktime_get_real_seconds())); - spin_unlock(&exp->exp_lock); - return 0; - } - } else { - CDEBUG(D_SEC, "mark oldest expired\n"); - exp->exp_flvr_expire[1] = 0; - } - CDEBUG(D_SEC, "exp %p (%x|%x|%x): %x not match found\n", - exp, exp->exp_flvr.sf_rpc, - exp->exp_flvr_old[0].sf_rpc, exp->exp_flvr_old[1].sf_rpc, - req->rq_flvr.sf_rpc); - } else { - CDEBUG(D_SEC, "exp %p (%x|%x|%x): skip the last one\n", - exp, exp->exp_flvr.sf_rpc, exp->exp_flvr_old[0].sf_rpc, - exp->exp_flvr_old[1].sf_rpc); - } - - spin_unlock(&exp->exp_lock); - - CWARN("exp %p(%s): req %p (%u|%u|%u|%u|%u|%u) with unauthorized flavor %x, expect %x|%x(%+lld)|%x(%+lld)\n", - exp, exp->exp_obd->obd_name, - req, req->rq_auth_gss, req->rq_ctx_init, req->rq_ctx_fini, - req->rq_auth_usr_root, req->rq_auth_usr_mdt, req->rq_auth_usr_ost, - req->rq_flvr.sf_rpc, - exp->exp_flvr.sf_rpc, - exp->exp_flvr_old[0].sf_rpc, - exp->exp_flvr_expire[0] ? - (s64)(exp->exp_flvr_expire[0] - ktime_get_real_seconds()) : 0, - exp->exp_flvr_old[1].sf_rpc, - exp->exp_flvr_expire[1] ? - (s64)(exp->exp_flvr_expire[1] - ktime_get_real_seconds()) : 0); - return -EACCES; -} -EXPORT_SYMBOL(sptlrpc_target_export_check); - -static int sptlrpc_svc_check_from(struct ptlrpc_request *req, int svc_rc) -{ - /* peer's claim is unreliable unless gss is being used */ - if (!req->rq_auth_gss || svc_rc == SECSVC_DROP) - return svc_rc; - - switch (req->rq_sp_from) { - case LUSTRE_SP_CLI: - if (req->rq_auth_usr_mdt || req->rq_auth_usr_ost) { - DEBUG_REQ(D_ERROR, req, "faked source CLI"); - svc_rc = SECSVC_DROP; - } - break; - case LUSTRE_SP_MDT: - if (!req->rq_auth_usr_mdt) { - DEBUG_REQ(D_ERROR, req, "faked source MDT"); - svc_rc = SECSVC_DROP; - } - break; - case LUSTRE_SP_OST: - if (!req->rq_auth_usr_ost) { - DEBUG_REQ(D_ERROR, req, "faked source OST"); - svc_rc = SECSVC_DROP; - } - break; - case LUSTRE_SP_MGS: - case LUSTRE_SP_MGC: - if (!req->rq_auth_usr_root && !req->rq_auth_usr_mdt && - !req->rq_auth_usr_ost) { - DEBUG_REQ(D_ERROR, req, "faked source MGC/MGS"); - svc_rc = SECSVC_DROP; - } - break; - case LUSTRE_SP_ANY: - default: - DEBUG_REQ(D_ERROR, req, "invalid source %u", req->rq_sp_from); - svc_rc = SECSVC_DROP; - } - - return svc_rc; -} - -/** - * Used by ptlrpc server, to perform transformation upon request message of - * incoming \a req. This must be the first thing to do with a incoming - * request in ptlrpc layer. - * - * \retval SECSVC_OK success, and req->rq_reqmsg point to request message in - * clear text, size is req->rq_reqlen; also req->rq_svc_ctx is set. - * \retval SECSVC_COMPLETE success, the request has been fully processed, and - * reply message has been prepared. - * \retval SECSVC_DROP failed, this request should be dropped. - */ -int sptlrpc_svc_unwrap_request(struct ptlrpc_request *req) -{ - struct ptlrpc_sec_policy *policy; - struct lustre_msg *msg = req->rq_reqbuf; - int rc; - - LASSERT(msg); - LASSERT(!req->rq_reqmsg); - LASSERT(!req->rq_repmsg); - LASSERT(!req->rq_svc_ctx); - - req->rq_req_swab_mask = 0; - - rc = __lustre_unpack_msg(msg, req->rq_reqdata_len); - switch (rc) { - case 1: - lustre_set_req_swabbed(req, MSG_PTLRPC_HEADER_OFF); - case 0: - break; - default: - CERROR("error unpacking request from %s x%llu\n", - libcfs_id2str(req->rq_peer), req->rq_xid); - return SECSVC_DROP; - } - - req->rq_flvr.sf_rpc = WIRE_FLVR(msg->lm_secflvr); - req->rq_sp_from = LUSTRE_SP_ANY; - req->rq_auth_uid = -1; - req->rq_auth_mapped_uid = -1; - - policy = sptlrpc_wireflavor2policy(req->rq_flvr.sf_rpc); - if (!policy) { - CERROR("unsupported rpc flavor %x\n", req->rq_flvr.sf_rpc); - return SECSVC_DROP; - } - - LASSERT(policy->sp_sops->accept); - rc = policy->sp_sops->accept(req); - sptlrpc_policy_put(policy); - LASSERT(req->rq_reqmsg || rc != SECSVC_OK); - LASSERT(req->rq_svc_ctx || rc == SECSVC_DROP); - - /* - * if it's not null flavor (which means embedded packing msg), - * reset the swab mask for the coming inner msg unpacking. - */ - if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) - req->rq_req_swab_mask = 0; - - /* sanity check for the request source */ - rc = sptlrpc_svc_check_from(req, rc); - return rc; -} - -/** - * Used by ptlrpc server, to allocate reply buffer for \a req. If succeed, - * req->rq_reply_state is set, and req->rq_reply_state->rs_msg point to - * a buffer of \a msglen size. - */ -int sptlrpc_svc_alloc_rs(struct ptlrpc_request *req, int msglen) -{ - struct ptlrpc_sec_policy *policy; - struct ptlrpc_reply_state *rs; - int rc; - - LASSERT(req->rq_svc_ctx); - LASSERT(req->rq_svc_ctx->sc_policy); - - policy = req->rq_svc_ctx->sc_policy; - LASSERT(policy->sp_sops->alloc_rs); - - rc = policy->sp_sops->alloc_rs(req, msglen); - if (unlikely(rc == -ENOMEM)) { - struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; - - if (svcpt->scp_service->srv_max_reply_size < - msglen + sizeof(struct ptlrpc_reply_state)) { - /* Just return failure if the size is too big */ - CERROR("size of message is too big (%zd), %d allowed\n", - msglen + sizeof(struct ptlrpc_reply_state), - svcpt->scp_service->srv_max_reply_size); - return -ENOMEM; - } - - /* failed alloc, try emergency pool */ - rs = lustre_get_emerg_rs(svcpt); - if (!rs) - return -ENOMEM; - - req->rq_reply_state = rs; - rc = policy->sp_sops->alloc_rs(req, msglen); - if (rc) { - lustre_put_emerg_rs(rs); - req->rq_reply_state = NULL; - } - } - - LASSERT(rc != 0 || - (req->rq_reply_state && req->rq_reply_state->rs_msg)); - - return rc; -} - -/** - * Used by ptlrpc server, to perform transformation upon reply message. - * - * \post req->rq_reply_off is set to appropriate server-controlled reply offset. - * \post req->rq_repmsg and req->rq_reply_state->rs_msg becomes inaccessible. - */ -int sptlrpc_svc_wrap_reply(struct ptlrpc_request *req) -{ - struct ptlrpc_sec_policy *policy; - int rc; - - LASSERT(req->rq_svc_ctx); - LASSERT(req->rq_svc_ctx->sc_policy); - - policy = req->rq_svc_ctx->sc_policy; - LASSERT(policy->sp_sops->authorize); - - rc = policy->sp_sops->authorize(req); - LASSERT(rc || req->rq_reply_state->rs_repdata_len); - - return rc; -} - -/** - * Used by ptlrpc server, to free reply_state. - */ -void sptlrpc_svc_free_rs(struct ptlrpc_reply_state *rs) -{ - struct ptlrpc_sec_policy *policy; - unsigned int prealloc; - - LASSERT(rs->rs_svc_ctx); - LASSERT(rs->rs_svc_ctx->sc_policy); - - policy = rs->rs_svc_ctx->sc_policy; - LASSERT(policy->sp_sops->free_rs); - - prealloc = rs->rs_prealloc; - policy->sp_sops->free_rs(rs); - - if (prealloc) - lustre_put_emerg_rs(rs); -} - -void sptlrpc_svc_ctx_addref(struct ptlrpc_request *req) -{ - struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx; - - if (ctx) - atomic_inc(&ctx->sc_refcount); -} - -void sptlrpc_svc_ctx_decref(struct ptlrpc_request *req) -{ - struct ptlrpc_svc_ctx *ctx = req->rq_svc_ctx; - - if (!ctx) - return; - - LASSERT_ATOMIC_POS(&ctx->sc_refcount); - if (atomic_dec_and_test(&ctx->sc_refcount)) { - if (ctx->sc_policy->sp_sops->free_ctx) - ctx->sc_policy->sp_sops->free_ctx(ctx); - } - req->rq_svc_ctx = NULL; -} - -/**************************************** - * bulk security * - ****************************************/ - -/** - * Perform transformation upon bulk data pointed by \a desc. This is called - * before transforming the request message. - */ -int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req, - struct ptlrpc_bulk_desc *desc) -{ - struct ptlrpc_cli_ctx *ctx; - - LASSERT(req->rq_bulk_read || req->rq_bulk_write); - - if (!req->rq_pack_bulk) - return 0; - - ctx = req->rq_cli_ctx; - if (ctx->cc_ops->wrap_bulk) - return ctx->cc_ops->wrap_bulk(ctx, req, desc); - return 0; -} -EXPORT_SYMBOL(sptlrpc_cli_wrap_bulk); - -/** - * This is called after unwrap the reply message. - * return nob of actual plain text size received, or error code. - */ -int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req, - struct ptlrpc_bulk_desc *desc, - int nob) -{ - struct ptlrpc_cli_ctx *ctx; - int rc; - - LASSERT(req->rq_bulk_read && !req->rq_bulk_write); - - if (!req->rq_pack_bulk) - return desc->bd_nob_transferred; - - ctx = req->rq_cli_ctx; - if (ctx->cc_ops->unwrap_bulk) { - rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc); - if (rc < 0) - return rc; - } - return desc->bd_nob_transferred; -} -EXPORT_SYMBOL(sptlrpc_cli_unwrap_bulk_read); - -/** - * This is called after unwrap the reply message. - * return 0 for success or error code. - */ -int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req, - struct ptlrpc_bulk_desc *desc) -{ - struct ptlrpc_cli_ctx *ctx; - int rc; - - LASSERT(!req->rq_bulk_read && req->rq_bulk_write); - - if (!req->rq_pack_bulk) - return 0; - - ctx = req->rq_cli_ctx; - if (ctx->cc_ops->unwrap_bulk) { - rc = ctx->cc_ops->unwrap_bulk(ctx, req, desc); - if (rc < 0) - return rc; - } - - /* - * if everything is going right, nob should equals to nob_transferred. - * in case of privacy mode, nob_transferred needs to be adjusted. - */ - if (desc->bd_nob != desc->bd_nob_transferred) { - CERROR("nob %d doesn't match transferred nob %d\n", - desc->bd_nob, desc->bd_nob_transferred); - return -EPROTO; - } - - return 0; -} -EXPORT_SYMBOL(sptlrpc_cli_unwrap_bulk_write); - -/**************************************** - * user descriptor helpers * - ****************************************/ - -int sptlrpc_current_user_desc_size(void) -{ - int ngroups; - - ngroups = current_ngroups; - - if (ngroups > LUSTRE_MAX_GROUPS) - ngroups = LUSTRE_MAX_GROUPS; - return sptlrpc_user_desc_size(ngroups); -} -EXPORT_SYMBOL(sptlrpc_current_user_desc_size); - -int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset) -{ - struct ptlrpc_user_desc *pud; - - pud = lustre_msg_buf(msg, offset, 0); - - if (!pud) - return -EINVAL; - - pud->pud_uid = from_kuid(&init_user_ns, current_uid()); - pud->pud_gid = from_kgid(&init_user_ns, current_gid()); - pud->pud_fsuid = from_kuid(&init_user_ns, current_fsuid()); - pud->pud_fsgid = from_kgid(&init_user_ns, current_fsgid()); - pud->pud_cap = current_cap().cap[0]; - pud->pud_ngroups = (msg->lm_buflens[offset] - sizeof(*pud)) / 4; - - task_lock(current); - if (pud->pud_ngroups > current_ngroups) - pud->pud_ngroups = current_ngroups; - memcpy(pud->pud_groups, current_cred()->group_info->gid, - pud->pud_ngroups * sizeof(__u32)); - task_unlock(current); - - return 0; -} -EXPORT_SYMBOL(sptlrpc_pack_user_desc); - -int sptlrpc_unpack_user_desc(struct lustre_msg *msg, int offset, int swabbed) -{ - struct ptlrpc_user_desc *pud; - int i; - - pud = lustre_msg_buf(msg, offset, sizeof(*pud)); - if (!pud) - return -EINVAL; - - if (swabbed) { - __swab32s(&pud->pud_uid); - __swab32s(&pud->pud_gid); - __swab32s(&pud->pud_fsuid); - __swab32s(&pud->pud_fsgid); - __swab32s(&pud->pud_cap); - __swab32s(&pud->pud_ngroups); - } - - if (pud->pud_ngroups > LUSTRE_MAX_GROUPS) { - CERROR("%u groups is too large\n", pud->pud_ngroups); - return -EINVAL; - } - - if (sizeof(*pud) + pud->pud_ngroups * sizeof(__u32) > - msg->lm_buflens[offset]) { - CERROR("%u groups are claimed but bufsize only %u\n", - pud->pud_ngroups, msg->lm_buflens[offset]); - return -EINVAL; - } - - if (swabbed) { - for (i = 0; i < pud->pud_ngroups; i++) - __swab32s(&pud->pud_groups[i]); - } - - return 0; -} -EXPORT_SYMBOL(sptlrpc_unpack_user_desc); - -/**************************************** - * misc helpers * - ****************************************/ - -const char *sec2target_str(struct ptlrpc_sec *sec) -{ - if (!sec || !sec->ps_import || !sec->ps_import->imp_obd) - return "*"; - if (sec_is_reverse(sec)) - return "c"; - return obd_uuid2str(&sec->ps_import->imp_obd->u.cli.cl_target_uuid); -} -EXPORT_SYMBOL(sec2target_str); - -/* - * return true if the bulk data is protected - */ -bool sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr) -{ - switch (SPTLRPC_FLVR_BULK_SVC(flvr->sf_rpc)) { - case SPTLRPC_BULK_SVC_INTG: - case SPTLRPC_BULK_SVC_PRIV: - return true; - default: - return false; - } -} -EXPORT_SYMBOL(sptlrpc_flavor_has_bulk); - -/**************************************** - * crypto API helper/alloc blkciper * - ****************************************/ - -/**************************************** - * initialize/finalize * - ****************************************/ - -int sptlrpc_init(void) -{ - int rc; - - rwlock_init(&policy_lock); - - rc = sptlrpc_gc_init(); - if (rc) - goto out; - - rc = sptlrpc_conf_init(); - if (rc) - goto out_gc; - - rc = sptlrpc_enc_pool_init(); - if (rc) - goto out_conf; - - rc = sptlrpc_null_init(); - if (rc) - goto out_pool; - - rc = sptlrpc_plain_init(); - if (rc) - goto out_null; - - sptlrpc_lproc_init(); - - return 0; - -out_null: - sptlrpc_null_fini(); -out_pool: - sptlrpc_enc_pool_fini(); -out_conf: - sptlrpc_conf_fini(); -out_gc: - sptlrpc_gc_fini(); -out: - return rc; -} - -void sptlrpc_fini(void) -{ - sptlrpc_lproc_fini(); - sptlrpc_plain_fini(); - sptlrpc_null_fini(); - sptlrpc_enc_pool_fini(); - sptlrpc_conf_fini(); - sptlrpc_gc_fini(); -} diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c b/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c deleted file mode 100644 index 625b9520d78f6eb021bb4912424579197027ea54..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ptlrpc/sec_bulk.c +++ /dev/null @@ -1,572 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/ptlrpc/sec_bulk.c - * - * Author: Eric Mei - */ - -#define DEBUG_SUBSYSTEM S_SEC - -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ptlrpc_internal.h" - -/**************************************** - * bulk encryption page pools * - ****************************************/ - -#define POINTERS_PER_PAGE (PAGE_SIZE / sizeof(void *)) -#define PAGES_PER_POOL (POINTERS_PER_PAGE) - -#define IDLE_IDX_MAX (100) -#define IDLE_IDX_WEIGHT (3) - -#define CACHE_QUIESCENT_PERIOD (20) - -static struct ptlrpc_enc_page_pool { - /* - * constants - */ - unsigned long epp_max_pages; /* maximum pages can hold, const */ - unsigned int epp_max_pools; /* number of pools, const */ - - /* - * wait queue in case of not enough free pages. - */ - wait_queue_head_t epp_waitq; /* waiting threads */ - unsigned int epp_waitqlen; /* wait queue length */ - unsigned long epp_pages_short; /* # of pages wanted of in-q users */ - unsigned int epp_growing:1; /* during adding pages */ - - /* - * indicating how idle the pools are, from 0 to MAX_IDLE_IDX - * this is counted based on each time when getting pages from - * the pools, not based on time. which means in case that system - * is idled for a while but the idle_idx might still be low if no - * activities happened in the pools. - */ - unsigned long epp_idle_idx; - - /* last shrink time due to mem tight */ - time64_t epp_last_shrink; - time64_t epp_last_access; - - /* - * in-pool pages bookkeeping - */ - spinlock_t epp_lock; /* protect following fields */ - unsigned long epp_total_pages; /* total pages in pools */ - unsigned long epp_free_pages; /* current pages available */ - - /* - * statistics - */ - unsigned long epp_st_max_pages; /* # of pages ever reached */ - unsigned int epp_st_grows; /* # of grows */ - unsigned int epp_st_grow_fails; /* # of add pages failures */ - unsigned int epp_st_shrinks; /* # of shrinks */ - unsigned long epp_st_access; /* # of access */ - unsigned long epp_st_missings; /* # of cache missing */ - unsigned long epp_st_lowfree; /* lowest free pages reached */ - unsigned int epp_st_max_wqlen; /* highest waitqueue length */ - unsigned long epp_st_max_wait; /* in jiffies */ - unsigned long epp_st_outofmem; /* # of out of mem requests */ - /* - * pointers to pools - */ - struct page ***epp_pools; -} page_pools; - -/* - * /sys/kernel/debug/lustre/sptlrpc/encrypt_page_pools - */ -int sptlrpc_proc_enc_pool_seq_show(struct seq_file *m, void *v) -{ - spin_lock(&page_pools.epp_lock); - - seq_printf(m, - "physical pages: %lu\n" - "pages per pool: %lu\n" - "max pages: %lu\n" - "max pools: %u\n" - "total pages: %lu\n" - "total free: %lu\n" - "idle index: %lu/100\n" - "last shrink: %lds\n" - "last access: %lds\n" - "max pages reached: %lu\n" - "grows: %u\n" - "grows failure: %u\n" - "shrinks: %u\n" - "cache access: %lu\n" - "cache missing: %lu\n" - "low free mark: %lu\n" - "max waitqueue depth: %u\n" - "max wait time: %ld/%lu\n" - "out of mem: %lu\n", - totalram_pages, - PAGES_PER_POOL, - page_pools.epp_max_pages, - page_pools.epp_max_pools, - page_pools.epp_total_pages, - page_pools.epp_free_pages, - page_pools.epp_idle_idx, - (long)(ktime_get_seconds() - page_pools.epp_last_shrink), - (long)(ktime_get_seconds() - page_pools.epp_last_access), - page_pools.epp_st_max_pages, - page_pools.epp_st_grows, - page_pools.epp_st_grow_fails, - page_pools.epp_st_shrinks, - page_pools.epp_st_access, - page_pools.epp_st_missings, - page_pools.epp_st_lowfree, - page_pools.epp_st_max_wqlen, - page_pools.epp_st_max_wait, - msecs_to_jiffies(MSEC_PER_SEC), - page_pools.epp_st_outofmem); - - spin_unlock(&page_pools.epp_lock); - - return 0; -} - -static void enc_pools_release_free_pages(long npages) -{ - int p_idx, g_idx; - int p_idx_max1, p_idx_max2; - - LASSERT(npages > 0); - LASSERT(npages <= page_pools.epp_free_pages); - LASSERT(page_pools.epp_free_pages <= page_pools.epp_total_pages); - - /* max pool index before the release */ - p_idx_max2 = (page_pools.epp_total_pages - 1) / PAGES_PER_POOL; - - page_pools.epp_free_pages -= npages; - page_pools.epp_total_pages -= npages; - - /* max pool index after the release */ - p_idx_max1 = page_pools.epp_total_pages == 0 ? -1 : - ((page_pools.epp_total_pages - 1) / PAGES_PER_POOL); - - p_idx = page_pools.epp_free_pages / PAGES_PER_POOL; - g_idx = page_pools.epp_free_pages % PAGES_PER_POOL; - LASSERT(page_pools.epp_pools[p_idx]); - - while (npages--) { - LASSERT(page_pools.epp_pools[p_idx]); - LASSERT(page_pools.epp_pools[p_idx][g_idx]); - - __free_page(page_pools.epp_pools[p_idx][g_idx]); - page_pools.epp_pools[p_idx][g_idx] = NULL; - - if (++g_idx == PAGES_PER_POOL) { - p_idx++; - g_idx = 0; - } - } - - /* free unused pools */ - while (p_idx_max1 < p_idx_max2) { - LASSERT(page_pools.epp_pools[p_idx_max2]); - kfree(page_pools.epp_pools[p_idx_max2]); - page_pools.epp_pools[p_idx_max2] = NULL; - p_idx_max2--; - } -} - -/* - * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool. - */ -static unsigned long enc_pools_shrink_count(struct shrinker *s, - struct shrink_control *sc) -{ - /* - * if no pool access for a long time, we consider it's fully idle. - * a little race here is fine. - */ - if (unlikely(ktime_get_seconds() - page_pools.epp_last_access > - CACHE_QUIESCENT_PERIOD)) { - spin_lock(&page_pools.epp_lock); - page_pools.epp_idle_idx = IDLE_IDX_MAX; - spin_unlock(&page_pools.epp_lock); - } - - LASSERT(page_pools.epp_idle_idx <= IDLE_IDX_MAX); - return max((int)page_pools.epp_free_pages - PTLRPC_MAX_BRW_PAGES, 0) * - (IDLE_IDX_MAX - page_pools.epp_idle_idx) / IDLE_IDX_MAX; -} - -/* - * we try to keep at least PTLRPC_MAX_BRW_PAGES pages in the pool. - */ -static unsigned long enc_pools_shrink_scan(struct shrinker *s, - struct shrink_control *sc) -{ - spin_lock(&page_pools.epp_lock); - sc->nr_to_scan = min_t(unsigned long, sc->nr_to_scan, - page_pools.epp_free_pages - PTLRPC_MAX_BRW_PAGES); - if (sc->nr_to_scan > 0) { - enc_pools_release_free_pages(sc->nr_to_scan); - CDEBUG(D_SEC, "released %ld pages, %ld left\n", - (long)sc->nr_to_scan, page_pools.epp_free_pages); - - page_pools.epp_st_shrinks++; - page_pools.epp_last_shrink = ktime_get_seconds(); - } - spin_unlock(&page_pools.epp_lock); - - /* - * if no pool access for a long time, we consider it's fully idle. - * a little race here is fine. - */ - if (unlikely(ktime_get_seconds() - page_pools.epp_last_access > - CACHE_QUIESCENT_PERIOD)) { - spin_lock(&page_pools.epp_lock); - page_pools.epp_idle_idx = IDLE_IDX_MAX; - spin_unlock(&page_pools.epp_lock); - } - - LASSERT(page_pools.epp_idle_idx <= IDLE_IDX_MAX); - return sc->nr_to_scan; -} - -static inline -int npages_to_npools(unsigned long npages) -{ - return (int)DIV_ROUND_UP(npages, PAGES_PER_POOL); -} - -/* - * return how many pages cleaned up. - */ -static unsigned long enc_pools_cleanup(struct page ***pools, int npools) -{ - unsigned long cleaned = 0; - int i, j; - - for (i = 0; i < npools; i++) { - if (pools[i]) { - for (j = 0; j < PAGES_PER_POOL; j++) { - if (pools[i][j]) { - __free_page(pools[i][j]); - cleaned++; - } - } - kfree(pools[i]); - pools[i] = NULL; - } - } - - return cleaned; -} - -static inline void enc_pools_wakeup(void) -{ - assert_spin_locked(&page_pools.epp_lock); - - if (unlikely(page_pools.epp_waitqlen)) { - LASSERT(waitqueue_active(&page_pools.epp_waitq)); - wake_up_all(&page_pools.epp_waitq); - } -} - -/* - * Export the number of free pages in the pool - */ -int get_free_pages_in_pool(void) -{ - return page_pools.epp_free_pages; -} - -/* - * Let outside world know if enc_pool full capacity is reached - */ -int pool_is_at_full_capacity(void) -{ - return (page_pools.epp_total_pages == page_pools.epp_max_pages); -} - -void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc) -{ - int p_idx, g_idx; - int i; - - LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type)); - - if (!GET_ENC_KIOV(desc)) - return; - - LASSERT(desc->bd_iov_count > 0); - - spin_lock(&page_pools.epp_lock); - - p_idx = page_pools.epp_free_pages / PAGES_PER_POOL; - g_idx = page_pools.epp_free_pages % PAGES_PER_POOL; - - LASSERT(page_pools.epp_free_pages + desc->bd_iov_count <= - page_pools.epp_total_pages); - LASSERT(page_pools.epp_pools[p_idx]); - - for (i = 0; i < desc->bd_iov_count; i++) { - LASSERT(BD_GET_ENC_KIOV(desc, i).bv_page); - LASSERT(g_idx != 0 || page_pools.epp_pools[p_idx]); - LASSERT(!page_pools.epp_pools[p_idx][g_idx]); - - page_pools.epp_pools[p_idx][g_idx] = - BD_GET_ENC_KIOV(desc, i).bv_page; - - if (++g_idx == PAGES_PER_POOL) { - p_idx++; - g_idx = 0; - } - } - - page_pools.epp_free_pages += desc->bd_iov_count; - - enc_pools_wakeup(); - - spin_unlock(&page_pools.epp_lock); - - kfree(GET_ENC_KIOV(desc)); - GET_ENC_KIOV(desc) = NULL; -} - -static inline void enc_pools_alloc(void) -{ - LASSERT(page_pools.epp_max_pools); - page_pools.epp_pools = - kvzalloc(page_pools.epp_max_pools * - sizeof(*page_pools.epp_pools), - GFP_KERNEL); -} - -static inline void enc_pools_free(void) -{ - LASSERT(page_pools.epp_max_pools); - LASSERT(page_pools.epp_pools); - - kvfree(page_pools.epp_pools); -} - -static struct shrinker pools_shrinker = { - .count_objects = enc_pools_shrink_count, - .scan_objects = enc_pools_shrink_scan, - .seeks = DEFAULT_SEEKS, -}; - -int sptlrpc_enc_pool_init(void) -{ - int rc; - - /* - * maximum capacity is 1/8 of total physical memory. - * is the 1/8 a good number? - */ - page_pools.epp_max_pages = totalram_pages / 8; - page_pools.epp_max_pools = npages_to_npools(page_pools.epp_max_pages); - - init_waitqueue_head(&page_pools.epp_waitq); - page_pools.epp_waitqlen = 0; - page_pools.epp_pages_short = 0; - - page_pools.epp_growing = 0; - - page_pools.epp_idle_idx = 0; - page_pools.epp_last_shrink = ktime_get_seconds(); - page_pools.epp_last_access = ktime_get_seconds(); - - spin_lock_init(&page_pools.epp_lock); - page_pools.epp_total_pages = 0; - page_pools.epp_free_pages = 0; - - page_pools.epp_st_max_pages = 0; - page_pools.epp_st_grows = 0; - page_pools.epp_st_grow_fails = 0; - page_pools.epp_st_shrinks = 0; - page_pools.epp_st_access = 0; - page_pools.epp_st_missings = 0; - page_pools.epp_st_lowfree = 0; - page_pools.epp_st_max_wqlen = 0; - page_pools.epp_st_max_wait = 0; - page_pools.epp_st_outofmem = 0; - - enc_pools_alloc(); - if (!page_pools.epp_pools) - return -ENOMEM; - - rc = register_shrinker(&pools_shrinker); - if (rc) - enc_pools_free(); - - return rc; -} - -void sptlrpc_enc_pool_fini(void) -{ - unsigned long cleaned, npools; - - LASSERT(page_pools.epp_pools); - LASSERT(page_pools.epp_total_pages == page_pools.epp_free_pages); - - unregister_shrinker(&pools_shrinker); - - npools = npages_to_npools(page_pools.epp_total_pages); - cleaned = enc_pools_cleanup(page_pools.epp_pools, npools); - LASSERT(cleaned == page_pools.epp_total_pages); - - enc_pools_free(); - - if (page_pools.epp_st_access > 0) { - CDEBUG(D_SEC, - "max pages %lu, grows %u, grow fails %u, shrinks %u, access %lu, missing %lu, max qlen %u, max wait %ld/%ld, out of mem %lu\n", - page_pools.epp_st_max_pages, page_pools.epp_st_grows, - page_pools.epp_st_grow_fails, - page_pools.epp_st_shrinks, page_pools.epp_st_access, - page_pools.epp_st_missings, page_pools.epp_st_max_wqlen, - page_pools.epp_st_max_wait, - msecs_to_jiffies(MSEC_PER_SEC), - page_pools.epp_st_outofmem); - } -} - -static int cfs_hash_alg_id[] = { - [BULK_HASH_ALG_NULL] = CFS_HASH_ALG_NULL, - [BULK_HASH_ALG_ADLER32] = CFS_HASH_ALG_ADLER32, - [BULK_HASH_ALG_CRC32] = CFS_HASH_ALG_CRC32, - [BULK_HASH_ALG_MD5] = CFS_HASH_ALG_MD5, - [BULK_HASH_ALG_SHA1] = CFS_HASH_ALG_SHA1, - [BULK_HASH_ALG_SHA256] = CFS_HASH_ALG_SHA256, - [BULK_HASH_ALG_SHA384] = CFS_HASH_ALG_SHA384, - [BULK_HASH_ALG_SHA512] = CFS_HASH_ALG_SHA512, -}; - -const char *sptlrpc_get_hash_name(__u8 hash_alg) -{ - return cfs_crypto_hash_name(cfs_hash_alg_id[hash_alg]); -} - -__u8 sptlrpc_get_hash_alg(const char *algname) -{ - return cfs_crypto_hash_alg(algname); -} - -int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed) -{ - struct ptlrpc_bulk_sec_desc *bsd; - int size = msg->lm_buflens[offset]; - - bsd = lustre_msg_buf(msg, offset, sizeof(*bsd)); - if (!bsd) { - CERROR("Invalid bulk sec desc: size %d\n", size); - return -EINVAL; - } - - if (swabbed) - __swab32s(&bsd->bsd_nob); - - if (unlikely(bsd->bsd_version != 0)) { - CERROR("Unexpected version %u\n", bsd->bsd_version); - return -EPROTO; - } - - if (unlikely(bsd->bsd_type >= SPTLRPC_BULK_MAX)) { - CERROR("Invalid type %u\n", bsd->bsd_type); - return -EPROTO; - } - - /* FIXME more sanity check here */ - - if (unlikely(bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL && - bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG && - bsd->bsd_svc != SPTLRPC_BULK_SVC_PRIV)) { - CERROR("Invalid svc %u\n", bsd->bsd_svc); - return -EPROTO; - } - - return 0; -} -EXPORT_SYMBOL(bulk_sec_desc_unpack); - -int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg, - void *buf, int buflen) -{ - struct ahash_request *hdesc; - int hashsize; - unsigned int bufsize; - int i, err; - - LASSERT(alg > BULK_HASH_ALG_NULL && alg < BULK_HASH_ALG_MAX); - LASSERT(buflen >= 4); - - hdesc = cfs_crypto_hash_init(cfs_hash_alg_id[alg], NULL, 0); - if (IS_ERR(hdesc)) { - CERROR("Unable to initialize checksum hash %s\n", - cfs_crypto_hash_name(cfs_hash_alg_id[alg])); - return PTR_ERR(hdesc); - } - - hashsize = cfs_crypto_hash_digestsize(cfs_hash_alg_id[alg]); - - for (i = 0; i < desc->bd_iov_count; i++) { - cfs_crypto_hash_update_page(hdesc, - BD_GET_KIOV(desc, i).bv_page, - BD_GET_KIOV(desc, i).bv_offset & - ~PAGE_MASK, - BD_GET_KIOV(desc, i).bv_len); - } - - if (hashsize > buflen) { - unsigned char hashbuf[CFS_CRYPTO_HASH_DIGESTSIZE_MAX]; - - bufsize = sizeof(hashbuf); - LASSERTF(bufsize >= hashsize, "bufsize = %u < hashsize %u\n", - bufsize, hashsize); - err = cfs_crypto_hash_final(hdesc, hashbuf, &bufsize); - memcpy(buf, hashbuf, buflen); - } else { - bufsize = buflen; - err = cfs_crypto_hash_final(hdesc, buf, &bufsize); - } - - return err; -} diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_config.c b/drivers/staging/lustre/lustre/ptlrpc/sec_config.c deleted file mode 100644 index 2389f9a8f53477b8fa1aec726f8e6eaa22acf3aa..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ptlrpc/sec_config.c +++ /dev/null @@ -1,850 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_SEC - -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "ptlrpc_internal.h" - -enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd) -{ - const char *type = obd->obd_type->typ_name; - - if (!strcmp(type, LUSTRE_MDT_NAME)) - return LUSTRE_SP_MDT; - if (!strcmp(type, LUSTRE_OST_NAME)) - return LUSTRE_SP_OST; - if (!strcmp(type, LUSTRE_MGS_NAME)) - return LUSTRE_SP_MGS; - - CERROR("unknown target %p(%s)\n", obd, type); - return LUSTRE_SP_ANY; -} - -/**************************************** - * user supplied flavor string parsing * - ****************************************/ - -/* - * format: [-] - */ -int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr) -{ - char buf[32]; - char *bulk, *alg; - - memset(flvr, 0, sizeof(*flvr)); - - if (!str || str[0] == '\0') { - flvr->sf_rpc = SPTLRPC_FLVR_INVALID; - return 0; - } - - strlcpy(buf, str, sizeof(buf)); - - bulk = strchr(buf, '-'); - if (bulk) - *bulk++ = '\0'; - - flvr->sf_rpc = sptlrpc_name2flavor_base(buf); - if (flvr->sf_rpc == SPTLRPC_FLVR_INVALID) - goto err_out; - - /* - * currently only base flavor "plain" can have bulk specification. - */ - if (flvr->sf_rpc == SPTLRPC_FLVR_PLAIN) { - flvr->u_bulk.hash.hash_alg = BULK_HASH_ALG_ADLER32; - if (bulk) { - /* - * format: plain-hash: - */ - alg = strchr(bulk, ':'); - if (!alg) - goto err_out; - *alg++ = '\0'; - - if (strcmp(bulk, "hash")) - goto err_out; - - flvr->u_bulk.hash.hash_alg = sptlrpc_get_hash_alg(alg); - if (flvr->u_bulk.hash.hash_alg >= BULK_HASH_ALG_MAX) - goto err_out; - } - - if (flvr->u_bulk.hash.hash_alg == BULK_HASH_ALG_NULL) - flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_NULL); - else - flvr_set_bulk_svc(&flvr->sf_rpc, SPTLRPC_BULK_SVC_INTG); - } else { - if (bulk) - goto err_out; - } - - flvr->sf_flags = 0; - return 0; - -err_out: - CERROR("invalid flavor string: %s\n", str); - return -EINVAL; -} -EXPORT_SYMBOL(sptlrpc_parse_flavor); - -/**************************************** - * configure rules * - ****************************************/ - -static void get_default_flavor(struct sptlrpc_flavor *sf) -{ - memset(sf, 0, sizeof(*sf)); - - sf->sf_rpc = SPTLRPC_FLVR_NULL; - sf->sf_flags = 0; -} - -static void sptlrpc_rule_init(struct sptlrpc_rule *rule) -{ - rule->sr_netid = LNET_NIDNET(LNET_NID_ANY); - rule->sr_from = LUSTRE_SP_ANY; - rule->sr_to = LUSTRE_SP_ANY; - rule->sr_padding = 0; - - get_default_flavor(&rule->sr_flvr); -} - -/* - * format: network[.direction]=flavor - */ -static int sptlrpc_parse_rule(char *param, struct sptlrpc_rule *rule) -{ - char *flavor, *dir; - int rc; - - sptlrpc_rule_init(rule); - - flavor = strchr(param, '='); - if (!flavor) { - CERROR("invalid param, no '='\n"); - return -EINVAL; - } - *flavor++ = '\0'; - - dir = strchr(param, '.'); - if (dir) - *dir++ = '\0'; - - /* 1.1 network */ - if (strcmp(param, "default")) { - rule->sr_netid = libcfs_str2net(param); - if (rule->sr_netid == LNET_NIDNET(LNET_NID_ANY)) { - CERROR("invalid network name: %s\n", param); - return -EINVAL; - } - } - - /* 1.2 direction */ - if (dir) { - if (!strcmp(dir, "mdt2ost")) { - rule->sr_from = LUSTRE_SP_MDT; - rule->sr_to = LUSTRE_SP_OST; - } else if (!strcmp(dir, "mdt2mdt")) { - rule->sr_from = LUSTRE_SP_MDT; - rule->sr_to = LUSTRE_SP_MDT; - } else if (!strcmp(dir, "cli2ost")) { - rule->sr_from = LUSTRE_SP_CLI; - rule->sr_to = LUSTRE_SP_OST; - } else if (!strcmp(dir, "cli2mdt")) { - rule->sr_from = LUSTRE_SP_CLI; - rule->sr_to = LUSTRE_SP_MDT; - } else { - CERROR("invalid rule dir segment: %s\n", dir); - return -EINVAL; - } - } - - /* 2.1 flavor */ - rc = sptlrpc_parse_flavor(flavor, &rule->sr_flvr); - if (rc) - return -EINVAL; - - return 0; -} - -static void sptlrpc_rule_set_free(struct sptlrpc_rule_set *rset) -{ - LASSERT(rset->srs_nslot || - (rset->srs_nrule == 0 && !rset->srs_rules)); - - if (rset->srs_nslot) { - kfree(rset->srs_rules); - sptlrpc_rule_set_init(rset); - } -} - -/* - * return 0 if the rule set could accommodate one more rule. - */ -static int sptlrpc_rule_set_expand(struct sptlrpc_rule_set *rset) -{ - struct sptlrpc_rule *rules; - int nslot; - - might_sleep(); - - if (rset->srs_nrule < rset->srs_nslot) - return 0; - - nslot = rset->srs_nslot + 8; - - /* better use realloc() if available */ - rules = kcalloc(nslot, sizeof(*rset->srs_rules), GFP_NOFS); - if (!rules) - return -ENOMEM; - - if (rset->srs_nrule) { - LASSERT(rset->srs_nslot && rset->srs_rules); - memcpy(rules, rset->srs_rules, - rset->srs_nrule * sizeof(*rset->srs_rules)); - - kfree(rset->srs_rules); - } - - rset->srs_rules = rules; - rset->srs_nslot = nslot; - return 0; -} - -static inline int rule_spec_dir(struct sptlrpc_rule *rule) -{ - return (rule->sr_from != LUSTRE_SP_ANY || - rule->sr_to != LUSTRE_SP_ANY); -} - -static inline int rule_spec_net(struct sptlrpc_rule *rule) -{ - return (rule->sr_netid != LNET_NIDNET(LNET_NID_ANY)); -} - -static inline int rule_match_dir(struct sptlrpc_rule *r1, - struct sptlrpc_rule *r2) -{ - return (r1->sr_from == r2->sr_from && r1->sr_to == r2->sr_to); -} - -static inline int rule_match_net(struct sptlrpc_rule *r1, - struct sptlrpc_rule *r2) -{ - return (r1->sr_netid == r2->sr_netid); -} - -/* - * merge @rule into @rset. - * the @rset slots might be expanded. - */ -static int sptlrpc_rule_set_merge(struct sptlrpc_rule_set *rset, - struct sptlrpc_rule *rule) -{ - struct sptlrpc_rule *p = rset->srs_rules; - int spec_dir, spec_net; - int rc, n, match = 0; - - might_sleep(); - - spec_net = rule_spec_net(rule); - spec_dir = rule_spec_dir(rule); - - for (n = 0; n < rset->srs_nrule; n++) { - p = &rset->srs_rules[n]; - - /* test network match, if failed: - * - spec rule: skip rules which is also spec rule match, until - * we hit a wild rule, which means no more chance - * - wild rule: skip until reach the one which is also wild - * and matches - */ - if (!rule_match_net(p, rule)) { - if (spec_net) { - if (rule_spec_net(p)) - continue; - else - break; - } else { - continue; - } - } - - /* test dir match, same logic as net matching */ - if (!rule_match_dir(p, rule)) { - if (spec_dir) { - if (rule_spec_dir(p)) - continue; - else - break; - } else { - continue; - } - } - - /* find a match */ - match = 1; - break; - } - - if (match) { - LASSERT(n >= 0 && n < rset->srs_nrule); - - if (rule->sr_flvr.sf_rpc == SPTLRPC_FLVR_INVALID) { - /* remove this rule */ - if (n < rset->srs_nrule - 1) - memmove(&rset->srs_rules[n], - &rset->srs_rules[n + 1], - (rset->srs_nrule - n - 1) * - sizeof(*rule)); - rset->srs_nrule--; - } else { - /* override the rule */ - memcpy(&rset->srs_rules[n], rule, sizeof(*rule)); - } - } else { - LASSERT(n >= 0 && n <= rset->srs_nrule); - - if (rule->sr_flvr.sf_rpc != SPTLRPC_FLVR_INVALID) { - rc = sptlrpc_rule_set_expand(rset); - if (rc) - return rc; - - if (n < rset->srs_nrule) - memmove(&rset->srs_rules[n + 1], - &rset->srs_rules[n], - (rset->srs_nrule - n) * sizeof(*rule)); - memcpy(&rset->srs_rules[n], rule, sizeof(*rule)); - rset->srs_nrule++; - } else { - CDEBUG(D_CONFIG, "ignore the unmatched deletion\n"); - } - } - - return 0; -} - -/** - * given from/to/nid, determine a matching flavor in ruleset. - * return 1 if a match found, otherwise return 0. - */ -static int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset, - enum lustre_sec_part from, - enum lustre_sec_part to, - lnet_nid_t nid, - struct sptlrpc_flavor *sf) -{ - struct sptlrpc_rule *r; - int n; - - for (n = 0; n < rset->srs_nrule; n++) { - r = &rset->srs_rules[n]; - - if (LNET_NIDNET(nid) != LNET_NIDNET(LNET_NID_ANY) && - r->sr_netid != LNET_NIDNET(LNET_NID_ANY) && - LNET_NIDNET(nid) != r->sr_netid) - continue; - - if (from != LUSTRE_SP_ANY && r->sr_from != LUSTRE_SP_ANY && - from != r->sr_from) - continue; - - if (to != LUSTRE_SP_ANY && r->sr_to != LUSTRE_SP_ANY && - to != r->sr_to) - continue; - - *sf = r->sr_flvr; - return 1; - } - - return 0; -} - -/********************************** - * sptlrpc configuration support * - **********************************/ - -struct sptlrpc_conf_tgt { - struct list_head sct_list; - char sct_name[MAX_OBD_NAME]; - struct sptlrpc_rule_set sct_rset; -}; - -struct sptlrpc_conf { - struct list_head sc_list; - char sc_fsname[MTI_NAME_MAXLEN]; - unsigned int sc_modified; /* modified during updating */ - unsigned int sc_updated:1, /* updated copy from MGS */ - sc_local:1; /* local copy from target */ - struct sptlrpc_rule_set sc_rset; /* fs general rules */ - struct list_head sc_tgts; /* target-specific rules */ -}; - -static struct mutex sptlrpc_conf_lock; -static LIST_HEAD(sptlrpc_confs); - -static inline int is_hex(char c) -{ - return ((c >= '0' && c <= '9') || - (c >= 'a' && c <= 'f')); -} - -static void target2fsname(const char *tgt, char *fsname, int buflen) -{ - const char *ptr; - int len; - - ptr = strrchr(tgt, '-'); - if (ptr) { - if ((strncmp(ptr, "-MDT", 4) != 0 && - strncmp(ptr, "-OST", 4) != 0) || - !is_hex(ptr[4]) || !is_hex(ptr[5]) || - !is_hex(ptr[6]) || !is_hex(ptr[7])) - ptr = NULL; - } - - /* if we didn't find the pattern, treat the whole string as fsname */ - if (!ptr) - len = strlen(tgt); - else - len = ptr - tgt; - - len = min(len, buflen - 1); - memcpy(fsname, tgt, len); - fsname[len] = '\0'; -} - -static void sptlrpc_conf_free_rsets(struct sptlrpc_conf *conf) -{ - struct sptlrpc_conf_tgt *conf_tgt, *conf_tgt_next; - - sptlrpc_rule_set_free(&conf->sc_rset); - - list_for_each_entry_safe(conf_tgt, conf_tgt_next, - &conf->sc_tgts, sct_list) { - sptlrpc_rule_set_free(&conf_tgt->sct_rset); - list_del(&conf_tgt->sct_list); - kfree(conf_tgt); - } - LASSERT(list_empty(&conf->sc_tgts)); - - conf->sc_updated = 0; - conf->sc_local = 0; -} - -static void sptlrpc_conf_free(struct sptlrpc_conf *conf) -{ - CDEBUG(D_SEC, "free sptlrpc conf %s\n", conf->sc_fsname); - - sptlrpc_conf_free_rsets(conf); - list_del(&conf->sc_list); - kfree(conf); -} - -static -struct sptlrpc_conf_tgt *sptlrpc_conf_get_tgt(struct sptlrpc_conf *conf, - const char *name, - int create) -{ - struct sptlrpc_conf_tgt *conf_tgt; - - list_for_each_entry(conf_tgt, &conf->sc_tgts, sct_list) { - if (strcmp(conf_tgt->sct_name, name) == 0) - return conf_tgt; - } - - if (!create) - return NULL; - - conf_tgt = kzalloc(sizeof(*conf_tgt), GFP_NOFS); - if (conf_tgt) { - strlcpy(conf_tgt->sct_name, name, sizeof(conf_tgt->sct_name)); - sptlrpc_rule_set_init(&conf_tgt->sct_rset); - list_add(&conf_tgt->sct_list, &conf->sc_tgts); - } - - return conf_tgt; -} - -static -struct sptlrpc_conf *sptlrpc_conf_get(const char *fsname, - int create) -{ - struct sptlrpc_conf *conf; - size_t len; - - list_for_each_entry(conf, &sptlrpc_confs, sc_list) { - if (strcmp(conf->sc_fsname, fsname) == 0) - return conf; - } - - if (!create) - return NULL; - - conf = kzalloc(sizeof(*conf), GFP_NOFS); - if (!conf) - return NULL; - - len = strlcpy(conf->sc_fsname, fsname, sizeof(conf->sc_fsname)); - if (len >= sizeof(conf->sc_fsname)) { - kfree(conf); - return NULL; - } - sptlrpc_rule_set_init(&conf->sc_rset); - INIT_LIST_HEAD(&conf->sc_tgts); - list_add(&conf->sc_list, &sptlrpc_confs); - - CDEBUG(D_SEC, "create sptlrpc conf %s\n", conf->sc_fsname); - return conf; -} - -/** - * caller must hold conf_lock already. - */ -static int sptlrpc_conf_merge_rule(struct sptlrpc_conf *conf, - const char *target, - struct sptlrpc_rule *rule) -{ - struct sptlrpc_conf_tgt *conf_tgt; - struct sptlrpc_rule_set *rule_set; - - /* fsname == target means general rules for the whole fs */ - if (strcmp(conf->sc_fsname, target) == 0) { - rule_set = &conf->sc_rset; - } else { - conf_tgt = sptlrpc_conf_get_tgt(conf, target, 1); - if (conf_tgt) { - rule_set = &conf_tgt->sct_rset; - } else { - CERROR("out of memory, can't merge rule!\n"); - return -ENOMEM; - } - } - - return sptlrpc_rule_set_merge(rule_set, rule); -} - -/** - * process one LCFG_SPTLRPC_CONF record. if \a conf is NULL, we - * find one through the target name in the record inside conf_lock; - * otherwise means caller already hold conf_lock. - */ -static int __sptlrpc_process_config(struct lustre_cfg *lcfg, - struct sptlrpc_conf *conf) -{ - char *target, *param; - char fsname[MTI_NAME_MAXLEN]; - struct sptlrpc_rule rule; - int rc; - - target = lustre_cfg_string(lcfg, 1); - if (!target) { - CERROR("missing target name\n"); - return -EINVAL; - } - - param = lustre_cfg_string(lcfg, 2); - if (!param) { - CERROR("missing parameter\n"); - return -EINVAL; - } - - CDEBUG(D_SEC, "processing rule: %s.%s\n", target, param); - - /* parse rule to make sure the format is correct */ - if (strncmp(param, PARAM_SRPC_FLVR, sizeof(PARAM_SRPC_FLVR) - 1) != 0) { - CERROR("Invalid sptlrpc parameter: %s\n", param); - return -EINVAL; - } - param += sizeof(PARAM_SRPC_FLVR) - 1; - - rc = sptlrpc_parse_rule(param, &rule); - if (rc) - return -EINVAL; - - if (!conf) { - target2fsname(target, fsname, sizeof(fsname)); - - mutex_lock(&sptlrpc_conf_lock); - conf = sptlrpc_conf_get(fsname, 0); - if (!conf) { - CERROR("can't find conf\n"); - rc = -ENOMEM; - } else { - rc = sptlrpc_conf_merge_rule(conf, target, &rule); - } - mutex_unlock(&sptlrpc_conf_lock); - } else { - LASSERT(mutex_is_locked(&sptlrpc_conf_lock)); - rc = sptlrpc_conf_merge_rule(conf, target, &rule); - } - - if (rc == 0) - conf->sc_modified++; - - return rc; -} - -int sptlrpc_process_config(struct lustre_cfg *lcfg) -{ - return __sptlrpc_process_config(lcfg, NULL); -} -EXPORT_SYMBOL(sptlrpc_process_config); - -static int logname2fsname(const char *logname, char *buf, int buflen) -{ - char *ptr; - int len; - - ptr = strrchr(logname, '-'); - if (!ptr || strcmp(ptr, "-sptlrpc")) { - CERROR("%s is not a sptlrpc config log\n", logname); - return -EINVAL; - } - - len = min((int)(ptr - logname), buflen - 1); - - memcpy(buf, logname, len); - buf[len] = '\0'; - return 0; -} - -void sptlrpc_conf_log_update_begin(const char *logname) -{ - struct sptlrpc_conf *conf; - char fsname[16]; - - if (logname2fsname(logname, fsname, sizeof(fsname))) - return; - - mutex_lock(&sptlrpc_conf_lock); - - conf = sptlrpc_conf_get(fsname, 0); - if (conf) { - if (conf->sc_local) { - LASSERT(conf->sc_updated == 0); - sptlrpc_conf_free_rsets(conf); - } - conf->sc_modified = 0; - } - - mutex_unlock(&sptlrpc_conf_lock); -} -EXPORT_SYMBOL(sptlrpc_conf_log_update_begin); - -/** - * mark a config log has been updated - */ -void sptlrpc_conf_log_update_end(const char *logname) -{ - struct sptlrpc_conf *conf; - char fsname[16]; - - if (logname2fsname(logname, fsname, sizeof(fsname))) - return; - - mutex_lock(&sptlrpc_conf_lock); - - conf = sptlrpc_conf_get(fsname, 0); - if (conf) { - /* - * if original state is not updated, make sure the - * modified counter > 0 to enforce updating local copy. - */ - if (conf->sc_updated == 0) - conf->sc_modified++; - - conf->sc_updated = 1; - } - - mutex_unlock(&sptlrpc_conf_lock); -} -EXPORT_SYMBOL(sptlrpc_conf_log_update_end); - -void sptlrpc_conf_log_start(const char *logname) -{ - char fsname[16]; - - if (logname2fsname(logname, fsname, sizeof(fsname))) - return; - - mutex_lock(&sptlrpc_conf_lock); - sptlrpc_conf_get(fsname, 1); - mutex_unlock(&sptlrpc_conf_lock); -} -EXPORT_SYMBOL(sptlrpc_conf_log_start); - -void sptlrpc_conf_log_stop(const char *logname) -{ - struct sptlrpc_conf *conf; - char fsname[16]; - - if (logname2fsname(logname, fsname, sizeof(fsname))) - return; - - mutex_lock(&sptlrpc_conf_lock); - conf = sptlrpc_conf_get(fsname, 0); - if (conf) - sptlrpc_conf_free(conf); - mutex_unlock(&sptlrpc_conf_lock); -} -EXPORT_SYMBOL(sptlrpc_conf_log_stop); - -static inline void flavor_set_flags(struct sptlrpc_flavor *sf, - enum lustre_sec_part from, - enum lustre_sec_part to, - unsigned int fl_udesc) -{ - /* - * null flavor doesn't need to set any flavor, and in fact - * we'd better not do that because everybody share a single sec. - */ - if (sf->sf_rpc == SPTLRPC_FLVR_NULL) - return; - - if (from == LUSTRE_SP_MDT) { - /* MDT->MDT; MDT->OST */ - sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY; - } else if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_OST) { - /* CLI->OST */ - sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY | PTLRPC_SEC_FL_BULK; - } else if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_MDT) { - /* CLI->MDT */ - if (fl_udesc && sf->sf_rpc != SPTLRPC_FLVR_NULL) - sf->sf_flags |= PTLRPC_SEC_FL_UDESC; - } -} - -void sptlrpc_conf_choose_flavor(enum lustre_sec_part from, - enum lustre_sec_part to, - struct obd_uuid *target, - lnet_nid_t nid, - struct sptlrpc_flavor *sf) -{ - struct sptlrpc_conf *conf; - struct sptlrpc_conf_tgt *conf_tgt; - char name[MTI_NAME_MAXLEN]; - int len, rc = 0; - - target2fsname(target->uuid, name, sizeof(name)); - - mutex_lock(&sptlrpc_conf_lock); - - conf = sptlrpc_conf_get(name, 0); - if (!conf) - goto out; - - /* convert uuid name (supposed end with _UUID) to target name */ - len = strlen(target->uuid); - LASSERT(len > 5); - memcpy(name, target->uuid, len - 5); - name[len - 5] = '\0'; - - conf_tgt = sptlrpc_conf_get_tgt(conf, name, 0); - if (conf_tgt) { - rc = sptlrpc_rule_set_choose(&conf_tgt->sct_rset, - from, to, nid, sf); - if (rc) - goto out; - } - - rc = sptlrpc_rule_set_choose(&conf->sc_rset, from, to, nid, sf); -out: - mutex_unlock(&sptlrpc_conf_lock); - - if (rc == 0) - get_default_flavor(sf); - - flavor_set_flags(sf, from, to, 1); -} - -#define SEC_ADAPT_DELAY (10) - -/** - * called by client devices, notify the sptlrpc config has changed and - * do import_sec_adapt later. - */ -void sptlrpc_conf_client_adapt(struct obd_device *obd) -{ - struct obd_import *imp; - - LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 || - strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) == 0); - CDEBUG(D_SEC, "obd %s\n", obd->u.cli.cl_target_uuid.uuid); - - /* serialize with connect/disconnect import */ - down_read_nested(&obd->u.cli.cl_sem, OBD_CLI_SEM_MDCOSC); - - imp = obd->u.cli.cl_import; - if (imp) { - spin_lock(&imp->imp_lock); - if (imp->imp_sec) - imp->imp_sec_expire = ktime_get_real_seconds() + - SEC_ADAPT_DELAY; - spin_unlock(&imp->imp_lock); - } - - up_read(&obd->u.cli.cl_sem); -} -EXPORT_SYMBOL(sptlrpc_conf_client_adapt); - -int sptlrpc_conf_init(void) -{ - mutex_init(&sptlrpc_conf_lock); - return 0; -} - -void sptlrpc_conf_fini(void) -{ - struct sptlrpc_conf *conf, *conf_next; - - mutex_lock(&sptlrpc_conf_lock); - list_for_each_entry_safe(conf, conf_next, &sptlrpc_confs, sc_list) { - sptlrpc_conf_free(conf); - } - LASSERT(list_empty(&sptlrpc_confs)); - mutex_unlock(&sptlrpc_conf_lock); -} diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c b/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c deleted file mode 100644 index 2c8bad7b78776b6332234eb7296ef8e0fd93d37b..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ptlrpc/sec_gc.c +++ /dev/null @@ -1,190 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/ptlrpc/sec_gc.c - * - * Author: Eric Mei - */ - -#define DEBUG_SUBSYSTEM S_SEC - -#include - -#include -#include -#include -#include - -#include "ptlrpc_internal.h" - -#define SEC_GC_INTERVAL (30 * 60) - -static struct mutex sec_gc_mutex; -static LIST_HEAD(sec_gc_list); -static spinlock_t sec_gc_list_lock; - -static LIST_HEAD(sec_gc_ctx_list); -static spinlock_t sec_gc_ctx_list_lock; - -static atomic_t sec_gc_wait_del = ATOMIC_INIT(0); - -void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec) -{ - LASSERT(sec->ps_policy->sp_cops->gc_ctx); - LASSERT(sec->ps_gc_interval > 0); - LASSERT(list_empty(&sec->ps_gc_list)); - - sec->ps_gc_next = ktime_get_real_seconds() + sec->ps_gc_interval; - - spin_lock(&sec_gc_list_lock); - list_add_tail(&sec->ps_gc_list, &sec_gc_list); - spin_unlock(&sec_gc_list_lock); - - CDEBUG(D_SEC, "added sec %p(%s)\n", sec, sec->ps_policy->sp_name); -} - -void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec) -{ - if (list_empty(&sec->ps_gc_list)) - return; - - might_sleep(); - - /* signal before list_del to make iteration in gc thread safe */ - atomic_inc(&sec_gc_wait_del); - - spin_lock(&sec_gc_list_lock); - list_del_init(&sec->ps_gc_list); - spin_unlock(&sec_gc_list_lock); - - /* barrier */ - mutex_lock(&sec_gc_mutex); - mutex_unlock(&sec_gc_mutex); - - atomic_dec(&sec_gc_wait_del); - - CDEBUG(D_SEC, "del sec %p(%s)\n", sec, sec->ps_policy->sp_name); -} - -static void sec_process_ctx_list(void) -{ - struct ptlrpc_cli_ctx *ctx; - - spin_lock(&sec_gc_ctx_list_lock); - - while (!list_empty(&sec_gc_ctx_list)) { - ctx = list_entry(sec_gc_ctx_list.next, - struct ptlrpc_cli_ctx, cc_gc_chain); - list_del_init(&ctx->cc_gc_chain); - spin_unlock(&sec_gc_ctx_list_lock); - - LASSERT(ctx->cc_sec); - LASSERT(atomic_read(&ctx->cc_refcount) == 1); - CDEBUG(D_SEC, "gc pick up ctx %p(%u->%s)\n", - ctx, ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec)); - sptlrpc_cli_ctx_put(ctx, 1); - - spin_lock(&sec_gc_ctx_list_lock); - } - - spin_unlock(&sec_gc_ctx_list_lock); -} - -static void sec_do_gc(struct ptlrpc_sec *sec) -{ - LASSERT(sec->ps_policy->sp_cops->gc_ctx); - - if (unlikely(sec->ps_gc_next == 0)) { - CDEBUG(D_SEC, "sec %p(%s) has 0 gc time\n", - sec, sec->ps_policy->sp_name); - return; - } - - CDEBUG(D_SEC, "check on sec %p(%s)\n", sec, sec->ps_policy->sp_name); - - if (sec->ps_gc_next > ktime_get_real_seconds()) - return; - - sec->ps_policy->sp_cops->gc_ctx(sec); - sec->ps_gc_next = ktime_get_real_seconds() + sec->ps_gc_interval; -} - -static void sec_gc_main(struct work_struct *ws); -static DECLARE_DELAYED_WORK(sec_gc_work, sec_gc_main); - -static void sec_gc_main(struct work_struct *ws) -{ - struct ptlrpc_sec *sec; - - sec_process_ctx_list(); -again: - /* go through sec list do gc. - * FIXME here we iterate through the whole list each time which - * is not optimal. we perhaps want to use balanced binary tree - * to trace each sec as order of expiry time. - * another issue here is we wakeup as fixed interval instead of - * according to each sec's expiry time - */ - mutex_lock(&sec_gc_mutex); - list_for_each_entry(sec, &sec_gc_list, ps_gc_list) { - /* if someone is waiting to be deleted, let it - * proceed as soon as possible. - */ - if (atomic_read(&sec_gc_wait_del)) { - CDEBUG(D_SEC, "deletion pending, start over\n"); - mutex_unlock(&sec_gc_mutex); - goto again; - } - - sec_do_gc(sec); - } - mutex_unlock(&sec_gc_mutex); - - /* check ctx list again before sleep */ - sec_process_ctx_list(); - schedule_delayed_work(&sec_gc_work, SEC_GC_INTERVAL * HZ); -} - -int sptlrpc_gc_init(void) -{ - mutex_init(&sec_gc_mutex); - spin_lock_init(&sec_gc_list_lock); - spin_lock_init(&sec_gc_ctx_list_lock); - - schedule_delayed_work(&sec_gc_work, 0); - return 0; -} - -void sptlrpc_gc_fini(void) -{ - cancel_delayed_work_sync(&sec_gc_work); -} diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c b/drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c deleted file mode 100644 index 2bb75ebd5d984bd346a8a182a19d5a365370873d..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ptlrpc/sec_lproc.c +++ /dev/null @@ -1,170 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/ptlrpc/sec_lproc.c - * - * Author: Eric Mei - */ - -#define DEBUG_SUBSYSTEM S_SEC - -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "ptlrpc_internal.h" - -static char *sec_flags2str(unsigned long flags, char *buf, int bufsize) -{ - buf[0] = '\0'; - - if (flags & PTLRPC_SEC_FL_REVERSE) - strlcat(buf, "reverse,", bufsize); - if (flags & PTLRPC_SEC_FL_ROOTONLY) - strlcat(buf, "rootonly,", bufsize); - if (flags & PTLRPC_SEC_FL_UDESC) - strlcat(buf, "udesc,", bufsize); - if (flags & PTLRPC_SEC_FL_BULK) - strlcat(buf, "bulk,", bufsize); - if (buf[0] == '\0') - strlcat(buf, "-,", bufsize); - - return buf; -} - -static int sptlrpc_info_lprocfs_seq_show(struct seq_file *seq, void *v) -{ - struct obd_device *dev = seq->private; - struct client_obd *cli = &dev->u.cli; - struct ptlrpc_sec *sec = NULL; - char str[32]; - - LASSERT(strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 || - strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 || - strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) == 0); - - if (cli->cl_import) - sec = sptlrpc_import_sec_ref(cli->cl_import); - if (!sec) - goto out; - - sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str)); - - seq_printf(seq, "rpc flavor: %s\n", - sptlrpc_flavor2name_base(sec->ps_flvr.sf_rpc)); - seq_printf(seq, "bulk flavor: %s\n", - sptlrpc_flavor2name_bulk(&sec->ps_flvr, str, sizeof(str))); - seq_printf(seq, "flags: %s\n", - sec_flags2str(sec->ps_flvr.sf_flags, str, sizeof(str))); - seq_printf(seq, "id: %d\n", sec->ps_id); - seq_printf(seq, "refcount: %d\n", - atomic_read(&sec->ps_refcount)); - seq_printf(seq, "nctx: %d\n", atomic_read(&sec->ps_nctx)); - seq_printf(seq, "gc internal %ld\n", sec->ps_gc_interval); - seq_printf(seq, "gc next %lld\n", - sec->ps_gc_interval ? - (s64)(sec->ps_gc_next - ktime_get_real_seconds()) : 0ll); - - sptlrpc_sec_put(sec); -out: - return 0; -} - -LPROC_SEQ_FOPS_RO(sptlrpc_info_lprocfs); - -static int sptlrpc_ctxs_lprocfs_seq_show(struct seq_file *seq, void *v) -{ - struct obd_device *dev = seq->private; - struct client_obd *cli = &dev->u.cli; - struct ptlrpc_sec *sec = NULL; - - LASSERT(strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) == 0 || - strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 || - strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) == 0); - - if (cli->cl_import) - sec = sptlrpc_import_sec_ref(cli->cl_import); - if (!sec) - goto out; - - if (sec->ps_policy->sp_cops->display) - sec->ps_policy->sp_cops->display(sec, seq); - - sptlrpc_sec_put(sec); -out: - return 0; -} - -LPROC_SEQ_FOPS_RO(sptlrpc_ctxs_lprocfs); - -int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev) -{ - if (strcmp(dev->obd_type->typ_name, LUSTRE_OSC_NAME) != 0 && - strcmp(dev->obd_type->typ_name, LUSTRE_MDC_NAME) != 0 && - strcmp(dev->obd_type->typ_name, LUSTRE_MGC_NAME) != 0) { - CERROR("can't register lproc for obd type %s\n", - dev->obd_type->typ_name); - return -EINVAL; - } - - debugfs_create_file("srpc_info", 0444, dev->obd_debugfs_entry, dev, - &sptlrpc_info_lprocfs_fops); - debugfs_create_file("srpc_contexts", 0444, dev->obd_debugfs_entry, dev, - &sptlrpc_ctxs_lprocfs_fops); - - return 0; -} -EXPORT_SYMBOL(sptlrpc_lprocfs_cliobd_attach); - -LPROC_SEQ_FOPS_RO(sptlrpc_proc_enc_pool); -static struct lprocfs_vars sptlrpc_lprocfs_vars[] = { - { "encrypt_page_pools", &sptlrpc_proc_enc_pool_fops }, - { NULL } -}; - -static struct dentry *sptlrpc_debugfs_dir; - -void sptlrpc_lproc_init(void) -{ - sptlrpc_debugfs_dir = debugfs_create_dir("sptlrpc", debugfs_lustre_root); - ldebugfs_add_vars(sptlrpc_debugfs_dir, sptlrpc_lprocfs_vars, NULL); -} - -void sptlrpc_lproc_fini(void) -{ - debugfs_remove_recursive(sptlrpc_debugfs_dir); -} diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_null.c b/drivers/staging/lustre/lustre/ptlrpc/sec_null.c deleted file mode 100644 index ecc387d1b9b4e789525d6423bac1de5b05dd7bed..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ptlrpc/sec_null.c +++ /dev/null @@ -1,459 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2012, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/ptlrpc/sec_null.c - * - * Author: Eric Mei - */ - -#define DEBUG_SUBSYSTEM S_SEC - -#include -#include -#include -#include -#include - -#include "ptlrpc_internal.h" - -static struct ptlrpc_sec_policy null_policy; -static struct ptlrpc_sec null_sec; -static struct ptlrpc_cli_ctx null_cli_ctx; -static struct ptlrpc_svc_ctx null_svc_ctx; - -/* - * we can temporarily use the topmost 8-bits of lm_secflvr to identify - * the source sec part. - */ -static inline -void null_encode_sec_part(struct lustre_msg *msg, enum lustre_sec_part sp) -{ - msg->lm_secflvr |= (((__u32)sp) & 0xFF) << 24; -} - -static inline -enum lustre_sec_part null_decode_sec_part(struct lustre_msg *msg) -{ - return (msg->lm_secflvr >> 24) & 0xFF; -} - -static int null_ctx_refresh(struct ptlrpc_cli_ctx *ctx) -{ - /* should never reach here */ - LBUG(); - return 0; -} - -static -int null_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) -{ - req->rq_reqbuf->lm_secflvr = SPTLRPC_FLVR_NULL; - - if (!req->rq_import->imp_dlm_fake) { - struct obd_device *obd = req->rq_import->imp_obd; - - null_encode_sec_part(req->rq_reqbuf, - obd->u.cli.cl_sp_me); - } - req->rq_reqdata_len = req->rq_reqlen; - return 0; -} - -static -int null_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) -{ - __u32 cksums, cksumc; - - LASSERT(req->rq_repdata); - - req->rq_repmsg = req->rq_repdata; - req->rq_replen = req->rq_repdata_len; - - if (req->rq_early) { - cksums = lustre_msg_get_cksum(req->rq_repdata); - cksumc = lustre_msg_calc_cksum(req->rq_repmsg); - if (cksumc != cksums) { - CDEBUG(D_SEC, - "early reply checksum mismatch: %08x != %08x\n", - cksumc, cksums); - return -EINVAL; - } - } - - return 0; -} - -static -struct ptlrpc_sec *null_create_sec(struct obd_import *imp, - struct ptlrpc_svc_ctx *svc_ctx, - struct sptlrpc_flavor *sf) -{ - LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_NULL); - - /* general layer has take a module reference for us, because we never - * really destroy the sec, simply release the reference here. - */ - sptlrpc_policy_put(&null_policy); - return &null_sec; -} - -static -void null_destroy_sec(struct ptlrpc_sec *sec) -{ - LASSERT(sec == &null_sec); -} - -static -struct ptlrpc_cli_ctx *null_lookup_ctx(struct ptlrpc_sec *sec, - struct vfs_cred *vcred, - int create, int remove_dead) -{ - atomic_inc(&null_cli_ctx.cc_refcount); - return &null_cli_ctx; -} - -static -int null_flush_ctx_cache(struct ptlrpc_sec *sec, - uid_t uid, - int grace, int force) -{ - return 0; -} - -static -int null_alloc_reqbuf(struct ptlrpc_sec *sec, - struct ptlrpc_request *req, - int msgsize) -{ - if (!req->rq_reqbuf) { - int alloc_size = size_roundup_power2(msgsize); - - LASSERT(!req->rq_pool); - req->rq_reqbuf = kvzalloc(alloc_size, GFP_NOFS); - if (!req->rq_reqbuf) - return -ENOMEM; - - req->rq_reqbuf_len = alloc_size; - } else { - LASSERT(req->rq_pool); - LASSERT(req->rq_reqbuf_len >= msgsize); - memset(req->rq_reqbuf, 0, msgsize); - } - - req->rq_reqmsg = req->rq_reqbuf; - return 0; -} - -static -void null_free_reqbuf(struct ptlrpc_sec *sec, - struct ptlrpc_request *req) -{ - if (!req->rq_pool) { - LASSERTF(req->rq_reqmsg == req->rq_reqbuf, - "req %p: reqmsg %p is not reqbuf %p in null sec\n", - req, req->rq_reqmsg, req->rq_reqbuf); - LASSERTF(req->rq_reqbuf_len >= req->rq_reqlen, - "req %p: reqlen %d should smaller than buflen %d\n", - req, req->rq_reqlen, req->rq_reqbuf_len); - - kvfree(req->rq_reqbuf); - req->rq_reqbuf = NULL; - req->rq_reqbuf_len = 0; - } -} - -static -int null_alloc_repbuf(struct ptlrpc_sec *sec, - struct ptlrpc_request *req, - int msgsize) -{ - /* add space for early replied */ - msgsize += lustre_msg_early_size(); - - msgsize = size_roundup_power2(msgsize); - - req->rq_repbuf = kvzalloc(msgsize, GFP_NOFS); - if (!req->rq_repbuf) - return -ENOMEM; - - req->rq_repbuf_len = msgsize; - return 0; -} - -static -void null_free_repbuf(struct ptlrpc_sec *sec, - struct ptlrpc_request *req) -{ - LASSERT(req->rq_repbuf); - - kvfree(req->rq_repbuf); - req->rq_repbuf = NULL; - req->rq_repbuf_len = 0; -} - -static -int null_enlarge_reqbuf(struct ptlrpc_sec *sec, - struct ptlrpc_request *req, - int segment, int newsize) -{ - struct lustre_msg *newbuf; - struct lustre_msg *oldbuf = req->rq_reqmsg; - int oldsize, newmsg_size, alloc_size; - - LASSERT(req->rq_reqbuf); - LASSERT(req->rq_reqbuf == req->rq_reqmsg); - LASSERT(req->rq_reqbuf_len >= req->rq_reqlen); - LASSERT(req->rq_reqlen == lustre_packed_msg_size(oldbuf)); - - /* compute new message size */ - oldsize = req->rq_reqbuf->lm_buflens[segment]; - req->rq_reqbuf->lm_buflens[segment] = newsize; - newmsg_size = lustre_packed_msg_size(oldbuf); - req->rq_reqbuf->lm_buflens[segment] = oldsize; - - /* request from pool should always have enough buffer */ - LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newmsg_size); - - if (req->rq_reqbuf_len < newmsg_size) { - alloc_size = size_roundup_power2(newmsg_size); - - newbuf = kvzalloc(alloc_size, GFP_NOFS); - if (!newbuf) - return -ENOMEM; - - /* Must lock this, so that otherwise unprotected change of - * rq_reqmsg is not racing with parallel processing of - * imp_replay_list traversing threads. See LU-3333 - * This is a bandaid at best, we really need to deal with this - * in request enlarging code before unpacking that's already - * there - */ - if (req->rq_import) - spin_lock(&req->rq_import->imp_lock); - memcpy(newbuf, req->rq_reqbuf, req->rq_reqlen); - - kvfree(req->rq_reqbuf); - req->rq_reqbuf = newbuf; - req->rq_reqmsg = newbuf; - req->rq_reqbuf_len = alloc_size; - - if (req->rq_import) - spin_unlock(&req->rq_import->imp_lock); - } - - _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize); - req->rq_reqlen = newmsg_size; - - return 0; -} - -static struct ptlrpc_svc_ctx null_svc_ctx = { - .sc_refcount = ATOMIC_INIT(1), - .sc_policy = &null_policy, -}; - -static -int null_accept(struct ptlrpc_request *req) -{ - LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) == - SPTLRPC_POLICY_NULL); - - if (req->rq_flvr.sf_rpc != SPTLRPC_FLVR_NULL) { - CERROR("Invalid rpc flavor 0x%x\n", req->rq_flvr.sf_rpc); - return SECSVC_DROP; - } - - req->rq_sp_from = null_decode_sec_part(req->rq_reqbuf); - - req->rq_reqmsg = req->rq_reqbuf; - req->rq_reqlen = req->rq_reqdata_len; - - req->rq_svc_ctx = &null_svc_ctx; - atomic_inc(&req->rq_svc_ctx->sc_refcount); - - return SECSVC_OK; -} - -static -int null_alloc_rs(struct ptlrpc_request *req, int msgsize) -{ - struct ptlrpc_reply_state *rs; - int rs_size = sizeof(*rs) + msgsize; - - LASSERT(msgsize % 8 == 0); - - rs = req->rq_reply_state; - - if (rs) { - /* pre-allocated */ - LASSERT(rs->rs_size >= rs_size); - } else { - rs = kvzalloc(rs_size, GFP_NOFS); - if (!rs) - return -ENOMEM; - - rs->rs_size = rs_size; - } - - rs->rs_svc_ctx = req->rq_svc_ctx; - atomic_inc(&req->rq_svc_ctx->sc_refcount); - - rs->rs_repbuf = (struct lustre_msg *)(rs + 1); - rs->rs_repbuf_len = rs_size - sizeof(*rs); - rs->rs_msg = rs->rs_repbuf; - - req->rq_reply_state = rs; - return 0; -} - -static -void null_free_rs(struct ptlrpc_reply_state *rs) -{ - LASSERT_ATOMIC_GT(&rs->rs_svc_ctx->sc_refcount, 1); - atomic_dec(&rs->rs_svc_ctx->sc_refcount); - - if (!rs->rs_prealloc) - kvfree(rs); -} - -static -int null_authorize(struct ptlrpc_request *req) -{ - struct ptlrpc_reply_state *rs = req->rq_reply_state; - - LASSERT(rs); - - rs->rs_repbuf->lm_secflvr = SPTLRPC_FLVR_NULL; - rs->rs_repdata_len = req->rq_replen; - - if (likely(req->rq_packed_final)) { - if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) - req->rq_reply_off = lustre_msg_early_size(); - else - req->rq_reply_off = 0; - } else { - __u32 cksum; - - cksum = lustre_msg_calc_cksum(rs->rs_repbuf); - lustre_msg_set_cksum(rs->rs_repbuf, cksum); - req->rq_reply_off = 0; - } - - return 0; -} - -static struct ptlrpc_ctx_ops null_ctx_ops = { - .refresh = null_ctx_refresh, - .sign = null_ctx_sign, - .verify = null_ctx_verify, -}; - -static struct ptlrpc_sec_cops null_sec_cops = { - .create_sec = null_create_sec, - .destroy_sec = null_destroy_sec, - .lookup_ctx = null_lookup_ctx, - .flush_ctx_cache = null_flush_ctx_cache, - .alloc_reqbuf = null_alloc_reqbuf, - .alloc_repbuf = null_alloc_repbuf, - .free_reqbuf = null_free_reqbuf, - .free_repbuf = null_free_repbuf, - .enlarge_reqbuf = null_enlarge_reqbuf, -}; - -static struct ptlrpc_sec_sops null_sec_sops = { - .accept = null_accept, - .alloc_rs = null_alloc_rs, - .authorize = null_authorize, - .free_rs = null_free_rs, -}; - -static struct ptlrpc_sec_policy null_policy = { - .sp_owner = THIS_MODULE, - .sp_name = "sec.null", - .sp_policy = SPTLRPC_POLICY_NULL, - .sp_cops = &null_sec_cops, - .sp_sops = &null_sec_sops, -}; - -static void null_init_internal(void) -{ - static HLIST_HEAD(__list); - - null_sec.ps_policy = &null_policy; - atomic_set(&null_sec.ps_refcount, 1); /* always busy */ - null_sec.ps_id = -1; - null_sec.ps_import = NULL; - null_sec.ps_flvr.sf_rpc = SPTLRPC_FLVR_NULL; - null_sec.ps_flvr.sf_flags = 0; - null_sec.ps_part = LUSTRE_SP_ANY; - null_sec.ps_dying = 0; - spin_lock_init(&null_sec.ps_lock); - atomic_set(&null_sec.ps_nctx, 1); /* for "null_cli_ctx" */ - INIT_LIST_HEAD(&null_sec.ps_gc_list); - null_sec.ps_gc_interval = 0; - null_sec.ps_gc_next = 0; - - hlist_add_head(&null_cli_ctx.cc_cache, &__list); - atomic_set(&null_cli_ctx.cc_refcount, 1); /* for hash */ - null_cli_ctx.cc_sec = &null_sec; - null_cli_ctx.cc_ops = &null_ctx_ops; - null_cli_ctx.cc_expire = 0; - null_cli_ctx.cc_flags = PTLRPC_CTX_CACHED | PTLRPC_CTX_ETERNAL | - PTLRPC_CTX_UPTODATE; - null_cli_ctx.cc_vcred.vc_uid = 0; - spin_lock_init(&null_cli_ctx.cc_lock); - INIT_LIST_HEAD(&null_cli_ctx.cc_req_list); - INIT_LIST_HEAD(&null_cli_ctx.cc_gc_chain); -} - -int sptlrpc_null_init(void) -{ - int rc; - - null_init_internal(); - - rc = sptlrpc_register_policy(&null_policy); - if (rc) - CERROR("failed to register %s: %d\n", null_policy.sp_name, rc); - - return rc; -} - -void sptlrpc_null_fini(void) -{ - int rc; - - rc = sptlrpc_unregister_policy(&null_policy); - if (rc) - CERROR("failed to unregister %s: %d\n", - null_policy.sp_name, rc); -} diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c b/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c deleted file mode 100644 index ec3d9af76b17a89befff4a37d3c54f1978b1eba8..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ptlrpc/sec_plain.c +++ /dev/null @@ -1,1023 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/ptlrpc/sec_plain.c - * - * Author: Eric Mei - */ - -#define DEBUG_SUBSYSTEM S_SEC - -#include -#include -#include -#include -#include -#include "ptlrpc_internal.h" - -struct plain_sec { - struct ptlrpc_sec pls_base; - rwlock_t pls_lock; - struct ptlrpc_cli_ctx *pls_ctx; -}; - -static inline struct plain_sec *sec2plsec(struct ptlrpc_sec *sec) -{ - return container_of(sec, struct plain_sec, pls_base); -} - -static struct ptlrpc_sec_policy plain_policy; -static struct ptlrpc_ctx_ops plain_ctx_ops; -static struct ptlrpc_svc_ctx plain_svc_ctx; - -static unsigned int plain_at_offset; - -/* - * for simplicity, plain policy rpc use fixed layout. - */ -#define PLAIN_PACK_SEGMENTS (4) - -#define PLAIN_PACK_HDR_OFF (0) -#define PLAIN_PACK_MSG_OFF (1) -#define PLAIN_PACK_USER_OFF (2) -#define PLAIN_PACK_BULK_OFF (3) - -#define PLAIN_FL_USER (0x01) -#define PLAIN_FL_BULK (0x02) - -struct plain_header { - __u8 ph_ver; /* 0 */ - __u8 ph_flags; - __u8 ph_sp; /* source */ - __u8 ph_bulk_hash_alg; /* complete flavor desc */ - __u8 ph_pad[4]; -}; - -struct plain_bulk_token { - __u8 pbt_hash[8]; -}; - -#define PLAIN_BSD_SIZE \ - (sizeof(struct ptlrpc_bulk_sec_desc) + sizeof(struct plain_bulk_token)) - -/**************************************** - * bulk checksum helpers * - ****************************************/ - -static int plain_unpack_bsd(struct lustre_msg *msg, int swabbed) -{ - struct ptlrpc_bulk_sec_desc *bsd; - - if (bulk_sec_desc_unpack(msg, PLAIN_PACK_BULK_OFF, swabbed)) - return -EPROTO; - - bsd = lustre_msg_buf(msg, PLAIN_PACK_BULK_OFF, PLAIN_BSD_SIZE); - if (!bsd) { - CERROR("bulk sec desc has short size %d\n", - lustre_msg_buflen(msg, PLAIN_PACK_BULK_OFF)); - return -EPROTO; - } - - if (bsd->bsd_svc != SPTLRPC_BULK_SVC_NULL && - bsd->bsd_svc != SPTLRPC_BULK_SVC_INTG) { - CERROR("invalid bulk svc %u\n", bsd->bsd_svc); - return -EPROTO; - } - - return 0; -} - -static int plain_generate_bulk_csum(struct ptlrpc_bulk_desc *desc, - __u8 hash_alg, - struct plain_bulk_token *token) -{ - if (hash_alg == BULK_HASH_ALG_NULL) - return 0; - - memset(token->pbt_hash, 0, sizeof(token->pbt_hash)); - return sptlrpc_get_bulk_checksum(desc, hash_alg, token->pbt_hash, - sizeof(token->pbt_hash)); -} - -static int plain_verify_bulk_csum(struct ptlrpc_bulk_desc *desc, - __u8 hash_alg, - struct plain_bulk_token *tokenr) -{ - struct plain_bulk_token tokenv; - int rc; - - if (hash_alg == BULK_HASH_ALG_NULL) - return 0; - - memset(&tokenv.pbt_hash, 0, sizeof(tokenv.pbt_hash)); - rc = sptlrpc_get_bulk_checksum(desc, hash_alg, tokenv.pbt_hash, - sizeof(tokenv.pbt_hash)); - if (rc) - return rc; - - if (memcmp(tokenr->pbt_hash, tokenv.pbt_hash, sizeof(tokenr->pbt_hash))) - return -EACCES; - return 0; -} - -static void corrupt_bulk_data(struct ptlrpc_bulk_desc *desc) -{ - char *ptr; - unsigned int off, i; - - LASSERT(ptlrpc_is_bulk_desc_kiov(desc->bd_type)); - - for (i = 0; i < desc->bd_iov_count; i++) { - if (!BD_GET_KIOV(desc, i).bv_len) - continue; - - ptr = kmap(BD_GET_KIOV(desc, i).bv_page); - off = BD_GET_KIOV(desc, i).bv_offset & ~PAGE_MASK; - ptr[off] ^= 0x1; - kunmap(BD_GET_KIOV(desc, i).bv_page); - return; - } -} - -/**************************************** - * cli_ctx apis * - ****************************************/ - -static -int plain_ctx_refresh(struct ptlrpc_cli_ctx *ctx) -{ - /* should never reach here */ - LBUG(); - return 0; -} - -static -int plain_ctx_validate(struct ptlrpc_cli_ctx *ctx) -{ - return 0; -} - -static -int plain_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) -{ - struct lustre_msg *msg = req->rq_reqbuf; - struct plain_header *phdr; - - msg->lm_secflvr = req->rq_flvr.sf_rpc; - - phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0); - phdr->ph_ver = 0; - phdr->ph_flags = 0; - phdr->ph_sp = ctx->cc_sec->ps_part; - phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg; - - if (req->rq_pack_udesc) - phdr->ph_flags |= PLAIN_FL_USER; - if (req->rq_pack_bulk) - phdr->ph_flags |= PLAIN_FL_BULK; - - req->rq_reqdata_len = lustre_msg_size_v2(msg->lm_bufcount, - msg->lm_buflens); - return 0; -} - -static -int plain_ctx_verify(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) -{ - struct lustre_msg *msg = req->rq_repdata; - struct plain_header *phdr; - __u32 cksum; - int swabbed; - - if (msg->lm_bufcount != PLAIN_PACK_SEGMENTS) { - CERROR("unexpected reply buf count %u\n", msg->lm_bufcount); - return -EPROTO; - } - - swabbed = ptlrpc_rep_need_swab(req); - - phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr)); - if (!phdr) { - CERROR("missing plain header\n"); - return -EPROTO; - } - - if (phdr->ph_ver != 0) { - CERROR("Invalid header version\n"); - return -EPROTO; - } - - /* expect no user desc in reply */ - if (phdr->ph_flags & PLAIN_FL_USER) { - CERROR("Unexpected udesc flag in reply\n"); - return -EPROTO; - } - - if (phdr->ph_bulk_hash_alg != req->rq_flvr.u_bulk.hash.hash_alg) { - CERROR("reply bulk flavor %u != %u\n", phdr->ph_bulk_hash_alg, - req->rq_flvr.u_bulk.hash.hash_alg); - return -EPROTO; - } - - if (unlikely(req->rq_early)) { - unsigned int hsize = 4; - - cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32, - lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, - 0), - lustre_msg_buflen(msg, - PLAIN_PACK_MSG_OFF), - NULL, 0, (unsigned char *)&cksum, - &hsize); - if (cksum != msg->lm_cksum) { - CDEBUG(D_SEC, - "early reply checksum mismatch: %08x != %08x\n", - cpu_to_le32(cksum), msg->lm_cksum); - return -EINVAL; - } - } else { - /* whether we sent with bulk or not, we expect the same - * in reply, except for early reply - */ - if (!req->rq_early && - !equi(req->rq_pack_bulk == 1, - phdr->ph_flags & PLAIN_FL_BULK)) { - CERROR("%s bulk checksum in reply\n", - req->rq_pack_bulk ? "Missing" : "Unexpected"); - return -EPROTO; - } - - if (phdr->ph_flags & PLAIN_FL_BULK) { - if (plain_unpack_bsd(msg, swabbed)) - return -EPROTO; - } - } - - req->rq_repmsg = lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0); - req->rq_replen = lustre_msg_buflen(msg, PLAIN_PACK_MSG_OFF); - return 0; -} - -static -int plain_cli_wrap_bulk(struct ptlrpc_cli_ctx *ctx, - struct ptlrpc_request *req, - struct ptlrpc_bulk_desc *desc) -{ - struct ptlrpc_bulk_sec_desc *bsd; - struct plain_bulk_token *token; - int rc; - - LASSERT(req->rq_pack_bulk); - LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS); - - bsd = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0); - token = (struct plain_bulk_token *)bsd->bsd_data; - - bsd->bsd_version = 0; - bsd->bsd_flags = 0; - bsd->bsd_type = SPTLRPC_BULK_DEFAULT; - bsd->bsd_svc = SPTLRPC_FLVR_BULK_SVC(req->rq_flvr.sf_rpc); - - if (bsd->bsd_svc == SPTLRPC_BULK_SVC_NULL) - return 0; - - if (req->rq_bulk_read) - return 0; - - rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg, - token); - if (rc) { - CERROR("bulk write: failed to compute checksum: %d\n", rc); - } else { - /* - * for sending we only compute the wrong checksum instead - * of corrupting the data so it is still correct on a redo - */ - if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_SEND) && - req->rq_flvr.u_bulk.hash.hash_alg != BULK_HASH_ALG_NULL) - token->pbt_hash[0] ^= 0x1; - } - - return rc; -} - -static -int plain_cli_unwrap_bulk(struct ptlrpc_cli_ctx *ctx, - struct ptlrpc_request *req, - struct ptlrpc_bulk_desc *desc) -{ - struct ptlrpc_bulk_sec_desc *bsdv; - struct plain_bulk_token *tokenv; - int rc; - int i, nob; - - LASSERT(req->rq_pack_bulk); - LASSERT(req->rq_reqbuf->lm_bufcount == PLAIN_PACK_SEGMENTS); - LASSERT(req->rq_repdata->lm_bufcount == PLAIN_PACK_SEGMENTS); - - bsdv = lustre_msg_buf(req->rq_repdata, PLAIN_PACK_BULK_OFF, 0); - tokenv = (struct plain_bulk_token *)bsdv->bsd_data; - - if (req->rq_bulk_write) { - if (bsdv->bsd_flags & BSD_FL_ERR) - return -EIO; - return 0; - } - - /* fix the actual data size */ - for (i = 0, nob = 0; i < desc->bd_iov_count; i++) { - struct bio_vec bv_desc = BD_GET_KIOV(desc, i); - - if (bv_desc.bv_len + nob > desc->bd_nob_transferred) - bv_desc.bv_len = desc->bd_nob_transferred - nob; - nob += bv_desc.bv_len; - } - - rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg, - tokenv); - if (rc) - CERROR("bulk read: client verify failed: %d\n", rc); - - return rc; -} - -/**************************************** - * sec apis * - ****************************************/ - -static -struct ptlrpc_cli_ctx *plain_sec_install_ctx(struct plain_sec *plsec) -{ - struct ptlrpc_cli_ctx *ctx, *ctx_new; - - ctx_new = kzalloc(sizeof(*ctx_new), GFP_NOFS); - - write_lock(&plsec->pls_lock); - - ctx = plsec->pls_ctx; - if (ctx) { - atomic_inc(&ctx->cc_refcount); - - kfree(ctx_new); - } else if (ctx_new) { - ctx = ctx_new; - - atomic_set(&ctx->cc_refcount, 1); /* for cache */ - ctx->cc_sec = &plsec->pls_base; - ctx->cc_ops = &plain_ctx_ops; - ctx->cc_expire = 0; - ctx->cc_flags = PTLRPC_CTX_CACHED | PTLRPC_CTX_UPTODATE; - ctx->cc_vcred.vc_uid = 0; - spin_lock_init(&ctx->cc_lock); - INIT_LIST_HEAD(&ctx->cc_req_list); - INIT_LIST_HEAD(&ctx->cc_gc_chain); - - plsec->pls_ctx = ctx; - atomic_inc(&plsec->pls_base.ps_nctx); - atomic_inc(&plsec->pls_base.ps_refcount); - - atomic_inc(&ctx->cc_refcount); /* for caller */ - } - - write_unlock(&plsec->pls_lock); - - return ctx; -} - -static -void plain_destroy_sec(struct ptlrpc_sec *sec) -{ - struct plain_sec *plsec = sec2plsec(sec); - - LASSERT(sec->ps_policy == &plain_policy); - LASSERT(sec->ps_import); - LASSERT(atomic_read(&sec->ps_refcount) == 0); - LASSERT(atomic_read(&sec->ps_nctx) == 0); - LASSERT(!plsec->pls_ctx); - - class_import_put(sec->ps_import); - - kfree(plsec); -} - -static -void plain_kill_sec(struct ptlrpc_sec *sec) -{ - sec->ps_dying = 1; -} - -static -struct ptlrpc_sec *plain_create_sec(struct obd_import *imp, - struct ptlrpc_svc_ctx *svc_ctx, - struct sptlrpc_flavor *sf) -{ - struct plain_sec *plsec; - struct ptlrpc_sec *sec; - struct ptlrpc_cli_ctx *ctx; - - LASSERT(SPTLRPC_FLVR_POLICY(sf->sf_rpc) == SPTLRPC_POLICY_PLAIN); - - plsec = kzalloc(sizeof(*plsec), GFP_NOFS); - if (!plsec) - return NULL; - - /* - * initialize plain_sec - */ - rwlock_init(&plsec->pls_lock); - plsec->pls_ctx = NULL; - - sec = &plsec->pls_base; - sec->ps_policy = &plain_policy; - atomic_set(&sec->ps_refcount, 0); - atomic_set(&sec->ps_nctx, 0); - sec->ps_id = sptlrpc_get_next_secid(); - sec->ps_import = class_import_get(imp); - sec->ps_flvr = *sf; - spin_lock_init(&sec->ps_lock); - INIT_LIST_HEAD(&sec->ps_gc_list); - sec->ps_gc_interval = 0; - sec->ps_gc_next = 0; - - /* install ctx immediately if this is a reverse sec */ - if (svc_ctx) { - ctx = plain_sec_install_ctx(plsec); - if (!ctx) { - plain_destroy_sec(sec); - return NULL; - } - sptlrpc_cli_ctx_put(ctx, 1); - } - - return sec; -} - -static -struct ptlrpc_cli_ctx *plain_lookup_ctx(struct ptlrpc_sec *sec, - struct vfs_cred *vcred, - int create, int remove_dead) -{ - struct plain_sec *plsec = sec2plsec(sec); - struct ptlrpc_cli_ctx *ctx; - - read_lock(&plsec->pls_lock); - ctx = plsec->pls_ctx; - if (ctx) - atomic_inc(&ctx->cc_refcount); - read_unlock(&plsec->pls_lock); - - if (unlikely(!ctx)) - ctx = plain_sec_install_ctx(plsec); - - return ctx; -} - -static -void plain_release_ctx(struct ptlrpc_sec *sec, - struct ptlrpc_cli_ctx *ctx, int sync) -{ - LASSERT(atomic_read(&sec->ps_refcount) > 0); - LASSERT(atomic_read(&sec->ps_nctx) > 0); - LASSERT(atomic_read(&ctx->cc_refcount) == 0); - LASSERT(ctx->cc_sec == sec); - - kfree(ctx); - - atomic_dec(&sec->ps_nctx); - sptlrpc_sec_put(sec); -} - -static -int plain_flush_ctx_cache(struct ptlrpc_sec *sec, - uid_t uid, int grace, int force) -{ - struct plain_sec *plsec = sec2plsec(sec); - struct ptlrpc_cli_ctx *ctx; - - /* do nothing unless caller want to flush for 'all' */ - if (uid != -1) - return 0; - - write_lock(&plsec->pls_lock); - ctx = plsec->pls_ctx; - plsec->pls_ctx = NULL; - write_unlock(&plsec->pls_lock); - - if (ctx) - sptlrpc_cli_ctx_put(ctx, 1); - return 0; -} - -static -int plain_alloc_reqbuf(struct ptlrpc_sec *sec, - struct ptlrpc_request *req, - int msgsize) -{ - __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, }; - int alloc_len; - - buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header); - buflens[PLAIN_PACK_MSG_OFF] = msgsize; - - if (req->rq_pack_udesc) - buflens[PLAIN_PACK_USER_OFF] = sptlrpc_current_user_desc_size(); - - if (req->rq_pack_bulk) { - LASSERT(req->rq_bulk_read || req->rq_bulk_write); - buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE; - } - - alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens); - - if (!req->rq_reqbuf) { - LASSERT(!req->rq_pool); - - alloc_len = size_roundup_power2(alloc_len); - req->rq_reqbuf = kvzalloc(alloc_len, GFP_NOFS); - if (!req->rq_reqbuf) - return -ENOMEM; - - req->rq_reqbuf_len = alloc_len; - } else { - LASSERT(req->rq_pool); - LASSERT(req->rq_reqbuf_len >= alloc_len); - memset(req->rq_reqbuf, 0, alloc_len); - } - - lustre_init_msg_v2(req->rq_reqbuf, PLAIN_PACK_SEGMENTS, buflens, NULL); - req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, 0); - - if (req->rq_pack_udesc) { - int rc = sptlrpc_pack_user_desc(req->rq_reqbuf, - PLAIN_PACK_USER_OFF); - if (rc < 0) - return rc; - } - - return 0; -} - -static -void plain_free_reqbuf(struct ptlrpc_sec *sec, - struct ptlrpc_request *req) -{ - if (!req->rq_pool) { - kvfree(req->rq_reqbuf); - req->rq_reqbuf = NULL; - req->rq_reqbuf_len = 0; - } -} - -static -int plain_alloc_repbuf(struct ptlrpc_sec *sec, - struct ptlrpc_request *req, - int msgsize) -{ - __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, }; - int alloc_len; - - buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header); - buflens[PLAIN_PACK_MSG_OFF] = msgsize; - - if (req->rq_pack_bulk) { - LASSERT(req->rq_bulk_read || req->rq_bulk_write); - buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE; - } - - alloc_len = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens); - - /* add space for early reply */ - alloc_len += plain_at_offset; - - alloc_len = size_roundup_power2(alloc_len); - - req->rq_repbuf = kvzalloc(alloc_len, GFP_NOFS); - if (!req->rq_repbuf) - return -ENOMEM; - - req->rq_repbuf_len = alloc_len; - return 0; -} - -static -void plain_free_repbuf(struct ptlrpc_sec *sec, - struct ptlrpc_request *req) -{ - kvfree(req->rq_repbuf); - req->rq_repbuf = NULL; - req->rq_repbuf_len = 0; -} - -static -int plain_enlarge_reqbuf(struct ptlrpc_sec *sec, - struct ptlrpc_request *req, - int segment, int newsize) -{ - struct lustre_msg *newbuf; - int oldsize; - int newmsg_size, newbuf_size; - - LASSERT(req->rq_reqbuf); - LASSERT(req->rq_reqbuf_len >= req->rq_reqlen); - LASSERT(lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, 0) == - req->rq_reqmsg); - - /* compute new embedded msg size. */ - oldsize = req->rq_reqmsg->lm_buflens[segment]; - req->rq_reqmsg->lm_buflens[segment] = newsize; - newmsg_size = lustre_msg_size_v2(req->rq_reqmsg->lm_bufcount, - req->rq_reqmsg->lm_buflens); - req->rq_reqmsg->lm_buflens[segment] = oldsize; - - /* compute new wrapper msg size. */ - oldsize = req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF]; - req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF] = newmsg_size; - newbuf_size = lustre_msg_size_v2(req->rq_reqbuf->lm_bufcount, - req->rq_reqbuf->lm_buflens); - req->rq_reqbuf->lm_buflens[PLAIN_PACK_MSG_OFF] = oldsize; - - /* request from pool should always have enough buffer */ - LASSERT(!req->rq_pool || req->rq_reqbuf_len >= newbuf_size); - - if (req->rq_reqbuf_len < newbuf_size) { - newbuf_size = size_roundup_power2(newbuf_size); - - newbuf = kvzalloc(newbuf_size, GFP_NOFS); - if (!newbuf) - return -ENOMEM; - - /* Must lock this, so that otherwise unprotected change of - * rq_reqmsg is not racing with parallel processing of - * imp_replay_list traversing threads. See LU-3333 - * This is a bandaid at best, we really need to deal with this - * in request enlarging code before unpacking that's already - * there - */ - if (req->rq_import) - spin_lock(&req->rq_import->imp_lock); - - memcpy(newbuf, req->rq_reqbuf, req->rq_reqbuf_len); - - kvfree(req->rq_reqbuf); - req->rq_reqbuf = newbuf; - req->rq_reqbuf_len = newbuf_size; - req->rq_reqmsg = lustre_msg_buf(req->rq_reqbuf, - PLAIN_PACK_MSG_OFF, 0); - - if (req->rq_import) - spin_unlock(&req->rq_import->imp_lock); - } - - _sptlrpc_enlarge_msg_inplace(req->rq_reqbuf, PLAIN_PACK_MSG_OFF, - newmsg_size); - _sptlrpc_enlarge_msg_inplace(req->rq_reqmsg, segment, newsize); - - req->rq_reqlen = newmsg_size; - return 0; -} - -/**************************************** - * service apis * - ****************************************/ - -static struct ptlrpc_svc_ctx plain_svc_ctx = { - .sc_refcount = ATOMIC_INIT(1), - .sc_policy = &plain_policy, -}; - -static -int plain_accept(struct ptlrpc_request *req) -{ - struct lustre_msg *msg = req->rq_reqbuf; - struct plain_header *phdr; - int swabbed; - - LASSERT(SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) == - SPTLRPC_POLICY_PLAIN); - - if (SPTLRPC_FLVR_BASE(req->rq_flvr.sf_rpc) != - SPTLRPC_FLVR_BASE(SPTLRPC_FLVR_PLAIN) || - SPTLRPC_FLVR_BULK_TYPE(req->rq_flvr.sf_rpc) != - SPTLRPC_FLVR_BULK_TYPE(SPTLRPC_FLVR_PLAIN)) { - CERROR("Invalid rpc flavor %x\n", req->rq_flvr.sf_rpc); - return SECSVC_DROP; - } - - if (msg->lm_bufcount < PLAIN_PACK_SEGMENTS) { - CERROR("unexpected request buf count %u\n", msg->lm_bufcount); - return SECSVC_DROP; - } - - swabbed = ptlrpc_req_need_swab(req); - - phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, sizeof(*phdr)); - if (!phdr) { - CERROR("missing plain header\n"); - return -EPROTO; - } - - if (phdr->ph_ver != 0) { - CERROR("Invalid header version\n"); - return -EPROTO; - } - - if (phdr->ph_bulk_hash_alg >= BULK_HASH_ALG_MAX) { - CERROR("invalid hash algorithm: %u\n", phdr->ph_bulk_hash_alg); - return -EPROTO; - } - - req->rq_sp_from = phdr->ph_sp; - req->rq_flvr.u_bulk.hash.hash_alg = phdr->ph_bulk_hash_alg; - - if (phdr->ph_flags & PLAIN_FL_USER) { - if (sptlrpc_unpack_user_desc(msg, PLAIN_PACK_USER_OFF, - swabbed)) { - CERROR("Mal-formed user descriptor\n"); - return SECSVC_DROP; - } - - req->rq_pack_udesc = 1; - req->rq_user_desc = lustre_msg_buf(msg, PLAIN_PACK_USER_OFF, 0); - } - - if (phdr->ph_flags & PLAIN_FL_BULK) { - if (plain_unpack_bsd(msg, swabbed)) - return SECSVC_DROP; - - req->rq_pack_bulk = 1; - } - - req->rq_reqmsg = lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, 0); - req->rq_reqlen = msg->lm_buflens[PLAIN_PACK_MSG_OFF]; - - req->rq_svc_ctx = &plain_svc_ctx; - atomic_inc(&req->rq_svc_ctx->sc_refcount); - - return SECSVC_OK; -} - -static -int plain_alloc_rs(struct ptlrpc_request *req, int msgsize) -{ - struct ptlrpc_reply_state *rs; - __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, }; - int rs_size = sizeof(*rs); - - LASSERT(msgsize % 8 == 0); - - buflens[PLAIN_PACK_HDR_OFF] = sizeof(struct plain_header); - buflens[PLAIN_PACK_MSG_OFF] = msgsize; - - if (req->rq_pack_bulk && (req->rq_bulk_read || req->rq_bulk_write)) - buflens[PLAIN_PACK_BULK_OFF] = PLAIN_BSD_SIZE; - - rs_size += lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens); - - rs = req->rq_reply_state; - - if (rs) { - /* pre-allocated */ - LASSERT(rs->rs_size >= rs_size); - } else { - rs = kvzalloc(rs_size, GFP_NOFS); - if (!rs) - return -ENOMEM; - - rs->rs_size = rs_size; - } - - rs->rs_svc_ctx = req->rq_svc_ctx; - atomic_inc(&req->rq_svc_ctx->sc_refcount); - rs->rs_repbuf = (struct lustre_msg *)(rs + 1); - rs->rs_repbuf_len = rs_size - sizeof(*rs); - - lustre_init_msg_v2(rs->rs_repbuf, PLAIN_PACK_SEGMENTS, buflens, NULL); - rs->rs_msg = lustre_msg_buf_v2(rs->rs_repbuf, PLAIN_PACK_MSG_OFF, 0); - - req->rq_reply_state = rs; - return 0; -} - -static -void plain_free_rs(struct ptlrpc_reply_state *rs) -{ - LASSERT(atomic_read(&rs->rs_svc_ctx->sc_refcount) > 1); - atomic_dec(&rs->rs_svc_ctx->sc_refcount); - - if (!rs->rs_prealloc) - kvfree(rs); -} - -static -int plain_authorize(struct ptlrpc_request *req) -{ - struct ptlrpc_reply_state *rs = req->rq_reply_state; - struct lustre_msg_v2 *msg = rs->rs_repbuf; - struct plain_header *phdr; - int len; - - LASSERT(rs); - LASSERT(msg); - - if (req->rq_replen != msg->lm_buflens[PLAIN_PACK_MSG_OFF]) - len = lustre_shrink_msg(msg, PLAIN_PACK_MSG_OFF, - req->rq_replen, 1); - else - len = lustre_msg_size_v2(msg->lm_bufcount, msg->lm_buflens); - - msg->lm_secflvr = req->rq_flvr.sf_rpc; - - phdr = lustre_msg_buf(msg, PLAIN_PACK_HDR_OFF, 0); - phdr->ph_ver = 0; - phdr->ph_flags = 0; - phdr->ph_bulk_hash_alg = req->rq_flvr.u_bulk.hash.hash_alg; - - if (req->rq_pack_bulk) - phdr->ph_flags |= PLAIN_FL_BULK; - - rs->rs_repdata_len = len; - - if (likely(req->rq_packed_final)) { - if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) - req->rq_reply_off = plain_at_offset; - else - req->rq_reply_off = 0; - } else { - unsigned int hsize = 4; - - cfs_crypto_hash_digest(CFS_HASH_ALG_CRC32, - lustre_msg_buf(msg, PLAIN_PACK_MSG_OFF, - 0), - lustre_msg_buflen(msg, - PLAIN_PACK_MSG_OFF), - NULL, 0, (unsigned char *)&msg->lm_cksum, - &hsize); - req->rq_reply_off = 0; - } - - return 0; -} - -static -int plain_svc_unwrap_bulk(struct ptlrpc_request *req, - struct ptlrpc_bulk_desc *desc) -{ - struct ptlrpc_reply_state *rs = req->rq_reply_state; - struct ptlrpc_bulk_sec_desc *bsdr, *bsdv; - struct plain_bulk_token *tokenr; - int rc; - - LASSERT(req->rq_bulk_write); - LASSERT(req->rq_pack_bulk); - - bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0); - tokenr = (struct plain_bulk_token *)bsdr->bsd_data; - bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0); - - bsdv->bsd_version = 0; - bsdv->bsd_type = SPTLRPC_BULK_DEFAULT; - bsdv->bsd_svc = bsdr->bsd_svc; - bsdv->bsd_flags = 0; - - if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL) - return 0; - - rc = plain_verify_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg, - tokenr); - if (rc) { - bsdv->bsd_flags |= BSD_FL_ERR; - CERROR("bulk write: server verify failed: %d\n", rc); - } - - return rc; -} - -static -int plain_svc_wrap_bulk(struct ptlrpc_request *req, - struct ptlrpc_bulk_desc *desc) -{ - struct ptlrpc_reply_state *rs = req->rq_reply_state; - struct ptlrpc_bulk_sec_desc *bsdr, *bsdv; - struct plain_bulk_token *tokenv; - int rc; - - LASSERT(req->rq_bulk_read); - LASSERT(req->rq_pack_bulk); - - bsdr = lustre_msg_buf(req->rq_reqbuf, PLAIN_PACK_BULK_OFF, 0); - bsdv = lustre_msg_buf(rs->rs_repbuf, PLAIN_PACK_BULK_OFF, 0); - tokenv = (struct plain_bulk_token *)bsdv->bsd_data; - - bsdv->bsd_version = 0; - bsdv->bsd_type = SPTLRPC_BULK_DEFAULT; - bsdv->bsd_svc = bsdr->bsd_svc; - bsdv->bsd_flags = 0; - - if (bsdr->bsd_svc == SPTLRPC_BULK_SVC_NULL) - return 0; - - rc = plain_generate_bulk_csum(desc, req->rq_flvr.u_bulk.hash.hash_alg, - tokenv); - if (rc) { - CERROR("bulk read: server failed to compute checksum: %d\n", - rc); - } else { - if (OBD_FAIL_CHECK(OBD_FAIL_OSC_CHECKSUM_RECEIVE)) - corrupt_bulk_data(desc); - } - - return rc; -} - -static struct ptlrpc_ctx_ops plain_ctx_ops = { - .refresh = plain_ctx_refresh, - .validate = plain_ctx_validate, - .sign = plain_ctx_sign, - .verify = plain_ctx_verify, - .wrap_bulk = plain_cli_wrap_bulk, - .unwrap_bulk = plain_cli_unwrap_bulk, -}; - -static struct ptlrpc_sec_cops plain_sec_cops = { - .create_sec = plain_create_sec, - .destroy_sec = plain_destroy_sec, - .kill_sec = plain_kill_sec, - .lookup_ctx = plain_lookup_ctx, - .release_ctx = plain_release_ctx, - .flush_ctx_cache = plain_flush_ctx_cache, - .alloc_reqbuf = plain_alloc_reqbuf, - .free_reqbuf = plain_free_reqbuf, - .alloc_repbuf = plain_alloc_repbuf, - .free_repbuf = plain_free_repbuf, - .enlarge_reqbuf = plain_enlarge_reqbuf, -}; - -static struct ptlrpc_sec_sops plain_sec_sops = { - .accept = plain_accept, - .alloc_rs = plain_alloc_rs, - .authorize = plain_authorize, - .free_rs = plain_free_rs, - .unwrap_bulk = plain_svc_unwrap_bulk, - .wrap_bulk = plain_svc_wrap_bulk, -}; - -static struct ptlrpc_sec_policy plain_policy = { - .sp_owner = THIS_MODULE, - .sp_name = "plain", - .sp_policy = SPTLRPC_POLICY_PLAIN, - .sp_cops = &plain_sec_cops, - .sp_sops = &plain_sec_sops, -}; - -int sptlrpc_plain_init(void) -{ - __u32 buflens[PLAIN_PACK_SEGMENTS] = { 0, }; - int rc; - - buflens[PLAIN_PACK_MSG_OFF] = lustre_msg_early_size(); - plain_at_offset = lustre_msg_size_v2(PLAIN_PACK_SEGMENTS, buflens); - - rc = sptlrpc_register_policy(&plain_policy); - if (rc) - CERROR("failed to register: %d\n", rc); - - return rc; -} - -void sptlrpc_plain_fini(void) -{ - int rc; - - rc = sptlrpc_unregister_policy(&plain_policy); - if (rc) - CERROR("cannot unregister: %d\n", rc); -} diff --git a/drivers/staging/lustre/lustre/ptlrpc/service.c b/drivers/staging/lustre/lustre/ptlrpc/service.c deleted file mode 100644 index 3fd8c746f4606df9a2849de95f8d58cab58655b3..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ptlrpc/service.c +++ /dev/null @@ -1,2807 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2010, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_RPC - -#include -#include -#include -#include -#include -#include -#include "ptlrpc_internal.h" -#include -#include - -/* The following are visible and mutable through /sys/module/ptlrpc */ -int test_req_buffer_pressure; -module_param(test_req_buffer_pressure, int, 0444); -MODULE_PARM_DESC(test_req_buffer_pressure, "set non-zero to put pressure on request buffer pools"); -module_param(at_min, int, 0644); -MODULE_PARM_DESC(at_min, "Adaptive timeout minimum (sec)"); -module_param(at_max, int, 0644); -MODULE_PARM_DESC(at_max, "Adaptive timeout maximum (sec)"); -module_param(at_history, int, 0644); -MODULE_PARM_DESC(at_history, - "Adaptive timeouts remember the slowest event that took place within this period (sec)"); -module_param(at_early_margin, int, 0644); -MODULE_PARM_DESC(at_early_margin, "How soon before an RPC deadline to send an early reply"); -module_param(at_extra, int, 0644); -MODULE_PARM_DESC(at_extra, "How much extra time to give with each early reply"); - -/* forward ref */ -static int ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt); -static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req); -static void ptlrpc_at_remove_timed(struct ptlrpc_request *req); - -/** Holds a list of all PTLRPC services */ -LIST_HEAD(ptlrpc_all_services); -/** Used to protect the \e ptlrpc_all_services list */ -struct mutex ptlrpc_all_services_mutex; - -static struct ptlrpc_request_buffer_desc * -ptlrpc_alloc_rqbd(struct ptlrpc_service_part *svcpt) -{ - struct ptlrpc_service *svc = svcpt->scp_service; - struct ptlrpc_request_buffer_desc *rqbd; - - rqbd = kzalloc_node(sizeof(*rqbd), GFP_NOFS, - cfs_cpt_spread_node(svc->srv_cptable, - svcpt->scp_cpt)); - if (!rqbd) - return NULL; - - rqbd->rqbd_svcpt = svcpt; - rqbd->rqbd_refcount = 0; - rqbd->rqbd_cbid.cbid_fn = request_in_callback; - rqbd->rqbd_cbid.cbid_arg = rqbd; - INIT_LIST_HEAD(&rqbd->rqbd_reqs); - rqbd->rqbd_buffer = kvzalloc_node(svc->srv_buf_size, GFP_KERNEL, - cfs_cpt_spread_node(svc->srv_cptable, - svcpt->scp_cpt)); - - if (!rqbd->rqbd_buffer) { - kfree(rqbd); - return NULL; - } - - spin_lock(&svcpt->scp_lock); - list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle); - svcpt->scp_nrqbds_total++; - spin_unlock(&svcpt->scp_lock); - - return rqbd; -} - -static void -ptlrpc_free_rqbd(struct ptlrpc_request_buffer_desc *rqbd) -{ - struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt; - - LASSERT(rqbd->rqbd_refcount == 0); - LASSERT(list_empty(&rqbd->rqbd_reqs)); - - spin_lock(&svcpt->scp_lock); - list_del(&rqbd->rqbd_list); - svcpt->scp_nrqbds_total--; - spin_unlock(&svcpt->scp_lock); - - kvfree(rqbd->rqbd_buffer); - kfree(rqbd); -} - -static int -ptlrpc_grow_req_bufs(struct ptlrpc_service_part *svcpt, int post) -{ - struct ptlrpc_service *svc = svcpt->scp_service; - struct ptlrpc_request_buffer_desc *rqbd; - int rc = 0; - int i; - - if (svcpt->scp_rqbd_allocating) - goto try_post; - - spin_lock(&svcpt->scp_lock); - /* check again with lock */ - if (svcpt->scp_rqbd_allocating) { - /* NB: we might allow more than one thread in the future */ - LASSERT(svcpt->scp_rqbd_allocating == 1); - spin_unlock(&svcpt->scp_lock); - goto try_post; - } - - svcpt->scp_rqbd_allocating++; - spin_unlock(&svcpt->scp_lock); - - for (i = 0; i < svc->srv_nbuf_per_group; i++) { - /* NB: another thread might have recycled enough rqbds, we - * need to make sure it wouldn't over-allocate, see LU-1212. - */ - if (svcpt->scp_nrqbds_posted >= svc->srv_nbuf_per_group) - break; - - rqbd = ptlrpc_alloc_rqbd(svcpt); - - if (!rqbd) { - CERROR("%s: Can't allocate request buffer\n", - svc->srv_name); - rc = -ENOMEM; - break; - } - } - - spin_lock(&svcpt->scp_lock); - - LASSERT(svcpt->scp_rqbd_allocating == 1); - svcpt->scp_rqbd_allocating--; - - spin_unlock(&svcpt->scp_lock); - - CDEBUG(D_RPCTRACE, - "%s: allocate %d new %d-byte reqbufs (%d/%d left), rc = %d\n", - svc->srv_name, i, svc->srv_buf_size, svcpt->scp_nrqbds_posted, - svcpt->scp_nrqbds_total, rc); - - try_post: - if (post && rc == 0) - rc = ptlrpc_server_post_idle_rqbds(svcpt); - - return rc; -} - -struct ptlrpc_hr_partition; - -struct ptlrpc_hr_thread { - int hrt_id; /* thread ID */ - spinlock_t hrt_lock; - wait_queue_head_t hrt_waitq; - struct list_head hrt_queue; /* RS queue */ - struct ptlrpc_hr_partition *hrt_partition; -}; - -struct ptlrpc_hr_partition { - /* # of started threads */ - atomic_t hrp_nstarted; - /* # of stopped threads */ - atomic_t hrp_nstopped; - /* cpu partition id */ - int hrp_cpt; - /* round-robin rotor for choosing thread */ - int hrp_rotor; - /* total number of threads on this partition */ - int hrp_nthrs; - /* threads table */ - struct ptlrpc_hr_thread *hrp_thrs; -}; - -#define HRT_RUNNING 0 -#define HRT_STOPPING 1 - -struct ptlrpc_hr_service { - /* CPU partition table, it's just cfs_cpt_tab for now */ - struct cfs_cpt_table *hr_cpt_table; - /** controller sleep waitq */ - wait_queue_head_t hr_waitq; - unsigned int hr_stopping; - /** roundrobin rotor for non-affinity service */ - unsigned int hr_rotor; - /* partition data */ - struct ptlrpc_hr_partition **hr_partitions; -}; - -/** reply handling service. */ -static struct ptlrpc_hr_service ptlrpc_hr; - -/** - * Choose an hr thread to dispatch requests to. - */ -static struct ptlrpc_hr_thread * -ptlrpc_hr_select(struct ptlrpc_service_part *svcpt) -{ - struct ptlrpc_hr_partition *hrp; - unsigned int rotor; - - if (svcpt->scp_cpt >= 0 && - svcpt->scp_service->srv_cptable == ptlrpc_hr.hr_cpt_table) { - /* directly match partition */ - hrp = ptlrpc_hr.hr_partitions[svcpt->scp_cpt]; - - } else { - rotor = ptlrpc_hr.hr_rotor++; - rotor %= cfs_cpt_number(ptlrpc_hr.hr_cpt_table); - - hrp = ptlrpc_hr.hr_partitions[rotor]; - } - - rotor = hrp->hrp_rotor++; - return &hrp->hrp_thrs[rotor % hrp->hrp_nthrs]; -} - -/** - * Put reply state into a queue for processing because we received - * ACK from the client - */ -void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs) -{ - struct ptlrpc_hr_thread *hrt; - - LASSERT(list_empty(&rs->rs_list)); - - hrt = ptlrpc_hr_select(rs->rs_svcpt); - - spin_lock(&hrt->hrt_lock); - list_add_tail(&rs->rs_list, &hrt->hrt_queue); - spin_unlock(&hrt->hrt_lock); - - wake_up(&hrt->hrt_waitq); -} - -void -ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs) -{ - assert_spin_locked(&rs->rs_svcpt->scp_rep_lock); - assert_spin_locked(&rs->rs_lock); - LASSERT(rs->rs_difficult); - rs->rs_scheduled_ever = 1; /* flag any notification attempt */ - - if (rs->rs_scheduled) { /* being set up or already notified */ - return; - } - - rs->rs_scheduled = 1; - list_del_init(&rs->rs_list); - ptlrpc_dispatch_difficult_reply(rs); -} -EXPORT_SYMBOL(ptlrpc_schedule_difficult_reply); - -static int -ptlrpc_server_post_idle_rqbds(struct ptlrpc_service_part *svcpt) -{ - struct ptlrpc_request_buffer_desc *rqbd; - int rc; - int posted = 0; - - for (;;) { - spin_lock(&svcpt->scp_lock); - - if (list_empty(&svcpt->scp_rqbd_idle)) { - spin_unlock(&svcpt->scp_lock); - return posted; - } - - rqbd = list_entry(svcpt->scp_rqbd_idle.next, - struct ptlrpc_request_buffer_desc, - rqbd_list); - list_del(&rqbd->rqbd_list); - - /* assume we will post successfully */ - svcpt->scp_nrqbds_posted++; - list_add(&rqbd->rqbd_list, &svcpt->scp_rqbd_posted); - - spin_unlock(&svcpt->scp_lock); - - rc = ptlrpc_register_rqbd(rqbd); - if (rc != 0) - break; - - posted = 1; - } - - spin_lock(&svcpt->scp_lock); - - svcpt->scp_nrqbds_posted--; - list_del(&rqbd->rqbd_list); - list_add_tail(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle); - - /* Don't complain if no request buffers are posted right now; LNET - * won't drop requests because we set the portal lazy! - */ - - spin_unlock(&svcpt->scp_lock); - - return -1; -} - -static void ptlrpc_at_timer(struct timer_list *t) -{ - struct ptlrpc_service_part *svcpt; - - svcpt = from_timer(svcpt, t, scp_at_timer); - - svcpt->scp_at_check = 1; - svcpt->scp_at_checktime = jiffies; - wake_up(&svcpt->scp_waitq); -} - -static void -ptlrpc_server_nthreads_check(struct ptlrpc_service *svc, - struct ptlrpc_service_conf *conf) -{ - struct ptlrpc_service_thr_conf *tc = &conf->psc_thr; - unsigned int init; - unsigned int total; - unsigned int nthrs; - int weight; - - /* - * Common code for estimating & validating threads number. - * CPT affinity service could have percpt thread-pool instead - * of a global thread-pool, which means user might not always - * get the threads number they give it in conf::tc_nthrs_user - * even they did set. It's because we need to validate threads - * number for each CPT to guarantee each pool will have enough - * threads to keep the service healthy. - */ - init = PTLRPC_NTHRS_INIT + (svc->srv_ops.so_hpreq_handler != NULL); - init = max_t(int, init, tc->tc_nthrs_init); - - /* NB: please see comments in lustre_lnet.h for definition - * details of these members - */ - LASSERT(tc->tc_nthrs_max != 0); - - if (tc->tc_nthrs_user != 0) { - /* In case there is a reason to test a service with many - * threads, we give a less strict check here, it can - * be up to 8 * nthrs_max - */ - total = min(tc->tc_nthrs_max * 8, tc->tc_nthrs_user); - nthrs = total / svc->srv_ncpts; - init = max(init, nthrs); - goto out; - } - - total = tc->tc_nthrs_max; - if (tc->tc_nthrs_base == 0) { - /* don't care about base threads number per partition, - * this is most for non-affinity service - */ - nthrs = total / svc->srv_ncpts; - goto out; - } - - nthrs = tc->tc_nthrs_base; - if (svc->srv_ncpts == 1) { - int i; - - /* NB: Increase the base number if it's single partition - * and total number of cores/HTs is larger or equal to 4. - * result will always < 2 * nthrs_base - */ - weight = cfs_cpt_weight(svc->srv_cptable, CFS_CPT_ANY); - for (i = 1; (weight >> (i + 1)) != 0 && /* >= 4 cores/HTs */ - (tc->tc_nthrs_base >> i) != 0; i++) - nthrs += tc->tc_nthrs_base >> i; - } - - if (tc->tc_thr_factor != 0) { - int factor = tc->tc_thr_factor; - const int fade = 4; - - /* - * User wants to increase number of threads with for - * each CPU core/HT, most likely the factor is larger then - * one thread/core because service threads are supposed to - * be blocked by lock or wait for IO. - */ - /* - * Amdahl's law says that adding processors wouldn't give - * a linear increasing of parallelism, so it's nonsense to - * have too many threads no matter how many cores/HTs - * there are. - */ - /* weight is # of HTs */ - if (cpumask_weight(topology_sibling_cpumask(0)) > 1) { - /* depress thread factor for hyper-thread */ - factor = factor - (factor >> 1) + (factor >> 3); - } - - weight = cfs_cpt_weight(svc->srv_cptable, 0); - LASSERT(weight > 0); - - for (; factor > 0 && weight > 0; factor--, weight -= fade) - nthrs += min(weight, fade) * factor; - } - - if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) { - nthrs = max(tc->tc_nthrs_base, - tc->tc_nthrs_max / svc->srv_ncpts); - } - out: - nthrs = max(nthrs, tc->tc_nthrs_init); - svc->srv_nthrs_cpt_limit = nthrs; - svc->srv_nthrs_cpt_init = init; - - if (nthrs * svc->srv_ncpts > tc->tc_nthrs_max) { - CDEBUG(D_OTHER, "%s: This service may have more threads (%d) than the given soft limit (%d)\n", - svc->srv_name, nthrs * svc->srv_ncpts, - tc->tc_nthrs_max); - } -} - -/** - * Initialize percpt data for a service - */ -static int -ptlrpc_service_part_init(struct ptlrpc_service *svc, - struct ptlrpc_service_part *svcpt, int cpt) -{ - struct ptlrpc_at_array *array; - int size; - int index; - int rc; - - svcpt->scp_cpt = cpt; - INIT_LIST_HEAD(&svcpt->scp_threads); - - /* rqbd and incoming request queue */ - spin_lock_init(&svcpt->scp_lock); - INIT_LIST_HEAD(&svcpt->scp_rqbd_idle); - INIT_LIST_HEAD(&svcpt->scp_rqbd_posted); - INIT_LIST_HEAD(&svcpt->scp_req_incoming); - init_waitqueue_head(&svcpt->scp_waitq); - /* history request & rqbd list */ - INIT_LIST_HEAD(&svcpt->scp_hist_reqs); - INIT_LIST_HEAD(&svcpt->scp_hist_rqbds); - - /* active requests and hp requests */ - spin_lock_init(&svcpt->scp_req_lock); - - /* reply states */ - spin_lock_init(&svcpt->scp_rep_lock); - INIT_LIST_HEAD(&svcpt->scp_rep_active); - INIT_LIST_HEAD(&svcpt->scp_rep_idle); - init_waitqueue_head(&svcpt->scp_rep_waitq); - atomic_set(&svcpt->scp_nreps_difficult, 0); - - /* adaptive timeout */ - spin_lock_init(&svcpt->scp_at_lock); - array = &svcpt->scp_at_array; - - size = at_est2timeout(at_max); - array->paa_size = size; - array->paa_count = 0; - array->paa_deadline = -1; - - /* allocate memory for scp_at_array (ptlrpc_at_array) */ - array->paa_reqs_array = - kzalloc_node(sizeof(struct list_head) * size, GFP_NOFS, - cfs_cpt_spread_node(svc->srv_cptable, cpt)); - if (!array->paa_reqs_array) - return -ENOMEM; - - for (index = 0; index < size; index++) - INIT_LIST_HEAD(&array->paa_reqs_array[index]); - - array->paa_reqs_count = - kzalloc_node(sizeof(__u32) * size, GFP_NOFS, - cfs_cpt_spread_node(svc->srv_cptable, cpt)); - if (!array->paa_reqs_count) - goto free_reqs_array; - - timer_setup(&svcpt->scp_at_timer, ptlrpc_at_timer, 0); - - /* At SOW, service time should be quick; 10s seems generous. If client - * timeout is less than this, we'll be sending an early reply. - */ - at_init(&svcpt->scp_at_estimate, 10, 0); - - /* assign this before call ptlrpc_grow_req_bufs */ - svcpt->scp_service = svc; - /* Now allocate the request buffers, but don't post them now */ - rc = ptlrpc_grow_req_bufs(svcpt, 0); - /* We shouldn't be under memory pressure at startup, so - * fail if we can't allocate all our buffers at this time. - */ - if (rc != 0) - goto free_reqs_count; - - return 0; - -free_reqs_count: - kfree(array->paa_reqs_count); - array->paa_reqs_count = NULL; -free_reqs_array: - kfree(array->paa_reqs_array); - array->paa_reqs_array = NULL; - - return -ENOMEM; -} - -/** - * Initialize service on a given portal. - * This includes starting serving threads , allocating and posting rqbds and - * so on. - */ -struct ptlrpc_service * -ptlrpc_register_service(struct ptlrpc_service_conf *conf, - struct kset *parent, - struct dentry *debugfs_entry) -{ - struct ptlrpc_service_cpt_conf *cconf = &conf->psc_cpt; - struct ptlrpc_service *service; - struct ptlrpc_service_part *svcpt; - struct cfs_cpt_table *cptable; - __u32 *cpts = NULL; - int ncpts; - int cpt; - int rc; - int i; - - LASSERT(conf->psc_buf.bc_nbufs > 0); - LASSERT(conf->psc_buf.bc_buf_size >= - conf->psc_buf.bc_req_max_size + SPTLRPC_MAX_PAYLOAD); - LASSERT(conf->psc_thr.tc_ctx_tags != 0); - - cptable = cconf->cc_cptable; - if (!cptable) - cptable = cfs_cpt_tab; - - if (!conf->psc_thr.tc_cpu_affinity) { - ncpts = 1; - } else { - ncpts = cfs_cpt_number(cptable); - if (cconf->cc_pattern) { - struct cfs_expr_list *el; - - rc = cfs_expr_list_parse(cconf->cc_pattern, - strlen(cconf->cc_pattern), - 0, ncpts - 1, &el); - if (rc != 0) { - CERROR("%s: invalid CPT pattern string: %s", - conf->psc_name, cconf->cc_pattern); - return ERR_PTR(-EINVAL); - } - - rc = cfs_expr_list_values(el, ncpts, &cpts); - cfs_expr_list_free(el); - if (rc <= 0) { - CERROR("%s: failed to parse CPT array %s: %d\n", - conf->psc_name, cconf->cc_pattern, rc); - kfree(cpts); - return ERR_PTR(rc < 0 ? rc : -EINVAL); - } - ncpts = rc; - } - } - - service = kzalloc(offsetof(struct ptlrpc_service, srv_parts[ncpts]), - GFP_NOFS); - if (!service) { - kfree(cpts); - return ERR_PTR(-ENOMEM); - } - - service->srv_cptable = cptable; - service->srv_cpts = cpts; - service->srv_ncpts = ncpts; - - service->srv_cpt_bits = 0; /* it's zero already, easy to read... */ - while ((1 << service->srv_cpt_bits) < cfs_cpt_number(cptable)) - service->srv_cpt_bits++; - - /* public members */ - spin_lock_init(&service->srv_lock); - service->srv_name = conf->psc_name; - service->srv_watchdog_factor = conf->psc_watchdog_factor; - INIT_LIST_HEAD(&service->srv_list); /* for safety of cleanup */ - - /* buffer configuration */ - service->srv_nbuf_per_group = test_req_buffer_pressure ? - 1 : conf->psc_buf.bc_nbufs; - service->srv_max_req_size = conf->psc_buf.bc_req_max_size + - SPTLRPC_MAX_PAYLOAD; - service->srv_buf_size = conf->psc_buf.bc_buf_size; - service->srv_rep_portal = conf->psc_buf.bc_rep_portal; - service->srv_req_portal = conf->psc_buf.bc_req_portal; - - /* Increase max reply size to next power of two */ - service->srv_max_reply_size = 1; - while (service->srv_max_reply_size < - conf->psc_buf.bc_rep_max_size + SPTLRPC_MAX_PAYLOAD) - service->srv_max_reply_size <<= 1; - - service->srv_thread_name = conf->psc_thr.tc_thr_name; - service->srv_ctx_tags = conf->psc_thr.tc_ctx_tags; - service->srv_hpreq_ratio = PTLRPC_SVC_HP_RATIO; - service->srv_ops = conf->psc_ops; - - for (i = 0; i < ncpts; i++) { - if (!conf->psc_thr.tc_cpu_affinity) - cpt = CFS_CPT_ANY; - else - cpt = cpts ? cpts[i] : i; - - svcpt = kzalloc_node(sizeof(*svcpt), GFP_NOFS, - cfs_cpt_spread_node(cptable, cpt)); - if (!svcpt) { - rc = -ENOMEM; - goto failed; - } - - service->srv_parts[i] = svcpt; - rc = ptlrpc_service_part_init(service, svcpt, cpt); - if (rc != 0) - goto failed; - } - - ptlrpc_server_nthreads_check(service, conf); - - rc = LNetSetLazyPortal(service->srv_req_portal); - LASSERT(rc == 0); - - mutex_lock(&ptlrpc_all_services_mutex); - list_add(&service->srv_list, &ptlrpc_all_services); - mutex_unlock(&ptlrpc_all_services_mutex); - - if (parent) { - rc = ptlrpc_sysfs_register_service(parent, service); - if (rc) - goto failed; - } - - if (!IS_ERR_OR_NULL(debugfs_entry)) - ptlrpc_ldebugfs_register_service(debugfs_entry, service); - - rc = ptlrpc_service_nrs_setup(service); - if (rc != 0) - goto failed; - - CDEBUG(D_NET, "%s: Started, listening on portal %d\n", - service->srv_name, service->srv_req_portal); - - rc = ptlrpc_start_threads(service); - if (rc != 0) { - CERROR("Failed to start threads for service %s: %d\n", - service->srv_name, rc); - goto failed; - } - - return service; -failed: - ptlrpc_unregister_service(service); - return ERR_PTR(rc); -} -EXPORT_SYMBOL(ptlrpc_register_service); - -/** - * to actually free the request, must be called without holding svc_lock. - * note it's caller's responsibility to unlink req->rq_list. - */ -static void ptlrpc_server_free_request(struct ptlrpc_request *req) -{ - LASSERT(atomic_read(&req->rq_refcount) == 0); - LASSERT(list_empty(&req->rq_timed_list)); - - /* DEBUG_REQ() assumes the reply state of a request with a valid - * ref will not be destroyed until that reference is dropped. - */ - ptlrpc_req_drop_rs(req); - - sptlrpc_svc_ctx_decref(req); - - if (req != &req->rq_rqbd->rqbd_req) { - /* NB request buffers use an embedded - * req if the incoming req unlinked the - * MD; this isn't one of them! - */ - ptlrpc_request_cache_free(req); - } -} - -/** - * drop a reference count of the request. if it reaches 0, we either - * put it into history list, or free it immediately. - */ -static void ptlrpc_server_drop_request(struct ptlrpc_request *req) -{ - struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd; - struct ptlrpc_service_part *svcpt = rqbd->rqbd_svcpt; - struct ptlrpc_service *svc = svcpt->scp_service; - int refcount; - - if (!atomic_dec_and_test(&req->rq_refcount)) - return; - - if (req->rq_at_linked) { - spin_lock(&svcpt->scp_at_lock); - /* recheck with lock, in case it's unlinked by - * ptlrpc_at_check_timed() - */ - if (likely(req->rq_at_linked)) - ptlrpc_at_remove_timed(req); - spin_unlock(&svcpt->scp_at_lock); - } - - LASSERT(list_empty(&req->rq_timed_list)); - - /* finalize request */ - if (req->rq_export) { - class_export_put(req->rq_export); - req->rq_export = NULL; - } - - spin_lock(&svcpt->scp_lock); - - list_add(&req->rq_list, &rqbd->rqbd_reqs); - - refcount = --(rqbd->rqbd_refcount); - if (refcount == 0) { - /* request buffer is now idle: add to history */ - list_del(&rqbd->rqbd_list); - - list_add_tail(&rqbd->rqbd_list, &svcpt->scp_hist_rqbds); - svcpt->scp_hist_nrqbds++; - - /* cull some history? - * I expect only about 1 or 2 rqbds need to be recycled here - */ - while (svcpt->scp_hist_nrqbds > svc->srv_hist_nrqbds_cpt_max) { - rqbd = list_entry(svcpt->scp_hist_rqbds.next, - struct ptlrpc_request_buffer_desc, - rqbd_list); - - list_del(&rqbd->rqbd_list); - svcpt->scp_hist_nrqbds--; - - /* remove rqbd's reqs from svc's req history while - * I've got the service lock - */ - list_for_each_entry(req, &rqbd->rqbd_reqs, rq_list) { - /* Track the highest culled req seq */ - if (req->rq_history_seq > - svcpt->scp_hist_seq_culled) { - svcpt->scp_hist_seq_culled = - req->rq_history_seq; - } - list_del(&req->rq_history_list); - } - - spin_unlock(&svcpt->scp_lock); - - while ((req = list_first_entry_or_null( - &rqbd->rqbd_reqs, - struct ptlrpc_request, rq_list))) { - list_del(&req->rq_list); - ptlrpc_server_free_request(req); - } - - spin_lock(&svcpt->scp_lock); - /* - * now all reqs including the embedded req has been - * disposed, schedule request buffer for re-use. - */ - LASSERT(atomic_read(&rqbd->rqbd_req.rq_refcount) == - 0); - list_add_tail(&rqbd->rqbd_list, &svcpt->scp_rqbd_idle); - } - - spin_unlock(&svcpt->scp_lock); - } else if (req->rq_reply_state && req->rq_reply_state->rs_prealloc) { - /* If we are low on memory, we are not interested in history */ - list_del(&req->rq_list); - list_del_init(&req->rq_history_list); - - /* Track the highest culled req seq */ - if (req->rq_history_seq > svcpt->scp_hist_seq_culled) - svcpt->scp_hist_seq_culled = req->rq_history_seq; - - spin_unlock(&svcpt->scp_lock); - - ptlrpc_server_free_request(req); - } else { - spin_unlock(&svcpt->scp_lock); - } -} - -/** - * to finish a request: stop sending more early replies, and release - * the request. - */ -static void ptlrpc_server_finish_request(struct ptlrpc_service_part *svcpt, - struct ptlrpc_request *req) -{ - ptlrpc_server_hpreq_fini(req); - - if (req->rq_session.lc_thread) { - lu_context_exit(&req->rq_session); - lu_context_fini(&req->rq_session); - } - - ptlrpc_server_drop_request(req); -} - -/** - * to finish a active request: stop sending more early replies, and release - * the request. should be called after we finished handling the request. - */ -static void ptlrpc_server_finish_active_request( - struct ptlrpc_service_part *svcpt, - struct ptlrpc_request *req) -{ - spin_lock(&svcpt->scp_req_lock); - ptlrpc_nrs_req_stop_nolock(req); - svcpt->scp_nreqs_active--; - if (req->rq_hp) - svcpt->scp_nhreqs_active--; - spin_unlock(&svcpt->scp_req_lock); - - ptlrpc_nrs_req_finalize(req); - - if (req->rq_export) - class_export_rpc_dec(req->rq_export); - - ptlrpc_server_finish_request(svcpt, req); -} - -/** - * Sanity check request \a req. - * Return 0 if all is ok, error code otherwise. - */ -static int ptlrpc_check_req(struct ptlrpc_request *req) -{ - struct obd_device *obd = req->rq_export->exp_obd; - int rc = 0; - - if (unlikely(lustre_msg_get_conn_cnt(req->rq_reqmsg) < - req->rq_export->exp_conn_cnt)) { - DEBUG_REQ(D_RPCTRACE, req, - "DROPPING req from old connection %d < %d", - lustre_msg_get_conn_cnt(req->rq_reqmsg), - req->rq_export->exp_conn_cnt); - return -EEXIST; - } - if (unlikely(!obd || obd->obd_fail)) { - /* - * Failing over, don't handle any more reqs, send - * error response instead. - */ - CDEBUG(D_RPCTRACE, "Dropping req %p for failed obd %s\n", - req, obd ? obd->obd_name : "unknown"); - rc = -ENODEV; - } else if (lustre_msg_get_flags(req->rq_reqmsg) & - (MSG_REPLAY | MSG_REQ_REPLAY_DONE)) { - DEBUG_REQ(D_ERROR, req, "Invalid replay without recovery"); - class_fail_export(req->rq_export); - rc = -ENODEV; - } else if (lustre_msg_get_transno(req->rq_reqmsg) != 0) { - DEBUG_REQ(D_ERROR, req, - "Invalid req with transno %llu without recovery", - lustre_msg_get_transno(req->rq_reqmsg)); - class_fail_export(req->rq_export); - rc = -ENODEV; - } - - if (unlikely(rc < 0)) { - req->rq_status = rc; - ptlrpc_error(req); - } - return rc; -} - -static void ptlrpc_at_set_timer(struct ptlrpc_service_part *svcpt) -{ - struct ptlrpc_at_array *array = &svcpt->scp_at_array; - __s32 next; - - if (array->paa_count == 0) { - del_timer(&svcpt->scp_at_timer); - return; - } - - /* Set timer for closest deadline */ - next = (__s32)(array->paa_deadline - ktime_get_real_seconds() - - at_early_margin); - if (next <= 0) { - ptlrpc_at_timer(&svcpt->scp_at_timer); - } else { - mod_timer(&svcpt->scp_at_timer, jiffies + next * HZ); - CDEBUG(D_INFO, "armed %s at %+ds\n", - svcpt->scp_service->srv_name, next); - } -} - -/* Add rpc to early reply check list */ -static int ptlrpc_at_add_timed(struct ptlrpc_request *req) -{ - struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; - struct ptlrpc_at_array *array = &svcpt->scp_at_array; - struct ptlrpc_request *rq = NULL; - __u32 index; - - if (AT_OFF) - return 0; - - if (req->rq_no_reply) - return 0; - - if ((lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) == 0) - return -ENOSYS; - - spin_lock(&svcpt->scp_at_lock); - LASSERT(list_empty(&req->rq_timed_list)); - - div_u64_rem(req->rq_deadline, array->paa_size, &index); - if (array->paa_reqs_count[index] > 0) { - /* latest rpcs will have the latest deadlines in the list, - * so search backward. - */ - list_for_each_entry_reverse(rq, &array->paa_reqs_array[index], - rq_timed_list) { - if (req->rq_deadline >= rq->rq_deadline) { - list_add(&req->rq_timed_list, - &rq->rq_timed_list); - break; - } - } - } - - /* Add the request at the head of the list */ - if (list_empty(&req->rq_timed_list)) - list_add(&req->rq_timed_list, &array->paa_reqs_array[index]); - - spin_lock(&req->rq_lock); - req->rq_at_linked = 1; - spin_unlock(&req->rq_lock); - req->rq_at_index = index; - array->paa_reqs_count[index]++; - array->paa_count++; - if (array->paa_count == 1 || array->paa_deadline > req->rq_deadline) { - array->paa_deadline = req->rq_deadline; - ptlrpc_at_set_timer(svcpt); - } - spin_unlock(&svcpt->scp_at_lock); - - return 0; -} - -static void -ptlrpc_at_remove_timed(struct ptlrpc_request *req) -{ - struct ptlrpc_at_array *array; - - array = &req->rq_rqbd->rqbd_svcpt->scp_at_array; - - /* NB: must call with hold svcpt::scp_at_lock */ - LASSERT(!list_empty(&req->rq_timed_list)); - list_del_init(&req->rq_timed_list); - - spin_lock(&req->rq_lock); - req->rq_at_linked = 0; - spin_unlock(&req->rq_lock); - - array->paa_reqs_count[req->rq_at_index]--; - array->paa_count--; -} - -/* - * Attempt to extend the request deadline by sending an early reply to the - * client. - */ -static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req) -{ - struct ptlrpc_service_part *svcpt = req->rq_rqbd->rqbd_svcpt; - struct ptlrpc_request *reqcopy; - struct lustre_msg *reqmsg; - long olddl = req->rq_deadline - ktime_get_real_seconds(); - time64_t newdl; - int rc; - - /* deadline is when the client expects us to reply, margin is the - * difference between clients' and servers' expectations - */ - DEBUG_REQ(D_ADAPTTO, req, - "%ssending early reply (deadline %+lds, margin %+lds) for %d+%d", - AT_OFF ? "AT off - not " : "", - olddl, olddl - at_get(&svcpt->scp_at_estimate), - at_get(&svcpt->scp_at_estimate), at_extra); - - if (AT_OFF) - return 0; - - if (olddl < 0) { - DEBUG_REQ(D_WARNING, req, "Already past deadline (%+lds), not sending early reply. Consider increasing at_early_margin (%d)?", - olddl, at_early_margin); - - /* Return an error so we're not re-added to the timed list. */ - return -ETIMEDOUT; - } - - if (!(lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT)) { - DEBUG_REQ(D_INFO, req, "Wanted to ask client for more time, but no AT support"); - return -ENOSYS; - } - - /* - * We want to extend the request deadline by at_extra seconds, - * so we set our service estimate to reflect how much time has - * passed since this request arrived plus an additional - * at_extra seconds. The client will calculate the new deadline - * based on this service estimate (plus some additional time to - * account for network latency). See ptlrpc_at_recv_early_reply - */ - at_measured(&svcpt->scp_at_estimate, at_extra + - ktime_get_real_seconds() - req->rq_arrival_time.tv_sec); - newdl = req->rq_arrival_time.tv_sec + at_get(&svcpt->scp_at_estimate); - - /* Check to see if we've actually increased the deadline - - * we may be past adaptive_max - */ - if (req->rq_deadline >= newdl) { - DEBUG_REQ(D_WARNING, req, "Couldn't add any time (%ld/%lld), not sending early reply\n", - olddl, newdl - ktime_get_real_seconds()); - return -ETIMEDOUT; - } - - reqcopy = ptlrpc_request_cache_alloc(GFP_NOFS); - if (!reqcopy) - return -ENOMEM; - reqmsg = kvzalloc(req->rq_reqlen, GFP_NOFS); - if (!reqmsg) { - rc = -ENOMEM; - goto out_free; - } - - *reqcopy = *req; - reqcopy->rq_reply_state = NULL; - reqcopy->rq_rep_swab_mask = 0; - reqcopy->rq_pack_bulk = 0; - reqcopy->rq_pack_udesc = 0; - reqcopy->rq_packed_final = 0; - sptlrpc_svc_ctx_addref(reqcopy); - /* We only need the reqmsg for the magic */ - reqcopy->rq_reqmsg = reqmsg; - memcpy(reqmsg, req->rq_reqmsg, req->rq_reqlen); - - LASSERT(atomic_read(&req->rq_refcount)); - /** if it is last refcount then early reply isn't needed */ - if (atomic_read(&req->rq_refcount) == 1) { - DEBUG_REQ(D_ADAPTTO, reqcopy, "Normal reply already sent out, abort sending early reply\n"); - rc = -EINVAL; - goto out; - } - - /* Connection ref */ - reqcopy->rq_export = class_conn2export( - lustre_msg_get_handle(reqcopy->rq_reqmsg)); - if (!reqcopy->rq_export) { - rc = -ENODEV; - goto out; - } - - /* RPC ref */ - class_export_rpc_inc(reqcopy->rq_export); - if (reqcopy->rq_export->exp_obd && - reqcopy->rq_export->exp_obd->obd_fail) { - rc = -ENODEV; - goto out_put; - } - - rc = lustre_pack_reply_flags(reqcopy, 1, NULL, NULL, LPRFL_EARLY_REPLY); - if (rc) - goto out_put; - - rc = ptlrpc_send_reply(reqcopy, PTLRPC_REPLY_EARLY); - - if (!rc) { - /* Adjust our own deadline to what we told the client */ - req->rq_deadline = newdl; - req->rq_early_count++; /* number sent, server side */ - } else { - DEBUG_REQ(D_ERROR, req, "Early reply send failed %d", rc); - } - - /* Free the (early) reply state from lustre_pack_reply. - * (ptlrpc_send_reply takes it's own rs ref, so this is safe here) - */ - ptlrpc_req_drop_rs(reqcopy); - -out_put: - class_export_rpc_dec(reqcopy->rq_export); - class_export_put(reqcopy->rq_export); -out: - sptlrpc_svc_ctx_decref(reqcopy); - kvfree(reqmsg); -out_free: - ptlrpc_request_cache_free(reqcopy); - return rc; -} - -/* Send early replies to everybody expiring within at_early_margin - * asking for at_extra time - */ -static void ptlrpc_at_check_timed(struct ptlrpc_service_part *svcpt) -{ - struct ptlrpc_at_array *array = &svcpt->scp_at_array; - struct ptlrpc_request *rq, *n; - struct list_head work_list; - __u32 index, count; - time64_t deadline; - time64_t now = ktime_get_real_seconds(); - long delay; - int first, counter = 0; - - spin_lock(&svcpt->scp_at_lock); - if (svcpt->scp_at_check == 0) { - spin_unlock(&svcpt->scp_at_lock); - return; - } - delay = jiffies - svcpt->scp_at_checktime; - svcpt->scp_at_check = 0; - - if (array->paa_count == 0) { - spin_unlock(&svcpt->scp_at_lock); - return; - } - - /* The timer went off, but maybe the nearest rpc already completed. */ - first = array->paa_deadline - now; - if (first > at_early_margin) { - /* We've still got plenty of time. Reset the timer. */ - ptlrpc_at_set_timer(svcpt); - spin_unlock(&svcpt->scp_at_lock); - return; - } - - /* We're close to a timeout, and we don't know how much longer the - * server will take. Send early replies to everyone expiring soon. - */ - INIT_LIST_HEAD(&work_list); - deadline = -1; - div_u64_rem(array->paa_deadline, array->paa_size, &index); - count = array->paa_count; - while (count > 0) { - count -= array->paa_reqs_count[index]; - list_for_each_entry_safe(rq, n, &array->paa_reqs_array[index], - rq_timed_list) { - if (rq->rq_deadline > now + at_early_margin) { - /* update the earliest deadline */ - if (deadline == -1 || - rq->rq_deadline < deadline) - deadline = rq->rq_deadline; - break; - } - - ptlrpc_at_remove_timed(rq); - /** - * ptlrpc_server_drop_request() may drop - * refcount to 0 already. Let's check this and - * don't add entry to work_list - */ - if (likely(atomic_inc_not_zero(&rq->rq_refcount))) - list_add(&rq->rq_timed_list, &work_list); - counter++; - } - - if (++index >= array->paa_size) - index = 0; - } - array->paa_deadline = deadline; - /* we have a new earliest deadline, restart the timer */ - ptlrpc_at_set_timer(svcpt); - - spin_unlock(&svcpt->scp_at_lock); - - CDEBUG(D_ADAPTTO, "timeout in %+ds, asking for %d secs on %d early replies\n", - first, at_extra, counter); - if (first < 0) { - /* We're already past request deadlines before we even get a - * chance to send early replies - */ - LCONSOLE_WARN("%s: This server is not able to keep up with request traffic (cpu-bound).\n", - svcpt->scp_service->srv_name); - CWARN("earlyQ=%d reqQ=%d recA=%d, svcEst=%d, delay=%ld(jiff)\n", - counter, svcpt->scp_nreqs_incoming, - svcpt->scp_nreqs_active, - at_get(&svcpt->scp_at_estimate), delay); - } - - /* we took additional refcount so entries can't be deleted from list, no - * locking is needed - */ - while (!list_empty(&work_list)) { - rq = list_entry(work_list.next, struct ptlrpc_request, - rq_timed_list); - list_del_init(&rq->rq_timed_list); - - if (ptlrpc_at_send_early_reply(rq) == 0) - ptlrpc_at_add_timed(rq); - - ptlrpc_server_drop_request(rq); - } -} - -/** - * Put the request to the export list if the request may become - * a high priority one. - */ -static int ptlrpc_server_hpreq_init(struct ptlrpc_service_part *svcpt, - struct ptlrpc_request *req) -{ - int rc = 0; - - if (svcpt->scp_service->srv_ops.so_hpreq_handler) { - rc = svcpt->scp_service->srv_ops.so_hpreq_handler(req); - if (rc < 0) - return rc; - LASSERT(rc == 0); - } - if (req->rq_export && req->rq_ops) { - /* Perform request specific check. We should do this check - * before the request is added into exp_hp_rpcs list otherwise - * it may hit swab race at LU-1044. - */ - if (req->rq_ops->hpreq_check) { - rc = req->rq_ops->hpreq_check(req); - if (rc == -ESTALE) { - req->rq_status = rc; - ptlrpc_error(req); - } - /** can only return error, - * 0 for normal request, - * or 1 for high priority request - */ - LASSERT(rc <= 1); - } - - spin_lock_bh(&req->rq_export->exp_rpc_lock); - list_add(&req->rq_exp_list, &req->rq_export->exp_hp_rpcs); - spin_unlock_bh(&req->rq_export->exp_rpc_lock); - } - - ptlrpc_nrs_req_initialize(svcpt, req, rc); - - return rc; -} - -/** Remove the request from the export list. */ -static void ptlrpc_server_hpreq_fini(struct ptlrpc_request *req) -{ - if (req->rq_export && req->rq_ops) { - /* refresh lock timeout again so that client has more - * room to send lock cancel RPC. - */ - if (req->rq_ops->hpreq_fini) - req->rq_ops->hpreq_fini(req); - - spin_lock_bh(&req->rq_export->exp_rpc_lock); - list_del_init(&req->rq_exp_list); - spin_unlock_bh(&req->rq_export->exp_rpc_lock); - } -} - -static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt, - struct ptlrpc_request *req) -{ - int rc; - - rc = ptlrpc_server_hpreq_init(svcpt, req); - if (rc < 0) - return rc; - - ptlrpc_nrs_req_add(svcpt, req, !!rc); - - return 0; -} - -/** - * Allow to handle high priority request - * User can call it w/o any lock but need to hold - * ptlrpc_service_part::scp_req_lock to get reliable result - */ -static bool ptlrpc_server_allow_high(struct ptlrpc_service_part *svcpt, - bool force) -{ - int running = svcpt->scp_nthrs_running; - - if (!nrs_svcpt_has_hp(svcpt)) - return false; - - if (force) - return true; - - if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL && - CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) { - /* leave just 1 thread for normal RPCs */ - running = PTLRPC_NTHRS_INIT; - if (svcpt->scp_service->srv_ops.so_hpreq_handler) - running += 1; - } - - if (svcpt->scp_nreqs_active >= running - 1) - return false; - - if (svcpt->scp_nhreqs_active == 0) - return true; - - return !ptlrpc_nrs_req_pending_nolock(svcpt, false) || - svcpt->scp_hreq_count < svcpt->scp_service->srv_hpreq_ratio; -} - -static bool ptlrpc_server_high_pending(struct ptlrpc_service_part *svcpt, - bool force) -{ - return ptlrpc_server_allow_high(svcpt, force) && - ptlrpc_nrs_req_pending_nolock(svcpt, true); -} - -/** - * Only allow normal priority requests on a service that has a high-priority - * queue if forced (i.e. cleanup), if there are other high priority requests - * already being processed (i.e. those threads can service more high-priority - * requests), or if there are enough idle threads that a later thread can do - * a high priority request. - * User can call it w/o any lock but need to hold - * ptlrpc_service_part::scp_req_lock to get reliable result - */ -static bool ptlrpc_server_allow_normal(struct ptlrpc_service_part *svcpt, - bool force) -{ - int running = svcpt->scp_nthrs_running; - - if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL && - CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) { - /* leave just 1 thread for normal RPCs */ - running = PTLRPC_NTHRS_INIT; - if (svcpt->scp_service->srv_ops.so_hpreq_handler) - running += 1; - } - - if (force || - svcpt->scp_nreqs_active < running - 2) - return true; - - if (svcpt->scp_nreqs_active >= running - 1) - return false; - - return svcpt->scp_nhreqs_active > 0 || !nrs_svcpt_has_hp(svcpt); -} - -static bool ptlrpc_server_normal_pending(struct ptlrpc_service_part *svcpt, - bool force) -{ - return ptlrpc_server_allow_normal(svcpt, force) && - ptlrpc_nrs_req_pending_nolock(svcpt, false); -} - -/** - * Returns true if there are requests available in incoming - * request queue for processing and it is allowed to fetch them. - * User can call it w/o any lock but need to hold ptlrpc_service::scp_req_lock - * to get reliable result - * \see ptlrpc_server_allow_normal - * \see ptlrpc_server_allow high - */ -static inline bool -ptlrpc_server_request_pending(struct ptlrpc_service_part *svcpt, bool force) -{ - return ptlrpc_server_high_pending(svcpt, force) || - ptlrpc_server_normal_pending(svcpt, force); -} - -/** - * Fetch a request for processing from queue of unprocessed requests. - * Favors high-priority requests. - * Returns a pointer to fetched request. - */ -static struct ptlrpc_request * -ptlrpc_server_request_get(struct ptlrpc_service_part *svcpt, bool force) -{ - struct ptlrpc_request *req = NULL; - - spin_lock(&svcpt->scp_req_lock); - - if (ptlrpc_server_high_pending(svcpt, force)) { - req = ptlrpc_nrs_req_get_nolock(svcpt, true, force); - if (req) { - svcpt->scp_hreq_count++; - goto got_request; - } - } - - if (ptlrpc_server_normal_pending(svcpt, force)) { - req = ptlrpc_nrs_req_get_nolock(svcpt, false, force); - if (req) { - svcpt->scp_hreq_count = 0; - goto got_request; - } - } - - spin_unlock(&svcpt->scp_req_lock); - return NULL; - -got_request: - svcpt->scp_nreqs_active++; - if (req->rq_hp) - svcpt->scp_nhreqs_active++; - - spin_unlock(&svcpt->scp_req_lock); - - if (likely(req->rq_export)) - class_export_rpc_inc(req->rq_export); - - return req; -} - -/** - * Handle freshly incoming reqs, add to timed early reply list, - * pass on to regular request queue. - * All incoming requests pass through here before getting into - * ptlrpc_server_handle_req later on. - */ -static int -ptlrpc_server_handle_req_in(struct ptlrpc_service_part *svcpt, - struct ptlrpc_thread *thread) -{ - struct ptlrpc_service *svc = svcpt->scp_service; - struct ptlrpc_request *req; - __u32 deadline; - int rc; - - spin_lock(&svcpt->scp_lock); - if (list_empty(&svcpt->scp_req_incoming)) { - spin_unlock(&svcpt->scp_lock); - return 0; - } - - req = list_entry(svcpt->scp_req_incoming.next, - struct ptlrpc_request, rq_list); - list_del_init(&req->rq_list); - svcpt->scp_nreqs_incoming--; - /* Consider this still a "queued" request as far as stats are - * concerned - */ - spin_unlock(&svcpt->scp_lock); - - /* go through security check/transform */ - rc = sptlrpc_svc_unwrap_request(req); - switch (rc) { - case SECSVC_OK: - break; - case SECSVC_COMPLETE: - target_send_reply(req, 0, OBD_FAIL_MDS_ALL_REPLY_NET); - goto err_req; - case SECSVC_DROP: - goto err_req; - default: - LBUG(); - } - - /* - * for null-flavored rpc, msg has been unpacked by sptlrpc, although - * redo it wouldn't be harmful. - */ - if (SPTLRPC_FLVR_POLICY(req->rq_flvr.sf_rpc) != SPTLRPC_POLICY_NULL) { - rc = ptlrpc_unpack_req_msg(req, req->rq_reqlen); - if (rc != 0) { - CERROR("error unpacking request: ptl %d from %s x%llu\n", - svc->srv_req_portal, libcfs_id2str(req->rq_peer), - req->rq_xid); - goto err_req; - } - } - - rc = lustre_unpack_req_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF); - if (rc) { - CERROR("error unpacking ptlrpc body: ptl %d from %s x%llu\n", - svc->srv_req_portal, libcfs_id2str(req->rq_peer), - req->rq_xid); - goto err_req; - } - - if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DROP_REQ_OPC) && - lustre_msg_get_opc(req->rq_reqmsg) == cfs_fail_val) { - CERROR("drop incoming rpc opc %u, x%llu\n", - cfs_fail_val, req->rq_xid); - goto err_req; - } - - rc = -EINVAL; - if (lustre_msg_get_type(req->rq_reqmsg) != PTL_RPC_MSG_REQUEST) { - CERROR("wrong packet type received (type=%u) from %s\n", - lustre_msg_get_type(req->rq_reqmsg), - libcfs_id2str(req->rq_peer)); - goto err_req; - } - - switch (lustre_msg_get_opc(req->rq_reqmsg)) { - case MDS_WRITEPAGE: - case OST_WRITE: - req->rq_bulk_write = 1; - break; - case MDS_READPAGE: - case OST_READ: - case MGS_CONFIG_READ: - req->rq_bulk_read = 1; - break; - } - - CDEBUG(D_RPCTRACE, "got req x%llu\n", req->rq_xid); - - req->rq_export = class_conn2export( - lustre_msg_get_handle(req->rq_reqmsg)); - if (req->rq_export) { - rc = ptlrpc_check_req(req); - if (rc == 0) { - rc = sptlrpc_target_export_check(req->rq_export, req); - if (rc) - DEBUG_REQ(D_ERROR, req, "DROPPING req with illegal security flavor,"); - } - - if (rc) - goto err_req; - } - - /* req_in handling should/must be fast */ - if (ktime_get_real_seconds() - req->rq_arrival_time.tv_sec > 5) - DEBUG_REQ(D_WARNING, req, "Slow req_in handling %llds", - (s64)(ktime_get_real_seconds() - - req->rq_arrival_time.tv_sec)); - - /* Set rpc server deadline and add it to the timed list */ - deadline = (lustre_msghdr_get_flags(req->rq_reqmsg) & - MSGHDR_AT_SUPPORT) ? - /* The max time the client expects us to take */ - lustre_msg_get_timeout(req->rq_reqmsg) : obd_timeout; - req->rq_deadline = req->rq_arrival_time.tv_sec + deadline; - if (unlikely(deadline == 0)) { - DEBUG_REQ(D_ERROR, req, "Dropping request with 0 timeout"); - goto err_req; - } - - req->rq_svc_thread = thread; - if (thread) { - /* initialize request session, it is needed for request - * processing by target - */ - rc = lu_context_init(&req->rq_session, - LCT_SERVER_SESSION | LCT_NOREF); - if (rc) { - CERROR("%s: failure to initialize session: rc = %d\n", - thread->t_name, rc); - goto err_req; - } - req->rq_session.lc_thread = thread; - lu_context_enter(&req->rq_session); - req->rq_svc_thread->t_env->le_ses = &req->rq_session; - } - - ptlrpc_at_add_timed(req); - - /* Move it over to the request processing queue */ - rc = ptlrpc_server_request_add(svcpt, req); - if (rc) - goto err_req; - - wake_up(&svcpt->scp_waitq); - return 1; - -err_req: - ptlrpc_server_finish_request(svcpt, req); - - return 1; -} - -/** - * Main incoming request handling logic. - * Calls handler function from service to do actual processing. - */ -static int -ptlrpc_server_handle_request(struct ptlrpc_service_part *svcpt, - struct ptlrpc_thread *thread) -{ - struct ptlrpc_service *svc = svcpt->scp_service; - struct ptlrpc_request *request; - struct timespec64 work_start; - struct timespec64 work_end; - struct timespec64 timediff; - struct timespec64 arrived; - unsigned long timediff_usecs; - unsigned long arrived_usecs; - int fail_opc = 0; - - request = ptlrpc_server_request_get(svcpt, false); - if (!request) - return 0; - - if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT)) - fail_opc = OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT; - else if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT)) - fail_opc = OBD_FAIL_PTLRPC_HPREQ_TIMEOUT; - - if (unlikely(fail_opc)) { - if (request->rq_export && request->rq_ops) - OBD_FAIL_TIMEOUT(fail_opc, 4); - } - - ptlrpc_rqphase_move(request, RQ_PHASE_INTERPRET); - - if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DUMP_LOG)) - libcfs_debug_dumplog(); - - ktime_get_real_ts64(&work_start); - timediff = timespec64_sub(work_start, request->rq_arrival_time); - timediff_usecs = timediff.tv_sec * USEC_PER_SEC + - timediff.tv_nsec / NSEC_PER_USEC; - if (likely(svc->srv_stats)) { - lprocfs_counter_add(svc->srv_stats, PTLRPC_REQWAIT_CNTR, - timediff_usecs); - lprocfs_counter_add(svc->srv_stats, PTLRPC_REQQDEPTH_CNTR, - svcpt->scp_nreqs_incoming); - lprocfs_counter_add(svc->srv_stats, PTLRPC_REQACTIVE_CNTR, - svcpt->scp_nreqs_active); - lprocfs_counter_add(svc->srv_stats, PTLRPC_TIMEOUT, - at_get(&svcpt->scp_at_estimate)); - } - - if (likely(request->rq_export)) { - if (unlikely(ptlrpc_check_req(request))) - goto put_conn; - } - - /* Discard requests queued for longer than the deadline. - * The deadline is increased if we send an early reply. - */ - if (ktime_get_real_seconds() > request->rq_deadline) { - DEBUG_REQ(D_ERROR, request, "Dropping timed-out request from %s: deadline %lld:%llds ago\n", - libcfs_id2str(request->rq_peer), - request->rq_deadline - - request->rq_arrival_time.tv_sec, - ktime_get_real_seconds() - request->rq_deadline); - goto put_conn; - } - - CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:nid:opc %s:%s+%d:%d:x%llu:%s:%d\n", - current->comm, - (request->rq_export ? - (char *)request->rq_export->exp_client_uuid.uuid : "0"), - (request->rq_export ? - atomic_read(&request->rq_export->exp_refcount) : -99), - lustre_msg_get_status(request->rq_reqmsg), request->rq_xid, - libcfs_id2str(request->rq_peer), - lustre_msg_get_opc(request->rq_reqmsg)); - - if (lustre_msg_get_opc(request->rq_reqmsg) != OBD_PING) - CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, cfs_fail_val); - - CDEBUG(D_NET, "got req %llu\n", request->rq_xid); - - /* re-assign request and sesson thread to the current one */ - request->rq_svc_thread = thread; - if (thread) { - LASSERT(request->rq_session.lc_thread); - request->rq_session.lc_thread = thread; - request->rq_session.lc_cookie = 0x55; - thread->t_env->le_ses = &request->rq_session; - } - svc->srv_ops.so_req_handler(request); - - ptlrpc_rqphase_move(request, RQ_PHASE_COMPLETE); - -put_conn: - if (unlikely(ktime_get_real_seconds() > request->rq_deadline)) { - DEBUG_REQ(D_WARNING, request, - "Request took longer than estimated (%lld:%llds); " - "client may timeout.", - (s64)request->rq_deadline - - request->rq_arrival_time.tv_sec, - (s64)ktime_get_real_seconds() - request->rq_deadline); - } - - ktime_get_real_ts64(&work_end); - timediff = timespec64_sub(work_end, work_start); - timediff_usecs = timediff.tv_sec * USEC_PER_SEC + - timediff.tv_nsec / NSEC_PER_USEC; - arrived = timespec64_sub(work_end, request->rq_arrival_time); - arrived_usecs = arrived.tv_sec * USEC_PER_SEC + - arrived.tv_nsec / NSEC_PER_USEC; - CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc %s:%s+%d:%d:x%llu:%s:%d Request processed in %ldus (%ldus total) trans %llu rc %d/%d\n", - current->comm, - (request->rq_export ? - (char *)request->rq_export->exp_client_uuid.uuid : "0"), - (request->rq_export ? - atomic_read(&request->rq_export->exp_refcount) : -99), - lustre_msg_get_status(request->rq_reqmsg), - request->rq_xid, - libcfs_id2str(request->rq_peer), - lustre_msg_get_opc(request->rq_reqmsg), - timediff_usecs, - arrived_usecs, - (request->rq_repmsg ? - lustre_msg_get_transno(request->rq_repmsg) : - request->rq_transno), - request->rq_status, - (request->rq_repmsg ? - lustre_msg_get_status(request->rq_repmsg) : -999)); - if (likely(svc->srv_stats && request->rq_reqmsg)) { - __u32 op = lustre_msg_get_opc(request->rq_reqmsg); - int opc = opcode_offset(op); - - if (opc > 0 && !(op == LDLM_ENQUEUE || op == MDS_REINT)) { - LASSERT(opc < LUSTRE_MAX_OPCODES); - lprocfs_counter_add(svc->srv_stats, - opc + EXTRA_MAX_OPCODES, - timediff_usecs); - } - } - if (unlikely(request->rq_early_count)) { - DEBUG_REQ(D_ADAPTTO, request, - "sent %d early replies before finishing in %llds", - request->rq_early_count, - (s64)work_end.tv_sec - - request->rq_arrival_time.tv_sec); - } - - ptlrpc_server_finish_active_request(svcpt, request); - - return 1; -} - -/** - * An internal function to process a single reply state object. - */ -static int -ptlrpc_handle_rs(struct ptlrpc_reply_state *rs) -{ - struct ptlrpc_service_part *svcpt = rs->rs_svcpt; - struct ptlrpc_service *svc = svcpt->scp_service; - struct obd_export *exp; - int nlocks; - int been_handled; - - exp = rs->rs_export; - - LASSERT(rs->rs_difficult); - LASSERT(rs->rs_scheduled); - LASSERT(list_empty(&rs->rs_list)); - - spin_lock(&exp->exp_lock); - /* Noop if removed already */ - list_del_init(&rs->rs_exp_list); - spin_unlock(&exp->exp_lock); - - /* The disk commit callback holds exp_uncommitted_replies_lock while it - * iterates over newly committed replies, removing them from - * exp_uncommitted_replies. It then drops this lock and schedules the - * replies it found for handling here. - * - * We can avoid contention for exp_uncommitted_replies_lock between the - * HRT threads and further commit callbacks by checking rs_committed - * which is set in the commit callback while it holds both - * rs_lock and exp_uncommitted_reples. - * - * If we see rs_committed clear, the commit callback _may_ not have - * handled this reply yet and we race with it to grab - * exp_uncommitted_replies_lock before removing the reply from - * exp_uncommitted_replies. Note that if we lose the race and the - * reply has already been removed, list_del_init() is a noop. - * - * If we see rs_committed set, we know the commit callback is handling, - * or has handled this reply since store reordering might allow us to - * see rs_committed set out of sequence. But since this is done - * holding rs_lock, we can be sure it has all completed once we hold - * rs_lock, which we do right next. - */ - if (!rs->rs_committed) { - spin_lock(&exp->exp_uncommitted_replies_lock); - list_del_init(&rs->rs_obd_list); - spin_unlock(&exp->exp_uncommitted_replies_lock); - } - - spin_lock(&rs->rs_lock); - - been_handled = rs->rs_handled; - rs->rs_handled = 1; - - nlocks = rs->rs_nlocks; /* atomic "steal", but */ - rs->rs_nlocks = 0; /* locks still on rs_locks! */ - - if (nlocks == 0 && !been_handled) { - /* If we see this, we should already have seen the warning - * in mds_steal_ack_locks() - */ - CDEBUG(D_HA, "All locks stolen from rs %p x%lld.t%lld o%d NID %s\n", - rs, - rs->rs_xid, rs->rs_transno, rs->rs_opc, - libcfs_nid2str(exp->exp_connection->c_peer.nid)); - } - - if ((!been_handled && rs->rs_on_net) || nlocks > 0) { - spin_unlock(&rs->rs_lock); - - if (!been_handled && rs->rs_on_net) { - LNetMDUnlink(rs->rs_md_h); - /* Ignore return code; we're racing with completion */ - } - - while (nlocks-- > 0) - ldlm_lock_decref(&rs->rs_locks[nlocks], - rs->rs_modes[nlocks]); - - spin_lock(&rs->rs_lock); - } - - rs->rs_scheduled = 0; - - if (!rs->rs_on_net) { - /* Off the net */ - spin_unlock(&rs->rs_lock); - - class_export_put(exp); - rs->rs_export = NULL; - ptlrpc_rs_decref(rs); - if (atomic_dec_and_test(&svcpt->scp_nreps_difficult) && - svc->srv_is_stopping) - wake_up_all(&svcpt->scp_waitq); - return 1; - } - - /* still on the net; callback will schedule */ - spin_unlock(&rs->rs_lock); - return 1; -} - -static void -ptlrpc_check_rqbd_pool(struct ptlrpc_service_part *svcpt) -{ - int avail = svcpt->scp_nrqbds_posted; - int low_water = test_req_buffer_pressure ? 0 : - svcpt->scp_service->srv_nbuf_per_group / 2; - - /* NB I'm not locking; just looking. */ - - /* CAVEAT EMPTOR: We might be allocating buffers here because we've - * allowed the request history to grow out of control. We could put a - * sanity check on that here and cull some history if we need the - * space. - */ - - if (avail <= low_water) - ptlrpc_grow_req_bufs(svcpt, 1); - - if (svcpt->scp_service->srv_stats) { - lprocfs_counter_add(svcpt->scp_service->srv_stats, - PTLRPC_REQBUF_AVAIL_CNTR, avail); - } -} - -static inline int -ptlrpc_threads_enough(struct ptlrpc_service_part *svcpt) -{ - return svcpt->scp_nreqs_active < - svcpt->scp_nthrs_running - 1 - - (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL); -} - -/** - * allowed to create more threads - * user can call it w/o any lock but need to hold - * ptlrpc_service_part::scp_lock to get reliable result - */ -static inline int -ptlrpc_threads_increasable(struct ptlrpc_service_part *svcpt) -{ - return svcpt->scp_nthrs_running + - svcpt->scp_nthrs_starting < - svcpt->scp_service->srv_nthrs_cpt_limit; -} - -/** - * too many requests and allowed to create more threads - */ -static inline int -ptlrpc_threads_need_create(struct ptlrpc_service_part *svcpt) -{ - return !ptlrpc_threads_enough(svcpt) && - ptlrpc_threads_increasable(svcpt); -} - -static inline int -ptlrpc_thread_stopping(struct ptlrpc_thread *thread) -{ - return thread_is_stopping(thread) || - thread->t_svcpt->scp_service->srv_is_stopping; -} - -static inline int -ptlrpc_rqbd_pending(struct ptlrpc_service_part *svcpt) -{ - return !list_empty(&svcpt->scp_rqbd_idle) && - svcpt->scp_rqbd_timeout == 0; -} - -static inline int -ptlrpc_at_check(struct ptlrpc_service_part *svcpt) -{ - return svcpt->scp_at_check; -} - -/** - * requests wait on preprocessing - * user can call it w/o any lock but need to hold - * ptlrpc_service_part::scp_lock to get reliable result - */ -static inline int -ptlrpc_server_request_incoming(struct ptlrpc_service_part *svcpt) -{ - return !list_empty(&svcpt->scp_req_incoming); -} - -/* We perfer lifo queuing, but kernel doesn't provide that yet. */ -#ifndef wait_event_idle_exclusive_lifo -#define wait_event_idle_exclusive_lifo wait_event_idle_exclusive -#define wait_event_idle_exclusive_lifo_timeout wait_event_idle_exclusive_timeout -#endif - -static __attribute__((__noinline__)) int -ptlrpc_wait_event(struct ptlrpc_service_part *svcpt, - struct ptlrpc_thread *thread) -{ - /* Don't exit while there are replies to be handled */ - - /* XXX: Add this back when libcfs watchdog is merged upstream - lc_watchdog_disable(thread->t_watchdog); - */ - - cond_resched(); - - if (svcpt->scp_rqbd_timeout == 0) - wait_event_idle_exclusive_lifo( - svcpt->scp_waitq, - ptlrpc_thread_stopping(thread) || - ptlrpc_server_request_incoming(svcpt) || - ptlrpc_server_request_pending(svcpt, - false) || - ptlrpc_rqbd_pending(svcpt) || - ptlrpc_at_check(svcpt)); - else if (0 == wait_event_idle_exclusive_lifo_timeout( - svcpt->scp_waitq, - ptlrpc_thread_stopping(thread) || - ptlrpc_server_request_incoming(svcpt) || - ptlrpc_server_request_pending(svcpt, - false) || - ptlrpc_rqbd_pending(svcpt) || - ptlrpc_at_check(svcpt), - svcpt->scp_rqbd_timeout)) - svcpt->scp_rqbd_timeout = 0; - - if (ptlrpc_thread_stopping(thread)) - return -EINTR; - - /* - lc_watchdog_touch(thread->t_watchdog, - ptlrpc_server_get_timeout(svcpt)); - */ - return 0; -} - -/** - * Main thread body for service threads. - * Waits in a loop waiting for new requests to process to appear. - * Every time an incoming requests is added to its queue, a waitq - * is woken up and one of the threads will handle it. - */ -static int ptlrpc_main(void *arg) -{ - struct ptlrpc_thread *thread = arg; - struct ptlrpc_service_part *svcpt = thread->t_svcpt; - struct ptlrpc_service *svc = svcpt->scp_service; - struct ptlrpc_reply_state *rs; - struct group_info *ginfo = NULL; - struct lu_env *env; - int counter = 0, rc = 0; - - thread->t_pid = current->pid; - unshare_fs_struct(); - - /* NB: we will call cfs_cpt_bind() for all threads, because we - * might want to run lustre server only on a subset of system CPUs, - * in that case ->scp_cpt is CFS_CPT_ANY - */ - rc = cfs_cpt_bind(svc->srv_cptable, svcpt->scp_cpt); - if (rc != 0) { - CWARN("%s: failed to bind %s on CPT %d\n", - svc->srv_name, thread->t_name, svcpt->scp_cpt); - } - - ginfo = groups_alloc(0); - if (!ginfo) { - rc = -ENOMEM; - goto out; - } - - set_current_groups(ginfo); - put_group_info(ginfo); - - if (svc->srv_ops.so_thr_init) { - rc = svc->srv_ops.so_thr_init(thread); - if (rc) - goto out; - } - - env = kzalloc(sizeof(*env), GFP_KERNEL); - if (!env) { - rc = -ENOMEM; - goto out_srv_fini; - } - - rc = lu_context_init(&env->le_ctx, - svc->srv_ctx_tags | LCT_REMEMBER | LCT_NOREF); - if (rc) - goto out_srv_fini; - - thread->t_env = env; - env->le_ctx.lc_thread = thread; - env->le_ctx.lc_cookie = 0x6; - - while (!list_empty(&svcpt->scp_rqbd_idle)) { - rc = ptlrpc_server_post_idle_rqbds(svcpt); - if (rc >= 0) - continue; - - CERROR("Failed to post rqbd for %s on CPT %d: %d\n", - svc->srv_name, svcpt->scp_cpt, rc); - goto out_srv_fini; - } - - /* Alloc reply state structure for this one */ - rs = kvzalloc(svc->srv_max_reply_size, GFP_KERNEL); - if (!rs) { - rc = -ENOMEM; - goto out_srv_fini; - } - - spin_lock(&svcpt->scp_lock); - - LASSERT(thread_is_starting(thread)); - thread_clear_flags(thread, SVC_STARTING); - - LASSERT(svcpt->scp_nthrs_starting == 1); - svcpt->scp_nthrs_starting--; - - /* SVC_STOPPING may already be set here if someone else is trying - * to stop the service while this new thread has been dynamically - * forked. We still set SVC_RUNNING to let our creator know that - * we are now running, however we will exit as soon as possible - */ - thread_add_flags(thread, SVC_RUNNING); - svcpt->scp_nthrs_running++; - spin_unlock(&svcpt->scp_lock); - - /* wake up our creator in case he's still waiting. */ - wake_up(&thread->t_ctl_waitq); - - /* - thread->t_watchdog = lc_watchdog_add(ptlrpc_server_get_timeout(svcpt), - NULL, NULL); - */ - - spin_lock(&svcpt->scp_rep_lock); - list_add(&rs->rs_list, &svcpt->scp_rep_idle); - wake_up(&svcpt->scp_rep_waitq); - spin_unlock(&svcpt->scp_rep_lock); - - CDEBUG(D_NET, "service thread %d (#%d) started\n", thread->t_id, - svcpt->scp_nthrs_running); - - /* XXX maintain a list of all managed devices: insert here */ - while (!ptlrpc_thread_stopping(thread)) { - if (ptlrpc_wait_event(svcpt, thread)) - break; - - ptlrpc_check_rqbd_pool(svcpt); - - if (ptlrpc_threads_need_create(svcpt)) { - /* Ignore return code - we tried... */ - ptlrpc_start_thread(svcpt, 0); - } - - /* Process all incoming reqs before handling any */ - if (ptlrpc_server_request_incoming(svcpt)) { - lu_context_enter(&env->le_ctx); - env->le_ses = NULL; - ptlrpc_server_handle_req_in(svcpt, thread); - lu_context_exit(&env->le_ctx); - - /* but limit ourselves in case of flood */ - if (counter++ < 100) - continue; - counter = 0; - } - - if (ptlrpc_at_check(svcpt)) - ptlrpc_at_check_timed(svcpt); - - if (ptlrpc_server_request_pending(svcpt, false)) { - lu_context_enter(&env->le_ctx); - ptlrpc_server_handle_request(svcpt, thread); - lu_context_exit(&env->le_ctx); - } - - if (ptlrpc_rqbd_pending(svcpt) && - ptlrpc_server_post_idle_rqbds(svcpt) < 0) { - /* I just failed to repost request buffers. - * Wait for a timeout (unless something else - * happens) before I try again - */ - svcpt->scp_rqbd_timeout = HZ / 10; - CDEBUG(D_RPCTRACE, "Posted buffers: %d\n", - svcpt->scp_nrqbds_posted); - } - } - - /* - lc_watchdog_delete(thread->t_watchdog); - thread->t_watchdog = NULL; - */ - -out_srv_fini: - /* - * deconstruct service specific state created by ptlrpc_start_thread() - */ - if (svc->srv_ops.so_thr_done) - svc->srv_ops.so_thr_done(thread); - - if (env) { - lu_context_fini(&env->le_ctx); - kfree(env); - } -out: - CDEBUG(D_RPCTRACE, "service thread [ %p : %u ] %d exiting: rc %d\n", - thread, thread->t_pid, thread->t_id, rc); - - spin_lock(&svcpt->scp_lock); - if (thread_test_and_clear_flags(thread, SVC_STARTING)) - svcpt->scp_nthrs_starting--; - - if (thread_test_and_clear_flags(thread, SVC_RUNNING)) { - /* must know immediately */ - svcpt->scp_nthrs_running--; - } - - thread->t_id = rc; - thread_add_flags(thread, SVC_STOPPED); - - wake_up(&thread->t_ctl_waitq); - spin_unlock(&svcpt->scp_lock); - - return rc; -} - -static int hrt_dont_sleep(struct ptlrpc_hr_thread *hrt, - struct list_head *replies) -{ - int result; - - spin_lock(&hrt->hrt_lock); - - list_splice_init(&hrt->hrt_queue, replies); - result = ptlrpc_hr.hr_stopping || !list_empty(replies); - - spin_unlock(&hrt->hrt_lock); - return result; -} - -/** - * Main body of "handle reply" function. - * It processes acked reply states - */ -static int ptlrpc_hr_main(void *arg) -{ - struct ptlrpc_hr_thread *hrt = arg; - struct ptlrpc_hr_partition *hrp = hrt->hrt_partition; - LIST_HEAD(replies); - char threadname[20]; - int rc; - - snprintf(threadname, sizeof(threadname), "ptlrpc_hr%02d_%03d", - hrp->hrp_cpt, hrt->hrt_id); - unshare_fs_struct(); - - rc = cfs_cpt_bind(ptlrpc_hr.hr_cpt_table, hrp->hrp_cpt); - if (rc != 0) { - CWARN("Failed to bind %s on CPT %d of CPT table %p: rc = %d\n", - threadname, hrp->hrp_cpt, ptlrpc_hr.hr_cpt_table, rc); - } - - atomic_inc(&hrp->hrp_nstarted); - wake_up(&ptlrpc_hr.hr_waitq); - - while (!ptlrpc_hr.hr_stopping) { - wait_event_idle(hrt->hrt_waitq, hrt_dont_sleep(hrt, &replies)); - - while (!list_empty(&replies)) { - struct ptlrpc_reply_state *rs; - - rs = list_entry(replies.prev, struct ptlrpc_reply_state, - rs_list); - list_del_init(&rs->rs_list); - ptlrpc_handle_rs(rs); - } - } - - atomic_inc(&hrp->hrp_nstopped); - wake_up(&ptlrpc_hr.hr_waitq); - - return 0; -} - -static void ptlrpc_stop_hr_threads(void) -{ - struct ptlrpc_hr_partition *hrp; - int i; - int j; - - ptlrpc_hr.hr_stopping = 1; - - cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { - if (!hrp->hrp_thrs) - continue; /* uninitialized */ - for (j = 0; j < hrp->hrp_nthrs; j++) - wake_up_all(&hrp->hrp_thrs[j].hrt_waitq); - } - - cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { - if (!hrp->hrp_thrs) - continue; /* uninitialized */ - wait_event(ptlrpc_hr.hr_waitq, - atomic_read(&hrp->hrp_nstopped) == - atomic_read(&hrp->hrp_nstarted)); - } -} - -static int ptlrpc_start_hr_threads(void) -{ - struct ptlrpc_hr_partition *hrp; - int i; - int j; - - cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { - int rc = 0; - - for (j = 0; j < hrp->hrp_nthrs; j++) { - struct ptlrpc_hr_thread *hrt = &hrp->hrp_thrs[j]; - struct task_struct *task; - - task = kthread_run(ptlrpc_hr_main, - &hrp->hrp_thrs[j], - "ptlrpc_hr%02d_%03d", - hrp->hrp_cpt, hrt->hrt_id); - if (IS_ERR(task)) { - rc = PTR_ERR(task); - break; - } - } - wait_event(ptlrpc_hr.hr_waitq, - atomic_read(&hrp->hrp_nstarted) == j); - - if (rc < 0) { - CERROR("cannot start reply handler thread %d:%d: rc = %d\n", - i, j, rc); - ptlrpc_stop_hr_threads(); - return rc; - } - } - return 0; -} - -static void ptlrpc_svcpt_stop_threads(struct ptlrpc_service_part *svcpt) -{ - struct ptlrpc_thread *thread; - LIST_HEAD(zombie); - - CDEBUG(D_INFO, "Stopping threads for service %s\n", - svcpt->scp_service->srv_name); - - spin_lock(&svcpt->scp_lock); - /* let the thread know that we would like it to stop asap */ - list_for_each_entry(thread, &svcpt->scp_threads, t_link) { - CDEBUG(D_INFO, "Stopping thread %s #%u\n", - svcpt->scp_service->srv_thread_name, thread->t_id); - thread_add_flags(thread, SVC_STOPPING); - } - - wake_up_all(&svcpt->scp_waitq); - - while (!list_empty(&svcpt->scp_threads)) { - thread = list_entry(svcpt->scp_threads.next, - struct ptlrpc_thread, t_link); - if (thread_is_stopped(thread)) { - list_del(&thread->t_link); - list_add(&thread->t_link, &zombie); - continue; - } - spin_unlock(&svcpt->scp_lock); - - CDEBUG(D_INFO, "waiting for stopping-thread %s #%u\n", - svcpt->scp_service->srv_thread_name, thread->t_id); - wait_event_idle(thread->t_ctl_waitq, - thread_is_stopped(thread)); - - spin_lock(&svcpt->scp_lock); - } - - spin_unlock(&svcpt->scp_lock); - - while (!list_empty(&zombie)) { - thread = list_entry(zombie.next, - struct ptlrpc_thread, t_link); - list_del(&thread->t_link); - kfree(thread); - } -} - -/** - * Stops all threads of a particular service \a svc - */ -static void ptlrpc_stop_all_threads(struct ptlrpc_service *svc) -{ - struct ptlrpc_service_part *svcpt; - int i; - - ptlrpc_service_for_each_part(svcpt, i, svc) { - if (svcpt->scp_service) - ptlrpc_svcpt_stop_threads(svcpt); - } -} - -int ptlrpc_start_threads(struct ptlrpc_service *svc) -{ - int rc = 0; - int i; - int j; - - /* We require 2 threads min, see note in ptlrpc_server_handle_request */ - LASSERT(svc->srv_nthrs_cpt_init >= PTLRPC_NTHRS_INIT); - - for (i = 0; i < svc->srv_ncpts; i++) { - for (j = 0; j < svc->srv_nthrs_cpt_init; j++) { - rc = ptlrpc_start_thread(svc->srv_parts[i], 1); - if (rc == 0) - continue; - - if (rc != -EMFILE) - goto failed; - /* We have enough threads, don't start more. b=15759 */ - break; - } - } - - return 0; - failed: - CERROR("cannot start %s thread #%d_%d: rc %d\n", - svc->srv_thread_name, i, j, rc); - ptlrpc_stop_all_threads(svc); - return rc; -} - -int ptlrpc_start_thread(struct ptlrpc_service_part *svcpt, int wait) -{ - struct ptlrpc_thread *thread; - struct ptlrpc_service *svc; - struct task_struct *task; - int rc; - - svc = svcpt->scp_service; - - CDEBUG(D_RPCTRACE, "%s[%d] started %d min %d max %d\n", - svc->srv_name, svcpt->scp_cpt, svcpt->scp_nthrs_running, - svc->srv_nthrs_cpt_init, svc->srv_nthrs_cpt_limit); - - again: - if (unlikely(svc->srv_is_stopping)) - return -ESRCH; - - if (!ptlrpc_threads_increasable(svcpt) || - (OBD_FAIL_CHECK(OBD_FAIL_TGT_TOOMANY_THREADS) && - svcpt->scp_nthrs_running == svc->srv_nthrs_cpt_init - 1)) - return -EMFILE; - - thread = kzalloc_node(sizeof(*thread), GFP_NOFS, - cfs_cpt_spread_node(svc->srv_cptable, - svcpt->scp_cpt)); - if (!thread) - return -ENOMEM; - init_waitqueue_head(&thread->t_ctl_waitq); - - spin_lock(&svcpt->scp_lock); - if (!ptlrpc_threads_increasable(svcpt)) { - spin_unlock(&svcpt->scp_lock); - kfree(thread); - return -EMFILE; - } - - if (svcpt->scp_nthrs_starting != 0) { - /* serialize starting because some modules (obdfilter) - * might require unique and contiguous t_id - */ - LASSERT(svcpt->scp_nthrs_starting == 1); - spin_unlock(&svcpt->scp_lock); - kfree(thread); - if (wait) { - CDEBUG(D_INFO, "Waiting for creating thread %s #%d\n", - svc->srv_thread_name, svcpt->scp_thr_nextid); - schedule(); - goto again; - } - - CDEBUG(D_INFO, "Creating thread %s #%d race, retry later\n", - svc->srv_thread_name, svcpt->scp_thr_nextid); - return -EAGAIN; - } - - svcpt->scp_nthrs_starting++; - thread->t_id = svcpt->scp_thr_nextid++; - thread_add_flags(thread, SVC_STARTING); - thread->t_svcpt = svcpt; - - list_add(&thread->t_link, &svcpt->scp_threads); - spin_unlock(&svcpt->scp_lock); - - if (svcpt->scp_cpt >= 0) { - snprintf(thread->t_name, sizeof(thread->t_name), "%s%02d_%03d", - svc->srv_thread_name, svcpt->scp_cpt, thread->t_id); - } else { - snprintf(thread->t_name, sizeof(thread->t_name), "%s_%04d", - svc->srv_thread_name, thread->t_id); - } - - CDEBUG(D_RPCTRACE, "starting thread '%s'\n", thread->t_name); - task = kthread_run(ptlrpc_main, thread, "%s", thread->t_name); - if (IS_ERR(task)) { - rc = PTR_ERR(task); - CERROR("cannot start thread '%s': rc = %d\n", - thread->t_name, rc); - spin_lock(&svcpt->scp_lock); - --svcpt->scp_nthrs_starting; - if (thread_is_stopping(thread)) { - /* this ptlrpc_thread is being handled - * by ptlrpc_svcpt_stop_threads now - */ - thread_add_flags(thread, SVC_STOPPED); - wake_up(&thread->t_ctl_waitq); - spin_unlock(&svcpt->scp_lock); - } else { - list_del(&thread->t_link); - spin_unlock(&svcpt->scp_lock); - kfree(thread); - } - return rc; - } - - if (!wait) - return 0; - - wait_event_idle(thread->t_ctl_waitq, - thread_is_running(thread) || thread_is_stopped(thread)); - - rc = thread_is_stopped(thread) ? thread->t_id : 0; - return rc; -} - -int ptlrpc_hr_init(void) -{ - struct ptlrpc_hr_partition *hrp; - struct ptlrpc_hr_thread *hrt; - int rc; - int i; - int j; - int weight; - - memset(&ptlrpc_hr, 0, sizeof(ptlrpc_hr)); - ptlrpc_hr.hr_cpt_table = cfs_cpt_tab; - - ptlrpc_hr.hr_partitions = cfs_percpt_alloc(ptlrpc_hr.hr_cpt_table, - sizeof(*hrp)); - if (!ptlrpc_hr.hr_partitions) - return -ENOMEM; - - init_waitqueue_head(&ptlrpc_hr.hr_waitq); - - weight = cpumask_weight(topology_sibling_cpumask(0)); - - cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { - hrp->hrp_cpt = i; - - atomic_set(&hrp->hrp_nstarted, 0); - atomic_set(&hrp->hrp_nstopped, 0); - - hrp->hrp_nthrs = cfs_cpt_weight(ptlrpc_hr.hr_cpt_table, i); - hrp->hrp_nthrs /= weight; - if (hrp->hrp_nthrs == 0) - hrp->hrp_nthrs = 1; - - hrp->hrp_thrs = - kzalloc_node(hrp->hrp_nthrs * sizeof(*hrt), GFP_NOFS, - cfs_cpt_spread_node(ptlrpc_hr.hr_cpt_table, - i)); - if (!hrp->hrp_thrs) { - rc = -ENOMEM; - goto out; - } - - for (j = 0; j < hrp->hrp_nthrs; j++) { - hrt = &hrp->hrp_thrs[j]; - - hrt->hrt_id = j; - hrt->hrt_partition = hrp; - init_waitqueue_head(&hrt->hrt_waitq); - spin_lock_init(&hrt->hrt_lock); - INIT_LIST_HEAD(&hrt->hrt_queue); - } - } - - rc = ptlrpc_start_hr_threads(); -out: - if (rc != 0) - ptlrpc_hr_fini(); - return rc; -} - -void ptlrpc_hr_fini(void) -{ - struct ptlrpc_hr_partition *hrp; - int i; - - if (!ptlrpc_hr.hr_partitions) - return; - - ptlrpc_stop_hr_threads(); - - cfs_percpt_for_each(hrp, i, ptlrpc_hr.hr_partitions) { - kfree(hrp->hrp_thrs); - } - - cfs_percpt_free(ptlrpc_hr.hr_partitions); - ptlrpc_hr.hr_partitions = NULL; -} - -/** - * Wait until all already scheduled replies are processed. - */ -static void ptlrpc_wait_replies(struct ptlrpc_service_part *svcpt) -{ - while (1) { - int rc; - - rc = wait_event_idle_timeout( - svcpt->scp_waitq, - atomic_read(&svcpt->scp_nreps_difficult) == 0, - 10 * HZ); - if (rc > 0) - break; - CWARN("Unexpectedly long timeout %s %p\n", - svcpt->scp_service->srv_name, svcpt->scp_service); - } -} - -static void -ptlrpc_service_del_atimer(struct ptlrpc_service *svc) -{ - struct ptlrpc_service_part *svcpt; - int i; - - /* early disarm AT timer... */ - ptlrpc_service_for_each_part(svcpt, i, svc) { - if (svcpt->scp_service) - del_timer(&svcpt->scp_at_timer); - } -} - -static void -ptlrpc_service_unlink_rqbd(struct ptlrpc_service *svc) -{ - struct ptlrpc_service_part *svcpt; - struct ptlrpc_request_buffer_desc *rqbd; - int cnt; - int rc; - int i; - - /* All history will be culled when the next request buffer is - * freed in ptlrpc_service_purge_all() - */ - svc->srv_hist_nrqbds_cpt_max = 0; - - rc = LNetClearLazyPortal(svc->srv_req_portal); - LASSERT(rc == 0); - - ptlrpc_service_for_each_part(svcpt, i, svc) { - if (!svcpt->scp_service) - break; - - /* Unlink all the request buffers. This forces a 'final' - * event with its 'unlink' flag set for each posted rqbd - */ - list_for_each_entry(rqbd, &svcpt->scp_rqbd_posted, - rqbd_list) { - rc = LNetMDUnlink(rqbd->rqbd_md_h); - LASSERT(rc == 0 || rc == -ENOENT); - } - } - - ptlrpc_service_for_each_part(svcpt, i, svc) { - if (!svcpt->scp_service) - break; - - /* Wait for the network to release any buffers - * it's currently filling - */ - spin_lock(&svcpt->scp_lock); - while (svcpt->scp_nrqbds_posted != 0) { - spin_unlock(&svcpt->scp_lock); - /* Network access will complete in finite time but - * the HUGE timeout lets us CWARN for visibility - * of sluggish LNDs - */ - cnt = 0; - while (cnt < LONG_UNLINK && - (rc = wait_event_idle_timeout(svcpt->scp_waitq, - svcpt->scp_nrqbds_posted == 0, - HZ)) == 0) - cnt++; - if (rc == 0) { - CWARN("Service %s waiting for request buffers\n", - svcpt->scp_service->srv_name); - } - spin_lock(&svcpt->scp_lock); - } - spin_unlock(&svcpt->scp_lock); - } -} - -static void -ptlrpc_service_purge_all(struct ptlrpc_service *svc) -{ - struct ptlrpc_service_part *svcpt; - struct ptlrpc_request_buffer_desc *rqbd; - struct ptlrpc_request *req; - struct ptlrpc_reply_state *rs; - int i; - - ptlrpc_service_for_each_part(svcpt, i, svc) { - if (!svcpt->scp_service) - break; - - spin_lock(&svcpt->scp_rep_lock); - while (!list_empty(&svcpt->scp_rep_active)) { - rs = list_entry(svcpt->scp_rep_active.next, - struct ptlrpc_reply_state, rs_list); - spin_lock(&rs->rs_lock); - ptlrpc_schedule_difficult_reply(rs); - spin_unlock(&rs->rs_lock); - } - spin_unlock(&svcpt->scp_rep_lock); - - /* purge the request queue. NB No new replies (rqbds - * all unlinked) and no service threads, so I'm the only - * thread noodling the request queue now - */ - while (!list_empty(&svcpt->scp_req_incoming)) { - req = list_entry(svcpt->scp_req_incoming.next, - struct ptlrpc_request, rq_list); - - list_del(&req->rq_list); - svcpt->scp_nreqs_incoming--; - ptlrpc_server_finish_request(svcpt, req); - } - - while (ptlrpc_server_request_pending(svcpt, true)) { - req = ptlrpc_server_request_get(svcpt, true); - ptlrpc_server_finish_active_request(svcpt, req); - } - - LASSERT(list_empty(&svcpt->scp_rqbd_posted)); - LASSERT(svcpt->scp_nreqs_incoming == 0); - LASSERT(svcpt->scp_nreqs_active == 0); - /* history should have been culled by - * ptlrpc_server_finish_request - */ - LASSERT(svcpt->scp_hist_nrqbds == 0); - - /* Now free all the request buffers since nothing - * references them any more... - */ - - while (!list_empty(&svcpt->scp_rqbd_idle)) { - rqbd = list_entry(svcpt->scp_rqbd_idle.next, - struct ptlrpc_request_buffer_desc, - rqbd_list); - ptlrpc_free_rqbd(rqbd); - } - ptlrpc_wait_replies(svcpt); - - while (!list_empty(&svcpt->scp_rep_idle)) { - rs = list_entry(svcpt->scp_rep_idle.next, - struct ptlrpc_reply_state, - rs_list); - list_del(&rs->rs_list); - kvfree(rs); - } - } -} - -static void -ptlrpc_service_free(struct ptlrpc_service *svc) -{ - struct ptlrpc_service_part *svcpt; - struct ptlrpc_at_array *array; - int i; - - ptlrpc_service_for_each_part(svcpt, i, svc) { - if (!svcpt->scp_service) - break; - - /* In case somebody rearmed this in the meantime */ - del_timer(&svcpt->scp_at_timer); - array = &svcpt->scp_at_array; - - kfree(array->paa_reqs_array); - array->paa_reqs_array = NULL; - kfree(array->paa_reqs_count); - array->paa_reqs_count = NULL; - } - - ptlrpc_service_for_each_part(svcpt, i, svc) - kfree(svcpt); - - if (svc->srv_cpts) - cfs_expr_list_values_free(svc->srv_cpts, svc->srv_ncpts); - - kfree(svc); -} - -int ptlrpc_unregister_service(struct ptlrpc_service *service) -{ - CDEBUG(D_NET, "%s: tearing down\n", service->srv_name); - - service->srv_is_stopping = 1; - - mutex_lock(&ptlrpc_all_services_mutex); - list_del_init(&service->srv_list); - mutex_unlock(&ptlrpc_all_services_mutex); - - ptlrpc_service_del_atimer(service); - ptlrpc_stop_all_threads(service); - - ptlrpc_service_unlink_rqbd(service); - ptlrpc_service_purge_all(service); - ptlrpc_service_nrs_cleanup(service); - - ptlrpc_lprocfs_unregister_service(service); - ptlrpc_sysfs_unregister_service(service); - - ptlrpc_service_free(service); - - return 0; -} -EXPORT_SYMBOL(ptlrpc_unregister_service); diff --git a/drivers/staging/lustre/lustre/ptlrpc/wiretest.c b/drivers/staging/lustre/lustre/ptlrpc/wiretest.c deleted file mode 100644 index f9394c3e1ee232b143cbe145465d836ed25ff389..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/lustre/ptlrpc/wiretest.c +++ /dev/null @@ -1,4210 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.gnu.org/licenses/gpl-2.0.html - * - * GPL HEADER END - */ -/* - * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. - * Use is subject to license terms. - * - * Copyright (c) 2011, 2015, Intel Corporation. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#define DEBUG_SUBSYSTEM S_RPC - -#include -#include - -#include -#include -#include -#include -#include "ptlrpc_internal.h" - -void lustre_assert_wire_constants(void) -{ - /* Wire protocol assertions generated by 'wirecheck' - * (make -C lustre/utils newwiretest) - * running on Linux centos6-bis 2.6.32-358.0.1.el6-head - * #3 SMP Wed Apr 17 17:37:43 CEST 2013 - * with gcc version 4.4.6 20110731 (Red Hat 4.4.6-3) (GCC) - */ - - /* Constants... */ - LASSERTF(PTL_RPC_MSG_REQUEST == 4711, "found %lld\n", - (long long)PTL_RPC_MSG_REQUEST); - LASSERTF(PTL_RPC_MSG_ERR == 4712, "found %lld\n", - (long long)PTL_RPC_MSG_ERR); - LASSERTF(PTL_RPC_MSG_REPLY == 4713, "found %lld\n", - (long long)PTL_RPC_MSG_REPLY); - LASSERTF(MDS_DIR_END_OFF == 0xfffffffffffffffeULL, "found 0x%.16llxULL\n", - MDS_DIR_END_OFF); - LASSERTF(DEAD_HANDLE_MAGIC == 0xdeadbeefcafebabeULL, "found 0x%.16llxULL\n", - DEAD_HANDLE_MAGIC); - BUILD_BUG_ON(MTI_NAME_MAXLEN != 64); - LASSERTF(OST_REPLY == 0, "found %lld\n", - (long long)OST_REPLY); - LASSERTF(OST_GETATTR == 1, "found %lld\n", - (long long)OST_GETATTR); - LASSERTF(OST_SETATTR == 2, "found %lld\n", - (long long)OST_SETATTR); - LASSERTF(OST_READ == 3, "found %lld\n", - (long long)OST_READ); - LASSERTF(OST_WRITE == 4, "found %lld\n", - (long long)OST_WRITE); - LASSERTF(OST_CREATE == 5, "found %lld\n", - (long long)OST_CREATE); - LASSERTF(OST_DESTROY == 6, "found %lld\n", - (long long)OST_DESTROY); - LASSERTF(OST_GET_INFO == 7, "found %lld\n", - (long long)OST_GET_INFO); - LASSERTF(OST_CONNECT == 8, "found %lld\n", - (long long)OST_CONNECT); - LASSERTF(OST_DISCONNECT == 9, "found %lld\n", - (long long)OST_DISCONNECT); - LASSERTF(OST_PUNCH == 10, "found %lld\n", - (long long)OST_PUNCH); - LASSERTF(OST_OPEN == 11, "found %lld\n", - (long long)OST_OPEN); - LASSERTF(OST_CLOSE == 12, "found %lld\n", - (long long)OST_CLOSE); - LASSERTF(OST_STATFS == 13, "found %lld\n", - (long long)OST_STATFS); - LASSERTF(OST_SYNC == 16, "found %lld\n", - (long long)OST_SYNC); - LASSERTF(OST_SET_INFO == 17, "found %lld\n", - (long long)OST_SET_INFO); - LASSERTF(OST_QUOTACHECK == 18, "found %lld\n", - (long long)OST_QUOTACHECK); - LASSERTF(OST_QUOTACTL == 19, "found %lld\n", - (long long)OST_QUOTACTL); - LASSERTF(OST_QUOTA_ADJUST_QUNIT == 20, "found %lld\n", - (long long)OST_QUOTA_ADJUST_QUNIT); - LASSERTF(OST_LAST_OPC == 21, "found %lld\n", - (long long)OST_LAST_OPC); - LASSERTF(OBD_OBJECT_EOF == 0xffffffffffffffffULL, "found 0x%.16llxULL\n", - OBD_OBJECT_EOF); - LASSERTF(OST_MIN_PRECREATE == 32, "found %lld\n", - (long long)OST_MIN_PRECREATE); - LASSERTF(OST_MAX_PRECREATE == 20000, "found %lld\n", - (long long)OST_MAX_PRECREATE); - LASSERTF(OST_LVB_ERR_INIT == 0xffbadbad80000000ULL, "found 0x%.16llxULL\n", - OST_LVB_ERR_INIT); - LASSERTF(OST_LVB_ERR_MASK == 0xffbadbad00000000ULL, "found 0x%.16llxULL\n", - OST_LVB_ERR_MASK); - LASSERTF(MDS_FIRST_OPC == 33, "found %lld\n", - (long long)MDS_FIRST_OPC); - LASSERTF(MDS_GETATTR == 33, "found %lld\n", - (long long)MDS_GETATTR); - LASSERTF(MDS_GETATTR_NAME == 34, "found %lld\n", - (long long)MDS_GETATTR_NAME); - LASSERTF(MDS_CLOSE == 35, "found %lld\n", - (long long)MDS_CLOSE); - LASSERTF(MDS_REINT == 36, "found %lld\n", - (long long)MDS_REINT); - LASSERTF(MDS_READPAGE == 37, "found %lld\n", - (long long)MDS_READPAGE); - LASSERTF(MDS_CONNECT == 38, "found %lld\n", - (long long)MDS_CONNECT); - LASSERTF(MDS_DISCONNECT == 39, "found %lld\n", - (long long)MDS_DISCONNECT); - LASSERTF(MDS_GETSTATUS == 40, "found %lld\n", - (long long)MDS_GETSTATUS); - LASSERTF(MDS_STATFS == 41, "found %lld\n", - (long long)MDS_STATFS); - LASSERTF(MDS_PIN == 42, "found %lld\n", - (long long)MDS_PIN); - LASSERTF(MDS_UNPIN == 43, "found %lld\n", - (long long)MDS_UNPIN); - LASSERTF(MDS_SYNC == 44, "found %lld\n", - (long long)MDS_SYNC); - LASSERTF(MDS_DONE_WRITING == 45, "found %lld\n", - (long long)MDS_DONE_WRITING); - LASSERTF(MDS_SET_INFO == 46, "found %lld\n", - (long long)MDS_SET_INFO); - LASSERTF(MDS_QUOTACHECK == 47, "found %lld\n", - (long long)MDS_QUOTACHECK); - LASSERTF(MDS_QUOTACTL == 48, "found %lld\n", - (long long)MDS_QUOTACTL); - LASSERTF(MDS_GETXATTR == 49, "found %lld\n", - (long long)MDS_GETXATTR); - LASSERTF(MDS_SETXATTR == 50, "found %lld\n", - (long long)MDS_SETXATTR); - LASSERTF(MDS_WRITEPAGE == 51, "found %lld\n", - (long long)MDS_WRITEPAGE); - LASSERTF(MDS_IS_SUBDIR == 52, "found %lld\n", - (long long)MDS_IS_SUBDIR); - LASSERTF(MDS_GET_INFO == 53, "found %lld\n", - (long long)MDS_GET_INFO); - LASSERTF(MDS_HSM_STATE_GET == 54, "found %lld\n", - (long long)MDS_HSM_STATE_GET); - LASSERTF(MDS_HSM_STATE_SET == 55, "found %lld\n", - (long long)MDS_HSM_STATE_SET); - LASSERTF(MDS_HSM_ACTION == 56, "found %lld\n", - (long long)MDS_HSM_ACTION); - LASSERTF(MDS_HSM_PROGRESS == 57, "found %lld\n", - (long long)MDS_HSM_PROGRESS); - LASSERTF(MDS_HSM_REQUEST == 58, "found %lld\n", - (long long)MDS_HSM_REQUEST); - LASSERTF(MDS_HSM_CT_REGISTER == 59, "found %lld\n", - (long long)MDS_HSM_CT_REGISTER); - LASSERTF(MDS_HSM_CT_UNREGISTER == 60, "found %lld\n", - (long long)MDS_HSM_CT_UNREGISTER); - LASSERTF(MDS_SWAP_LAYOUTS == 61, "found %lld\n", - (long long)MDS_SWAP_LAYOUTS); - LASSERTF(MDS_LAST_OPC == 62, "found %lld\n", - (long long)MDS_LAST_OPC); - LASSERTF(REINT_SETATTR == 1, "found %lld\n", - (long long)REINT_SETATTR); - LASSERTF(REINT_CREATE == 2, "found %lld\n", - (long long)REINT_CREATE); - LASSERTF(REINT_LINK == 3, "found %lld\n", - (long long)REINT_LINK); - LASSERTF(REINT_UNLINK == 4, "found %lld\n", - (long long)REINT_UNLINK); - LASSERTF(REINT_RENAME == 5, "found %lld\n", - (long long)REINT_RENAME); - LASSERTF(REINT_OPEN == 6, "found %lld\n", - (long long)REINT_OPEN); - LASSERTF(REINT_SETXATTR == 7, "found %lld\n", - (long long)REINT_SETXATTR); - LASSERTF(REINT_RMENTRY == 8, "found %lld\n", - (long long)REINT_RMENTRY); - LASSERTF(REINT_MIGRATE == 9, "found %lld\n", - (long long)REINT_MIGRATE); - LASSERTF(REINT_MAX == 10, "found %lld\n", - (long long)REINT_MAX); - LASSERTF(DISP_IT_EXECD == 0x00000001UL, "found 0x%.8xUL\n", - (unsigned int)DISP_IT_EXECD); - LASSERTF(DISP_LOOKUP_EXECD == 0x00000002UL, "found 0x%.8xUL\n", - (unsigned int)DISP_LOOKUP_EXECD); - LASSERTF(DISP_LOOKUP_NEG == 0x00000004UL, "found 0x%.8xUL\n", - (unsigned int)DISP_LOOKUP_NEG); - LASSERTF(DISP_LOOKUP_POS == 0x00000008UL, "found 0x%.8xUL\n", - (unsigned int)DISP_LOOKUP_POS); - LASSERTF(DISP_OPEN_CREATE == 0x00000010UL, "found 0x%.8xUL\n", - (unsigned int)DISP_OPEN_CREATE); - LASSERTF(DISP_OPEN_OPEN == 0x00000020UL, "found 0x%.8xUL\n", - (unsigned int)DISP_OPEN_OPEN); - LASSERTF(DISP_ENQ_COMPLETE == 0x00400000UL, "found 0x%.8xUL\n", - (unsigned int)DISP_ENQ_COMPLETE); - LASSERTF(DISP_ENQ_OPEN_REF == 0x00800000UL, "found 0x%.8xUL\n", - (unsigned int)DISP_ENQ_OPEN_REF); - LASSERTF(DISP_ENQ_CREATE_REF == 0x01000000UL, "found 0x%.8xUL\n", - (unsigned int)DISP_ENQ_CREATE_REF); - LASSERTF(DISP_OPEN_LOCK == 0x02000000UL, "found 0x%.8xUL\n", - (unsigned int)DISP_OPEN_LOCK); - LASSERTF(MDS_STATUS_CONN == 1, "found %lld\n", - (long long)MDS_STATUS_CONN); - LASSERTF(MDS_STATUS_LOV == 2, "found %lld\n", - (long long)MDS_STATUS_LOV); - LASSERTF(MDS_ATTR_MODE == 0x0000000000000001ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_MODE); - LASSERTF(MDS_ATTR_UID == 0x0000000000000002ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_UID); - LASSERTF(MDS_ATTR_GID == 0x0000000000000004ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_GID); - LASSERTF(MDS_ATTR_SIZE == 0x0000000000000008ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_SIZE); - LASSERTF(MDS_ATTR_ATIME == 0x0000000000000010ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_ATIME); - LASSERTF(MDS_ATTR_MTIME == 0x0000000000000020ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_MTIME); - LASSERTF(MDS_ATTR_CTIME == 0x0000000000000040ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_CTIME); - LASSERTF(MDS_ATTR_ATIME_SET == 0x0000000000000080ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_ATIME_SET); - LASSERTF(MDS_ATTR_MTIME_SET == 0x0000000000000100ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_MTIME_SET); - LASSERTF(MDS_ATTR_FORCE == 0x0000000000000200ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_FORCE); - LASSERTF(MDS_ATTR_ATTR_FLAG == 0x0000000000000400ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_ATTR_FLAG); - LASSERTF(MDS_ATTR_KILL_SUID == 0x0000000000000800ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_KILL_SUID); - LASSERTF(MDS_ATTR_KILL_SGID == 0x0000000000001000ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_KILL_SGID); - LASSERTF(MDS_ATTR_CTIME_SET == 0x0000000000002000ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_CTIME_SET); - LASSERTF(MDS_ATTR_FROM_OPEN == 0x0000000000004000ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_FROM_OPEN); - LASSERTF(MDS_ATTR_BLOCKS == 0x0000000000008000ULL, "found 0x%.16llxULL\n", - (long long)MDS_ATTR_BLOCKS); - LASSERTF(FLD_QUERY == 900, "found %lld\n", - (long long)FLD_QUERY); - LASSERTF(FLD_FIRST_OPC == 900, "found %lld\n", - (long long)FLD_FIRST_OPC); - LASSERTF(FLD_READ == 901, "found %lld\n", - (long long)FLD_READ); - LASSERTF(FLD_LAST_OPC == 902, "found %lld\n", - (long long)FLD_LAST_OPC); - LASSERTF(SEQ_QUERY == 700, "found %lld\n", - (long long)SEQ_QUERY); - LASSERTF(SEQ_FIRST_OPC == 700, "found %lld\n", - (long long)SEQ_FIRST_OPC); - LASSERTF(SEQ_LAST_OPC == 701, "found %lld\n", - (long long)SEQ_LAST_OPC); - LASSERTF(SEQ_ALLOC_SUPER == 0, "found %lld\n", - (long long)SEQ_ALLOC_SUPER); - LASSERTF(SEQ_ALLOC_META == 1, "found %lld\n", - (long long)SEQ_ALLOC_META); - LASSERTF(LDLM_ENQUEUE == 101, "found %lld\n", - (long long)LDLM_ENQUEUE); - LASSERTF(LDLM_CONVERT == 102, "found %lld\n", - (long long)LDLM_CONVERT); - LASSERTF(LDLM_CANCEL == 103, "found %lld\n", - (long long)LDLM_CANCEL); - LASSERTF(LDLM_BL_CALLBACK == 104, "found %lld\n", - (long long)LDLM_BL_CALLBACK); - LASSERTF(LDLM_CP_CALLBACK == 105, "found %lld\n", - (long long)LDLM_CP_CALLBACK); - LASSERTF(LDLM_GL_CALLBACK == 106, "found %lld\n", - (long long)LDLM_GL_CALLBACK); - LASSERTF(LDLM_SET_INFO == 107, "found %lld\n", - (long long)LDLM_SET_INFO); - LASSERTF(LDLM_LAST_OPC == 108, "found %lld\n", - (long long)LDLM_LAST_OPC); - LASSERTF(LCK_MINMODE == 0, "found %lld\n", - (long long)LCK_MINMODE); - LASSERTF(LCK_EX == 1, "found %lld\n", - (long long)LCK_EX); - LASSERTF(LCK_PW == 2, "found %lld\n", - (long long)LCK_PW); - LASSERTF(LCK_PR == 4, "found %lld\n", - (long long)LCK_PR); - LASSERTF(LCK_CW == 8, "found %lld\n", - (long long)LCK_CW); - LASSERTF(LCK_CR == 16, "found %lld\n", - (long long)LCK_CR); - LASSERTF(LCK_NL == 32, "found %lld\n", - (long long)LCK_NL); - LASSERTF(LCK_GROUP == 64, "found %lld\n", - (long long)LCK_GROUP); - LASSERTF(LCK_COS == 128, "found %lld\n", - (long long)LCK_COS); - LASSERTF(LCK_MAXMODE == 129, "found %lld\n", - (long long)LCK_MAXMODE); - LASSERTF(LCK_MODE_NUM == 8, "found %lld\n", - (long long)LCK_MODE_NUM); - BUILD_BUG_ON(LDLM_PLAIN != 10); - BUILD_BUG_ON(LDLM_EXTENT != 11); - BUILD_BUG_ON(LDLM_FLOCK != 12); - BUILD_BUG_ON(LDLM_IBITS != 13); - BUILD_BUG_ON(LDLM_MAX_TYPE != 14); - BUILD_BUG_ON(LUSTRE_RES_ID_SEQ_OFF != 0); - BUILD_BUG_ON(LUSTRE_RES_ID_VER_OID_OFF != 1); - BUILD_BUG_ON(LUSTRE_RES_ID_QUOTA_SEQ_OFF != 2); - BUILD_BUG_ON(LUSTRE_RES_ID_QUOTA_VER_OID_OFF != 3); - BUILD_BUG_ON(LUSTRE_RES_ID_HSH_OFF != 3); - LASSERTF(OBD_PING == 400, "found %lld\n", - (long long)OBD_PING); - LASSERTF(OBD_LOG_CANCEL == 401, "found %lld\n", - (long long)OBD_LOG_CANCEL); - LASSERTF(OBD_QC_CALLBACK == 402, "found %lld\n", - (long long)OBD_QC_CALLBACK); - LASSERTF(OBD_IDX_READ == 403, "found %lld\n", - (long long)OBD_IDX_READ); - LASSERTF(OBD_LAST_OPC == 404, "found %lld\n", - (long long)OBD_LAST_OPC); - LASSERTF(QUOTA_DQACQ == 601, "found %lld\n", - (long long)QUOTA_DQACQ); - LASSERTF(QUOTA_DQREL == 602, "found %lld\n", - (long long)QUOTA_DQREL); - LASSERTF(QUOTA_LAST_OPC == 603, "found %lld\n", - (long long)QUOTA_LAST_OPC); - LASSERTF(MGS_CONNECT == 250, "found %lld\n", - (long long)MGS_CONNECT); - LASSERTF(MGS_DISCONNECT == 251, "found %lld\n", - (long long)MGS_DISCONNECT); - LASSERTF(MGS_EXCEPTION == 252, "found %lld\n", - (long long)MGS_EXCEPTION); - LASSERTF(MGS_TARGET_REG == 253, "found %lld\n", - (long long)MGS_TARGET_REG); - LASSERTF(MGS_TARGET_DEL == 254, "found %lld\n", - (long long)MGS_TARGET_DEL); - LASSERTF(MGS_SET_INFO == 255, "found %lld\n", - (long long)MGS_SET_INFO); - LASSERTF(MGS_LAST_OPC == 257, "found %lld\n", - (long long)MGS_LAST_OPC); - LASSERTF(SEC_CTX_INIT == 801, "found %lld\n", - (long long)SEC_CTX_INIT); - LASSERTF(SEC_CTX_INIT_CONT == 802, "found %lld\n", - (long long)SEC_CTX_INIT_CONT); - LASSERTF(SEC_CTX_FINI == 803, "found %lld\n", - (long long)SEC_CTX_FINI); - LASSERTF(SEC_LAST_OPC == 804, "found %lld\n", - (long long)SEC_LAST_OPC); - /* Sizes and Offsets */ - - /* Checks for struct obd_uuid */ - LASSERTF((int)sizeof(struct obd_uuid) == 40, "found %lld\n", - (long long)(int)sizeof(struct obd_uuid)); - - /* Checks for struct lu_seq_range */ - LASSERTF((int)sizeof(struct lu_seq_range) == 24, "found %lld\n", - (long long)(int)sizeof(struct lu_seq_range)); - LASSERTF((int)offsetof(struct lu_seq_range, lsr_start) == 0, "found %lld\n", - (long long)(int)offsetof(struct lu_seq_range, lsr_start)); - LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_start) == 8, "found %lld\n", - (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_start)); - LASSERTF((int)offsetof(struct lu_seq_range, lsr_end) == 8, "found %lld\n", - (long long)(int)offsetof(struct lu_seq_range, lsr_end)); - LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_end) == 8, "found %lld\n", - (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_end)); - LASSERTF((int)offsetof(struct lu_seq_range, lsr_index) == 16, "found %lld\n", - (long long)(int)offsetof(struct lu_seq_range, lsr_index)); - LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_index) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_index)); - LASSERTF((int)offsetof(struct lu_seq_range, lsr_flags) == 20, "found %lld\n", - (long long)(int)offsetof(struct lu_seq_range, lsr_flags)); - LASSERTF((int)sizeof(((struct lu_seq_range *)0)->lsr_flags) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lu_seq_range *)0)->lsr_flags)); - LASSERTF(LU_SEQ_RANGE_MDT == 0, "found %lld\n", - (long long)LU_SEQ_RANGE_MDT); - LASSERTF(LU_SEQ_RANGE_OST == 1, "found %lld\n", - (long long)LU_SEQ_RANGE_OST); - - /* Checks for struct lustre_mdt_attrs */ - LASSERTF((int)sizeof(struct lustre_mdt_attrs) == 24, "found %lld\n", - (long long)(int)sizeof(struct lustre_mdt_attrs)); - LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_compat) == 0, "found %lld\n", - (long long)(int)offsetof(struct lustre_mdt_attrs, lma_compat)); - LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_compat) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_compat)); - LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_incompat) == 4, "found %lld\n", - (long long)(int)offsetof(struct lustre_mdt_attrs, lma_incompat)); - LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_incompat) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_incompat)); - LASSERTF((int)offsetof(struct lustre_mdt_attrs, lma_self_fid) == 8, "found %lld\n", - (long long)(int)offsetof(struct lustre_mdt_attrs, lma_self_fid)); - LASSERTF((int)sizeof(((struct lustre_mdt_attrs *)0)->lma_self_fid) == 16, "found %lld\n", - (long long)(int)sizeof(((struct lustre_mdt_attrs *)0)->lma_self_fid)); - LASSERTF(LMAI_RELEASED == 0x00000001UL, "found 0x%.8xUL\n", - (unsigned int)LMAI_RELEASED); - LASSERTF(LMAC_HSM == 0x00000001UL, "found 0x%.8xUL\n", - (unsigned int)LMAC_HSM); - LASSERTF(LMAC_NOT_IN_OI == 0x00000004UL, "found 0x%.8xUL\n", - (unsigned int)LMAC_NOT_IN_OI); - LASSERTF(LMAC_FID_ON_OST == 0x00000008UL, "found 0x%.8xUL\n", - (unsigned int)LMAC_FID_ON_OST); - - /* Checks for struct ost_id */ - LASSERTF((int)sizeof(struct ost_id) == 16, "found %lld\n", - (long long)(int)sizeof(struct ost_id)); - LASSERTF((int)offsetof(struct ost_id, oi) == 0, "found %lld\n", - (long long)(int)offsetof(struct ost_id, oi)); - LASSERTF((int)sizeof(((struct ost_id *)0)->oi) == 16, "found %lld\n", - (long long)(int)sizeof(((struct ost_id *)0)->oi)); - LASSERTF(LUSTRE_FID_INIT_OID == 1, "found %lld\n", - (long long)LUSTRE_FID_INIT_OID); - LASSERTF(FID_SEQ_OST_MDT0 == 0, "found %lld\n", - (long long)FID_SEQ_OST_MDT0); - LASSERTF(FID_SEQ_LLOG == 1, "found %lld\n", - (long long)FID_SEQ_LLOG); - LASSERTF(FID_SEQ_ECHO == 2, "found %lld\n", - (long long)FID_SEQ_ECHO); - LASSERTF(FID_SEQ_OST_MDT1 == 3, "found %lld\n", - (long long)FID_SEQ_OST_MDT1); - LASSERTF(FID_SEQ_OST_MAX == 9, "found %lld\n", - (long long)FID_SEQ_OST_MAX); - LASSERTF(FID_SEQ_RSVD == 11, "found %lld\n", - (long long)FID_SEQ_RSVD); - LASSERTF(FID_SEQ_IGIF == 12, "found %lld\n", - (long long)FID_SEQ_IGIF); - LASSERTF(FID_SEQ_IGIF_MAX == 0x00000000ffffffffULL, "found 0x%.16llxULL\n", - (long long)FID_SEQ_IGIF_MAX); - LASSERTF(FID_SEQ_IDIF == 0x0000000100000000ULL, "found 0x%.16llxULL\n", - (long long)FID_SEQ_IDIF); - LASSERTF(FID_SEQ_IDIF_MAX == 0x00000001ffffffffULL, "found 0x%.16llxULL\n", - (long long)FID_SEQ_IDIF_MAX); - LASSERTF(FID_SEQ_START == 0x0000000200000000ULL, "found 0x%.16llxULL\n", - (long long)FID_SEQ_START); - LASSERTF(FID_SEQ_LOCAL_FILE == 0x0000000200000001ULL, "found 0x%.16llxULL\n", - (long long)FID_SEQ_LOCAL_FILE); - LASSERTF(FID_SEQ_DOT_LUSTRE == 0x0000000200000002ULL, "found 0x%.16llxULL\n", - (long long)FID_SEQ_DOT_LUSTRE); - LASSERTF(FID_SEQ_SPECIAL == 0x0000000200000004ULL, "found 0x%.16llxULL\n", - (long long)FID_SEQ_SPECIAL); - LASSERTF(FID_SEQ_QUOTA == 0x0000000200000005ULL, "found 0x%.16llxULL\n", - (long long)FID_SEQ_QUOTA); - LASSERTF(FID_SEQ_QUOTA_GLB == 0x0000000200000006ULL, "found 0x%.16llxULL\n", - (long long)FID_SEQ_QUOTA_GLB); - LASSERTF(FID_SEQ_ROOT == 0x0000000200000007ULL, "found 0x%.16llxULL\n", - (long long)FID_SEQ_ROOT); - LASSERTF(FID_SEQ_NORMAL == 0x0000000200000400ULL, "found 0x%.16llxULL\n", - (long long)FID_SEQ_NORMAL); - LASSERTF(FID_SEQ_LOV_DEFAULT == 0xffffffffffffffffULL, "found 0x%.16llxULL\n", - (long long)FID_SEQ_LOV_DEFAULT); - LASSERTF(FID_OID_SPECIAL_BFL == 0x00000001UL, "found 0x%.8xUL\n", - (unsigned int)FID_OID_SPECIAL_BFL); - LASSERTF(FID_OID_DOT_LUSTRE == 0x00000001UL, "found 0x%.8xUL\n", - (unsigned int)FID_OID_DOT_LUSTRE); - LASSERTF(FID_OID_DOT_LUSTRE_OBF == 0x00000002UL, "found 0x%.8xUL\n", - (unsigned int)FID_OID_DOT_LUSTRE_OBF); - - /* Checks for struct lu_dirent */ - LASSERTF((int)sizeof(struct lu_dirent) == 32, "found %lld\n", - (long long)(int)sizeof(struct lu_dirent)); - LASSERTF((int)offsetof(struct lu_dirent, lde_fid) == 0, "found %lld\n", - (long long)(int)offsetof(struct lu_dirent, lde_fid)); - LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_fid) == 16, "found %lld\n", - (long long)(int)sizeof(((struct lu_dirent *)0)->lde_fid)); - LASSERTF((int)offsetof(struct lu_dirent, lde_hash) == 16, "found %lld\n", - (long long)(int)offsetof(struct lu_dirent, lde_hash)); - LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_hash) == 8, "found %lld\n", - (long long)(int)sizeof(((struct lu_dirent *)0)->lde_hash)); - LASSERTF((int)offsetof(struct lu_dirent, lde_reclen) == 24, "found %lld\n", - (long long)(int)offsetof(struct lu_dirent, lde_reclen)); - LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_reclen) == 2, "found %lld\n", - (long long)(int)sizeof(((struct lu_dirent *)0)->lde_reclen)); - LASSERTF((int)offsetof(struct lu_dirent, lde_namelen) == 26, "found %lld\n", - (long long)(int)offsetof(struct lu_dirent, lde_namelen)); - LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_namelen) == 2, "found %lld\n", - (long long)(int)sizeof(((struct lu_dirent *)0)->lde_namelen)); - LASSERTF((int)offsetof(struct lu_dirent, lde_attrs) == 28, "found %lld\n", - (long long)(int)offsetof(struct lu_dirent, lde_attrs)); - LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_attrs) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lu_dirent *)0)->lde_attrs)); - LASSERTF((int)offsetof(struct lu_dirent, lde_name[0]) == 32, "found %lld\n", - (long long)(int)offsetof(struct lu_dirent, lde_name[0])); - LASSERTF((int)sizeof(((struct lu_dirent *)0)->lde_name[0]) == 1, "found %lld\n", - (long long)(int)sizeof(((struct lu_dirent *)0)->lde_name[0])); - LASSERTF(LUDA_FID == 0x00000001UL, "found 0x%.8xUL\n", - (unsigned int)LUDA_FID); - LASSERTF(LUDA_TYPE == 0x00000002UL, "found 0x%.8xUL\n", - (unsigned int)LUDA_TYPE); - LASSERTF(LUDA_64BITHASH == 0x00000004UL, "found 0x%.8xUL\n", - (unsigned int)LUDA_64BITHASH); - - /* Checks for struct luda_type */ - LASSERTF((int)sizeof(struct luda_type) == 2, "found %lld\n", - (long long)(int)sizeof(struct luda_type)); - LASSERTF((int)offsetof(struct luda_type, lt_type) == 0, "found %lld\n", - (long long)(int)offsetof(struct luda_type, lt_type)); - LASSERTF((int)sizeof(((struct luda_type *)0)->lt_type) == 2, "found %lld\n", - (long long)(int)sizeof(((struct luda_type *)0)->lt_type)); - - /* Checks for struct lu_dirpage */ - LASSERTF((int)sizeof(struct lu_dirpage) == 24, "found %lld\n", - (long long)(int)sizeof(struct lu_dirpage)); - LASSERTF((int)offsetof(struct lu_dirpage, ldp_hash_start) == 0, "found %lld\n", - (long long)(int)offsetof(struct lu_dirpage, ldp_hash_start)); - LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_hash_start) == 8, "found %lld\n", - (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_hash_start)); - LASSERTF((int)offsetof(struct lu_dirpage, ldp_hash_end) == 8, "found %lld\n", - (long long)(int)offsetof(struct lu_dirpage, ldp_hash_end)); - LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_hash_end) == 8, "found %lld\n", - (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_hash_end)); - LASSERTF((int)offsetof(struct lu_dirpage, ldp_flags) == 16, "found %lld\n", - (long long)(int)offsetof(struct lu_dirpage, ldp_flags)); - LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_flags) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_flags)); - LASSERTF((int)offsetof(struct lu_dirpage, ldp_pad0) == 20, "found %lld\n", - (long long)(int)offsetof(struct lu_dirpage, ldp_pad0)); - LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_pad0) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_pad0)); - LASSERTF((int)offsetof(struct lu_dirpage, ldp_entries[0]) == 24, "found %lld\n", - (long long)(int)offsetof(struct lu_dirpage, ldp_entries[0])); - LASSERTF((int)sizeof(((struct lu_dirpage *)0)->ldp_entries[0]) == 32, "found %lld\n", - (long long)(int)sizeof(((struct lu_dirpage *)0)->ldp_entries[0])); - LASSERTF(LDF_EMPTY == 1, "found %lld\n", - (long long)LDF_EMPTY); - LASSERTF(LDF_COLLIDE == 2, "found %lld\n", - (long long)LDF_COLLIDE); - LASSERTF(LU_PAGE_SIZE == 4096, "found %lld\n", - (long long)LU_PAGE_SIZE); - - /* Checks for struct lustre_handle */ - LASSERTF((int)sizeof(struct lustre_handle) == 8, "found %lld\n", - (long long)(int)sizeof(struct lustre_handle)); - LASSERTF((int)offsetof(struct lustre_handle, cookie) == 0, "found %lld\n", - (long long)(int)offsetof(struct lustre_handle, cookie)); - LASSERTF((int)sizeof(((struct lustre_handle *)0)->cookie) == 8, "found %lld\n", - (long long)(int)sizeof(((struct lustre_handle *)0)->cookie)); - - /* Checks for struct lustre_msg_v2 */ - LASSERTF((int)sizeof(struct lustre_msg_v2) == 32, "found %lld\n", - (long long)(int)sizeof(struct lustre_msg_v2)); - LASSERTF((int)offsetof(struct lustre_msg_v2, lm_bufcount) == 0, "found %lld\n", - (long long)(int)offsetof(struct lustre_msg_v2, lm_bufcount)); - LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_bufcount) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_bufcount)); - LASSERTF((int)offsetof(struct lustre_msg_v2, lm_secflvr) == 4, "found %lld\n", - (long long)(int)offsetof(struct lustre_msg_v2, lm_secflvr)); - LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_secflvr) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_secflvr)); - LASSERTF((int)offsetof(struct lustre_msg_v2, lm_magic) == 8, "found %lld\n", - (long long)(int)offsetof(struct lustre_msg_v2, lm_magic)); - LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_magic) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_magic)); - LASSERTF((int)offsetof(struct lustre_msg_v2, lm_repsize) == 12, "found %lld\n", - (long long)(int)offsetof(struct lustre_msg_v2, lm_repsize)); - LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize)); - LASSERTF((int)offsetof(struct lustre_msg_v2, lm_cksum) == 16, "found %lld\n", - (long long)(int)offsetof(struct lustre_msg_v2, lm_cksum)); - LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum)); - LASSERTF((int)offsetof(struct lustre_msg_v2, lm_flags) == 20, "found %lld\n", - (long long)(int)offsetof(struct lustre_msg_v2, lm_flags)); - LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_flags)); - LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_2) == 24, "found %lld\n", - (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_2)); - LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_2)); - LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_3) == 28, "found %lld\n", - (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_3)); - LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_3) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_3)); - LASSERTF((int)offsetof(struct lustre_msg_v2, lm_buflens[0]) == 32, "found %lld\n", - (long long)(int)offsetof(struct lustre_msg_v2, lm_buflens[0])); - LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0]) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_buflens[0])); - LASSERTF(LUSTRE_MSG_MAGIC_V2 == 0x0BD00BD3, "found 0x%.8x\n", - LUSTRE_MSG_MAGIC_V2); - LASSERTF(LUSTRE_MSG_MAGIC_V2_SWABBED == 0xD30BD00B, "found 0x%.8x\n", - LUSTRE_MSG_MAGIC_V2_SWABBED); - - /* Checks for struct ptlrpc_body */ - LASSERTF((int)sizeof(struct ptlrpc_body_v3) == 184, "found %lld\n", - (long long)(int)sizeof(struct ptlrpc_body_v3)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_handle) == 0, "found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body_v3, pb_handle)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_type) == 8, "found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body_v3, pb_type)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type) == 4, "found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_version) == 12, "found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body_v3, pb_version)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version) == 4, "found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_opc) == 16, "found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body_v3, pb_opc)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc) == 4, "found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_status) == 20, "found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body_v3, pb_status)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status) == 4, "found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_xid) == 24, "found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_xid)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_tag) == 32, "found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body_v3, pb_tag)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_tag) == 2, "found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_tag)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding0) == 34, "found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding0)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding0) == 2, "found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding0)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding1) == 36, "found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding1)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding1) == 4, "found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding1)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_committed) == 40, "found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body_v3, pb_last_committed)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_transno) == 48, "found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body_v3, pb_transno)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_flags) == 56, "found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body_v3, pb_flags)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags) == 4, "found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_op_flags) == 60, "found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body_v3, pb_op_flags)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags) == 4, "found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt) == 64, "found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt) == 4, "found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_timeout) == 68, "found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body_v3, pb_timeout)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout) == 4, "found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_service_time) == 72, "found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body_v3, pb_service_time)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time) == 4, "found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_limit) == 76, "found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body_v3, pb_limit)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit) == 4, "found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_slv) == 80, "found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body_v3, pb_slv)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv)); - BUILD_BUG_ON(PTLRPC_NUM_VERSIONS != 4); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_pre_versions) == 88, "found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body_v3, pb_pre_versions)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions) == 32, "found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_mbits) == 120, "found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body_v3, pb_mbits)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_mbits) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_mbits)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_0) == 128, "found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding64_0)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_0) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_0)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_1) == 136, "found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding64_1)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_1) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_1)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_2) == 144, "found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body_v3, pb_padding64_2)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_2) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_2)); - BUILD_BUG_ON(LUSTRE_JOBID_SIZE != 32); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_jobid) == 152, "found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body_v3, pb_jobid)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_jobid) == 32, "found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_jobid)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_handle) == (int)offsetof(struct ptlrpc_body_v2, pb_handle), "%d != %d\n", - (int)offsetof(struct ptlrpc_body_v3, pb_handle), (int)offsetof(struct ptlrpc_body_v2, pb_handle)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_handle), "%d != %d\n", - (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_handle), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_handle)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_type) == (int)offsetof(struct ptlrpc_body_v2, pb_type), "%d != %d\n", - (int)offsetof(struct ptlrpc_body_v3, pb_type), (int)offsetof(struct ptlrpc_body_v2, pb_type)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_type), "%d != %d\n", - (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_type), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_type)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_version) == (int)offsetof(struct ptlrpc_body_v2, pb_version), "%d != %d\n", - (int)offsetof(struct ptlrpc_body_v3, pb_version), (int)offsetof(struct ptlrpc_body_v2, pb_version)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_version), "%d != %d\n", - (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_version), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_version)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_opc) == (int)offsetof(struct ptlrpc_body_v2, pb_opc), "%d != %d\n", - (int)offsetof(struct ptlrpc_body_v3, pb_opc), (int)offsetof(struct ptlrpc_body_v2, pb_opc)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_opc), "%d != %d\n", - (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_opc), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_opc)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_status) == (int)offsetof(struct ptlrpc_body_v2, pb_status), "%d != %d\n", - (int)offsetof(struct ptlrpc_body_v3, pb_status), (int)offsetof(struct ptlrpc_body_v2, pb_status)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_status), "%d != %d\n", - (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_status), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_status)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_xid) == (int)offsetof(struct ptlrpc_body_v2, pb_last_xid), "%d != %d\n", - (int)offsetof(struct ptlrpc_body_v3, pb_last_xid), (int)offsetof(struct ptlrpc_body_v2, pb_last_xid)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_xid), "%d != %d\n", - (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_xid), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_xid)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_tag) == (int)offsetof(struct ptlrpc_body_v2, pb_tag), "%d != %d\n", - (int)offsetof(struct ptlrpc_body_v3, pb_tag), (int)offsetof(struct ptlrpc_body_v2, pb_tag)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_tag) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_tag), "%d != %d\n", - (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_tag), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_tag)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding0) == (int)offsetof(struct ptlrpc_body_v2, pb_padding0), "%d != %d\n", - (int)offsetof(struct ptlrpc_body_v3, pb_padding0), (int)offsetof(struct ptlrpc_body_v2, pb_padding0)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding0) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding0), "%d != %d\n", - (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding0), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding0)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding1) == (int)offsetof(struct ptlrpc_body_v2, pb_padding1), "%d != %d\n", - (int)offsetof(struct ptlrpc_body_v3, pb_padding1), (int)offsetof(struct ptlrpc_body_v2, pb_padding1)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding1) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding1), "%d != %d\n", - (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding1), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding1)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_last_committed) == (int)offsetof(struct ptlrpc_body_v2, pb_last_committed), "%d != %d\n", - (int)offsetof(struct ptlrpc_body_v3, pb_last_committed), (int)offsetof(struct ptlrpc_body_v2, pb_last_committed)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_committed), "%d != %d\n", - (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_last_committed), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_last_committed)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_transno) == (int)offsetof(struct ptlrpc_body_v2, pb_transno), "%d != %d\n", - (int)offsetof(struct ptlrpc_body_v3, pb_transno), (int)offsetof(struct ptlrpc_body_v2, pb_transno)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_transno), "%d != %d\n", - (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_transno), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_transno)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_flags) == (int)offsetof(struct ptlrpc_body_v2, pb_flags), "%d != %d\n", - (int)offsetof(struct ptlrpc_body_v3, pb_flags), (int)offsetof(struct ptlrpc_body_v2, pb_flags)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_flags), "%d != %d\n", - (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_flags), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_flags)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_op_flags) == (int)offsetof(struct ptlrpc_body_v2, pb_op_flags), "%d != %d\n", - (int)offsetof(struct ptlrpc_body_v3, pb_op_flags), (int)offsetof(struct ptlrpc_body_v2, pb_op_flags)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_op_flags), "%d != %d\n", - (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_op_flags), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_op_flags)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt) == (int)offsetof(struct ptlrpc_body_v2, pb_conn_cnt), "%d != %d\n", - (int)offsetof(struct ptlrpc_body_v3, pb_conn_cnt), (int)offsetof(struct ptlrpc_body_v2, pb_conn_cnt)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_conn_cnt), "%d != %d\n", - (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_conn_cnt), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_conn_cnt)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_timeout) == (int)offsetof(struct ptlrpc_body_v2, pb_timeout), "%d != %d\n", - (int)offsetof(struct ptlrpc_body_v3, pb_timeout), (int)offsetof(struct ptlrpc_body_v2, pb_timeout)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_timeout), "%d != %d\n", - (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_timeout), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_timeout)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_service_time) == (int)offsetof(struct ptlrpc_body_v2, pb_service_time), "%d != %d\n", - (int)offsetof(struct ptlrpc_body_v3, pb_service_time), (int)offsetof(struct ptlrpc_body_v2, pb_service_time)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_service_time), "%d != %d\n", - (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_service_time), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_service_time)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_limit) == (int)offsetof(struct ptlrpc_body_v2, pb_limit), "%d != %d\n", - (int)offsetof(struct ptlrpc_body_v3, pb_limit), (int)offsetof(struct ptlrpc_body_v2, pb_limit)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_limit), "%d != %d\n", - (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_limit), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_limit)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_slv) == (int)offsetof(struct ptlrpc_body_v2, pb_slv), "%d != %d\n", - (int)offsetof(struct ptlrpc_body_v3, pb_slv), (int)offsetof(struct ptlrpc_body_v2, pb_slv)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_slv), "%d != %d\n", - (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_slv), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_slv)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_pre_versions) == (int)offsetof(struct ptlrpc_body_v2, pb_pre_versions), "%d != %d\n", - (int)offsetof(struct ptlrpc_body_v3, pb_pre_versions), (int)offsetof(struct ptlrpc_body_v2, pb_pre_versions)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_pre_versions), "%d != %d\n", - (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_pre_versions), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_pre_versions)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_mbits) == (int)offsetof(struct ptlrpc_body_v2, pb_mbits), "%d != %d\n", - (int)offsetof(struct ptlrpc_body_v3, pb_mbits), (int)offsetof(struct ptlrpc_body_v2, pb_mbits)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_mbits) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_mbits), "%d != %d\n", - (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_mbits), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_mbits)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_0) == (int)offsetof(struct ptlrpc_body_v2, pb_padding64_0), "%d != %d\n", - (int)offsetof(struct ptlrpc_body_v3, pb_padding64_0), (int)offsetof(struct ptlrpc_body_v2, pb_padding64_0)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_0) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_0), "%d != %d\n", - (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_0), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_0)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_1) == (int)offsetof(struct ptlrpc_body_v2, pb_padding64_1), "%d != %d\n", - (int)offsetof(struct ptlrpc_body_v3, pb_padding64_1), (int)offsetof(struct ptlrpc_body_v2, pb_padding64_1)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_1) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_1), "%d != %d\n", - (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_1), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_1)); - LASSERTF((int)offsetof(struct ptlrpc_body_v3, pb_padding64_2) == (int)offsetof(struct ptlrpc_body_v2, pb_padding64_2), "%d != %d\n", - (int)offsetof(struct ptlrpc_body_v3, pb_padding64_2), (int)offsetof(struct ptlrpc_body_v2, pb_padding64_2)); - LASSERTF((int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_2) == (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_2), "%d != %d\n", - (int)sizeof(((struct ptlrpc_body_v3 *)0)->pb_padding64_2), (int)sizeof(((struct ptlrpc_body_v2 *)0)->pb_padding64_2)); - LASSERTF(MSG_PTLRPC_BODY_OFF == 0, "found %lld\n", - (long long)MSG_PTLRPC_BODY_OFF); - LASSERTF(REQ_REC_OFF == 1, "found %lld\n", - (long long)REQ_REC_OFF); - LASSERTF(REPLY_REC_OFF == 1, "found %lld\n", - (long long)REPLY_REC_OFF); - LASSERTF(DLM_LOCKREQ_OFF == 1, "found %lld\n", - (long long)DLM_LOCKREQ_OFF); - LASSERTF(DLM_REQ_REC_OFF == 2, "found %lld\n", - (long long)DLM_REQ_REC_OFF); - LASSERTF(DLM_INTENT_IT_OFF == 2, "found %lld\n", - (long long)DLM_INTENT_IT_OFF); - LASSERTF(DLM_INTENT_REC_OFF == 3, "found %lld\n", - (long long)DLM_INTENT_REC_OFF); - LASSERTF(DLM_LOCKREPLY_OFF == 1, "found %lld\n", - (long long)DLM_LOCKREPLY_OFF); - LASSERTF(DLM_REPLY_REC_OFF == 2, "found %lld\n", - (long long)DLM_REPLY_REC_OFF); - LASSERTF(MSG_PTLRPC_HEADER_OFF == 31, "found %lld\n", - (long long)MSG_PTLRPC_HEADER_OFF); - LASSERTF(PTLRPC_MSG_VERSION == 0x00000003, "found 0x%.8x\n", - PTLRPC_MSG_VERSION); - LASSERTF(LUSTRE_VERSION_MASK == 0xffff0000, "found 0x%.8x\n", - LUSTRE_VERSION_MASK); - LASSERTF(LUSTRE_OBD_VERSION == 0x00010000, "found 0x%.8x\n", - LUSTRE_OBD_VERSION); - LASSERTF(LUSTRE_MDS_VERSION == 0x00020000, "found 0x%.8x\n", - LUSTRE_MDS_VERSION); - LASSERTF(LUSTRE_OST_VERSION == 0x00030000, "found 0x%.8x\n", - LUSTRE_OST_VERSION); - LASSERTF(LUSTRE_DLM_VERSION == 0x00040000, "found 0x%.8x\n", - LUSTRE_DLM_VERSION); - LASSERTF(LUSTRE_LOG_VERSION == 0x00050000, "found 0x%.8x\n", - LUSTRE_LOG_VERSION); - LASSERTF(LUSTRE_MGS_VERSION == 0x00060000, "found 0x%.8x\n", - LUSTRE_MGS_VERSION); - LASSERTF(MSGHDR_AT_SUPPORT == 1, "found %lld\n", - (long long)MSGHDR_AT_SUPPORT); - LASSERTF(MSGHDR_CKSUM_INCOMPAT18 == 2, "found %lld\n", - (long long)MSGHDR_CKSUM_INCOMPAT18); - LASSERTF(MSG_OP_FLAG_MASK == 0xffff0000UL, "found 0x%.8xUL\n", - (unsigned int)MSG_OP_FLAG_MASK); - LASSERTF(MSG_OP_FLAG_SHIFT == 16, "found %lld\n", - (long long)MSG_OP_FLAG_SHIFT); - LASSERTF(MSG_GEN_FLAG_MASK == 0x0000ffffUL, "found 0x%.8xUL\n", - (unsigned int)MSG_GEN_FLAG_MASK); - LASSERTF(MSG_LAST_REPLAY == 0x00000001UL, "found 0x%.8xUL\n", - (unsigned int)MSG_LAST_REPLAY); - LASSERTF(MSG_RESENT == 0x00000002UL, "found 0x%.8xUL\n", - (unsigned int)MSG_RESENT); - LASSERTF(MSG_REPLAY == 0x00000004UL, "found 0x%.8xUL\n", - (unsigned int)MSG_REPLAY); - LASSERTF(MSG_DELAY_REPLAY == 0x00000010UL, "found 0x%.8xUL\n", - (unsigned int)MSG_DELAY_REPLAY); - LASSERTF(MSG_VERSION_REPLAY == 0x00000020UL, "found 0x%.8xUL\n", - (unsigned int)MSG_VERSION_REPLAY); - LASSERTF(MSG_REQ_REPLAY_DONE == 0x00000040UL, "found 0x%.8xUL\n", - (unsigned int)MSG_REQ_REPLAY_DONE); - LASSERTF(MSG_LOCK_REPLAY_DONE == 0x00000080UL, "found 0x%.8xUL\n", - (unsigned int)MSG_LOCK_REPLAY_DONE); - LASSERTF(MSG_CONNECT_RECOVERING == 0x00000001UL, "found 0x%.8xUL\n", - (unsigned int)MSG_CONNECT_RECOVERING); - LASSERTF(MSG_CONNECT_RECONNECT == 0x00000002UL, "found 0x%.8xUL\n", - (unsigned int)MSG_CONNECT_RECONNECT); - LASSERTF(MSG_CONNECT_REPLAYABLE == 0x00000004UL, "found 0x%.8xUL\n", - (unsigned int)MSG_CONNECT_REPLAYABLE); - LASSERTF(MSG_CONNECT_LIBCLIENT == 0x00000010UL, "found 0x%.8xUL\n", - (unsigned int)MSG_CONNECT_LIBCLIENT); - LASSERTF(MSG_CONNECT_INITIAL == 0x00000020UL, "found 0x%.8xUL\n", - (unsigned int)MSG_CONNECT_INITIAL); - LASSERTF(MSG_CONNECT_ASYNC == 0x00000040UL, "found 0x%.8xUL\n", - (unsigned int)MSG_CONNECT_ASYNC); - LASSERTF(MSG_CONNECT_NEXT_VER == 0x00000080UL, "found 0x%.8xUL\n", - (unsigned int)MSG_CONNECT_NEXT_VER); - LASSERTF(MSG_CONNECT_TRANSNO == 0x00000100UL, "found 0x%.8xUL\n", - (unsigned int)MSG_CONNECT_TRANSNO); - - /* Checks for struct obd_connect_data */ - LASSERTF((int)sizeof(struct obd_connect_data) == 192, "found %lld\n", - (long long)(int)sizeof(struct obd_connect_data)); - LASSERTF((int)offsetof(struct obd_connect_data, ocd_connect_flags) == 0, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, ocd_connect_flags)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags)); - LASSERTF((int)offsetof(struct obd_connect_data, ocd_version) == 8, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, ocd_version)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_version) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_version)); - LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant) == 12, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, ocd_grant)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant)); - LASSERTF((int)offsetof(struct obd_connect_data, ocd_index) == 16, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, ocd_index)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_index) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_index)); - LASSERTF((int)offsetof(struct obd_connect_data, ocd_brw_size) == 20, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, ocd_brw_size)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_brw_size) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_brw_size)); - LASSERTF((int)offsetof(struct obd_connect_data, ocd_ibits_known) == 24, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, ocd_ibits_known)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known)); - LASSERTF((int)offsetof(struct obd_connect_data, ocd_blocksize) == 32, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, ocd_blocksize)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_blocksize) == 1, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_blocksize)); - LASSERTF((int)offsetof(struct obd_connect_data, ocd_inodespace) == 33, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, ocd_inodespace)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_inodespace) == 1, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_inodespace)); - LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_extent) == 34, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, ocd_grant_extent)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_extent) == 2, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_extent)); - LASSERTF((int)offsetof(struct obd_connect_data, ocd_unused) == 36, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, ocd_unused)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_unused) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_unused)); - LASSERTF((int)offsetof(struct obd_connect_data, ocd_transno) == 40, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, ocd_transno)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_transno) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_transno)); - LASSERTF((int)offsetof(struct obd_connect_data, ocd_group) == 48, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, ocd_group)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_group) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_group)); - LASSERTF((int)offsetof(struct obd_connect_data, ocd_cksum_types) == 52, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, ocd_cksum_types)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_cksum_types) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_cksum_types)); - LASSERTF((int)offsetof(struct obd_connect_data, ocd_max_easize) == 56, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, ocd_max_easize)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_max_easize) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_max_easize)); - LASSERTF((int)offsetof(struct obd_connect_data, ocd_instance) == 60, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, ocd_instance)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_instance) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_instance)); - LASSERTF((int)offsetof(struct obd_connect_data, ocd_maxbytes) == 64, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, ocd_maxbytes)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_maxbytes) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_maxbytes)); - LASSERTF((int)offsetof(struct obd_connect_data, ocd_maxmodrpcs) == 72, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, ocd_maxmodrpcs)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_maxmodrpcs) == 2, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_maxmodrpcs)); - LASSERTF((int)offsetof(struct obd_connect_data, padding0) == 74, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, padding0)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding0) == 2, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->padding0)); - LASSERTF((int)offsetof(struct obd_connect_data, padding1) == 76, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, padding1)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding1) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->padding1)); - LASSERTF((int)offsetof(struct obd_connect_data, ocd_connect_flags2) == 80, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, ocd_connect_flags2)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags2) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_connect_flags2)); - LASSERTF((int)offsetof(struct obd_connect_data, padding3) == 88, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, padding3)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding3) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->padding3)); - LASSERTF((int)offsetof(struct obd_connect_data, padding4) == 96, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, padding4)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding4) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->padding4)); - LASSERTF((int)offsetof(struct obd_connect_data, padding5) == 104, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, padding5)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding5) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->padding5)); - LASSERTF((int)offsetof(struct obd_connect_data, padding6) == 112, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, padding6)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding6) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->padding6)); - LASSERTF((int)offsetof(struct obd_connect_data, padding7) == 120, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, padding7)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding7) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->padding7)); - LASSERTF((int)offsetof(struct obd_connect_data, padding8) == 128, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, padding8)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding8) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->padding8)); - LASSERTF((int)offsetof(struct obd_connect_data, padding9) == 136, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, padding9)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->padding9) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->padding9)); - LASSERTF((int)offsetof(struct obd_connect_data, paddingA) == 144, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, paddingA)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingA) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingA)); - LASSERTF((int)offsetof(struct obd_connect_data, paddingB) == 152, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, paddingB)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingB) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingB)); - LASSERTF((int)offsetof(struct obd_connect_data, paddingC) == 160, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, paddingC)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingC) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingC)); - LASSERTF((int)offsetof(struct obd_connect_data, paddingD) == 168, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, paddingD)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingD) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingD)); - LASSERTF((int)offsetof(struct obd_connect_data, paddingE) == 176, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, paddingE)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingE) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingE)); - LASSERTF((int)offsetof(struct obd_connect_data, paddingF) == 184, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, paddingF)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->paddingF) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->paddingF)); - LASSERTF(OBD_CONNECT_RDONLY == 0x1ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_RDONLY); - LASSERTF(OBD_CONNECT_INDEX == 0x2ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_INDEX); - LASSERTF(OBD_CONNECT_MDS == 0x4ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_MDS); - LASSERTF(OBD_CONNECT_GRANT == 0x8ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_GRANT); - LASSERTF(OBD_CONNECT_SRVLOCK == 0x10ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_SRVLOCK); - LASSERTF(OBD_CONNECT_VERSION == 0x20ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_VERSION); - LASSERTF(OBD_CONNECT_REQPORTAL == 0x40ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_REQPORTAL); - LASSERTF(OBD_CONNECT_ACL == 0x80ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_ACL); - LASSERTF(OBD_CONNECT_XATTR == 0x100ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_XATTR); - LASSERTF(OBD_CONNECT_LARGE_ACL == 0x200ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_LARGE_ACL); - LASSERTF(OBD_CONNECT_TRUNCLOCK == 0x400ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_TRUNCLOCK); - LASSERTF(OBD_CONNECT_TRANSNO == 0x800ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_TRANSNO); - LASSERTF(OBD_CONNECT_IBITS == 0x1000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_IBITS); - LASSERTF(OBD_CONNECT_JOIN == 0x2000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_JOIN); - LASSERTF(OBD_CONNECT_ATTRFID == 0x4000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_ATTRFID); - LASSERTF(OBD_CONNECT_NODEVOH == 0x8000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_NODEVOH); - LASSERTF(OBD_CONNECT_RMT_CLIENT == 0x10000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_RMT_CLIENT); - LASSERTF(OBD_CONNECT_RMT_CLIENT_FORCE == 0x20000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_RMT_CLIENT_FORCE); - LASSERTF(OBD_CONNECT_BRW_SIZE == 0x40000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_BRW_SIZE); - LASSERTF(OBD_CONNECT_QUOTA64 == 0x80000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_QUOTA64); - LASSERTF(OBD_CONNECT_MDS_CAPA == 0x100000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_MDS_CAPA); - LASSERTF(OBD_CONNECT_OSS_CAPA == 0x200000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_OSS_CAPA); - LASSERTF(OBD_CONNECT_CANCELSET == 0x400000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_CANCELSET); - LASSERTF(OBD_CONNECT_SOM == 0x800000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_SOM); - LASSERTF(OBD_CONNECT_AT == 0x1000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_AT); - LASSERTF(OBD_CONNECT_LRU_RESIZE == 0x2000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_LRU_RESIZE); - LASSERTF(OBD_CONNECT_MDS_MDS == 0x4000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_MDS_MDS); - LASSERTF(OBD_CONNECT_REAL == 0x8000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_REAL); - LASSERTF(OBD_CONNECT_CHANGE_QS == 0x10000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_CHANGE_QS); - LASSERTF(OBD_CONNECT_CKSUM == 0x20000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_CKSUM); - LASSERTF(OBD_CONNECT_FID == 0x40000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_FID); - LASSERTF(OBD_CONNECT_VBR == 0x80000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_VBR); - LASSERTF(OBD_CONNECT_LOV_V3 == 0x100000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_LOV_V3); - LASSERTF(OBD_CONNECT_GRANT_SHRINK == 0x200000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_GRANT_SHRINK); - LASSERTF(OBD_CONNECT_SKIP_ORPHAN == 0x400000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_SKIP_ORPHAN); - LASSERTF(OBD_CONNECT_MAX_EASIZE == 0x800000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_MAX_EASIZE); - LASSERTF(OBD_CONNECT_FULL20 == 0x1000000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_FULL20); - LASSERTF(OBD_CONNECT_LAYOUTLOCK == 0x2000000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_LAYOUTLOCK); - LASSERTF(OBD_CONNECT_64BITHASH == 0x4000000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_64BITHASH); - LASSERTF(OBD_CONNECT_MAXBYTES == 0x8000000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_MAXBYTES); - LASSERTF(OBD_CONNECT_IMP_RECOV == 0x10000000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_IMP_RECOV); - LASSERTF(OBD_CONNECT_JOBSTATS == 0x20000000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_JOBSTATS); - LASSERTF(OBD_CONNECT_UMASK == 0x40000000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_UMASK); - LASSERTF(OBD_CONNECT_EINPROGRESS == 0x80000000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_EINPROGRESS); - LASSERTF(OBD_CONNECT_GRANT_PARAM == 0x100000000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_GRANT_PARAM); - LASSERTF(OBD_CONNECT_FLOCK_OWNER == 0x200000000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_FLOCK_OWNER); - LASSERTF(OBD_CONNECT_LVB_TYPE == 0x400000000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_LVB_TYPE); - LASSERTF(OBD_CONNECT_NANOSEC_TIME == 0x800000000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_NANOSEC_TIME); - LASSERTF(OBD_CONNECT_LIGHTWEIGHT == 0x1000000000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_LIGHTWEIGHT); - LASSERTF(OBD_CONNECT_SHORTIO == 0x2000000000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_SHORTIO); - LASSERTF(OBD_CONNECT_PINGLESS == 0x4000000000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_PINGLESS); - LASSERTF(OBD_CONNECT_FLOCK_DEAD == 0x8000000000000ULL, - "found 0x%.16llxULL\n", OBD_CONNECT_FLOCK_DEAD); - LASSERTF(OBD_CONNECT_OPEN_BY_FID == 0x20000000000000ULL, - "found 0x%.16llxULL\n", OBD_CONNECT_OPEN_BY_FID); - LASSERTF(OBD_CONNECT_LFSCK == 0x40000000000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_LFSCK); - LASSERTF(OBD_CONNECT_UNLINK_CLOSE == 0x100000000000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_UNLINK_CLOSE); - LASSERTF(OBD_CONNECT_MULTIMODRPCS == 0x200000000000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_MULTIMODRPCS); - LASSERTF(OBD_CONNECT_DIR_STRIPE == 0x400000000000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_DIR_STRIPE); - LASSERTF(OBD_CONNECT_SUBTREE == 0x800000000000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_SUBTREE); - LASSERTF(OBD_CONNECT_LOCK_AHEAD == 0x1000000000000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_LOCK_AHEAD); - LASSERTF(OBD_CONNECT_OBDOPACK == 0x4000000000000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_OBDOPACK); - LASSERTF(OBD_CONNECT_FLAGS2 == 0x8000000000000000ULL, "found 0x%.16llxULL\n", - OBD_CONNECT_FLAGS2); - LASSERTF(OBD_CKSUM_CRC32 == 0x00000001UL, "found 0x%.8xUL\n", - (unsigned int)OBD_CKSUM_CRC32); - LASSERTF(OBD_CKSUM_ADLER == 0x00000002UL, "found 0x%.8xUL\n", - (unsigned int)OBD_CKSUM_ADLER); - LASSERTF(OBD_CKSUM_CRC32C == 0x00000004UL, "found 0x%.8xUL\n", - (unsigned int)OBD_CKSUM_CRC32C); - - /* Checks for struct obdo */ - LASSERTF((int)sizeof(struct obdo) == 208, "found %lld\n", - (long long)(int)sizeof(struct obdo)); - LASSERTF((int)offsetof(struct obdo, o_valid) == 0, "found %lld\n", - (long long)(int)offsetof(struct obdo, o_valid)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_valid) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_valid)); - LASSERTF((int)offsetof(struct obdo, o_oi) == 8, "found %lld\n", - (long long)(int)offsetof(struct obdo, o_oi)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_oi) == 16, "found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_oi)); - LASSERTF((int)offsetof(struct obdo, o_parent_seq) == 24, "found %lld\n", - (long long)(int)offsetof(struct obdo, o_parent_seq)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_seq) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_parent_seq)); - LASSERTF((int)offsetof(struct obdo, o_size) == 32, "found %lld\n", - (long long)(int)offsetof(struct obdo, o_size)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_size) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_size)); - LASSERTF((int)offsetof(struct obdo, o_mtime) == 40, "found %lld\n", - (long long)(int)offsetof(struct obdo, o_mtime)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_mtime) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_mtime)); - LASSERTF((int)offsetof(struct obdo, o_atime) == 48, "found %lld\n", - (long long)(int)offsetof(struct obdo, o_atime)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_atime) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_atime)); - LASSERTF((int)offsetof(struct obdo, o_ctime) == 56, "found %lld\n", - (long long)(int)offsetof(struct obdo, o_ctime)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_ctime) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_ctime)); - LASSERTF((int)offsetof(struct obdo, o_blocks) == 64, "found %lld\n", - (long long)(int)offsetof(struct obdo, o_blocks)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_blocks) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_blocks)); - LASSERTF((int)offsetof(struct obdo, o_grant) == 72, "found %lld\n", - (long long)(int)offsetof(struct obdo, o_grant)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_grant) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_grant)); - LASSERTF((int)offsetof(struct obdo, o_blksize) == 80, "found %lld\n", - (long long)(int)offsetof(struct obdo, o_blksize)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_blksize) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_blksize)); - LASSERTF((int)offsetof(struct obdo, o_mode) == 84, "found %lld\n", - (long long)(int)offsetof(struct obdo, o_mode)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_mode) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_mode)); - LASSERTF((int)offsetof(struct obdo, o_uid) == 88, "found %lld\n", - (long long)(int)offsetof(struct obdo, o_uid)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_uid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_uid)); - LASSERTF((int)offsetof(struct obdo, o_gid) == 92, "found %lld\n", - (long long)(int)offsetof(struct obdo, o_gid)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_gid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_gid)); - LASSERTF((int)offsetof(struct obdo, o_flags) == 96, "found %lld\n", - (long long)(int)offsetof(struct obdo, o_flags)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_flags) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_flags)); - LASSERTF((int)offsetof(struct obdo, o_nlink) == 100, "found %lld\n", - (long long)(int)offsetof(struct obdo, o_nlink)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_nlink) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_nlink)); - LASSERTF((int)offsetof(struct obdo, o_parent_oid) == 104, "found %lld\n", - (long long)(int)offsetof(struct obdo, o_parent_oid)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_oid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_parent_oid)); - LASSERTF((int)offsetof(struct obdo, o_misc) == 108, "found %lld\n", - (long long)(int)offsetof(struct obdo, o_misc)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_misc) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_misc)); - LASSERTF((int)offsetof(struct obdo, o_ioepoch) == 112, "found %lld\n", - (long long)(int)offsetof(struct obdo, o_ioepoch)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_ioepoch) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_ioepoch)); - LASSERTF((int)offsetof(struct obdo, o_stripe_idx) == 120, "found %lld\n", - (long long)(int)offsetof(struct obdo, o_stripe_idx)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_stripe_idx) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_stripe_idx)); - LASSERTF((int)offsetof(struct obdo, o_parent_ver) == 124, "found %lld\n", - (long long)(int)offsetof(struct obdo, o_parent_ver)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_parent_ver) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_parent_ver)); - LASSERTF((int)offsetof(struct obdo, o_handle) == 128, "found %lld\n", - (long long)(int)offsetof(struct obdo, o_handle)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_handle) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_handle)); - LASSERTF((int)offsetof(struct obdo, o_lcookie) == 136, "found %lld\n", - (long long)(int)offsetof(struct obdo, o_lcookie)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_lcookie) == 32, "found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_lcookie)); - LASSERTF((int)offsetof(struct obdo, o_uid_h) == 168, "found %lld\n", - (long long)(int)offsetof(struct obdo, o_uid_h)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_uid_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_uid_h)); - LASSERTF((int)offsetof(struct obdo, o_gid_h) == 172, "found %lld\n", - (long long)(int)offsetof(struct obdo, o_gid_h)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_gid_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_gid_h)); - LASSERTF((int)offsetof(struct obdo, o_data_version) == 176, "found %lld\n", - (long long)(int)offsetof(struct obdo, o_data_version)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_data_version) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_data_version)); - LASSERTF((int)offsetof(struct obdo, o_padding_4) == 184, "found %lld\n", - (long long)(int)offsetof(struct obdo, o_padding_4)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_4) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_padding_4)); - LASSERTF((int)offsetof(struct obdo, o_padding_5) == 192, "found %lld\n", - (long long)(int)offsetof(struct obdo, o_padding_5)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_5) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_padding_5)); - LASSERTF((int)offsetof(struct obdo, o_padding_6) == 200, "found %lld\n", - (long long)(int)offsetof(struct obdo, o_padding_6)); - LASSERTF((int)sizeof(((struct obdo *)0)->o_padding_6) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obdo *)0)->o_padding_6)); - LASSERTF(OBD_MD_FLID == (0x00000001ULL), "found 0x%.16llxULL\n", - OBD_MD_FLID); - LASSERTF(OBD_MD_FLATIME == (0x00000002ULL), "found 0x%.16llxULL\n", - OBD_MD_FLATIME); - LASSERTF(OBD_MD_FLMTIME == (0x00000004ULL), "found 0x%.16llxULL\n", - OBD_MD_FLMTIME); - LASSERTF(OBD_MD_FLCTIME == (0x00000008ULL), "found 0x%.16llxULL\n", - OBD_MD_FLCTIME); - LASSERTF(OBD_MD_FLSIZE == (0x00000010ULL), "found 0x%.16llxULL\n", - OBD_MD_FLSIZE); - LASSERTF(OBD_MD_FLBLOCKS == (0x00000020ULL), "found 0x%.16llxULL\n", - OBD_MD_FLBLOCKS); - LASSERTF(OBD_MD_FLBLKSZ == (0x00000040ULL), "found 0x%.16llxULL\n", - OBD_MD_FLBLKSZ); - LASSERTF(OBD_MD_FLMODE == (0x00000080ULL), "found 0x%.16llxULL\n", - OBD_MD_FLMODE); - LASSERTF(OBD_MD_FLTYPE == (0x00000100ULL), "found 0x%.16llxULL\n", - OBD_MD_FLTYPE); - LASSERTF(OBD_MD_FLUID == (0x00000200ULL), "found 0x%.16llxULL\n", - OBD_MD_FLUID); - LASSERTF(OBD_MD_FLGID == (0x00000400ULL), "found 0x%.16llxULL\n", - OBD_MD_FLGID); - LASSERTF(OBD_MD_FLFLAGS == (0x00000800ULL), "found 0x%.16llxULL\n", - OBD_MD_FLFLAGS); - LASSERTF(OBD_MD_FLNLINK == (0x00002000ULL), "found 0x%.16llxULL\n", - OBD_MD_FLNLINK); - LASSERTF(OBD_MD_FLGENER == (0x00004000ULL), "found 0x%.16llxULL\n", - OBD_MD_FLGENER); - LASSERTF(OBD_MD_FLRDEV == (0x00010000ULL), "found 0x%.16llxULL\n", - OBD_MD_FLRDEV); - LASSERTF(OBD_MD_FLEASIZE == (0x00020000ULL), "found 0x%.16llxULL\n", - OBD_MD_FLEASIZE); - LASSERTF(OBD_MD_LINKNAME == (0x00040000ULL), "found 0x%.16llxULL\n", - OBD_MD_LINKNAME); - LASSERTF(OBD_MD_FLHANDLE == (0x00080000ULL), "found 0x%.16llxULL\n", - OBD_MD_FLHANDLE); - LASSERTF(OBD_MD_FLCKSUM == (0x00100000ULL), "found 0x%.16llxULL\n", - OBD_MD_FLCKSUM); - LASSERTF(OBD_MD_FLQOS == (0x00200000ULL), "found 0x%.16llxULL\n", - OBD_MD_FLQOS); - LASSERTF(OBD_MD_FLGROUP == (0x01000000ULL), "found 0x%.16llxULL\n", - OBD_MD_FLGROUP); - LASSERTF(OBD_MD_FLFID == (0x02000000ULL), "found 0x%.16llxULL\n", - OBD_MD_FLFID); - LASSERTF(OBD_MD_FLEPOCH == (0x04000000ULL), "found 0x%.16llxULL\n", - OBD_MD_FLEPOCH); - LASSERTF(OBD_MD_FLGRANT == (0x08000000ULL), "found 0x%.16llxULL\n", - OBD_MD_FLGRANT); - LASSERTF(OBD_MD_FLDIREA == (0x10000000ULL), "found 0x%.16llxULL\n", - OBD_MD_FLDIREA); - LASSERTF(OBD_MD_FLUSRQUOTA == (0x20000000ULL), "found 0x%.16llxULL\n", - OBD_MD_FLUSRQUOTA); - LASSERTF(OBD_MD_FLGRPQUOTA == (0x40000000ULL), "found 0x%.16llxULL\n", - OBD_MD_FLGRPQUOTA); - LASSERTF(OBD_MD_FLMODEASIZE == (0x80000000ULL), "found 0x%.16llxULL\n", - OBD_MD_FLMODEASIZE); - LASSERTF(OBD_MD_MDS == (0x0000000100000000ULL), "found 0x%.16llxULL\n", - OBD_MD_MDS); - LASSERTF(OBD_MD_REINT == (0x0000000200000000ULL), "found 0x%.16llxULL\n", - OBD_MD_REINT); - LASSERTF(OBD_MD_MEA == (0x0000000400000000ULL), "found 0x%.16llxULL\n", - OBD_MD_MEA); - LASSERTF(OBD_MD_TSTATE == (0x0000000800000000ULL), - "found 0x%.16llxULL\n", OBD_MD_TSTATE); - LASSERTF(OBD_MD_FLXATTR == (0x0000001000000000ULL), "found 0x%.16llxULL\n", - OBD_MD_FLXATTR); - LASSERTF(OBD_MD_FLXATTRLS == (0x0000002000000000ULL), "found 0x%.16llxULL\n", - OBD_MD_FLXATTRLS); - LASSERTF(OBD_MD_FLXATTRRM == (0x0000004000000000ULL), "found 0x%.16llxULL\n", - OBD_MD_FLXATTRRM); - LASSERTF(OBD_MD_FLACL == (0x0000008000000000ULL), "found 0x%.16llxULL\n", - OBD_MD_FLACL); - LASSERTF(OBD_MD_FLMDSCAPA == (0x0000020000000000ULL), "found 0x%.16llxULL\n", - OBD_MD_FLMDSCAPA); - LASSERTF(OBD_MD_FLOSSCAPA == (0x0000040000000000ULL), "found 0x%.16llxULL\n", - OBD_MD_FLOSSCAPA); - LASSERTF(OBD_MD_FLCKSPLIT == (0x0000080000000000ULL), "found 0x%.16llxULL\n", - OBD_MD_FLCKSPLIT); - LASSERTF(OBD_MD_FLCROSSREF == (0x0000100000000000ULL), "found 0x%.16llxULL\n", - OBD_MD_FLCROSSREF); - LASSERTF(OBD_MD_FLGETATTRLOCK == (0x0000200000000000ULL), "found 0x%.16llxULL\n", - OBD_MD_FLGETATTRLOCK); - LASSERTF(OBD_MD_FLDATAVERSION == (0x0010000000000000ULL), "found 0x%.16llxULL\n", - OBD_MD_FLDATAVERSION); - BUILD_BUG_ON(OBD_FL_INLINEDATA != 0x00000001); - BUILD_BUG_ON(OBD_FL_OBDMDEXISTS != 0x00000002); - BUILD_BUG_ON(OBD_FL_DELORPHAN != 0x00000004); - BUILD_BUG_ON(OBD_FL_NORPC != 0x00000008); - BUILD_BUG_ON(OBD_FL_IDONLY != 0x00000010); - BUILD_BUG_ON(OBD_FL_RECREATE_OBJS != 0x00000020); - BUILD_BUG_ON(OBD_FL_DEBUG_CHECK != 0x00000040); - BUILD_BUG_ON(OBD_FL_NO_USRQUOTA != 0x00000100); - BUILD_BUG_ON(OBD_FL_NO_GRPQUOTA != 0x00000200); - BUILD_BUG_ON(OBD_FL_CREATE_CROW != 0x00000400); - BUILD_BUG_ON(OBD_FL_SRVLOCK != 0x00000800); - BUILD_BUG_ON(OBD_FL_CKSUM_CRC32 != 0x00001000); - BUILD_BUG_ON(OBD_FL_CKSUM_ADLER != 0x00002000); - BUILD_BUG_ON(OBD_FL_CKSUM_CRC32C != 0x00004000); - BUILD_BUG_ON(OBD_FL_CKSUM_RSVD2 != 0x00008000); - BUILD_BUG_ON(OBD_FL_CKSUM_RSVD3 != 0x00010000); - BUILD_BUG_ON(OBD_FL_SHRINK_GRANT != 0x00020000); - BUILD_BUG_ON(OBD_FL_MMAP != 0x00040000); - BUILD_BUG_ON(OBD_FL_RECOV_RESEND != 0x00080000); - BUILD_BUG_ON(OBD_FL_NOSPC_BLK != 0x00100000); - BUILD_BUG_ON(OBD_FL_LOCAL_MASK != 0xf0000000); - - /* Checks for struct lov_ost_data_v1 */ - LASSERTF((int)sizeof(struct lov_ost_data_v1) == 24, "found %lld\n", - (long long)(int)sizeof(struct lov_ost_data_v1)); - LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_oi) == 0, "found %lld\n", - (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_oi)); - LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_oi) == 16, "found %lld\n", - (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_oi)); - LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_gen) == 16, "found %lld\n", - (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_gen)); - LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen)); - LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_idx) == 20, "found %lld\n", - (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_idx)); - LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx)); - - /* Checks for struct lov_mds_md_v1 */ - LASSERTF((int)sizeof(struct lov_mds_md_v1) == 32, "found %lld\n", - (long long)(int)sizeof(struct lov_mds_md_v1)); - LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_magic) == 0, "found %lld\n", - (long long)(int)offsetof(struct lov_mds_md_v1, lmm_magic)); - LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_magic) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_magic)); - LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_pattern) == 4, "found %lld\n", - (long long)(int)offsetof(struct lov_mds_md_v1, lmm_pattern)); - LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_pattern) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_pattern)); - LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_oi) == 8, "found %lld\n", - (long long)(int)offsetof(struct lov_mds_md_v1, lmm_oi)); - LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_oi) == 16, "found %lld\n", - (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_oi)); - LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_stripe_size) == 24, "found %lld\n", - (long long)(int)offsetof(struct lov_mds_md_v1, lmm_stripe_size)); - LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_size) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_size)); - LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_stripe_count) == 28, "found %lld\n", - (long long)(int)offsetof(struct lov_mds_md_v1, lmm_stripe_count)); - LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_count) == 2, "found %lld\n", - (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_stripe_count)); - LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_layout_gen) == 30, "found %lld\n", - (long long)(int)offsetof(struct lov_mds_md_v1, lmm_layout_gen)); - LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_layout_gen) == 2, "found %lld\n", - (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_layout_gen)); - LASSERTF((int)offsetof(struct lov_mds_md_v1, lmm_objects[0]) == 32, "found %lld\n", - (long long)(int)offsetof(struct lov_mds_md_v1, lmm_objects[0])); - LASSERTF((int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_objects[0]) == 24, "found %lld\n", - (long long)(int)sizeof(((struct lov_mds_md_v1 *)0)->lmm_objects[0])); - BUILD_BUG_ON(LOV_MAGIC_V1 != (0x0BD10000 | 0x0BD0)); - - /* Checks for struct lov_mds_md_v3 */ - LASSERTF((int)sizeof(struct lov_mds_md_v3) == 48, "found %lld\n", - (long long)(int)sizeof(struct lov_mds_md_v3)); - LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_magic) == 0, "found %lld\n", - (long long)(int)offsetof(struct lov_mds_md_v3, lmm_magic)); - LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic)); - LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pattern) == 4, "found %lld\n", - (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pattern)); - LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern)); - LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_oi) == 8, "found %lld\n", - (long long)(int)offsetof(struct lov_mds_md_v3, lmm_oi)); - LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_oi) == 16, "found %lld\n", - (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_oi)); - LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_size) == 24, "found %lld\n", - (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_size)); - LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size)); - LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_count) == 28, "found %lld\n", - (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_count)); - LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count) == 2, "found %lld\n", - (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count)); - LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_layout_gen) == 30, "found %lld\n", - (long long)(int)offsetof(struct lov_mds_md_v3, lmm_layout_gen)); - LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_layout_gen) == 2, "found %lld\n", - (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_layout_gen)); - BUILD_BUG_ON(LOV_MAXPOOLNAME != 15); - LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pool_name[16]) == 48, "found %lld\n", - (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pool_name[16])); - LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name[16]) == 1, "found %lld\n", - (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name[16])); - LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_objects[0]) == 48, "found %lld\n", - (long long)(int)offsetof(struct lov_mds_md_v3, lmm_objects[0])); - LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects[0]) == 24, "found %lld\n", - (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects[0])); - BUILD_BUG_ON(LOV_MAGIC_V3 != (0x0BD30000 | 0x0BD0)); - LASSERTF(LOV_PATTERN_RAID0 == 0x00000001UL, "found 0x%.8xUL\n", - (unsigned int)LOV_PATTERN_RAID0); - LASSERTF(LOV_PATTERN_RAID1 == 0x00000002UL, "found 0x%.8xUL\n", - (unsigned int)LOV_PATTERN_RAID1); - LASSERTF(LOV_PATTERN_FIRST == 0x00000100UL, "found 0x%.8xUL\n", - (unsigned int)LOV_PATTERN_FIRST); - LASSERTF(LOV_PATTERN_CMOBD == 0x00000200UL, "found 0x%.8xUL\n", - (unsigned int)LOV_PATTERN_CMOBD); - - /* Checks for struct lmv_mds_md_v1 */ - LASSERTF((int)sizeof(struct lmv_mds_md_v1) == 56, "found %lld\n", - (long long)(int)sizeof(struct lmv_mds_md_v1)); - LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_magic) == 0, "found %lld\n", - (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_magic)); - LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_magic) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_magic)); - LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_stripe_count) == 4, "found %lld\n", - (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_stripe_count)); - LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_stripe_count) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_stripe_count)); - LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_master_mdt_index) == 8, "found %lld\n", - (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_master_mdt_index)); - LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_master_mdt_index) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_master_mdt_index)); - LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_hash_type) == 12, "found %lld\n", - (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_hash_type)); - LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_hash_type) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_hash_type)); - LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_layout_version) == 16, "found %lld\n", - (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_layout_version)); - LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_layout_version) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_layout_version)); - LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_padding1) == 20, "found %lld\n", - (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_padding1)); - LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding1) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding1)); - LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_padding2) == 24, "found %lld\n", - (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_padding2)); - LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding2) == 8, "found %lld\n", - (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding2)); - LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_padding3) == 32, "found %lld\n", - (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_padding3)); - LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding3) == 8, "found %lld\n", - (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_padding3)); - LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_pool_name[16]) == 56, "found %lld\n", - (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_pool_name[16])); - LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_pool_name[16]) == 1, "found %lld\n", - (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_pool_name[16])); - LASSERTF((int)offsetof(struct lmv_mds_md_v1, lmv_stripe_fids[0]) == 56, "found %lld\n", - (long long)(int)offsetof(struct lmv_mds_md_v1, lmv_stripe_fids[0])); - LASSERTF((int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_stripe_fids[0]) == 16, "found %lld\n", - (long long)(int)sizeof(((struct lmv_mds_md_v1 *)0)->lmv_stripe_fids[0])); - BUILD_BUG_ON(LMV_MAGIC_V1 != 0x0CD20CD0); - BUILD_BUG_ON(LMV_MAGIC_STRIPE != 0x0CD40CD0); - BUILD_BUG_ON(LMV_HASH_TYPE_MASK != 0x0000ffff); - BUILD_BUG_ON(LMV_HASH_FLAG_MIGRATION != 0x80000000); - BUILD_BUG_ON(LMV_HASH_FLAG_DEAD != 0x40000000); - - /* Checks for struct obd_statfs */ - LASSERTF((int)sizeof(struct obd_statfs) == 144, "found %lld\n", - (long long)(int)sizeof(struct obd_statfs)); - LASSERTF((int)offsetof(struct obd_statfs, os_type) == 0, "found %lld\n", - (long long)(int)offsetof(struct obd_statfs, os_type)); - LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_type) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_statfs *)0)->os_type)); - LASSERTF((int)offsetof(struct obd_statfs, os_blocks) == 8, "found %lld\n", - (long long)(int)offsetof(struct obd_statfs, os_blocks)); - LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_blocks) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_statfs *)0)->os_blocks)); - LASSERTF((int)offsetof(struct obd_statfs, os_bfree) == 16, "found %lld\n", - (long long)(int)offsetof(struct obd_statfs, os_bfree)); - LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bfree) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_statfs *)0)->os_bfree)); - LASSERTF((int)offsetof(struct obd_statfs, os_bavail) == 24, "found %lld\n", - (long long)(int)offsetof(struct obd_statfs, os_bavail)); - LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bavail) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_statfs *)0)->os_bavail)); - LASSERTF((int)offsetof(struct obd_statfs, os_ffree) == 40, "found %lld\n", - (long long)(int)offsetof(struct obd_statfs, os_ffree)); - LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_ffree) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_statfs *)0)->os_ffree)); - LASSERTF((int)offsetof(struct obd_statfs, os_fsid) == 48, "found %lld\n", - (long long)(int)offsetof(struct obd_statfs, os_fsid)); - LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_fsid) == 40, "found %lld\n", - (long long)(int)sizeof(((struct obd_statfs *)0)->os_fsid)); - LASSERTF((int)offsetof(struct obd_statfs, os_bsize) == 88, "found %lld\n", - (long long)(int)offsetof(struct obd_statfs, os_bsize)); - LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_bsize) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_statfs *)0)->os_bsize)); - LASSERTF((int)offsetof(struct obd_statfs, os_namelen) == 92, "found %lld\n", - (long long)(int)offsetof(struct obd_statfs, os_namelen)); - LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_namelen) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_statfs *)0)->os_namelen)); - LASSERTF((int)offsetof(struct obd_statfs, os_state) == 104, "found %lld\n", - (long long)(int)offsetof(struct obd_statfs, os_state)); - LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_state) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_statfs *)0)->os_state)); - LASSERTF((int)offsetof(struct obd_statfs, os_fprecreated) == 108, "found %lld\n", - (long long)(int)offsetof(struct obd_statfs, os_fprecreated)); - LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_fprecreated) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_statfs *)0)->os_fprecreated)); - LASSERTF((int)offsetof(struct obd_statfs, os_spare2) == 112, "found %lld\n", - (long long)(int)offsetof(struct obd_statfs, os_spare2)); - LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare2) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare2)); - LASSERTF((int)offsetof(struct obd_statfs, os_spare3) == 116, "found %lld\n", - (long long)(int)offsetof(struct obd_statfs, os_spare3)); - LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare3) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare3)); - LASSERTF((int)offsetof(struct obd_statfs, os_spare4) == 120, "found %lld\n", - (long long)(int)offsetof(struct obd_statfs, os_spare4)); - LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare4) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare4)); - LASSERTF((int)offsetof(struct obd_statfs, os_spare5) == 124, "found %lld\n", - (long long)(int)offsetof(struct obd_statfs, os_spare5)); - LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare5) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare5)); - LASSERTF((int)offsetof(struct obd_statfs, os_spare6) == 128, "found %lld\n", - (long long)(int)offsetof(struct obd_statfs, os_spare6)); - LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare6) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare6)); - LASSERTF((int)offsetof(struct obd_statfs, os_spare7) == 132, "found %lld\n", - (long long)(int)offsetof(struct obd_statfs, os_spare7)); - LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare7) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare7)); - LASSERTF((int)offsetof(struct obd_statfs, os_spare8) == 136, "found %lld\n", - (long long)(int)offsetof(struct obd_statfs, os_spare8)); - LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare8) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare8)); - LASSERTF((int)offsetof(struct obd_statfs, os_spare9) == 140, "found %lld\n", - (long long)(int)offsetof(struct obd_statfs, os_spare9)); - LASSERTF((int)sizeof(((struct obd_statfs *)0)->os_spare9) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_statfs *)0)->os_spare9)); - - /* Checks for struct obd_ioobj */ - LASSERTF((int)sizeof(struct obd_ioobj) == 24, "found %lld\n", - (long long)(int)sizeof(struct obd_ioobj)); - LASSERTF((int)offsetof(struct obd_ioobj, ioo_oid) == 0, "found %lld\n", - (long long)(int)offsetof(struct obd_ioobj, ioo_oid)); - LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_oid) == 16, "found %lld\n", - (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_oid)); - LASSERTF((int)offsetof(struct obd_ioobj, ioo_max_brw) == 16, "found %lld\n", - (long long)(int)offsetof(struct obd_ioobj, ioo_max_brw)); - LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_max_brw) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_max_brw)); - LASSERTF((int)offsetof(struct obd_ioobj, ioo_bufcnt) == 20, "found %lld\n", - (long long)(int)offsetof(struct obd_ioobj, ioo_bufcnt)); - LASSERTF((int)sizeof(((struct obd_ioobj *)0)->ioo_bufcnt) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_ioobj *)0)->ioo_bufcnt)); - LASSERTF(IOOBJ_MAX_BRW_BITS == 16, "found %lld\n", - (long long)IOOBJ_MAX_BRW_BITS); - - /* Checks for union lquota_id */ - LASSERTF((int)sizeof(union lquota_id) == 16, "found %lld\n", - (long long)(int)sizeof(union lquota_id)); - - /* Checks for struct obd_quotactl */ - LASSERTF((int)sizeof(struct obd_quotactl) == 112, "found %lld\n", - (long long)(int)sizeof(struct obd_quotactl)); - LASSERTF((int)offsetof(struct obd_quotactl, qc_cmd) == 0, "found %lld\n", - (long long)(int)offsetof(struct obd_quotactl, qc_cmd)); - LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_cmd) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_cmd)); - LASSERTF((int)offsetof(struct obd_quotactl, qc_type) == 4, "found %lld\n", - (long long)(int)offsetof(struct obd_quotactl, qc_type)); - LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_type) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_type)); - LASSERTF((int)offsetof(struct obd_quotactl, qc_id) == 8, "found %lld\n", - (long long)(int)offsetof(struct obd_quotactl, qc_id)); - LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_id) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_id)); - LASSERTF((int)offsetof(struct obd_quotactl, qc_stat) == 12, "found %lld\n", - (long long)(int)offsetof(struct obd_quotactl, qc_stat)); - LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_stat) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_stat)); - LASSERTF((int)offsetof(struct obd_quotactl, qc_dqinfo) == 16, "found %lld\n", - (long long)(int)offsetof(struct obd_quotactl, qc_dqinfo)); - LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_dqinfo) == 24, "found %lld\n", - (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_dqinfo)); - LASSERTF((int)offsetof(struct obd_quotactl, qc_dqblk) == 40, "found %lld\n", - (long long)(int)offsetof(struct obd_quotactl, qc_dqblk)); - LASSERTF((int)sizeof(((struct obd_quotactl *)0)->qc_dqblk) == 72, "found %lld\n", - (long long)(int)sizeof(((struct obd_quotactl *)0)->qc_dqblk)); - - /* Checks for struct obd_dqinfo */ - LASSERTF((int)sizeof(struct obd_dqinfo) == 24, "found %lld\n", - (long long)(int)sizeof(struct obd_dqinfo)); - LASSERTF((int)offsetof(struct obd_dqinfo, dqi_bgrace) == 0, "found %lld\n", - (long long)(int)offsetof(struct obd_dqinfo, dqi_bgrace)); - LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_bgrace) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_bgrace)); - LASSERTF((int)offsetof(struct obd_dqinfo, dqi_igrace) == 8, "found %lld\n", - (long long)(int)offsetof(struct obd_dqinfo, dqi_igrace)); - LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_igrace) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_igrace)); - LASSERTF((int)offsetof(struct obd_dqinfo, dqi_flags) == 16, "found %lld\n", - (long long)(int)offsetof(struct obd_dqinfo, dqi_flags)); - LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_flags) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_flags)); - LASSERTF((int)offsetof(struct obd_dqinfo, dqi_valid) == 20, "found %lld\n", - (long long)(int)offsetof(struct obd_dqinfo, dqi_valid)); - LASSERTF((int)sizeof(((struct obd_dqinfo *)0)->dqi_valid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_dqinfo *)0)->dqi_valid)); - - /* Checks for struct obd_dqblk */ - LASSERTF((int)sizeof(struct obd_dqblk) == 72, "found %lld\n", - (long long)(int)sizeof(struct obd_dqblk)); - LASSERTF((int)offsetof(struct obd_dqblk, dqb_bhardlimit) == 0, "found %lld\n", - (long long)(int)offsetof(struct obd_dqblk, dqb_bhardlimit)); - LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_bhardlimit) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_bhardlimit)); - LASSERTF((int)offsetof(struct obd_dqblk, dqb_bsoftlimit) == 8, "found %lld\n", - (long long)(int)offsetof(struct obd_dqblk, dqb_bsoftlimit)); - LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_bsoftlimit) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_bsoftlimit)); - LASSERTF((int)offsetof(struct obd_dqblk, dqb_curspace) == 16, "found %lld\n", - (long long)(int)offsetof(struct obd_dqblk, dqb_curspace)); - LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_curspace) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_curspace)); - LASSERTF((int)offsetof(struct obd_dqblk, dqb_ihardlimit) == 24, "found %lld\n", - (long long)(int)offsetof(struct obd_dqblk, dqb_ihardlimit)); - LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_ihardlimit) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_ihardlimit)); - LASSERTF((int)offsetof(struct obd_dqblk, dqb_isoftlimit) == 32, "found %lld\n", - (long long)(int)offsetof(struct obd_dqblk, dqb_isoftlimit)); - LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_isoftlimit) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_isoftlimit)); - LASSERTF((int)offsetof(struct obd_dqblk, dqb_curinodes) == 40, "found %lld\n", - (long long)(int)offsetof(struct obd_dqblk, dqb_curinodes)); - LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_curinodes) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_curinodes)); - LASSERTF((int)offsetof(struct obd_dqblk, dqb_btime) == 48, "found %lld\n", - (long long)(int)offsetof(struct obd_dqblk, dqb_btime)); - LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_btime) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_btime)); - LASSERTF((int)offsetof(struct obd_dqblk, dqb_itime) == 56, "found %lld\n", - (long long)(int)offsetof(struct obd_dqblk, dqb_itime)); - LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_itime) == 8, "found %lld\n", - (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_itime)); - LASSERTF((int)offsetof(struct obd_dqblk, dqb_valid) == 64, "found %lld\n", - (long long)(int)offsetof(struct obd_dqblk, dqb_valid)); - LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_valid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_valid)); - LASSERTF((int)offsetof(struct obd_dqblk, dqb_padding) == 68, "found %lld\n", - (long long)(int)offsetof(struct obd_dqblk, dqb_padding)); - LASSERTF((int)sizeof(((struct obd_dqblk *)0)->dqb_padding) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_dqblk *)0)->dqb_padding)); - LASSERTF(Q_QUOTACHECK == 0x800100, "found 0x%.8x\n", - Q_QUOTACHECK); - LASSERTF(Q_INITQUOTA == 0x800101, "found 0x%.8x\n", - Q_INITQUOTA); - LASSERTF(Q_GETOINFO == 0x800102, "found 0x%.8x\n", - Q_GETOINFO); - LASSERTF(Q_GETOQUOTA == 0x800103, "found 0x%.8x\n", - Q_GETOQUOTA); - LASSERTF(Q_FINVALIDATE == 0x800104, "found 0x%.8x\n", - Q_FINVALIDATE); - - /* Checks for struct niobuf_remote */ - LASSERTF((int)sizeof(struct niobuf_remote) == 16, "found %lld\n", - (long long)(int)sizeof(struct niobuf_remote)); - LASSERTF((int)offsetof(struct niobuf_remote, rnb_offset) == 0, "found %lld\n", - (long long)(int)offsetof(struct niobuf_remote, rnb_offset)); - LASSERTF((int)sizeof(((struct niobuf_remote *)0)->rnb_offset) == 8, "found %lld\n", - (long long)(int)sizeof(((struct niobuf_remote *)0)->rnb_offset)); - LASSERTF((int)offsetof(struct niobuf_remote, rnb_len) == 8, "found %lld\n", - (long long)(int)offsetof(struct niobuf_remote, rnb_len)); - LASSERTF((int)sizeof(((struct niobuf_remote *)0)->rnb_len) == 4, "found %lld\n", - (long long)(int)sizeof(((struct niobuf_remote *)0)->rnb_len)); - LASSERTF((int)offsetof(struct niobuf_remote, rnb_flags) == 12, "found %lld\n", - (long long)(int)offsetof(struct niobuf_remote, rnb_flags)); - LASSERTF((int)sizeof(((struct niobuf_remote *)0)->rnb_flags) == 4, "found %lld\n", - (long long)(int)sizeof(((struct niobuf_remote *)0)->rnb_flags)); - LASSERTF(OBD_BRW_READ == 0x01, "found 0x%.8x\n", - OBD_BRW_READ); - LASSERTF(OBD_BRW_WRITE == 0x02, "found 0x%.8x\n", - OBD_BRW_WRITE); - LASSERTF(OBD_BRW_SYNC == 0x08, "found 0x%.8x\n", - OBD_BRW_SYNC); - LASSERTF(OBD_BRW_CHECK == 0x10, "found 0x%.8x\n", - OBD_BRW_CHECK); - LASSERTF(OBD_BRW_FROM_GRANT == 0x20, "found 0x%.8x\n", - OBD_BRW_FROM_GRANT); - LASSERTF(OBD_BRW_GRANTED == 0x40, "found 0x%.8x\n", - OBD_BRW_GRANTED); - LASSERTF(OBD_BRW_NOCACHE == 0x80, "found 0x%.8x\n", - OBD_BRW_NOCACHE); - LASSERTF(OBD_BRW_NOQUOTA == 0x100, "found 0x%.8x\n", - OBD_BRW_NOQUOTA); - LASSERTF(OBD_BRW_SRVLOCK == 0x200, "found 0x%.8x\n", - OBD_BRW_SRVLOCK); - LASSERTF(OBD_BRW_ASYNC == 0x400, "found 0x%.8x\n", - OBD_BRW_ASYNC); - LASSERTF(OBD_BRW_MEMALLOC == 0x800, "found 0x%.8x\n", - OBD_BRW_MEMALLOC); - LASSERTF(OBD_BRW_OVER_USRQUOTA == 0x1000, "found 0x%.8x\n", - OBD_BRW_OVER_USRQUOTA); - LASSERTF(OBD_BRW_OVER_GRPQUOTA == 0x2000, "found 0x%.8x\n", - OBD_BRW_OVER_GRPQUOTA); - LASSERTF(OBD_BRW_SOFT_SYNC == 0x4000, "found 0x%.8x\n", - OBD_BRW_SOFT_SYNC); - - /* Checks for struct ost_body */ - LASSERTF((int)sizeof(struct ost_body) == 208, "found %lld\n", - (long long)(int)sizeof(struct ost_body)); - LASSERTF((int)offsetof(struct ost_body, oa) == 0, "found %lld\n", - (long long)(int)offsetof(struct ost_body, oa)); - LASSERTF((int)sizeof(((struct ost_body *)0)->oa) == 208, "found %lld\n", - (long long)(int)sizeof(((struct ost_body *)0)->oa)); - - /* Checks for struct ll_fid */ - LASSERTF((int)sizeof(struct ll_fid) == 16, "found %lld\n", - (long long)(int)sizeof(struct ll_fid)); - LASSERTF((int)offsetof(struct ll_fid, id) == 0, "found %lld\n", - (long long)(int)offsetof(struct ll_fid, id)); - LASSERTF((int)sizeof(((struct ll_fid *)0)->id) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ll_fid *)0)->id)); - LASSERTF((int)offsetof(struct ll_fid, generation) == 8, "found %lld\n", - (long long)(int)offsetof(struct ll_fid, generation)); - LASSERTF((int)sizeof(((struct ll_fid *)0)->generation) == 4, "found %lld\n", - (long long)(int)sizeof(((struct ll_fid *)0)->generation)); - LASSERTF((int)offsetof(struct ll_fid, f_type) == 12, "found %lld\n", - (long long)(int)offsetof(struct ll_fid, f_type)); - LASSERTF((int)sizeof(((struct ll_fid *)0)->f_type) == 4, "found %lld\n", - (long long)(int)sizeof(((struct ll_fid *)0)->f_type)); - - /* Checks for struct mdt_body */ - LASSERTF((int)sizeof(struct mdt_body) == 216, "found %lld\n", - (long long)(int)sizeof(struct mdt_body)); - LASSERTF((int)offsetof(struct mdt_body, mbo_fid1) == 0, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_fid1)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_fid1) == 16, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_fid1)); - LASSERTF((int)offsetof(struct mdt_body, mbo_fid2) == 16, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_fid2)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_fid2) == 16, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_fid2)); - LASSERTF((int)offsetof(struct mdt_body, mbo_handle) == 32, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_handle)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_handle) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_handle)); - LASSERTF((int)offsetof(struct mdt_body, mbo_valid) == 40, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_valid)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_valid) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_valid)); - LASSERTF((int)offsetof(struct mdt_body, mbo_size) == 48, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_size)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_size) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_size)); - LASSERTF((int)offsetof(struct mdt_body, mbo_mtime) == 56, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_mtime)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_mtime) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_mtime)); - LASSERTF((int)offsetof(struct mdt_body, mbo_atime) == 64, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_atime)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_atime) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_atime)); - LASSERTF((int)offsetof(struct mdt_body, mbo_ctime) == 72, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_ctime)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_ctime) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_ctime)); - LASSERTF((int)offsetof(struct mdt_body, mbo_blocks) == 80, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_blocks)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_blocks) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_blocks)); - LASSERTF((int)offsetof(struct mdt_body, mbo_t_state) == 96, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_t_state)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_t_state) == 8, - "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_t_state)); - LASSERTF((int)offsetof(struct mdt_body, mbo_fsuid) == 104, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_fsuid)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_fsuid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_fsuid)); - LASSERTF((int)offsetof(struct mdt_body, mbo_fsgid) == 108, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_fsgid)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_fsgid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_fsgid)); - LASSERTF((int)offsetof(struct mdt_body, mbo_capability) == 112, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_capability)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_capability) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_capability)); - LASSERTF((int)offsetof(struct mdt_body, mbo_mode) == 116, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_mode)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_mode) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_mode)); - LASSERTF((int)offsetof(struct mdt_body, mbo_uid) == 120, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_uid)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_uid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_uid)); - LASSERTF((int)offsetof(struct mdt_body, mbo_gid) == 124, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_gid)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_gid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_gid)); - LASSERTF((int)offsetof(struct mdt_body, mbo_flags) == 128, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_flags)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_flags) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_flags)); - LASSERTF((int)offsetof(struct mdt_body, mbo_rdev) == 132, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_rdev)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_rdev) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_rdev)); - LASSERTF((int)offsetof(struct mdt_body, mbo_nlink) == 136, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_nlink)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_nlink) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_nlink)); - LASSERTF((int)offsetof(struct mdt_body, mbo_unused2) == 140, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_unused2)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_unused2) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_unused2)); - LASSERTF((int)offsetof(struct mdt_body, mbo_suppgid) == 144, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_suppgid)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_suppgid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_suppgid)); - LASSERTF((int)offsetof(struct mdt_body, mbo_eadatasize) == 148, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_eadatasize)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_eadatasize) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_eadatasize)); - LASSERTF((int)offsetof(struct mdt_body, mbo_aclsize) == 152, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_aclsize)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_aclsize) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_aclsize)); - LASSERTF((int)offsetof(struct mdt_body, mbo_max_mdsize) == 156, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_max_mdsize)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_max_mdsize) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_max_mdsize)); - LASSERTF((int)offsetof(struct mdt_body, mbo_unused3) == 160, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_unused3)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_unused3) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_unused3)); - LASSERTF((int)offsetof(struct mdt_body, mbo_uid_h) == 164, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_uid_h)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_uid_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_uid_h)); - LASSERTF((int)offsetof(struct mdt_body, mbo_gid_h) == 168, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_gid_h)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_gid_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_gid_h)); - LASSERTF((int)offsetof(struct mdt_body, mbo_padding_5) == 172, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_padding_5)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_5) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_5)); - LASSERTF((int)offsetof(struct mdt_body, mbo_padding_6) == 176, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_padding_6)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_6) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_6)); - LASSERTF((int)offsetof(struct mdt_body, mbo_padding_7) == 184, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_padding_7)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_7) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_7)); - LASSERTF((int)offsetof(struct mdt_body, mbo_padding_8) == 192, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_padding_8)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_8) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_8)); - LASSERTF((int)offsetof(struct mdt_body, mbo_padding_9) == 200, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_padding_9)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_9) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_9)); - LASSERTF((int)offsetof(struct mdt_body, mbo_padding_10) == 208, "found %lld\n", - (long long)(int)offsetof(struct mdt_body, mbo_padding_10)); - LASSERTF((int)sizeof(((struct mdt_body *)0)->mbo_padding_10) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_body *)0)->mbo_padding_10)); - LASSERTF(MDS_FMODE_CLOSED == 000000000000UL, "found 0%.11oUL\n", - MDS_FMODE_CLOSED); - LASSERTF(MDS_FMODE_EXEC == 000000000004UL, "found 0%.11oUL\n", - MDS_FMODE_EXEC); - LASSERTF(MDS_OPEN_CREATED == 000000000010UL, "found 0%.11oUL\n", - MDS_OPEN_CREATED); - LASSERTF(MDS_OPEN_CROSS == 000000000020UL, "found 0%.11oUL\n", - MDS_OPEN_CROSS); - LASSERTF(MDS_OPEN_CREAT == 000000000100UL, "found 0%.11oUL\n", - MDS_OPEN_CREAT); - LASSERTF(MDS_OPEN_EXCL == 000000000200UL, "found 0%.11oUL\n", - MDS_OPEN_EXCL); - LASSERTF(MDS_OPEN_TRUNC == 000000001000UL, "found 0%.11oUL\n", - MDS_OPEN_TRUNC); - LASSERTF(MDS_OPEN_APPEND == 000000002000UL, "found 0%.11oUL\n", - MDS_OPEN_APPEND); - LASSERTF(MDS_OPEN_SYNC == 000000010000UL, "found 0%.11oUL\n", - MDS_OPEN_SYNC); - LASSERTF(MDS_OPEN_DIRECTORY == 000000200000UL, "found 0%.11oUL\n", - MDS_OPEN_DIRECTORY); - LASSERTF(MDS_OPEN_BY_FID == 000040000000UL, "found 0%.11oUL\n", - MDS_OPEN_BY_FID); - LASSERTF(MDS_OPEN_DELAY_CREATE == 000100000000UL, "found 0%.11oUL\n", - MDS_OPEN_DELAY_CREATE); - LASSERTF(MDS_OPEN_OWNEROVERRIDE == 000200000000UL, "found 0%.11oUL\n", - MDS_OPEN_OWNEROVERRIDE); - LASSERTF(MDS_OPEN_JOIN_FILE == 000400000000UL, "found 0%.11oUL\n", - MDS_OPEN_JOIN_FILE); - LASSERTF(MDS_OPEN_LOCK == 004000000000UL, "found 0%.11oUL\n", - MDS_OPEN_LOCK); - LASSERTF(MDS_OPEN_HAS_EA == 010000000000UL, "found 0%.11oUL\n", - MDS_OPEN_HAS_EA); - LASSERTF(MDS_OPEN_HAS_OBJS == 020000000000UL, "found 0%.11oUL\n", - MDS_OPEN_HAS_OBJS); - LASSERTF(MDS_OPEN_NORESTORE == 00000000000100000000000ULL, "found 0%.22lloULL\n", - (long long)MDS_OPEN_NORESTORE); - LASSERTF(MDS_OPEN_NEWSTRIPE == 00000000000200000000000ULL, "found 0%.22lloULL\n", - (long long)MDS_OPEN_NEWSTRIPE); - LASSERTF(MDS_OPEN_VOLATILE == 00000000000400000000000ULL, "found 0%.22lloULL\n", - (long long)MDS_OPEN_VOLATILE); - LASSERTF(LUSTRE_SYNC_FL == 0x00000008, "found 0x%.8x\n", - LUSTRE_SYNC_FL); - LASSERTF(LUSTRE_IMMUTABLE_FL == 0x00000010, "found 0x%.8x\n", - LUSTRE_IMMUTABLE_FL); - LASSERTF(LUSTRE_APPEND_FL == 0x00000020, "found 0x%.8x\n", - LUSTRE_APPEND_FL); - LASSERTF(LUSTRE_NODUMP_FL == 0x00000040, "found 0x%.8x\n", - LUSTRE_NODUMP_FL); - LASSERTF(LUSTRE_NOATIME_FL == 0x00000080, "found 0x%.8x\n", - LUSTRE_NOATIME_FL); - LASSERTF(LUSTRE_INDEX_FL == 0x00001000, "found 0x%.8x\n", - LUSTRE_INDEX_FL); - LASSERTF(LUSTRE_DIRSYNC_FL == 0x00010000, "found 0x%.8x\n", - LUSTRE_DIRSYNC_FL); - LASSERTF(LUSTRE_TOPDIR_FL == 0x00020000, "found 0x%.8x\n", - LUSTRE_TOPDIR_FL); - LASSERTF(LUSTRE_DIRECTIO_FL == 0x00100000, "found 0x%.8x\n", - LUSTRE_DIRECTIO_FL); - LASSERTF(LUSTRE_INLINE_DATA_FL == 0x10000000, "found 0x%.8x\n", - LUSTRE_INLINE_DATA_FL); - LASSERTF(MDS_INODELOCK_LOOKUP == 0x000001, "found 0x%.8x\n", - MDS_INODELOCK_LOOKUP); - LASSERTF(MDS_INODELOCK_UPDATE == 0x000002, "found 0x%.8x\n", - MDS_INODELOCK_UPDATE); - LASSERTF(MDS_INODELOCK_OPEN == 0x000004, "found 0x%.8x\n", - MDS_INODELOCK_OPEN); - LASSERTF(MDS_INODELOCK_LAYOUT == 0x000008, "found 0x%.8x\n", - MDS_INODELOCK_LAYOUT); - - /* Checks for struct mdt_ioepoch */ - LASSERTF((int)sizeof(struct mdt_ioepoch) == 24, "found %lld\n", - (long long)(int)sizeof(struct mdt_ioepoch)); - LASSERTF((int)offsetof(struct mdt_ioepoch, mio_handle) == 0, "found %lld\n", - (long long)(int)offsetof(struct mdt_ioepoch, mio_handle)); - LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->mio_handle) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_ioepoch *)0)->mio_handle)); - LASSERTF((int)offsetof(struct mdt_ioepoch, mio_unused1) == 8, "found %lld\n", - (long long)(int)offsetof(struct mdt_ioepoch, mio_unused1)); - LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->mio_unused1) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_ioepoch *)0)->mio_unused1)); - LASSERTF((int)offsetof(struct mdt_ioepoch, mio_unused2) == 16, "found %lld\n", - (long long)(int)offsetof(struct mdt_ioepoch, mio_unused2)); - LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->mio_unused2) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_ioepoch *)0)->mio_unused2)); - LASSERTF((int)offsetof(struct mdt_ioepoch, mio_padding) == 20, "found %lld\n", - (long long)(int)offsetof(struct mdt_ioepoch, mio_padding)); - LASSERTF((int)sizeof(((struct mdt_ioepoch *)0)->mio_padding) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_ioepoch *)0)->mio_padding)); - - /* Checks for struct mdt_rec_setattr */ - LASSERTF((int)sizeof(struct mdt_rec_setattr) == 136, "found %lld\n", - (long long)(int)sizeof(struct mdt_rec_setattr)); - LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_opcode) == 0, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setattr, sa_opcode)); - LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_opcode) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_opcode)); - LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_cap) == 4, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setattr, sa_cap)); - LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_cap) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_cap)); - LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsuid) == 8, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsuid)); - LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid)); - LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsuid_h) == 12, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsuid_h)); - LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsuid_h)); - LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsgid) == 16, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsgid)); - LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid)); - LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fsgid_h) == 20, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setattr, sa_fsgid_h)); - LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fsgid_h)); - LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_suppgid) == 24, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setattr, sa_suppgid)); - LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid)); - LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_suppgid_h) == 28, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setattr, sa_suppgid_h)); - LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_suppgid_h)); - LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_1) == 32, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_1)); - LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1)); - LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_1_h) == 36, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_1_h)); - LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_1_h)); - LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_fid) == 40, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setattr, sa_fid)); - LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_fid) == 16, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_fid)); - LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_valid) == 56, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setattr, sa_valid)); - LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_valid) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_valid)); - LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_uid) == 64, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setattr, sa_uid)); - LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_uid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_uid)); - LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_gid) == 68, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setattr, sa_gid)); - LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_gid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_gid)); - LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_size) == 72, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setattr, sa_size)); - LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_size) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_size)); - LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_blocks) == 80, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setattr, sa_blocks)); - LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_blocks) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_blocks)); - LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_mtime) == 88, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setattr, sa_mtime)); - LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_mtime) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_mtime)); - LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_atime) == 96, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setattr, sa_atime)); - LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_atime) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_atime)); - LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_ctime) == 104, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setattr, sa_ctime)); - LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_ctime) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_ctime)); - LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_attr_flags) == 112, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setattr, sa_attr_flags)); - LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_attr_flags) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_attr_flags)); - LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_mode) == 116, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setattr, sa_mode)); - LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_mode) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_mode)); - LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_bias) == 120, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setattr, sa_bias)); - LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_bias) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_bias)); - LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_3) == 124, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_3)); - LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_3) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_3)); - LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_4) == 128, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_4)); - LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_4) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_4)); - LASSERTF((int)offsetof(struct mdt_rec_setattr, sa_padding_5) == 132, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setattr, sa_padding_5)); - LASSERTF((int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_5) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setattr *)0)->sa_padding_5)); - - /* Checks for struct mdt_rec_create */ - LASSERTF((int)sizeof(struct mdt_rec_create) == 136, "found %lld\n", - (long long)(int)sizeof(struct mdt_rec_create)); - LASSERTF((int)offsetof(struct mdt_rec_create, cr_opcode) == 0, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_create, cr_opcode)); - LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_opcode) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_opcode)); - LASSERTF((int)offsetof(struct mdt_rec_create, cr_cap) == 4, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_create, cr_cap)); - LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_cap) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_cap)); - LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsuid) == 8, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_create, cr_fsuid)); - LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid)); - LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsuid_h) == 12, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_create, cr_fsuid_h)); - LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsuid_h)); - LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsgid) == 16, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_create, cr_fsgid)); - LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid)); - LASSERTF((int)offsetof(struct mdt_rec_create, cr_fsgid_h) == 20, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_create, cr_fsgid_h)); - LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fsgid_h)); - LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid1) == 24, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid1)); - LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1)); - LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid1_h) == 28, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid1_h)); - LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid1_h)); - LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid2) == 32, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid2)); - LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2)); - LASSERTF((int)offsetof(struct mdt_rec_create, cr_suppgid2_h) == 36, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_create, cr_suppgid2_h)); - LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_suppgid2_h)); - LASSERTF((int)offsetof(struct mdt_rec_create, cr_fid1) == 40, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_create, cr_fid1)); - LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fid1) == 16, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fid1)); - LASSERTF((int)offsetof(struct mdt_rec_create, cr_fid2) == 56, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_create, cr_fid2)); - LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_fid2) == 16, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_fid2)); - LASSERTF((int)offsetof(struct mdt_rec_create, cr_old_handle) == 72, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_create, cr_old_handle)); - LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_old_handle) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_old_handle)); - LASSERTF((int)offsetof(struct mdt_rec_create, cr_time) == 80, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_create, cr_time)); - LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_time) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_time)); - LASSERTF((int)offsetof(struct mdt_rec_create, cr_rdev) == 88, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_create, cr_rdev)); - LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_rdev) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_rdev)); - LASSERTF((int)offsetof(struct mdt_rec_create, cr_ioepoch) == 96, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_create, cr_ioepoch)); - LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_ioepoch) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_ioepoch)); - LASSERTF((int)offsetof(struct mdt_rec_create, cr_padding_1) == 104, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_create, cr_padding_1)); - LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_padding_1) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_padding_1)); - LASSERTF((int)offsetof(struct mdt_rec_create, cr_mode) == 112, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_create, cr_mode)); - LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_mode) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_mode)); - LASSERTF((int)offsetof(struct mdt_rec_create, cr_bias) == 116, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_create, cr_bias)); - LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_bias) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_bias)); - LASSERTF((int)offsetof(struct mdt_rec_create, cr_flags_l) == 120, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_create, cr_flags_l)); - LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_flags_l) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_flags_l)); - LASSERTF((int)offsetof(struct mdt_rec_create, cr_flags_h) == 124, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_create, cr_flags_h)); - LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_flags_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_flags_h)); - LASSERTF((int)offsetof(struct mdt_rec_create, cr_umask) == 128, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_create, cr_umask)); - LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_umask) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_umask)); - LASSERTF((int)offsetof(struct mdt_rec_create, cr_padding_4) == 132, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_create, cr_padding_4)); - LASSERTF((int)sizeof(((struct mdt_rec_create *)0)->cr_padding_4) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_create *)0)->cr_padding_4)); - - /* Checks for struct mdt_rec_link */ - LASSERTF((int)sizeof(struct mdt_rec_link) == 136, "found %lld\n", - (long long)(int)sizeof(struct mdt_rec_link)); - LASSERTF((int)offsetof(struct mdt_rec_link, lk_opcode) == 0, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_link, lk_opcode)); - LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_opcode) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_opcode)); - LASSERTF((int)offsetof(struct mdt_rec_link, lk_cap) == 4, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_link, lk_cap)); - LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_cap) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_cap)); - LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsuid) == 8, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_link, lk_fsuid)); - LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid)); - LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsuid_h) == 12, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_link, lk_fsuid_h)); - LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsuid_h)); - LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsgid) == 16, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_link, lk_fsgid)); - LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid)); - LASSERTF((int)offsetof(struct mdt_rec_link, lk_fsgid_h) == 20, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_link, lk_fsgid_h)); - LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fsgid_h)); - LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid1) == 24, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid1)); - LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1)); - LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid1_h) == 28, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid1_h)); - LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid1_h)); - LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid2) == 32, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid2)); - LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2)); - LASSERTF((int)offsetof(struct mdt_rec_link, lk_suppgid2_h) == 36, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_link, lk_suppgid2_h)); - LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_suppgid2_h)); - LASSERTF((int)offsetof(struct mdt_rec_link, lk_fid1) == 40, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_link, lk_fid1)); - LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fid1) == 16, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fid1)); - LASSERTF((int)offsetof(struct mdt_rec_link, lk_fid2) == 56, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_link, lk_fid2)); - LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_fid2) == 16, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_fid2)); - LASSERTF((int)offsetof(struct mdt_rec_link, lk_time) == 72, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_link, lk_time)); - LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_time) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_time)); - LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_1) == 80, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_link, lk_padding_1)); - LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_1) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_1)); - LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_2) == 88, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_link, lk_padding_2)); - LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_2) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_2)); - LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_3) == 96, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_link, lk_padding_3)); - LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_3) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_3)); - LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_4) == 104, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_link, lk_padding_4)); - LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_4) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_4)); - LASSERTF((int)offsetof(struct mdt_rec_link, lk_bias) == 112, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_link, lk_bias)); - LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_bias) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_bias)); - LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_5) == 116, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_link, lk_padding_5)); - LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_5) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_5)); - LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_6) == 120, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_link, lk_padding_6)); - LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_6) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_6)); - LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_7) == 124, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_link, lk_padding_7)); - LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_7) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_7)); - LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_8) == 128, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_link, lk_padding_8)); - LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_8) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_8)); - LASSERTF((int)offsetof(struct mdt_rec_link, lk_padding_9) == 132, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_link, lk_padding_9)); - LASSERTF((int)sizeof(((struct mdt_rec_link *)0)->lk_padding_9) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_link *)0)->lk_padding_9)); - - /* Checks for struct mdt_rec_unlink */ - LASSERTF((int)sizeof(struct mdt_rec_unlink) == 136, "found %lld\n", - (long long)(int)sizeof(struct mdt_rec_unlink)); - LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_opcode) == 0, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_unlink, ul_opcode)); - LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_opcode) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_opcode)); - LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_cap) == 4, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_unlink, ul_cap)); - LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_cap) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_cap)); - LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsuid) == 8, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsuid)); - LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid)); - LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsuid_h) == 12, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsuid_h)); - LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsuid_h)); - LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsgid) == 16, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsgid)); - LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid)); - LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fsgid_h) == 20, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_unlink, ul_fsgid_h)); - LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fsgid_h)); - LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid1) == 24, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid1)); - LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1)); - LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid1_h) == 28, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid1_h)); - LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid1_h)); - LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid2) == 32, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid2)); - LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2)); - LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_suppgid2_h) == 36, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_unlink, ul_suppgid2_h)); - LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_suppgid2_h)); - LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fid1) == 40, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_unlink, ul_fid1)); - LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid1) == 16, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid1)); - LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_fid2) == 56, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_unlink, ul_fid2)); - LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid2) == 16, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_fid2)); - LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_time) == 72, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_unlink, ul_time)); - LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_time) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_time)); - LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_2) == 80, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_2)); - LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_2) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_2)); - LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_3) == 88, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_3)); - LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_3) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_3)); - LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_4) == 96, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_4)); - LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_4) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_4)); - LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_5) == 104, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_5)); - LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_5) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_5)); - LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_bias) == 112, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_unlink, ul_bias)); - LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_bias) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_bias)); - LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_mode) == 116, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_unlink, ul_mode)); - LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_mode) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_mode)); - LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_6) == 120, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_6)); - LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_6) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_6)); - LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_7) == 124, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_7)); - LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_7) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_7)); - LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_8) == 128, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_8)); - LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_8) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_8)); - LASSERTF((int)offsetof(struct mdt_rec_unlink, ul_padding_9) == 132, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_unlink, ul_padding_9)); - LASSERTF((int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_9) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_unlink *)0)->ul_padding_9)); - - /* Checks for struct mdt_rec_rename */ - LASSERTF((int)sizeof(struct mdt_rec_rename) == 136, "found %lld\n", - (long long)(int)sizeof(struct mdt_rec_rename)); - LASSERTF((int)offsetof(struct mdt_rec_rename, rn_opcode) == 0, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_rename, rn_opcode)); - LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_opcode) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_opcode)); - LASSERTF((int)offsetof(struct mdt_rec_rename, rn_cap) == 4, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_rename, rn_cap)); - LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_cap) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_cap)); - LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsuid) == 8, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_rename, rn_fsuid)); - LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid)); - LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsuid_h) == 12, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_rename, rn_fsuid_h)); - LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsuid_h)); - LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsgid) == 16, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_rename, rn_fsgid)); - LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid)); - LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fsgid_h) == 20, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_rename, rn_fsgid_h)); - LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fsgid_h)); - LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid1) == 24, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid1)); - LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1)); - LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid1_h) == 28, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid1_h)); - LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid1_h)); - LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid2) == 32, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid2)); - LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2)); - LASSERTF((int)offsetof(struct mdt_rec_rename, rn_suppgid2_h) == 36, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_rename, rn_suppgid2_h)); - LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_suppgid2_h)); - LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fid1) == 40, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_rename, rn_fid1)); - LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fid1) == 16, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fid1)); - LASSERTF((int)offsetof(struct mdt_rec_rename, rn_fid2) == 56, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_rename, rn_fid2)); - LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_fid2) == 16, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_fid2)); - LASSERTF((int)offsetof(struct mdt_rec_rename, rn_time) == 72, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_rename, rn_time)); - LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_time) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_time)); - LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_1) == 80, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_1)); - LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_1) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_1)); - LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_2) == 88, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_2)); - LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_2) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_2)); - LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_3) == 96, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_3)); - LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_3) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_3)); - LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_4) == 104, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_4)); - LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_4) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_4)); - LASSERTF((int)offsetof(struct mdt_rec_rename, rn_bias) == 112, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_rename, rn_bias)); - LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_bias) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_bias)); - LASSERTF((int)offsetof(struct mdt_rec_rename, rn_mode) == 116, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_rename, rn_mode)); - LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_mode) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_mode)); - LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_5) == 120, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_5)); - LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_5) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_5)); - LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_6) == 124, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_6)); - LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_6) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_6)); - LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_7) == 128, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_7)); - LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_7) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_7)); - LASSERTF((int)offsetof(struct mdt_rec_rename, rn_padding_8) == 132, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_rename, rn_padding_8)); - LASSERTF((int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_8) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_rename *)0)->rn_padding_8)); - - /* Checks for struct mdt_rec_setxattr */ - LASSERTF((int)sizeof(struct mdt_rec_setxattr) == 136, "found %lld\n", - (long long)(int)sizeof(struct mdt_rec_setxattr)); - LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_opcode) == 0, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setxattr, sx_opcode)); - LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_opcode) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_opcode)); - LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_cap) == 4, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setxattr, sx_cap)); - LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_cap) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_cap)); - LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsuid) == 8, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsuid)); - LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid)); - LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsuid_h) == 12, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsuid_h)); - LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsuid_h)); - LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsgid) == 16, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsgid)); - LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid)); - LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fsgid_h) == 20, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fsgid_h)); - LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fsgid_h)); - LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid1) == 24, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid1)); - LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1)); - LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid1_h) == 28, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid1_h)); - LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid1_h)); - LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid2) == 32, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid2)); - LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2)); - LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_suppgid2_h) == 36, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setxattr, sx_suppgid2_h)); - LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_suppgid2_h)); - LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_fid) == 40, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setxattr, sx_fid)); - LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fid) == 16, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_fid)); - LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_1) == 56, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_1)); - LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_1) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_1)); - LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_2) == 64, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_2)); - LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_2) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_2)); - LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_3) == 68, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_3)); - LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_3) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_3)); - LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_valid) == 72, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setxattr, sx_valid)); - LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_valid) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_valid)); - LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_time) == 80, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setxattr, sx_time)); - LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_time) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_time)); - LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_5) == 88, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_5)); - LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_5) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_5)); - LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_6) == 96, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_6)); - LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_6) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_6)); - LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_7) == 104, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_7)); - LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_7) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_7)); - LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_size) == 112, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setxattr, sx_size)); - LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_size) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_size)); - LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_flags) == 116, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setxattr, sx_flags)); - LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_flags) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_flags)); - LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_8) == 120, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_8)); - LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_8) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_8)); - LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_9) == 124, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_9)); - LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_9) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_9)); - LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_10) == 128, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_10)); - LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_10) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_10)); - LASSERTF((int)offsetof(struct mdt_rec_setxattr, sx_padding_11) == 132, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_setxattr, sx_padding_11)); - LASSERTF((int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_setxattr *)0)->sx_padding_11)); - - /* Checks for struct mdt_rec_reint */ - LASSERTF((int)sizeof(struct mdt_rec_reint) == 136, "found %lld\n", - (long long)(int)sizeof(struct mdt_rec_reint)); - LASSERTF((int)offsetof(struct mdt_rec_reint, rr_opcode) == 0, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_reint, rr_opcode)); - LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_opcode) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_opcode)); - LASSERTF((int)offsetof(struct mdt_rec_reint, rr_cap) == 4, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_reint, rr_cap)); - LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_cap) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_cap)); - LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsuid) == 8, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_reint, rr_fsuid)); - LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid)); - LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsuid_h) == 12, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_reint, rr_fsuid_h)); - LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsuid_h)); - LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsgid) == 16, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_reint, rr_fsgid)); - LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid)); - LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fsgid_h) == 20, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_reint, rr_fsgid_h)); - LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fsgid_h)); - LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid1) == 24, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid1)); - LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1)); - LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid1_h) == 28, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid1_h)); - LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid1_h)); - LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid2) == 32, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid2)); - LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2)); - LASSERTF((int)offsetof(struct mdt_rec_reint, rr_suppgid2_h) == 36, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_reint, rr_suppgid2_h)); - LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_suppgid2_h)); - LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fid1) == 40, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_reint, rr_fid1)); - LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fid1) == 16, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fid1)); - LASSERTF((int)offsetof(struct mdt_rec_reint, rr_fid2) == 56, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_reint, rr_fid2)); - LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_fid2) == 16, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_fid2)); - LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mtime) == 72, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_reint, rr_mtime)); - LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mtime) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mtime)); - LASSERTF((int)offsetof(struct mdt_rec_reint, rr_atime) == 80, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_reint, rr_atime)); - LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_atime) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_atime)); - LASSERTF((int)offsetof(struct mdt_rec_reint, rr_ctime) == 88, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_reint, rr_ctime)); - LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_ctime) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_ctime)); - LASSERTF((int)offsetof(struct mdt_rec_reint, rr_size) == 96, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_reint, rr_size)); - LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_size) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_size)); - LASSERTF((int)offsetof(struct mdt_rec_reint, rr_blocks) == 104, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_reint, rr_blocks)); - LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_blocks) == 8, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_blocks)); - LASSERTF((int)offsetof(struct mdt_rec_reint, rr_bias) == 112, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_reint, rr_bias)); - LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_bias) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_bias)); - LASSERTF((int)offsetof(struct mdt_rec_reint, rr_mode) == 116, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_reint, rr_mode)); - LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_mode) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_mode)); - LASSERTF((int)offsetof(struct mdt_rec_reint, rr_flags) == 120, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_reint, rr_flags)); - LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_flags) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_flags)); - LASSERTF((int)offsetof(struct mdt_rec_reint, rr_flags_h) == 124, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_reint, rr_flags_h)); - LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_flags_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_flags_h)); - LASSERTF((int)offsetof(struct mdt_rec_reint, rr_umask) == 128, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_reint, rr_umask)); - LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_umask) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_umask)); - LASSERTF((int)offsetof(struct mdt_rec_reint, rr_padding_4) == 132, "found %lld\n", - (long long)(int)offsetof(struct mdt_rec_reint, rr_padding_4)); - LASSERTF((int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mdt_rec_reint *)0)->rr_padding_4)); - - /* Checks for struct lmv_desc */ - LASSERTF((int)sizeof(struct lmv_desc) == 88, "found %lld\n", - (long long)(int)sizeof(struct lmv_desc)); - LASSERTF((int)offsetof(struct lmv_desc, ld_tgt_count) == 0, "found %lld\n", - (long long)(int)offsetof(struct lmv_desc, ld_tgt_count)); - LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_tgt_count) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lmv_desc *)0)->ld_tgt_count)); - LASSERTF((int)offsetof(struct lmv_desc, ld_active_tgt_count) == 4, "found %lld\n", - (long long)(int)offsetof(struct lmv_desc, ld_active_tgt_count)); - LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_active_tgt_count) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lmv_desc *)0)->ld_active_tgt_count)); - LASSERTF((int)offsetof(struct lmv_desc, ld_default_stripe_count) == 8, "found %lld\n", - (long long)(int)offsetof(struct lmv_desc, ld_default_stripe_count)); - LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_default_stripe_count) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lmv_desc *)0)->ld_default_stripe_count)); - LASSERTF((int)offsetof(struct lmv_desc, ld_pattern) == 12, "found %lld\n", - (long long)(int)offsetof(struct lmv_desc, ld_pattern)); - LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_pattern) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lmv_desc *)0)->ld_pattern)); - LASSERTF((int)offsetof(struct lmv_desc, ld_default_hash_size) == 16, "found %lld\n", - (long long)(int)offsetof(struct lmv_desc, ld_default_hash_size)); - LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_default_hash_size) == 8, "found %lld\n", - (long long)(int)sizeof(((struct lmv_desc *)0)->ld_default_hash_size)); - LASSERTF((int)offsetof(struct lmv_desc, ld_padding_1) == 24, "found %lld\n", - (long long)(int)offsetof(struct lmv_desc, ld_padding_1)); - LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_1) == 8, "found %lld\n", - (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_1)); - LASSERTF((int)offsetof(struct lmv_desc, ld_padding_2) == 32, "found %lld\n", - (long long)(int)offsetof(struct lmv_desc, ld_padding_2)); - LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_2) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_2)); - LASSERTF((int)offsetof(struct lmv_desc, ld_qos_maxage) == 36, "found %lld\n", - (long long)(int)offsetof(struct lmv_desc, ld_qos_maxage)); - LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_qos_maxage) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lmv_desc *)0)->ld_qos_maxage)); - LASSERTF((int)offsetof(struct lmv_desc, ld_padding_3) == 40, "found %lld\n", - (long long)(int)offsetof(struct lmv_desc, ld_padding_3)); - LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_3) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_3)); - LASSERTF((int)offsetof(struct lmv_desc, ld_padding_4) == 44, "found %lld\n", - (long long)(int)offsetof(struct lmv_desc, ld_padding_4)); - LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_padding_4) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lmv_desc *)0)->ld_padding_4)); - LASSERTF((int)offsetof(struct lmv_desc, ld_uuid) == 48, "found %lld\n", - (long long)(int)offsetof(struct lmv_desc, ld_uuid)); - LASSERTF((int)sizeof(((struct lmv_desc *)0)->ld_uuid) == 40, "found %lld\n", - (long long)(int)sizeof(((struct lmv_desc *)0)->ld_uuid)); - - /* Checks for struct lov_desc */ - LASSERTF((int)sizeof(struct lov_desc) == 88, "found %lld\n", - (long long)(int)sizeof(struct lov_desc)); - LASSERTF((int)offsetof(struct lov_desc, ld_tgt_count) == 0, "found %lld\n", - (long long)(int)offsetof(struct lov_desc, ld_tgt_count)); - LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_tgt_count) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lov_desc *)0)->ld_tgt_count)); - LASSERTF((int)offsetof(struct lov_desc, ld_active_tgt_count) == 4, "found %lld\n", - (long long)(int)offsetof(struct lov_desc, ld_active_tgt_count)); - LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_active_tgt_count) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lov_desc *)0)->ld_active_tgt_count)); - LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_count) == 8, "found %lld\n", - (long long)(int)offsetof(struct lov_desc, ld_default_stripe_count)); - LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_count) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_count)); - LASSERTF((int)offsetof(struct lov_desc, ld_pattern) == 12, "found %lld\n", - (long long)(int)offsetof(struct lov_desc, ld_pattern)); - LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_pattern) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lov_desc *)0)->ld_pattern)); - LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_size) == 16, "found %lld\n", - (long long)(int)offsetof(struct lov_desc, ld_default_stripe_size)); - LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_size) == 8, "found %lld\n", - (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_size)); - LASSERTF((int)offsetof(struct lov_desc, ld_default_stripe_offset) == 24, "found %lld\n", - (long long)(int)offsetof(struct lov_desc, ld_default_stripe_offset)); - LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset) == 8, "found %lld\n", - (long long)(int)sizeof(((struct lov_desc *)0)->ld_default_stripe_offset)); - LASSERTF((int)offsetof(struct lov_desc, ld_padding_0) == 32, "found %lld\n", - (long long)(int)offsetof(struct lov_desc, ld_padding_0)); - LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_0) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_0)); - LASSERTF((int)offsetof(struct lov_desc, ld_qos_maxage) == 36, "found %lld\n", - (long long)(int)offsetof(struct lov_desc, ld_qos_maxage)); - LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_qos_maxage) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lov_desc *)0)->ld_qos_maxage)); - LASSERTF((int)offsetof(struct lov_desc, ld_padding_1) == 40, "found %lld\n", - (long long)(int)offsetof(struct lov_desc, ld_padding_1)); - LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_1) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_1)); - LASSERTF((int)offsetof(struct lov_desc, ld_padding_2) == 44, "found %lld\n", - (long long)(int)offsetof(struct lov_desc, ld_padding_2)); - LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_padding_2) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lov_desc *)0)->ld_padding_2)); - LASSERTF((int)offsetof(struct lov_desc, ld_uuid) == 48, "found %lld\n", - (long long)(int)offsetof(struct lov_desc, ld_uuid)); - LASSERTF((int)sizeof(((struct lov_desc *)0)->ld_uuid) == 40, "found %lld\n", - (long long)(int)sizeof(((struct lov_desc *)0)->ld_uuid)); - BUILD_BUG_ON(LOV_DESC_MAGIC != 0xB0CCDE5C); - - /* Checks for struct ldlm_res_id */ - LASSERTF((int)sizeof(struct ldlm_res_id) == 32, "found %lld\n", - (long long)(int)sizeof(struct ldlm_res_id)); - BUILD_BUG_ON(RES_NAME_SIZE != 4); - LASSERTF((int)offsetof(struct ldlm_res_id, name[4]) == 32, "found %lld\n", - (long long)(int)offsetof(struct ldlm_res_id, name[4])); - LASSERTF((int)sizeof(((struct ldlm_res_id *)0)->name[4]) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_res_id *)0)->name[4])); - - /* Checks for struct ldlm_extent */ - LASSERTF((int)sizeof(struct ldlm_extent) == 24, "found %lld\n", - (long long)(int)sizeof(struct ldlm_extent)); - LASSERTF((int)offsetof(struct ldlm_extent, start) == 0, "found %lld\n", - (long long)(int)offsetof(struct ldlm_extent, start)); - LASSERTF((int)sizeof(((struct ldlm_extent *)0)->start) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_extent *)0)->start)); - LASSERTF((int)offsetof(struct ldlm_extent, end) == 8, "found %lld\n", - (long long)(int)offsetof(struct ldlm_extent, end)); - LASSERTF((int)sizeof(((struct ldlm_extent *)0)->end) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_extent *)0)->end)); - LASSERTF((int)offsetof(struct ldlm_extent, gid) == 16, "found %lld\n", - (long long)(int)offsetof(struct ldlm_extent, gid)); - LASSERTF((int)sizeof(((struct ldlm_extent *)0)->gid) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_extent *)0)->gid)); - - /* Checks for struct ldlm_inodebits */ - LASSERTF((int)sizeof(struct ldlm_inodebits) == 8, "found %lld\n", - (long long)(int)sizeof(struct ldlm_inodebits)); - LASSERTF((int)offsetof(struct ldlm_inodebits, bits) == 0, "found %lld\n", - (long long)(int)offsetof(struct ldlm_inodebits, bits)); - LASSERTF((int)sizeof(((struct ldlm_inodebits *)0)->bits) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_inodebits *)0)->bits)); - - /* Checks for struct ldlm_flock_wire */ - LASSERTF((int)sizeof(struct ldlm_flock_wire) == 32, "found %lld\n", - (long long)(int)sizeof(struct ldlm_flock_wire)); - LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_start) == 0, "found %lld\n", - (long long)(int)offsetof(struct ldlm_flock_wire, lfw_start)); - LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_start) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_start)); - LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_end) == 8, "found %lld\n", - (long long)(int)offsetof(struct ldlm_flock_wire, lfw_end)); - LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_end) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_end)); - LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_owner) == 16, "found %lld\n", - (long long)(int)offsetof(struct ldlm_flock_wire, lfw_owner)); - LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_owner) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_owner)); - LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_padding) == 24, "found %lld\n", - (long long)(int)offsetof(struct ldlm_flock_wire, lfw_padding)); - LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_padding) == 4, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_padding)); - LASSERTF((int)offsetof(struct ldlm_flock_wire, lfw_pid) == 28, "found %lld\n", - (long long)(int)offsetof(struct ldlm_flock_wire, lfw_pid)); - LASSERTF((int)sizeof(((struct ldlm_flock_wire *)0)->lfw_pid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_flock_wire *)0)->lfw_pid)); - - /* Checks for struct ldlm_intent */ - LASSERTF((int)sizeof(struct ldlm_intent) == 8, "found %lld\n", - (long long)(int)sizeof(struct ldlm_intent)); - LASSERTF((int)offsetof(struct ldlm_intent, opc) == 0, "found %lld\n", - (long long)(int)offsetof(struct ldlm_intent, opc)); - LASSERTF((int)sizeof(((struct ldlm_intent *)0)->opc) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_intent *)0)->opc)); - - /* Checks for struct ldlm_resource_desc */ - LASSERTF((int)sizeof(struct ldlm_resource_desc) == 40, "found %lld\n", - (long long)(int)sizeof(struct ldlm_resource_desc)); - LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_type) == 0, "found %lld\n", - (long long)(int)offsetof(struct ldlm_resource_desc, lr_type)); - LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_type) == 4, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_type)); - LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_padding) == 4, "found %lld\n", - (long long)(int)offsetof(struct ldlm_resource_desc, lr_padding)); - LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_padding) == 4, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_padding)); - LASSERTF((int)offsetof(struct ldlm_resource_desc, lr_name) == 8, "found %lld\n", - (long long)(int)offsetof(struct ldlm_resource_desc, lr_name)); - LASSERTF((int)sizeof(((struct ldlm_resource_desc *)0)->lr_name) == 32, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_resource_desc *)0)->lr_name)); - - /* Checks for struct ldlm_lock_desc */ - LASSERTF((int)sizeof(struct ldlm_lock_desc) == 80, "found %lld\n", - (long long)(int)sizeof(struct ldlm_lock_desc)); - LASSERTF((int)offsetof(struct ldlm_lock_desc, l_resource) == 0, "found %lld\n", - (long long)(int)offsetof(struct ldlm_lock_desc, l_resource)); - LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_resource) == 40, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_resource)); - LASSERTF((int)offsetof(struct ldlm_lock_desc, l_req_mode) == 40, "found %lld\n", - (long long)(int)offsetof(struct ldlm_lock_desc, l_req_mode)); - LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_req_mode) == 4, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_req_mode)); - LASSERTF((int)offsetof(struct ldlm_lock_desc, l_granted_mode) == 44, "found %lld\n", - (long long)(int)offsetof(struct ldlm_lock_desc, l_granted_mode)); - LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_granted_mode) == 4, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_granted_mode)); - LASSERTF((int)offsetof(struct ldlm_lock_desc, l_policy_data) == 48, "found %lld\n", - (long long)(int)offsetof(struct ldlm_lock_desc, l_policy_data)); - LASSERTF((int)sizeof(((struct ldlm_lock_desc *)0)->l_policy_data) == 32, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_lock_desc *)0)->l_policy_data)); - - /* Checks for struct ldlm_request */ - LASSERTF((int)sizeof(struct ldlm_request) == 104, "found %lld\n", - (long long)(int)sizeof(struct ldlm_request)); - LASSERTF((int)offsetof(struct ldlm_request, lock_flags) == 0, "found %lld\n", - (long long)(int)offsetof(struct ldlm_request, lock_flags)); - LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_flags) == 4, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_request *)0)->lock_flags)); - LASSERTF((int)offsetof(struct ldlm_request, lock_count) == 4, "found %lld\n", - (long long)(int)offsetof(struct ldlm_request, lock_count)); - LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_count) == 4, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_request *)0)->lock_count)); - LASSERTF((int)offsetof(struct ldlm_request, lock_desc) == 8, "found %lld\n", - (long long)(int)offsetof(struct ldlm_request, lock_desc)); - LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_desc) == 80, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_request *)0)->lock_desc)); - LASSERTF((int)offsetof(struct ldlm_request, lock_handle) == 88, "found %lld\n", - (long long)(int)offsetof(struct ldlm_request, lock_handle)); - LASSERTF((int)sizeof(((struct ldlm_request *)0)->lock_handle) == 16, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_request *)0)->lock_handle)); - - /* Checks for struct ldlm_reply */ - LASSERTF((int)sizeof(struct ldlm_reply) == 112, "found %lld\n", - (long long)(int)sizeof(struct ldlm_reply)); - LASSERTF((int)offsetof(struct ldlm_reply, lock_flags) == 0, "found %lld\n", - (long long)(int)offsetof(struct ldlm_reply, lock_flags)); - LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_flags) == 4, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_flags)); - LASSERTF((int)offsetof(struct ldlm_reply, lock_padding) == 4, "found %lld\n", - (long long)(int)offsetof(struct ldlm_reply, lock_padding)); - LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_padding) == 4, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_padding)); - LASSERTF((int)offsetof(struct ldlm_reply, lock_desc) == 8, "found %lld\n", - (long long)(int)offsetof(struct ldlm_reply, lock_desc)); - LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_desc) == 80, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_desc)); - LASSERTF((int)offsetof(struct ldlm_reply, lock_handle) == 88, "found %lld\n", - (long long)(int)offsetof(struct ldlm_reply, lock_handle)); - LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_handle) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_handle)); - LASSERTF((int)offsetof(struct ldlm_reply, lock_policy_res1) == 96, "found %lld\n", - (long long)(int)offsetof(struct ldlm_reply, lock_policy_res1)); - LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_policy_res1) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_policy_res1)); - LASSERTF((int)offsetof(struct ldlm_reply, lock_policy_res2) == 104, "found %lld\n", - (long long)(int)offsetof(struct ldlm_reply, lock_policy_res2)); - LASSERTF((int)sizeof(((struct ldlm_reply *)0)->lock_policy_res2) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_reply *)0)->lock_policy_res2)); - - /* Checks for struct ost_lvb_v1 */ - LASSERTF((int)sizeof(struct ost_lvb_v1) == 40, "found %lld\n", - (long long)(int)sizeof(struct ost_lvb_v1)); - LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_size) == 0, "found %lld\n", - (long long)(int)offsetof(struct ost_lvb_v1, lvb_size)); - LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_size) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_size)); - LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_mtime) == 8, "found %lld\n", - (long long)(int)offsetof(struct ost_lvb_v1, lvb_mtime)); - LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_mtime) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_mtime)); - LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_atime) == 16, "found %lld\n", - (long long)(int)offsetof(struct ost_lvb_v1, lvb_atime)); - LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_atime) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_atime)); - LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_ctime) == 24, "found %lld\n", - (long long)(int)offsetof(struct ost_lvb_v1, lvb_ctime)); - LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_ctime) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_ctime)); - LASSERTF((int)offsetof(struct ost_lvb_v1, lvb_blocks) == 32, "found %lld\n", - (long long)(int)offsetof(struct ost_lvb_v1, lvb_blocks)); - LASSERTF((int)sizeof(((struct ost_lvb_v1 *)0)->lvb_blocks) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ost_lvb_v1 *)0)->lvb_blocks)); - - /* Checks for struct ost_lvb */ - LASSERTF((int)sizeof(struct ost_lvb) == 56, "found %lld\n", - (long long)(int)sizeof(struct ost_lvb)); - LASSERTF((int)offsetof(struct ost_lvb, lvb_size) == 0, "found %lld\n", - (long long)(int)offsetof(struct ost_lvb, lvb_size)); - LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_size) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_size)); - LASSERTF((int)offsetof(struct ost_lvb, lvb_mtime) == 8, "found %lld\n", - (long long)(int)offsetof(struct ost_lvb, lvb_mtime)); - LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_mtime) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_mtime)); - LASSERTF((int)offsetof(struct ost_lvb, lvb_atime) == 16, "found %lld\n", - (long long)(int)offsetof(struct ost_lvb, lvb_atime)); - LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_atime) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_atime)); - LASSERTF((int)offsetof(struct ost_lvb, lvb_ctime) == 24, "found %lld\n", - (long long)(int)offsetof(struct ost_lvb, lvb_ctime)); - LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_ctime) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_ctime)); - LASSERTF((int)offsetof(struct ost_lvb, lvb_blocks) == 32, "found %lld\n", - (long long)(int)offsetof(struct ost_lvb, lvb_blocks)); - LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_blocks) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_blocks)); - LASSERTF((int)offsetof(struct ost_lvb, lvb_mtime_ns) == 40, "found %lld\n", - (long long)(int)offsetof(struct ost_lvb, lvb_mtime_ns)); - LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_mtime_ns) == 4, "found %lld\n", - (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_mtime_ns)); - LASSERTF((int)offsetof(struct ost_lvb, lvb_atime_ns) == 44, "found %lld\n", - (long long)(int)offsetof(struct ost_lvb, lvb_atime_ns)); - LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_atime_ns) == 4, "found %lld\n", - (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_atime_ns)); - LASSERTF((int)offsetof(struct ost_lvb, lvb_ctime_ns) == 48, "found %lld\n", - (long long)(int)offsetof(struct ost_lvb, lvb_ctime_ns)); - LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_ctime_ns) == 4, "found %lld\n", - (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_ctime_ns)); - LASSERTF((int)offsetof(struct ost_lvb, lvb_padding) == 52, "found %lld\n", - (long long)(int)offsetof(struct ost_lvb, lvb_padding)); - LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_padding) == 4, "found %lld\n", - (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_padding)); - - /* Checks for struct lquota_lvb */ - LASSERTF((int)sizeof(struct lquota_lvb) == 40, "found %lld\n", - (long long)(int)sizeof(struct lquota_lvb)); - LASSERTF((int)offsetof(struct lquota_lvb, lvb_flags) == 0, "found %lld\n", - (long long)(int)offsetof(struct lquota_lvb, lvb_flags)); - LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_flags) == 8, "found %lld\n", - (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_flags)); - LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_may_rel) == 8, "found %lld\n", - (long long)(int)offsetof(struct lquota_lvb, lvb_id_may_rel)); - LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_may_rel) == 8, "found %lld\n", - (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_may_rel)); - LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_rel) == 16, "found %lld\n", - (long long)(int)offsetof(struct lquota_lvb, lvb_id_rel)); - LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_rel) == 8, "found %lld\n", - (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_rel)); - LASSERTF((int)offsetof(struct lquota_lvb, lvb_id_qunit) == 24, "found %lld\n", - (long long)(int)offsetof(struct lquota_lvb, lvb_id_qunit)); - LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_id_qunit) == 8, "found %lld\n", - (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_id_qunit)); - LASSERTF((int)offsetof(struct lquota_lvb, lvb_pad1) == 32, "found %lld\n", - (long long)(int)offsetof(struct lquota_lvb, lvb_pad1)); - LASSERTF((int)sizeof(((struct lquota_lvb *)0)->lvb_pad1) == 8, "found %lld\n", - (long long)(int)sizeof(((struct lquota_lvb *)0)->lvb_pad1)); - LASSERTF(LQUOTA_FL_EDQUOT == 1, "found %lld\n", - (long long)LQUOTA_FL_EDQUOT); - - /* Checks for struct ldlm_gl_lquota_desc */ - LASSERTF((int)sizeof(struct ldlm_gl_lquota_desc) == 64, "found %lld\n", - (long long)(int)sizeof(struct ldlm_gl_lquota_desc)); - LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_id) == 0, "found %lld\n", - (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_id)); - LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_id) == 16, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_id)); - LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_flags) == 16, "found %lld\n", - (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_flags)); - LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_flags) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_flags)); - LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_ver) == 24, "found %lld\n", - (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_ver)); - LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_ver) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_ver)); - LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_hardlimit) == 32, "found %lld\n", - (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_hardlimit)); - LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_hardlimit) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_hardlimit)); - LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_softlimit) == 40, "found %lld\n", - (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_softlimit)); - LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_softlimit) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_softlimit)); - LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_time) == 48, "found %lld\n", - (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_time)); - LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_time) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_time)); - LASSERTF((int)offsetof(struct ldlm_gl_lquota_desc, gl_pad2) == 56, "found %lld\n", - (long long)(int)offsetof(struct ldlm_gl_lquota_desc, gl_pad2)); - LASSERTF((int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_pad2) == 8, "found %lld\n", - (long long)(int)sizeof(((struct ldlm_gl_lquota_desc *)0)->gl_pad2)); - - /* Checks for struct mgs_send_param */ - LASSERTF((int)sizeof(struct mgs_send_param) == 1024, "found %lld\n", - (long long)(int)sizeof(struct mgs_send_param)); - BUILD_BUG_ON(MGS_PARAM_MAXLEN != 1024); - LASSERTF((int)offsetof(struct mgs_send_param, mgs_param[1024]) == 1024, "found %lld\n", - (long long)(int)offsetof(struct mgs_send_param, mgs_param[1024])); - LASSERTF((int)sizeof(((struct mgs_send_param *)0)->mgs_param[1024]) == 1, "found %lld\n", - (long long)(int)sizeof(((struct mgs_send_param *)0)->mgs_param[1024])); - - /* Checks for struct cfg_marker */ - LASSERTF((int)sizeof(struct cfg_marker) == 160, "found %lld\n", - (long long)(int)sizeof(struct cfg_marker)); - LASSERTF((int)offsetof(struct cfg_marker, cm_step) == 0, "found %lld\n", - (long long)(int)offsetof(struct cfg_marker, cm_step)); - LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_step) == 4, "found %lld\n", - (long long)(int)sizeof(((struct cfg_marker *)0)->cm_step)); - LASSERTF((int)offsetof(struct cfg_marker, cm_flags) == 4, "found %lld\n", - (long long)(int)offsetof(struct cfg_marker, cm_flags)); - LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_flags) == 4, "found %lld\n", - (long long)(int)sizeof(((struct cfg_marker *)0)->cm_flags)); - LASSERTF((int)offsetof(struct cfg_marker, cm_vers) == 8, "found %lld\n", - (long long)(int)offsetof(struct cfg_marker, cm_vers)); - LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_vers) == 4, "found %lld\n", - (long long)(int)sizeof(((struct cfg_marker *)0)->cm_vers)); - LASSERTF((int)offsetof(struct cfg_marker, cm_padding) == 12, "found %lld\n", - (long long)(int)offsetof(struct cfg_marker, cm_padding)); - LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_padding) == 4, "found %lld\n", - (long long)(int)sizeof(((struct cfg_marker *)0)->cm_padding)); - LASSERTF((int)offsetof(struct cfg_marker, cm_createtime) == 16, "found %lld\n", - (long long)(int)offsetof(struct cfg_marker, cm_createtime)); - LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_createtime) == 8, "found %lld\n", - (long long)(int)sizeof(((struct cfg_marker *)0)->cm_createtime)); - LASSERTF((int)offsetof(struct cfg_marker, cm_canceltime) == 24, "found %lld\n", - (long long)(int)offsetof(struct cfg_marker, cm_canceltime)); - LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_canceltime) == 8, "found %lld\n", - (long long)(int)sizeof(((struct cfg_marker *)0)->cm_canceltime)); - LASSERTF((int)offsetof(struct cfg_marker, cm_tgtname) == 32, "found %lld\n", - (long long)(int)offsetof(struct cfg_marker, cm_tgtname)); - LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_tgtname) == 64, "found %lld\n", - (long long)(int)sizeof(((struct cfg_marker *)0)->cm_tgtname)); - LASSERTF((int)offsetof(struct cfg_marker, cm_comment) == 96, "found %lld\n", - (long long)(int)offsetof(struct cfg_marker, cm_comment)); - LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_comment) == 64, "found %lld\n", - (long long)(int)sizeof(((struct cfg_marker *)0)->cm_comment)); - - /* Checks for struct llog_logid */ - LASSERTF((int)sizeof(struct llog_logid) == 20, "found %lld\n", - (long long)(int)sizeof(struct llog_logid)); - LASSERTF((int)offsetof(struct llog_logid, lgl_oi) == 0, "found %lld\n", - (long long)(int)offsetof(struct llog_logid, lgl_oi)); - LASSERTF((int)sizeof(((struct llog_logid *)0)->lgl_oi) == 16, "found %lld\n", - (long long)(int)sizeof(((struct llog_logid *)0)->lgl_oi)); - LASSERTF((int)offsetof(struct llog_logid, lgl_ogen) == 16, "found %lld\n", - (long long)(int)offsetof(struct llog_logid, lgl_ogen)); - LASSERTF((int)sizeof(((struct llog_logid *)0)->lgl_ogen) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_logid *)0)->lgl_ogen)); - BUILD_BUG_ON(OST_SZ_REC != 274730752); - BUILD_BUG_ON(MDS_UNLINK_REC != 274801668); - BUILD_BUG_ON(MDS_UNLINK64_REC != 275325956); - BUILD_BUG_ON(MDS_SETATTR64_REC != 275325953); - BUILD_BUG_ON(OBD_CFG_REC != 274857984); - BUILD_BUG_ON(LLOG_GEN_REC != 274989056); - BUILD_BUG_ON(CHANGELOG_REC != 275120128); - BUILD_BUG_ON(CHANGELOG_USER_REC != 275185664); - BUILD_BUG_ON(LLOG_HDR_MAGIC != 275010873); - BUILD_BUG_ON(LLOG_LOGID_MAGIC != 275010875); - - /* Checks for struct llog_catid */ - LASSERTF((int)sizeof(struct llog_catid) == 32, "found %lld\n", - (long long)(int)sizeof(struct llog_catid)); - LASSERTF((int)offsetof(struct llog_catid, lci_logid) == 0, "found %lld\n", - (long long)(int)offsetof(struct llog_catid, lci_logid)); - LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_logid) == 20, "found %lld\n", - (long long)(int)sizeof(((struct llog_catid *)0)->lci_logid)); - LASSERTF((int)offsetof(struct llog_catid, lci_padding1) == 20, "found %lld\n", - (long long)(int)offsetof(struct llog_catid, lci_padding1)); - LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding1) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding1)); - LASSERTF((int)offsetof(struct llog_catid, lci_padding2) == 24, "found %lld\n", - (long long)(int)offsetof(struct llog_catid, lci_padding2)); - LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding2) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding2)); - LASSERTF((int)offsetof(struct llog_catid, lci_padding3) == 28, "found %lld\n", - (long long)(int)offsetof(struct llog_catid, lci_padding3)); - LASSERTF((int)sizeof(((struct llog_catid *)0)->lci_padding3) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_catid *)0)->lci_padding3)); - - /* Checks for struct llog_rec_hdr */ - LASSERTF((int)sizeof(struct llog_rec_hdr) == 16, "found %lld\n", - (long long)(int)sizeof(struct llog_rec_hdr)); - LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_len) == 0, "found %lld\n", - (long long)(int)offsetof(struct llog_rec_hdr, lrh_len)); - LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_len) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_len)); - LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_index) == 4, "found %lld\n", - (long long)(int)offsetof(struct llog_rec_hdr, lrh_index)); - LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_index) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_index)); - LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_type) == 8, "found %lld\n", - (long long)(int)offsetof(struct llog_rec_hdr, lrh_type)); - LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_type) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_type)); - LASSERTF((int)offsetof(struct llog_rec_hdr, lrh_id) == 12, "found %lld\n", - (long long)(int)offsetof(struct llog_rec_hdr, lrh_id)); - LASSERTF((int)sizeof(((struct llog_rec_hdr *)0)->lrh_id) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_rec_hdr *)0)->lrh_id)); - - /* Checks for struct llog_rec_tail */ - LASSERTF((int)sizeof(struct llog_rec_tail) == 8, "found %lld\n", - (long long)(int)sizeof(struct llog_rec_tail)); - LASSERTF((int)offsetof(struct llog_rec_tail, lrt_len) == 0, "found %lld\n", - (long long)(int)offsetof(struct llog_rec_tail, lrt_len)); - LASSERTF((int)sizeof(((struct llog_rec_tail *)0)->lrt_len) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_rec_tail *)0)->lrt_len)); - LASSERTF((int)offsetof(struct llog_rec_tail, lrt_index) == 4, "found %lld\n", - (long long)(int)offsetof(struct llog_rec_tail, lrt_index)); - LASSERTF((int)sizeof(((struct llog_rec_tail *)0)->lrt_index) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_rec_tail *)0)->lrt_index)); - - /* Checks for struct llog_logid_rec */ - LASSERTF((int)sizeof(struct llog_logid_rec) == 64, "found %lld\n", - (long long)(int)sizeof(struct llog_logid_rec)); - LASSERTF((int)offsetof(struct llog_logid_rec, lid_hdr) == 0, "found %lld\n", - (long long)(int)offsetof(struct llog_logid_rec, lid_hdr)); - LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_hdr) == 16, "found %lld\n", - (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_hdr)); - LASSERTF((int)offsetof(struct llog_logid_rec, lid_id) == 16, "found %lld\n", - (long long)(int)offsetof(struct llog_logid_rec, lid_id)); - LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_id) == 20, "found %lld\n", - (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_id)); - LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding1) == 36, "found %lld\n", - (long long)(int)offsetof(struct llog_logid_rec, lid_padding1)); - LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding1) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding1)); - LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding2) == 40, "found %lld\n", - (long long)(int)offsetof(struct llog_logid_rec, lid_padding2)); - LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding2) == 8, "found %lld\n", - (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding2)); - LASSERTF((int)offsetof(struct llog_logid_rec, lid_padding3) == 48, "found %lld\n", - (long long)(int)offsetof(struct llog_logid_rec, lid_padding3)); - LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_padding3) == 8, "found %lld\n", - (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_padding3)); - LASSERTF((int)offsetof(struct llog_logid_rec, lid_tail) == 56, "found %lld\n", - (long long)(int)offsetof(struct llog_logid_rec, lid_tail)); - LASSERTF((int)sizeof(((struct llog_logid_rec *)0)->lid_tail) == 8, "found %lld\n", - (long long)(int)sizeof(((struct llog_logid_rec *)0)->lid_tail)); - - /* Checks for struct llog_unlink_rec */ - LASSERTF((int)sizeof(struct llog_unlink_rec) == 40, "found %lld\n", - (long long)(int)sizeof(struct llog_unlink_rec)); - LASSERTF((int)offsetof(struct llog_unlink_rec, lur_hdr) == 0, "found %lld\n", - (long long)(int)offsetof(struct llog_unlink_rec, lur_hdr)); - LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_hdr) == 16, "found %lld\n", - (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_hdr)); - LASSERTF((int)offsetof(struct llog_unlink_rec, lur_oid) == 16, "found %lld\n", - (long long)(int)offsetof(struct llog_unlink_rec, lur_oid)); - LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_oid) == 8, "found %lld\n", - (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_oid)); - LASSERTF((int)offsetof(struct llog_unlink_rec, lur_oseq) == 24, "found %lld\n", - (long long)(int)offsetof(struct llog_unlink_rec, lur_oseq)); - LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_oseq) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_oseq)); - LASSERTF((int)offsetof(struct llog_unlink_rec, lur_count) == 28, "found %lld\n", - (long long)(int)offsetof(struct llog_unlink_rec, lur_count)); - LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_count) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_count)); - LASSERTF((int)offsetof(struct llog_unlink_rec, lur_tail) == 32, "found %lld\n", - (long long)(int)offsetof(struct llog_unlink_rec, lur_tail)); - LASSERTF((int)sizeof(((struct llog_unlink_rec *)0)->lur_tail) == 8, "found %lld\n", - (long long)(int)sizeof(((struct llog_unlink_rec *)0)->lur_tail)); - /* Checks for struct llog_unlink64_rec */ - LASSERTF((int)sizeof(struct llog_unlink64_rec) == 64, "found %lld\n", - (long long)(int)sizeof(struct llog_unlink64_rec)); - LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_hdr) == 0, "found %lld\n", - (long long)(int)offsetof(struct llog_unlink64_rec, lur_hdr)); - LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_hdr) == 16, "found %lld\n", - (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_hdr)); - LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_fid) == 16, "found %lld\n", - (long long)(int)offsetof(struct llog_unlink64_rec, lur_fid)); - LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_fid) == 16, "found %lld\n", - (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_fid)); - LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_count) == 32, "found %lld\n", - (long long)(int)offsetof(struct llog_unlink64_rec, lur_count)); - LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_count) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_count)); - LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_tail) == 56, "found %lld\n", - (long long)(int)offsetof(struct llog_unlink64_rec, lur_tail)); - LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_tail) == 8, "found %lld\n", - (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_tail)); - LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding1) == 36, "found %lld\n", - (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding1)); - LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding1) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding1)); - LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding2) == 40, "found %lld\n", - (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding2)); - LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding2) == 8, "found %lld\n", - (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding2)); - LASSERTF((int)offsetof(struct llog_unlink64_rec, lur_padding3) == 48, "found %lld\n", - (long long)(int)offsetof(struct llog_unlink64_rec, lur_padding3)); - LASSERTF((int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding3) == 8, "found %lld\n", - (long long)(int)sizeof(((struct llog_unlink64_rec *)0)->lur_padding3)); - - /* Checks for struct llog_setattr64_rec */ - LASSERTF((int)sizeof(struct llog_setattr64_rec) == 64, "found %lld\n", - (long long)(int)sizeof(struct llog_setattr64_rec)); - LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_hdr) == 0, "found %lld\n", - (long long)(int)offsetof(struct llog_setattr64_rec, lsr_hdr)); - LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr) == 16, "found %lld\n", - (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr)); - LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_oi) == 16, "found %lld\n", - (long long)(int)offsetof(struct llog_setattr64_rec, lsr_oi)); - LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi) == 16, "found %lld\n", - (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oi)); - LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid) == 32, "found %lld\n", - (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid)); - LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid)); - LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid_h) == 36, "found %lld\n", - (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid_h)); - LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h)); - LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid) == 40, "found %lld\n", - (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid)); - LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid)); - LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid_h) == 44, "found %lld\n", - (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid_h)); - LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h)); - LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_valid) == 48, "found %lld\n", - (long long)(int)offsetof(struct llog_setattr64_rec, lsr_valid)); - LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_valid) == 8, "found %lld\n", - (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_valid)); - LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_tail) == 56, "found %lld\n", - (long long)(int)offsetof(struct llog_setattr64_rec, lsr_tail)); - LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail) == 8, "found %lld\n", - (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail)); - - /* Checks for struct llog_size_change_rec */ - LASSERTF((int)sizeof(struct llog_size_change_rec) == 64, "found %lld\n", - (long long)(int)sizeof(struct llog_size_change_rec)); - LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_hdr) == 0, "found %lld\n", - (long long)(int)offsetof(struct llog_size_change_rec, lsc_hdr)); - LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_hdr) == 16, "found %lld\n", - (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_hdr)); - LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_fid) == 16, "found %lld\n", - (long long)(int)offsetof(struct llog_size_change_rec, lsc_fid)); - LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_fid) == 16, "found %lld\n", - (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_fid)); - LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_ioepoch) == 32, "found %lld\n", - (long long)(int)offsetof(struct llog_size_change_rec, lsc_ioepoch)); - LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_ioepoch) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_ioepoch)); - LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding1) == 36, "found %lld\n", - (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding1)); - LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding1) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding1)); - LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding2) == 40, "found %lld\n", - (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding2)); - LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding2) == 8, "found %lld\n", - (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding2)); - LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_padding3) == 48, "found %lld\n", - (long long)(int)offsetof(struct llog_size_change_rec, lsc_padding3)); - LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding3) == 8, "found %lld\n", - (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_padding3)); - LASSERTF((int)offsetof(struct llog_size_change_rec, lsc_tail) == 56, "found %lld\n", - (long long)(int)offsetof(struct llog_size_change_rec, lsc_tail)); - LASSERTF((int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail) == 8, "found %lld\n", - (long long)(int)sizeof(((struct llog_size_change_rec *)0)->lsc_tail)); - - /* Checks for struct changelog_rec */ - LASSERTF((int)sizeof(struct changelog_rec) == 64, "found %lld\n", - (long long)(int)sizeof(struct changelog_rec)); - LASSERTF((int)offsetof(struct changelog_rec, cr_namelen) == 0, "found %lld\n", - (long long)(int)offsetof(struct changelog_rec, cr_namelen)); - LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_namelen) == 2, "found %lld\n", - (long long)(int)sizeof(((struct changelog_rec *)0)->cr_namelen)); - LASSERTF((int)offsetof(struct changelog_rec, cr_flags) == 2, "found %lld\n", - (long long)(int)offsetof(struct changelog_rec, cr_flags)); - LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_flags) == 2, "found %lld\n", - (long long)(int)sizeof(((struct changelog_rec *)0)->cr_flags)); - LASSERTF((int)offsetof(struct changelog_rec, cr_type) == 4, "found %lld\n", - (long long)(int)offsetof(struct changelog_rec, cr_type)); - LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_type) == 4, "found %lld\n", - (long long)(int)sizeof(((struct changelog_rec *)0)->cr_type)); - LASSERTF((int)offsetof(struct changelog_rec, cr_index) == 8, "found %lld\n", - (long long)(int)offsetof(struct changelog_rec, cr_index)); - LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_index) == 8, "found %lld\n", - (long long)(int)sizeof(((struct changelog_rec *)0)->cr_index)); - LASSERTF((int)offsetof(struct changelog_rec, cr_prev) == 16, "found %lld\n", - (long long)(int)offsetof(struct changelog_rec, cr_prev)); - LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_prev) == 8, "found %lld\n", - (long long)(int)sizeof(((struct changelog_rec *)0)->cr_prev)); - LASSERTF((int)offsetof(struct changelog_rec, cr_time) == 24, "found %lld\n", - (long long)(int)offsetof(struct changelog_rec, cr_time)); - LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_time) == 8, "found %lld\n", - (long long)(int)sizeof(((struct changelog_rec *)0)->cr_time)); - LASSERTF((int)offsetof(struct changelog_rec, cr_tfid) == 32, "found %lld\n", - (long long)(int)offsetof(struct changelog_rec, cr_tfid)); - LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_tfid) == 16, "found %lld\n", - (long long)(int)sizeof(((struct changelog_rec *)0)->cr_tfid)); - LASSERTF((int)offsetof(struct changelog_rec, cr_pfid) == 48, "found %lld\n", - (long long)(int)offsetof(struct changelog_rec, cr_pfid)); - LASSERTF((int)sizeof(((struct changelog_rec *)0)->cr_pfid) == 16, "found %lld\n", - (long long)(int)sizeof(((struct changelog_rec *)0)->cr_pfid)); - - /* Checks for struct changelog_setinfo */ - LASSERTF((int)sizeof(struct changelog_setinfo) == 12, "found %lld\n", - (long long)(int)sizeof(struct changelog_setinfo)); - LASSERTF((int)offsetof(struct changelog_setinfo, cs_recno) == 0, "found %lld\n", - (long long)(int)offsetof(struct changelog_setinfo, cs_recno)); - LASSERTF((int)sizeof(((struct changelog_setinfo *)0)->cs_recno) == 8, "found %lld\n", - (long long)(int)sizeof(((struct changelog_setinfo *)0)->cs_recno)); - LASSERTF((int)offsetof(struct changelog_setinfo, cs_id) == 8, "found %lld\n", - (long long)(int)offsetof(struct changelog_setinfo, cs_id)); - LASSERTF((int)sizeof(((struct changelog_setinfo *)0)->cs_id) == 4, "found %lld\n", - (long long)(int)sizeof(((struct changelog_setinfo *)0)->cs_id)); - - /* Checks for struct llog_changelog_rec */ - LASSERTF((int)sizeof(struct llog_changelog_rec) == 88, "found %lld\n", - (long long)(int)sizeof(struct llog_changelog_rec)); - LASSERTF((int)offsetof(struct llog_changelog_rec, cr_hdr) == 0, "found %lld\n", - (long long)(int)offsetof(struct llog_changelog_rec, cr_hdr)); - LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr_hdr) == 16, "found %lld\n", - (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr_hdr)); - LASSERTF((int)offsetof(struct llog_changelog_rec, cr) == 16, "found %lld\n", - (long long)(int)offsetof(struct llog_changelog_rec, cr)); - LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr) == 64, "found %lld\n", - (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr)); - LASSERTF((int)offsetof(struct llog_changelog_rec, cr_do_not_use) == 80, "found %lld\n", - (long long)(int)offsetof(struct llog_changelog_rec, cr_do_not_use)); - LASSERTF((int)sizeof(((struct llog_changelog_rec *)0)->cr_do_not_use) == 8, "found %lld\n", - (long long)(int)sizeof(((struct llog_changelog_rec *)0)->cr_do_not_use)); - - /* Checks for struct llog_changelog_user_rec */ - LASSERTF((int)sizeof(struct llog_changelog_user_rec) == 40, "found %lld\n", - (long long)(int)sizeof(struct llog_changelog_user_rec)); - LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_hdr) == 0, "found %lld\n", - (long long)(int)offsetof(struct llog_changelog_user_rec, cur_hdr)); - LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_hdr) == 16, "found %lld\n", - (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_hdr)); - LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_id) == 16, "found %lld\n", - (long long)(int)offsetof(struct llog_changelog_user_rec, cur_id)); - LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_id)); - LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_padding) == 20, "found %lld\n", - (long long)(int)offsetof(struct llog_changelog_user_rec, cur_padding)); - LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_padding) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_padding)); - LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_endrec) == 24, "found %lld\n", - (long long)(int)offsetof(struct llog_changelog_user_rec, cur_endrec)); - LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_endrec) == 8, "found %lld\n", - (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_endrec)); - LASSERTF((int)offsetof(struct llog_changelog_user_rec, cur_tail) == 32, "found %lld\n", - (long long)(int)offsetof(struct llog_changelog_user_rec, cur_tail)); - LASSERTF((int)sizeof(((struct llog_changelog_user_rec *)0)->cur_tail) == 8, "found %lld\n", - (long long)(int)sizeof(((struct llog_changelog_user_rec *)0)->cur_tail)); - - /* Checks for struct llog_gen */ - LASSERTF((int)sizeof(struct llog_gen) == 16, "found %lld\n", - (long long)(int)sizeof(struct llog_gen)); - LASSERTF((int)offsetof(struct llog_gen, mnt_cnt) == 0, "found %lld\n", - (long long)(int)offsetof(struct llog_gen, mnt_cnt)); - LASSERTF((int)sizeof(((struct llog_gen *)0)->mnt_cnt) == 8, "found %lld\n", - (long long)(int)sizeof(((struct llog_gen *)0)->mnt_cnt)); - LASSERTF((int)offsetof(struct llog_gen, conn_cnt) == 8, "found %lld\n", - (long long)(int)offsetof(struct llog_gen, conn_cnt)); - LASSERTF((int)sizeof(((struct llog_gen *)0)->conn_cnt) == 8, "found %lld\n", - (long long)(int)sizeof(((struct llog_gen *)0)->conn_cnt)); - - /* Checks for struct llog_gen_rec */ - LASSERTF((int)sizeof(struct llog_gen_rec) == 64, "found %lld\n", - (long long)(int)sizeof(struct llog_gen_rec)); - LASSERTF((int)offsetof(struct llog_gen_rec, lgr_hdr) == 0, "found %lld\n", - (long long)(int)offsetof(struct llog_gen_rec, lgr_hdr)); - LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_hdr) == 16, "found %lld\n", - (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_hdr)); - LASSERTF((int)offsetof(struct llog_gen_rec, lgr_gen) == 16, "found %lld\n", - (long long)(int)offsetof(struct llog_gen_rec, lgr_gen)); - LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_gen) == 16, "found %lld\n", - (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_gen)); - LASSERTF((int)offsetof(struct llog_gen_rec, lgr_tail) == 56, "found %lld\n", - (long long)(int)offsetof(struct llog_gen_rec, lgr_tail)); - LASSERTF((int)sizeof(((struct llog_gen_rec *)0)->lgr_tail) == 8, "found %lld\n", - (long long)(int)sizeof(((struct llog_gen_rec *)0)->lgr_tail)); - - /* Checks for struct llog_log_hdr */ - LASSERTF((int)sizeof(struct llog_log_hdr) == 8192, "found %lld\n", - (long long)(int)sizeof(struct llog_log_hdr)); - LASSERTF((int)offsetof(struct llog_log_hdr, llh_hdr) == 0, "found %lld\n", - (long long)(int)offsetof(struct llog_log_hdr, llh_hdr)); - LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_hdr) == 16, "found %lld\n", - (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_hdr)); - LASSERTF((int)offsetof(struct llog_log_hdr, llh_timestamp) == 16, "found %lld\n", - (long long)(int)offsetof(struct llog_log_hdr, llh_timestamp)); - LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_timestamp) == 8, "found %lld\n", - (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_timestamp)); - LASSERTF((int)offsetof(struct llog_log_hdr, llh_count) == 24, "found %lld\n", - (long long)(int)offsetof(struct llog_log_hdr, llh_count)); - LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_count) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_count)); - LASSERTF((int)offsetof(struct llog_log_hdr, llh_bitmap_offset) == 28, "found %lld\n", - (long long)(int)offsetof(struct llog_log_hdr, llh_bitmap_offset)); - LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap_offset) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap_offset)); - LASSERTF((int)offsetof(struct llog_log_hdr, llh_size) == 32, "found %lld\n", - (long long)(int)offsetof(struct llog_log_hdr, llh_size)); - LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_size) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_size)); - LASSERTF((int)offsetof(struct llog_log_hdr, llh_flags) == 36, "found %lld\n", - (long long)(int)offsetof(struct llog_log_hdr, llh_flags)); - LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_flags) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_flags)); - LASSERTF((int)offsetof(struct llog_log_hdr, llh_cat_idx) == 40, "found %lld\n", - (long long)(int)offsetof(struct llog_log_hdr, llh_cat_idx)); - LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_cat_idx) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_cat_idx)); - LASSERTF((int)offsetof(struct llog_log_hdr, llh_tgtuuid) == 44, "found %lld\n", - (long long)(int)offsetof(struct llog_log_hdr, llh_tgtuuid)); - LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_tgtuuid) == 40, "found %lld\n", - (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_tgtuuid)); - LASSERTF((int)offsetof(struct llog_log_hdr, llh_reserved) == 84, "found %lld\n", - (long long)(int)offsetof(struct llog_log_hdr, llh_reserved)); - LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_reserved) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_reserved)); - LASSERTF((int)offsetof(struct llog_log_hdr, llh_bitmap) == 88, "found %lld\n", - (long long)(int)offsetof(struct llog_log_hdr, llh_bitmap)); - LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap) == 8096, "found %lld\n", - (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_bitmap)); - LASSERTF((int)offsetof(struct llog_log_hdr, llh_tail) == 8184, "found %lld\n", - (long long)(int)offsetof(struct llog_log_hdr, llh_tail)); - LASSERTF((int)sizeof(((struct llog_log_hdr *)0)->llh_tail) == 8, "found %lld\n", - (long long)(int)sizeof(((struct llog_log_hdr *)0)->llh_tail)); - - /* Checks for struct llog_cookie */ - LASSERTF((int)sizeof(struct llog_cookie) == 32, "found %lld\n", - (long long)(int)sizeof(struct llog_cookie)); - LASSERTF((int)offsetof(struct llog_cookie, lgc_lgl) == 0, "found %lld\n", - (long long)(int)offsetof(struct llog_cookie, lgc_lgl)); - LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_lgl) == 20, "found %lld\n", - (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_lgl)); - LASSERTF((int)offsetof(struct llog_cookie, lgc_subsys) == 20, "found %lld\n", - (long long)(int)offsetof(struct llog_cookie, lgc_subsys)); - LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_subsys) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_subsys)); - LASSERTF((int)offsetof(struct llog_cookie, lgc_index) == 24, "found %lld\n", - (long long)(int)offsetof(struct llog_cookie, lgc_index)); - LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_index) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_index)); - LASSERTF((int)offsetof(struct llog_cookie, lgc_padding) == 28, "found %lld\n", - (long long)(int)offsetof(struct llog_cookie, lgc_padding)); - LASSERTF((int)sizeof(((struct llog_cookie *)0)->lgc_padding) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llog_cookie *)0)->lgc_padding)); - - /* Checks for struct llogd_body */ - LASSERTF((int)sizeof(struct llogd_body) == 48, "found %lld\n", - (long long)(int)sizeof(struct llogd_body)); - LASSERTF((int)offsetof(struct llogd_body, lgd_logid) == 0, "found %lld\n", - (long long)(int)offsetof(struct llogd_body, lgd_logid)); - LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_logid) == 20, "found %lld\n", - (long long)(int)sizeof(((struct llogd_body *)0)->lgd_logid)); - LASSERTF((int)offsetof(struct llogd_body, lgd_ctxt_idx) == 20, "found %lld\n", - (long long)(int)offsetof(struct llogd_body, lgd_ctxt_idx)); - LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_ctxt_idx) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llogd_body *)0)->lgd_ctxt_idx)); - LASSERTF((int)offsetof(struct llogd_body, lgd_llh_flags) == 24, "found %lld\n", - (long long)(int)offsetof(struct llogd_body, lgd_llh_flags)); - LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_llh_flags) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llogd_body *)0)->lgd_llh_flags)); - LASSERTF((int)offsetof(struct llogd_body, lgd_index) == 28, "found %lld\n", - (long long)(int)offsetof(struct llogd_body, lgd_index)); - LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_index) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llogd_body *)0)->lgd_index)); - LASSERTF((int)offsetof(struct llogd_body, lgd_saved_index) == 32, "found %lld\n", - (long long)(int)offsetof(struct llogd_body, lgd_saved_index)); - LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_saved_index) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llogd_body *)0)->lgd_saved_index)); - LASSERTF((int)offsetof(struct llogd_body, lgd_len) == 36, "found %lld\n", - (long long)(int)offsetof(struct llogd_body, lgd_len)); - LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_len) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llogd_body *)0)->lgd_len)); - LASSERTF((int)offsetof(struct llogd_body, lgd_cur_offset) == 40, "found %lld\n", - (long long)(int)offsetof(struct llogd_body, lgd_cur_offset)); - LASSERTF((int)sizeof(((struct llogd_body *)0)->lgd_cur_offset) == 8, "found %lld\n", - (long long)(int)sizeof(((struct llogd_body *)0)->lgd_cur_offset)); - BUILD_BUG_ON(LLOG_ORIGIN_HANDLE_CREATE != 501); - BUILD_BUG_ON(LLOG_ORIGIN_HANDLE_NEXT_BLOCK != 502); - BUILD_BUG_ON(LLOG_ORIGIN_HANDLE_READ_HEADER != 503); - BUILD_BUG_ON(LLOG_ORIGIN_HANDLE_WRITE_REC != 504); - BUILD_BUG_ON(LLOG_ORIGIN_HANDLE_CLOSE != 505); - BUILD_BUG_ON(LLOG_ORIGIN_CONNECT != 506); - BUILD_BUG_ON(LLOG_CATINFO != 507); - BUILD_BUG_ON(LLOG_ORIGIN_HANDLE_PREV_BLOCK != 508); - BUILD_BUG_ON(LLOG_ORIGIN_HANDLE_DESTROY != 509); - BUILD_BUG_ON(LLOG_FIRST_OPC != 501); - BUILD_BUG_ON(LLOG_LAST_OPC != 510); - BUILD_BUG_ON(LLOG_CONFIG_ORIG_CTXT != 0); - BUILD_BUG_ON(LLOG_CONFIG_REPL_CTXT != 1); - BUILD_BUG_ON(LLOG_MDS_OST_ORIG_CTXT != 2); - BUILD_BUG_ON(LLOG_MDS_OST_REPL_CTXT != 3); - BUILD_BUG_ON(LLOG_SIZE_ORIG_CTXT != 4); - BUILD_BUG_ON(LLOG_SIZE_REPL_CTXT != 5); - BUILD_BUG_ON(LLOG_TEST_ORIG_CTXT != 8); - BUILD_BUG_ON(LLOG_TEST_REPL_CTXT != 9); - BUILD_BUG_ON(LLOG_CHANGELOG_ORIG_CTXT != 12); - BUILD_BUG_ON(LLOG_CHANGELOG_REPL_CTXT != 13); - BUILD_BUG_ON(LLOG_CHANGELOG_USER_ORIG_CTXT != 14); - BUILD_BUG_ON(LLOG_AGENT_ORIG_CTXT != 15); - BUILD_BUG_ON(LLOG_MAX_CTXTS != 16); - - /* Checks for struct llogd_conn_body */ - LASSERTF((int)sizeof(struct llogd_conn_body) == 40, "found %lld\n", - (long long)(int)sizeof(struct llogd_conn_body)); - LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_gen) == 0, "found %lld\n", - (long long)(int)offsetof(struct llogd_conn_body, lgdc_gen)); - LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_gen) == 16, "found %lld\n", - (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_gen)); - LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_logid) == 16, "found %lld\n", - (long long)(int)offsetof(struct llogd_conn_body, lgdc_logid)); - LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_logid) == 20, "found %lld\n", - (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_logid)); - LASSERTF((int)offsetof(struct llogd_conn_body, lgdc_ctxt_idx) == 36, "found %lld\n", - (long long)(int)offsetof(struct llogd_conn_body, lgdc_ctxt_idx)); - LASSERTF((int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx) == 4, "found %lld\n", - (long long)(int)sizeof(((struct llogd_conn_body *)0)->lgdc_ctxt_idx)); - - /* Checks for struct fiemap_info_key */ - LASSERTF((int)sizeof(struct ll_fiemap_info_key) == 248, "found %lld\n", - (long long)(int)sizeof(struct ll_fiemap_info_key)); - LASSERTF((int)offsetof(struct ll_fiemap_info_key, lfik_name[8]) == 8, "found %lld\n", - (long long)(int)offsetof(struct ll_fiemap_info_key, lfik_name[8])); - LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_name[8]) == 1, "found %lld\n", - (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_name[8])); - LASSERTF((int)offsetof(struct ll_fiemap_info_key, lfik_oa) == 8, "found %lld\n", - (long long)(int)offsetof(struct ll_fiemap_info_key, lfik_oa)); - LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_oa) == 208, "found %lld\n", - (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_oa)); - LASSERTF((int)offsetof(struct ll_fiemap_info_key, lfik_fiemap) == 216, "found %lld\n", - (long long)(int)offsetof(struct ll_fiemap_info_key, lfik_fiemap)); - LASSERTF((int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_fiemap) == 32, "found %lld\n", - (long long)(int)sizeof(((struct ll_fiemap_info_key *)0)->lfik_fiemap)); - - /* Checks for struct mgs_target_info */ - LASSERTF((int)sizeof(struct mgs_target_info) == 4544, "found %lld\n", - (long long)(int)sizeof(struct mgs_target_info)); - LASSERTF((int)offsetof(struct mgs_target_info, mti_lustre_ver) == 0, "found %lld\n", - (long long)(int)offsetof(struct mgs_target_info, mti_lustre_ver)); - LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_lustre_ver) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_lustre_ver)); - LASSERTF((int)offsetof(struct mgs_target_info, mti_stripe_index) == 4, "found %lld\n", - (long long)(int)offsetof(struct mgs_target_info, mti_stripe_index)); - LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_stripe_index) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_stripe_index)); - LASSERTF((int)offsetof(struct mgs_target_info, mti_config_ver) == 8, "found %lld\n", - (long long)(int)offsetof(struct mgs_target_info, mti_config_ver)); - LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_config_ver) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_config_ver)); - LASSERTF((int)offsetof(struct mgs_target_info, mti_flags) == 12, "found %lld\n", - (long long)(int)offsetof(struct mgs_target_info, mti_flags)); - LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_flags) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_flags)); - LASSERTF((int)offsetof(struct mgs_target_info, mti_nid_count) == 16, "found %lld\n", - (long long)(int)offsetof(struct mgs_target_info, mti_nid_count)); - LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_nid_count) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_nid_count)); - LASSERTF((int)offsetof(struct mgs_target_info, mti_instance) == 20, "found %lld\n", - (long long)(int)offsetof(struct mgs_target_info, mti_instance)); - LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_instance) == 4, "found %lld\n", - (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_instance)); - LASSERTF((int)offsetof(struct mgs_target_info, mti_fsname) == 24, "found %lld\n", - (long long)(int)offsetof(struct mgs_target_info, mti_fsname)); - LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_fsname) == 64, "found %lld\n", - (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_fsname)); - LASSERTF((int)offsetof(struct mgs_target_info, mti_svname) == 88, "found %lld\n", - (long long)(int)offsetof(struct mgs_target_info, mti_svname)); - LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_svname) == 64, "found %lld\n", - (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_svname)); - LASSERTF((int)offsetof(struct mgs_target_info, mti_uuid) == 152, "found %lld\n", - (long long)(int)offsetof(struct mgs_target_info, mti_uuid)); - LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_uuid) == 40, "found %lld\n", - (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_uuid)); - LASSERTF((int)offsetof(struct mgs_target_info, mti_nids) == 192, "found %lld\n", - (long long)(int)offsetof(struct mgs_target_info, mti_nids)); - LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_nids) == 256, "found %lld\n", - (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_nids)); - LASSERTF((int)offsetof(struct mgs_target_info, mti_params) == 448, "found %lld\n", - (long long)(int)offsetof(struct mgs_target_info, mti_params)); - LASSERTF((int)sizeof(((struct mgs_target_info *)0)->mti_params) == 4096, "found %lld\n", - (long long)(int)sizeof(((struct mgs_target_info *)0)->mti_params)); - - /* Checks for struct lustre_capa */ - LASSERTF((int)sizeof(struct lustre_capa) == 120, "found %lld\n", - (long long)(int)sizeof(struct lustre_capa)); - LASSERTF((int)offsetof(struct lustre_capa, lc_fid) == 0, "found %lld\n", - (long long)(int)offsetof(struct lustre_capa, lc_fid)); - LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_fid) == 16, "found %lld\n", - (long long)(int)sizeof(((struct lustre_capa *)0)->lc_fid)); - LASSERTF((int)offsetof(struct lustre_capa, lc_opc) == 16, "found %lld\n", - (long long)(int)offsetof(struct lustre_capa, lc_opc)); - LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_opc) == 8, "found %lld\n", - (long long)(int)sizeof(((struct lustre_capa *)0)->lc_opc)); - LASSERTF((int)offsetof(struct lustre_capa, lc_uid) == 24, "found %lld\n", - (long long)(int)offsetof(struct lustre_capa, lc_uid)); - LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_uid) == 8, "found %lld\n", - (long long)(int)sizeof(((struct lustre_capa *)0)->lc_uid)); - LASSERTF((int)offsetof(struct lustre_capa, lc_gid) == 32, "found %lld\n", - (long long)(int)offsetof(struct lustre_capa, lc_gid)); - LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_gid) == 8, "found %lld\n", - (long long)(int)sizeof(((struct lustre_capa *)0)->lc_gid)); - LASSERTF((int)offsetof(struct lustre_capa, lc_flags) == 40, "found %lld\n", - (long long)(int)offsetof(struct lustre_capa, lc_flags)); - LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_flags) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lustre_capa *)0)->lc_flags)); - LASSERTF((int)offsetof(struct lustre_capa, lc_keyid) == 44, "found %lld\n", - (long long)(int)offsetof(struct lustre_capa, lc_keyid)); - LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_keyid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lustre_capa *)0)->lc_keyid)); - LASSERTF((int)offsetof(struct lustre_capa, lc_timeout) == 48, "found %lld\n", - (long long)(int)offsetof(struct lustre_capa, lc_timeout)); - LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_timeout) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lustre_capa *)0)->lc_timeout)); - LASSERTF((int)offsetof(struct lustre_capa, lc_expiry) == 52, "found %lld\n", - (long long)(int)offsetof(struct lustre_capa, lc_expiry)); - LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_expiry) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lustre_capa *)0)->lc_expiry)); - BUILD_BUG_ON(CAPA_HMAC_MAX_LEN != 64); - LASSERTF((int)offsetof(struct lustre_capa, lc_hmac[64]) == 120, "found %lld\n", - (long long)(int)offsetof(struct lustre_capa, lc_hmac[64])); - LASSERTF((int)sizeof(((struct lustre_capa *)0)->lc_hmac[64]) == 1, "found %lld\n", - (long long)(int)sizeof(((struct lustre_capa *)0)->lc_hmac[64])); - - /* Checks for struct lustre_capa_key */ - LASSERTF((int)sizeof(struct lustre_capa_key) == 72, "found %lld\n", - (long long)(int)sizeof(struct lustre_capa_key)); - LASSERTF((int)offsetof(struct lustre_capa_key, lk_seq) == 0, "found %lld\n", - (long long)(int)offsetof(struct lustre_capa_key, lk_seq)); - LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_seq) == 8, "found %lld\n", - (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_seq)); - LASSERTF((int)offsetof(struct lustre_capa_key, lk_keyid) == 8, "found %lld\n", - (long long)(int)offsetof(struct lustre_capa_key, lk_keyid)); - LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_keyid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_keyid)); - LASSERTF((int)offsetof(struct lustre_capa_key, lk_padding) == 12, "found %lld\n", - (long long)(int)offsetof(struct lustre_capa_key, lk_padding)); - LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_padding) == 4, "found %lld\n", - (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_padding)); - BUILD_BUG_ON(CAPA_HMAC_KEY_MAX_LEN != 56); - LASSERTF((int)offsetof(struct lustre_capa_key, lk_key[56]) == 72, "found %lld\n", - (long long)(int)offsetof(struct lustre_capa_key, lk_key[56])); - LASSERTF((int)sizeof(((struct lustre_capa_key *)0)->lk_key[56]) == 1, "found %lld\n", - (long long)(int)sizeof(((struct lustre_capa_key *)0)->lk_key[56])); - - /* Checks for struct getinfo_fid2path */ - LASSERTF((int)sizeof(struct getinfo_fid2path) == 32, "found %lld\n", - (long long)(int)sizeof(struct getinfo_fid2path)); - LASSERTF((int)offsetof(struct getinfo_fid2path, gf_fid) == 0, "found %lld\n", - (long long)(int)offsetof(struct getinfo_fid2path, gf_fid)); - LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_fid) == 16, "found %lld\n", - (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_fid)); - LASSERTF((int)offsetof(struct getinfo_fid2path, gf_recno) == 16, "found %lld\n", - (long long)(int)offsetof(struct getinfo_fid2path, gf_recno)); - LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_recno) == 8, "found %lld\n", - (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_recno)); - LASSERTF((int)offsetof(struct getinfo_fid2path, gf_linkno) == 24, "found %lld\n", - (long long)(int)offsetof(struct getinfo_fid2path, gf_linkno)); - LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_linkno) == 4, "found %lld\n", - (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_linkno)); - LASSERTF((int)offsetof(struct getinfo_fid2path, gf_pathlen) == 28, "found %lld\n", - (long long)(int)offsetof(struct getinfo_fid2path, gf_pathlen)); - LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_pathlen) == 4, "found %lld\n", - (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_pathlen)); - LASSERTF((int)offsetof(struct getinfo_fid2path, gf_path[0]) == 32, "found %lld\n", - (long long)(int)offsetof(struct getinfo_fid2path, gf_path[0])); - LASSERTF((int)sizeof(((struct getinfo_fid2path *)0)->gf_path[0]) == 1, "found %lld\n", - (long long)(int)sizeof(((struct getinfo_fid2path *)0)->gf_path[0])); - - /* Checks for struct fiemap */ - LASSERTF((int)sizeof(struct fiemap) == 32, "found %lld\n", - (long long)(int)sizeof(struct fiemap)); - LASSERTF((int)offsetof(struct fiemap, fm_start) == 0, "found %lld\n", - (long long)(int)offsetof(struct fiemap, fm_start)); - LASSERTF((int)sizeof(((struct fiemap *)0)->fm_start) == 8, "found %lld\n", - (long long)(int)sizeof(((struct fiemap *)0)->fm_start)); - LASSERTF((int)offsetof(struct fiemap, fm_length) == 8, "found %lld\n", - (long long)(int)offsetof(struct fiemap, fm_length)); - LASSERTF((int)sizeof(((struct fiemap *)0)->fm_length) == 8, "found %lld\n", - (long long)(int)sizeof(((struct fiemap *)0)->fm_length)); - LASSERTF((int)offsetof(struct fiemap, fm_flags) == 16, "found %lld\n", - (long long)(int)offsetof(struct fiemap, fm_flags)); - LASSERTF((int)sizeof(((struct fiemap *)0)->fm_flags) == 4, "found %lld\n", - (long long)(int)sizeof(((struct fiemap *)0)->fm_flags)); - LASSERTF((int)offsetof(struct fiemap, fm_mapped_extents) == 20, "found %lld\n", - (long long)(int)offsetof(struct fiemap, fm_mapped_extents)); - LASSERTF((int)sizeof(((struct fiemap *)0)->fm_mapped_extents) == 4, "found %lld\n", - (long long)(int)sizeof(((struct fiemap *)0)->fm_mapped_extents)); - LASSERTF((int)offsetof(struct fiemap, fm_extent_count) == 24, "found %lld\n", - (long long)(int)offsetof(struct fiemap, fm_extent_count)); - LASSERTF((int)sizeof(((struct fiemap *)0)->fm_extent_count) == 4, "found %lld\n", - (long long)(int)sizeof(((struct fiemap *)0)->fm_extent_count)); - LASSERTF((int)offsetof(struct fiemap, fm_reserved) == 28, "found %lld\n", - (long long)(int)offsetof(struct fiemap, fm_reserved)); - LASSERTF((int)sizeof(((struct fiemap *)0)->fm_reserved) == 4, "found %lld\n", - (long long)(int)sizeof(((struct fiemap *)0)->fm_reserved)); - LASSERTF((int)offsetof(struct fiemap, fm_extents) == 32, "found %lld\n", - (long long)(int)offsetof(struct fiemap, fm_extents)); - LASSERTF((int)sizeof(((struct fiemap *)0)->fm_extents) == 0, "found %lld\n", - (long long)(int)sizeof(((struct fiemap *)0)->fm_extents)); - BUILD_BUG_ON(FIEMAP_FLAG_SYNC != 0x00000001); - BUILD_BUG_ON(FIEMAP_FLAG_XATTR != 0x00000002); - BUILD_BUG_ON(FIEMAP_FLAG_DEVICE_ORDER != 0x40000000); - - /* Checks for struct fiemap_extent */ - LASSERTF((int)sizeof(struct fiemap_extent) == 56, "found %lld\n", - (long long)(int)sizeof(struct fiemap_extent)); - LASSERTF((int)offsetof(struct fiemap_extent, fe_logical) == 0, "found %lld\n", - (long long)(int)offsetof(struct fiemap_extent, fe_logical)); - LASSERTF((int)sizeof(((struct fiemap_extent *)0)->fe_logical) == 8, "found %lld\n", - (long long)(int)sizeof(((struct fiemap_extent *)0)->fe_logical)); - LASSERTF((int)offsetof(struct fiemap_extent, fe_physical) == 8, "found %lld\n", - (long long)(int)offsetof(struct fiemap_extent, fe_physical)); - LASSERTF((int)sizeof(((struct fiemap_extent *)0)->fe_physical) == 8, "found %lld\n", - (long long)(int)sizeof(((struct fiemap_extent *)0)->fe_physical)); - LASSERTF((int)offsetof(struct fiemap_extent, fe_length) == 16, "found %lld\n", - (long long)(int)offsetof(struct fiemap_extent, fe_length)); - LASSERTF((int)sizeof(((struct fiemap_extent *)0)->fe_length) == 8, "found %lld\n", - (long long)(int)sizeof(((struct fiemap_extent *)0)->fe_length)); - LASSERTF((int)offsetof(struct fiemap_extent, fe_flags) == 40, "found %lld\n", - (long long)(int)offsetof(struct fiemap_extent, fe_flags)); - LASSERTF((int)sizeof(((struct fiemap_extent *)0)->fe_flags) == 4, "found %lld\n", - (long long)(int)sizeof(((struct fiemap_extent *)0)->fe_flags)); - LASSERTF((int)offsetof(struct fiemap_extent, fe_reserved[0]) == 44, "found %lld\n", - (long long)(int)offsetof(struct fiemap_extent, fe_reserved[0])); - LASSERTF((int)sizeof(((struct fiemap_extent *)0)->fe_reserved[0]) == 4, "found %lld\n", - (long long)(int)sizeof(((struct fiemap_extent *)0)->fe_reserved[0])); - BUILD_BUG_ON(FIEMAP_EXTENT_LAST != 0x00000001); - BUILD_BUG_ON(FIEMAP_EXTENT_UNKNOWN != 0x00000002); - BUILD_BUG_ON(FIEMAP_EXTENT_DELALLOC != 0x00000004); - BUILD_BUG_ON(FIEMAP_EXTENT_ENCODED != 0x00000008); - BUILD_BUG_ON(FIEMAP_EXTENT_DATA_ENCRYPTED != 0x00000080); - BUILD_BUG_ON(FIEMAP_EXTENT_NOT_ALIGNED != 0x00000100); - BUILD_BUG_ON(FIEMAP_EXTENT_DATA_INLINE != 0x00000200); - BUILD_BUG_ON(FIEMAP_EXTENT_DATA_TAIL != 0x00000400); - BUILD_BUG_ON(FIEMAP_EXTENT_UNWRITTEN != 0x00000800); - BUILD_BUG_ON(FIEMAP_EXTENT_MERGED != 0x00001000); - BUILD_BUG_ON(FIEMAP_EXTENT_NO_DIRECT != 0x40000000); - BUILD_BUG_ON(FIEMAP_EXTENT_NET != 0x80000000); - - /* Checks for type posix_acl_xattr_entry */ - LASSERTF((int)sizeof(struct posix_acl_xattr_entry) == 8, "found %lld\n", - (long long)(int)sizeof(struct posix_acl_xattr_entry)); - LASSERTF((int)offsetof(struct posix_acl_xattr_entry, e_tag) == 0, "found %lld\n", - (long long)(int)offsetof(struct posix_acl_xattr_entry, e_tag)); - LASSERTF((int)sizeof(((struct posix_acl_xattr_entry *)0)->e_tag) == 2, "found %lld\n", - (long long)(int)sizeof(((struct posix_acl_xattr_entry *)0)->e_tag)); - LASSERTF((int)offsetof(struct posix_acl_xattr_entry, e_perm) == 2, "found %lld\n", - (long long)(int)offsetof(struct posix_acl_xattr_entry, e_perm)); - LASSERTF((int)sizeof(((struct posix_acl_xattr_entry *)0)->e_perm) == 2, "found %lld\n", - (long long)(int)sizeof(((struct posix_acl_xattr_entry *)0)->e_perm)); - LASSERTF((int)offsetof(struct posix_acl_xattr_entry, e_id) == 4, "found %lld\n", - (long long)(int)offsetof(struct posix_acl_xattr_entry, e_id)); - LASSERTF((int)sizeof(((struct posix_acl_xattr_entry *)0)->e_id) == 4, "found %lld\n", - (long long)(int)sizeof(((struct posix_acl_xattr_entry *)0)->e_id)); - - /* Checks for type posix_acl_xattr_header */ - LASSERTF((int)sizeof(struct posix_acl_xattr_header) == 4, "found %lld\n", - (long long)(int)sizeof(struct posix_acl_xattr_header)); - LASSERTF((int)offsetof(struct posix_acl_xattr_header, a_version) == 0, "found %lld\n", - (long long)(int)offsetof(struct posix_acl_xattr_header, a_version)); - LASSERTF((int)sizeof(((struct posix_acl_xattr_header *)0)->a_version) == 4, "found %lld\n", - (long long)(int)sizeof(((struct posix_acl_xattr_header *)0)->a_version)); - - /* Checks for struct link_ea_header */ - LASSERTF((int)sizeof(struct link_ea_header) == 24, "found %lld\n", - (long long)(int)sizeof(struct link_ea_header)); - LASSERTF((int)offsetof(struct link_ea_header, leh_magic) == 0, "found %lld\n", - (long long)(int)offsetof(struct link_ea_header, leh_magic)); - LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_magic) == 4, "found %lld\n", - (long long)(int)sizeof(((struct link_ea_header *)0)->leh_magic)); - LASSERTF((int)offsetof(struct link_ea_header, leh_reccount) == 4, "found %lld\n", - (long long)(int)offsetof(struct link_ea_header, leh_reccount)); - LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_reccount) == 4, "found %lld\n", - (long long)(int)sizeof(((struct link_ea_header *)0)->leh_reccount)); - LASSERTF((int)offsetof(struct link_ea_header, leh_len) == 8, "found %lld\n", - (long long)(int)offsetof(struct link_ea_header, leh_len)); - LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_len) == 8, "found %lld\n", - (long long)(int)sizeof(((struct link_ea_header *)0)->leh_len)); - LASSERTF((int)offsetof(struct link_ea_header, leh_overflow_time) == 16, "found %lld\n", - (long long)(int)offsetof(struct link_ea_header, leh_overflow_time)); - LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_overflow_time) == 4, "found %lld\n", - (long long)(int)sizeof(((struct link_ea_header *)0)->leh_overflow_time)); - LASSERTF((int)offsetof(struct link_ea_header, leh_padding) == 20, "found %lld\n", - (long long)(int)offsetof(struct link_ea_header, leh_padding)); - LASSERTF((int)sizeof(((struct link_ea_header *)0)->leh_padding) == 4, "found %lld\n", - (long long)(int)sizeof(((struct link_ea_header *)0)->leh_padding)); - BUILD_BUG_ON(LINK_EA_MAGIC != 0x11EAF1DFUL); - - /* Checks for struct link_ea_entry */ - LASSERTF((int)sizeof(struct link_ea_entry) == 18, "found %lld\n", - (long long)(int)sizeof(struct link_ea_entry)); - LASSERTF((int)offsetof(struct link_ea_entry, lee_reclen) == 0, "found %lld\n", - (long long)(int)offsetof(struct link_ea_entry, lee_reclen)); - LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_reclen) == 2, "found %lld\n", - (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_reclen)); - LASSERTF((int)offsetof(struct link_ea_entry, lee_parent_fid) == 2, "found %lld\n", - (long long)(int)offsetof(struct link_ea_entry, lee_parent_fid)); - LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_parent_fid) == 16, "found %lld\n", - (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_parent_fid)); - LASSERTF((int)offsetof(struct link_ea_entry, lee_name) == 18, "found %lld\n", - (long long)(int)offsetof(struct link_ea_entry, lee_name)); - LASSERTF((int)sizeof(((struct link_ea_entry *)0)->lee_name) == 0, "found %lld\n", - (long long)(int)sizeof(((struct link_ea_entry *)0)->lee_name)); - - /* Checks for struct layout_intent */ - LASSERTF((int)sizeof(struct layout_intent) == 24, "found %lld\n", - (long long)(int)sizeof(struct layout_intent)); - LASSERTF((int)offsetof(struct layout_intent, li_opc) == 0, "found %lld\n", - (long long)(int)offsetof(struct layout_intent, li_opc)); - LASSERTF((int)sizeof(((struct layout_intent *)0)->li_opc) == 4, "found %lld\n", - (long long)(int)sizeof(((struct layout_intent *)0)->li_opc)); - LASSERTF((int)offsetof(struct layout_intent, li_flags) == 4, "found %lld\n", - (long long)(int)offsetof(struct layout_intent, li_flags)); - LASSERTF((int)sizeof(((struct layout_intent *)0)->li_flags) == 4, "found %lld\n", - (long long)(int)sizeof(((struct layout_intent *)0)->li_flags)); - LASSERTF((int)offsetof(struct layout_intent, li_start) == 8, "found %lld\n", - (long long)(int)offsetof(struct layout_intent, li_start)); - LASSERTF((int)sizeof(((struct layout_intent *)0)->li_start) == 8, "found %lld\n", - (long long)(int)sizeof(((struct layout_intent *)0)->li_start)); - LASSERTF((int)offsetof(struct layout_intent, li_end) == 16, "found %lld\n", - (long long)(int)offsetof(struct layout_intent, li_end)); - LASSERTF((int)sizeof(((struct layout_intent *)0)->li_end) == 8, "found %lld\n", - (long long)(int)sizeof(((struct layout_intent *)0)->li_end)); - LASSERTF(LAYOUT_INTENT_ACCESS == 0, "found %lld\n", - (long long)LAYOUT_INTENT_ACCESS); - LASSERTF(LAYOUT_INTENT_READ == 1, "found %lld\n", - (long long)LAYOUT_INTENT_READ); - LASSERTF(LAYOUT_INTENT_WRITE == 2, "found %lld\n", - (long long)LAYOUT_INTENT_WRITE); - LASSERTF(LAYOUT_INTENT_GLIMPSE == 3, "found %lld\n", - (long long)LAYOUT_INTENT_GLIMPSE); - LASSERTF(LAYOUT_INTENT_TRUNC == 4, "found %lld\n", - (long long)LAYOUT_INTENT_TRUNC); - LASSERTF(LAYOUT_INTENT_RELEASE == 5, "found %lld\n", - (long long)LAYOUT_INTENT_RELEASE); - LASSERTF(LAYOUT_INTENT_RESTORE == 6, "found %lld\n", - (long long)LAYOUT_INTENT_RESTORE); - - /* Checks for struct hsm_action_item */ - LASSERTF((int)sizeof(struct hsm_action_item) == 72, "found %lld\n", - (long long)(int)sizeof(struct hsm_action_item)); - LASSERTF((int)offsetof(struct hsm_action_item, hai_len) == 0, "found %lld\n", - (long long)(int)offsetof(struct hsm_action_item, hai_len)); - LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_len) == 4, "found %lld\n", - (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_len)); - LASSERTF((int)offsetof(struct hsm_action_item, hai_action) == 4, "found %lld\n", - (long long)(int)offsetof(struct hsm_action_item, hai_action)); - LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_action) == 4, "found %lld\n", - (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_action)); - LASSERTF((int)offsetof(struct hsm_action_item, hai_fid) == 8, "found %lld\n", - (long long)(int)offsetof(struct hsm_action_item, hai_fid)); - LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_fid) == 16, "found %lld\n", - (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_fid)); - LASSERTF((int)offsetof(struct hsm_action_item, hai_dfid) == 24, "found %lld\n", - (long long)(int)offsetof(struct hsm_action_item, hai_dfid)); - LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_dfid) == 16, "found %lld\n", - (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_dfid)); - LASSERTF((int)offsetof(struct hsm_action_item, hai_extent) == 40, "found %lld\n", - (long long)(int)offsetof(struct hsm_action_item, hai_extent)); - LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_extent) == 16, "found %lld\n", - (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_extent)); - LASSERTF((int)offsetof(struct hsm_action_item, hai_cookie) == 56, "found %lld\n", - (long long)(int)offsetof(struct hsm_action_item, hai_cookie)); - LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_cookie) == 8, "found %lld\n", - (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_cookie)); - LASSERTF((int)offsetof(struct hsm_action_item, hai_gid) == 64, "found %lld\n", - (long long)(int)offsetof(struct hsm_action_item, hai_gid)); - LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_gid) == 8, "found %lld\n", - (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_gid)); - LASSERTF((int)offsetof(struct hsm_action_item, hai_data) == 72, "found %lld\n", - (long long)(int)offsetof(struct hsm_action_item, hai_data)); - LASSERTF((int)sizeof(((struct hsm_action_item *)0)->hai_data) == 0, "found %lld\n", - (long long)(int)sizeof(((struct hsm_action_item *)0)->hai_data)); - - /* Checks for struct hsm_action_list */ - LASSERTF((int)sizeof(struct hsm_action_list) == 32, "found %lld\n", - (long long)(int)sizeof(struct hsm_action_list)); - LASSERTF((int)offsetof(struct hsm_action_list, hal_version) == 0, "found %lld\n", - (long long)(int)offsetof(struct hsm_action_list, hal_version)); - LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_version) == 4, "found %lld\n", - (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_version)); - LASSERTF((int)offsetof(struct hsm_action_list, hal_count) == 4, "found %lld\n", - (long long)(int)offsetof(struct hsm_action_list, hal_count)); - LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_count) == 4, "found %lld\n", - (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_count)); - LASSERTF((int)offsetof(struct hsm_action_list, hal_compound_id) == 8, "found %lld\n", - (long long)(int)offsetof(struct hsm_action_list, hal_compound_id)); - LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_compound_id) == 8, "found %lld\n", - (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_compound_id)); - LASSERTF((int)offsetof(struct hsm_action_list, hal_flags) == 16, "found %lld\n", - (long long)(int)offsetof(struct hsm_action_list, hal_flags)); - LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_flags) == 8, "found %lld\n", - (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_flags)); - LASSERTF((int)offsetof(struct hsm_action_list, hal_archive_id) == 24, "found %lld\n", - (long long)(int)offsetof(struct hsm_action_list, hal_archive_id)); - LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_archive_id) == 4, "found %lld\n", - (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_archive_id)); - LASSERTF((int)offsetof(struct hsm_action_list, padding1) == 28, "found %lld\n", - (long long)(int)offsetof(struct hsm_action_list, padding1)); - LASSERTF((int)sizeof(((struct hsm_action_list *)0)->padding1) == 4, "found %lld\n", - (long long)(int)sizeof(((struct hsm_action_list *)0)->padding1)); - LASSERTF((int)offsetof(struct hsm_action_list, hal_fsname) == 32, "found %lld\n", - (long long)(int)offsetof(struct hsm_action_list, hal_fsname)); - LASSERTF((int)sizeof(((struct hsm_action_list *)0)->hal_fsname) == 0, "found %lld\n", - (long long)(int)sizeof(((struct hsm_action_list *)0)->hal_fsname)); - - /* Checks for struct hsm_progress */ - LASSERTF((int)sizeof(struct hsm_progress) == 48, "found %lld\n", - (long long)(int)sizeof(struct hsm_progress)); - LASSERTF((int)offsetof(struct hsm_progress, hp_fid) == 0, "found %lld\n", - (long long)(int)offsetof(struct hsm_progress, hp_fid)); - LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_fid) == 16, "found %lld\n", - (long long)(int)sizeof(((struct hsm_progress *)0)->hp_fid)); - LASSERTF((int)offsetof(struct hsm_progress, hp_cookie) == 16, "found %lld\n", - (long long)(int)offsetof(struct hsm_progress, hp_cookie)); - LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_cookie) == 8, "found %lld\n", - (long long)(int)sizeof(((struct hsm_progress *)0)->hp_cookie)); - LASSERTF((int)offsetof(struct hsm_progress, hp_extent) == 24, "found %lld\n", - (long long)(int)offsetof(struct hsm_progress, hp_extent)); - LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_extent) == 16, "found %lld\n", - (long long)(int)sizeof(((struct hsm_progress *)0)->hp_extent)); - LASSERTF((int)offsetof(struct hsm_progress, hp_flags) == 40, "found %lld\n", - (long long)(int)offsetof(struct hsm_progress, hp_flags)); - LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_flags) == 2, "found %lld\n", - (long long)(int)sizeof(((struct hsm_progress *)0)->hp_flags)); - LASSERTF((int)offsetof(struct hsm_progress, hp_errval) == 42, "found %lld\n", - (long long)(int)offsetof(struct hsm_progress, hp_errval)); - LASSERTF((int)sizeof(((struct hsm_progress *)0)->hp_errval) == 2, "found %lld\n", - (long long)(int)sizeof(((struct hsm_progress *)0)->hp_errval)); - LASSERTF((int)offsetof(struct hsm_progress, padding) == 44, "found %lld\n", - (long long)(int)offsetof(struct hsm_progress, padding)); - LASSERTF((int)sizeof(((struct hsm_progress *)0)->padding) == 4, "found %lld\n", - (long long)(int)sizeof(((struct hsm_progress *)0)->padding)); - LASSERTF(HP_FLAG_COMPLETED == 0x01, "found 0x%.8x\n", - HP_FLAG_COMPLETED); - LASSERTF(HP_FLAG_RETRY == 0x02, "found 0x%.8x\n", - HP_FLAG_RETRY); - - LASSERTF((int)offsetof(struct hsm_copy, hc_data_version) == 0, "found %lld\n", - (long long)(int)offsetof(struct hsm_copy, hc_data_version)); - LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_data_version) == 8, "found %lld\n", - (long long)(int)sizeof(((struct hsm_copy *)0)->hc_data_version)); - LASSERTF((int)offsetof(struct hsm_copy, hc_flags) == 8, "found %lld\n", - (long long)(int)offsetof(struct hsm_copy, hc_flags)); - LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_flags) == 2, "found %lld\n", - (long long)(int)sizeof(((struct hsm_copy *)0)->hc_flags)); - LASSERTF((int)offsetof(struct hsm_copy, hc_errval) == 10, "found %lld\n", - (long long)(int)offsetof(struct hsm_copy, hc_errval)); - LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_errval) == 2, "found %lld\n", - (long long)(int)sizeof(((struct hsm_copy *)0)->hc_errval)); - LASSERTF((int)offsetof(struct hsm_copy, padding) == 12, "found %lld\n", - (long long)(int)offsetof(struct hsm_copy, padding)); - LASSERTF((int)sizeof(((struct hsm_copy *)0)->padding) == 4, "found %lld\n", - (long long)(int)sizeof(((struct hsm_copy *)0)->padding)); - LASSERTF((int)offsetof(struct hsm_copy, hc_hai) == 16, "found %lld\n", - (long long)(int)offsetof(struct hsm_copy, hc_hai)); - LASSERTF((int)sizeof(((struct hsm_copy *)0)->hc_hai) == 72, "found %lld\n", - (long long)(int)sizeof(((struct hsm_copy *)0)->hc_hai)); - - /* Checks for struct hsm_progress_kernel */ - LASSERTF((int)sizeof(struct hsm_progress_kernel) == 64, "found %lld\n", - (long long)(int)sizeof(struct hsm_progress_kernel)); - LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_fid) == 0, "found %lld\n", - (long long)(int)offsetof(struct hsm_progress_kernel, hpk_fid)); - LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_fid) == 16, "found %lld\n", - (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_fid)); - LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_cookie) == 16, "found %lld\n", - (long long)(int)offsetof(struct hsm_progress_kernel, hpk_cookie)); - LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_cookie) == 8, "found %lld\n", - (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_cookie)); - LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_extent) == 24, "found %lld\n", - (long long)(int)offsetof(struct hsm_progress_kernel, hpk_extent)); - LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_extent) == 16, "found %lld\n", - (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_extent)); - LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_flags) == 40, "found %lld\n", - (long long)(int)offsetof(struct hsm_progress_kernel, hpk_flags)); - LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_flags) == 2, "found %lld\n", - (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_flags)); - LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_errval) == 42, "found %lld\n", - (long long)(int)offsetof(struct hsm_progress_kernel, hpk_errval)); - LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_errval) == 2, "found %lld\n", - (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_errval)); - LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_padding1) == 44, "found %lld\n", - (long long)(int)offsetof(struct hsm_progress_kernel, hpk_padding1)); - LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding1) == 4, "found %lld\n", - (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding1)); - LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_data_version) == 48, "found %lld\n", - (long long)(int)offsetof(struct hsm_progress_kernel, hpk_data_version)); - LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_data_version) == 8, "found %lld\n", - (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_data_version)); - LASSERTF((int)offsetof(struct hsm_progress_kernel, hpk_padding2) == 56, "found %lld\n", - (long long)(int)offsetof(struct hsm_progress_kernel, hpk_padding2)); - LASSERTF((int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding2) == 8, "found %lld\n", - (long long)(int)sizeof(((struct hsm_progress_kernel *)0)->hpk_padding2)); - - /* Checks for struct hsm_user_item */ - LASSERTF((int)sizeof(struct hsm_user_item) == 32, "found %lld\n", - (long long)(int)sizeof(struct hsm_user_item)); - LASSERTF((int)offsetof(struct hsm_user_item, hui_fid) == 0, "found %lld\n", - (long long)(int)offsetof(struct hsm_user_item, hui_fid)); - LASSERTF((int)sizeof(((struct hsm_user_item *)0)->hui_fid) == 16, "found %lld\n", - (long long)(int)sizeof(((struct hsm_user_item *)0)->hui_fid)); - LASSERTF((int)offsetof(struct hsm_user_item, hui_extent) == 16, "found %lld\n", - (long long)(int)offsetof(struct hsm_user_item, hui_extent)); - LASSERTF((int)sizeof(((struct hsm_user_item *)0)->hui_extent) == 16, "found %lld\n", - (long long)(int)sizeof(((struct hsm_user_item *)0)->hui_extent)); - - /* Checks for struct hsm_user_state */ - LASSERTF((int)sizeof(struct hsm_user_state) == 32, "found %lld\n", - (long long)(int)sizeof(struct hsm_user_state)); - LASSERTF((int)offsetof(struct hsm_user_state, hus_states) == 0, "found %lld\n", - (long long)(int)offsetof(struct hsm_user_state, hus_states)); - LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_states) == 4, "found %lld\n", - (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_states)); - LASSERTF((int)offsetof(struct hsm_user_state, hus_archive_id) == 4, "found %lld\n", - (long long)(int)offsetof(struct hsm_user_state, hus_archive_id)); - LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_archive_id) == 4, "found %lld\n", - (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_archive_id)); - LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_state) == 8, "found %lld\n", - (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_state)); - LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_state) == 4, "found %lld\n", - (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_state)); - LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_action) == 12, "found %lld\n", - (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_action)); - LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_action) == 4, "found %lld\n", - (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_action)); - LASSERTF((int)offsetof(struct hsm_user_state, hus_in_progress_location) == 16, "found %lld\n", - (long long)(int)offsetof(struct hsm_user_state, hus_in_progress_location)); - LASSERTF((int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_location) == 16, "found %lld\n", - (long long)(int)sizeof(((struct hsm_user_state *)0)->hus_in_progress_location)); - - /* Checks for struct hsm_state_set */ - LASSERTF((int)sizeof(struct hsm_state_set) == 24, "found %lld\n", - (long long)(int)sizeof(struct hsm_state_set)); - LASSERTF((int)offsetof(struct hsm_state_set, hss_valid) == 0, "found %lld\n", - (long long)(int)offsetof(struct hsm_state_set, hss_valid)); - LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_valid) == 4, "found %lld\n", - (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_valid)); - LASSERTF((int)offsetof(struct hsm_state_set, hss_archive_id) == 4, "found %lld\n", - (long long)(int)offsetof(struct hsm_state_set, hss_archive_id)); - LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_archive_id) == 4, "found %lld\n", - (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_archive_id)); - LASSERTF((int)offsetof(struct hsm_state_set, hss_setmask) == 8, "found %lld\n", - (long long)(int)offsetof(struct hsm_state_set, hss_setmask)); - LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_setmask) == 8, "found %lld\n", - (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_setmask)); - LASSERTF((int)offsetof(struct hsm_state_set, hss_clearmask) == 16, "found %lld\n", - (long long)(int)offsetof(struct hsm_state_set, hss_clearmask)); - LASSERTF((int)sizeof(((struct hsm_state_set *)0)->hss_clearmask) == 8, "found %lld\n", - (long long)(int)sizeof(((struct hsm_state_set *)0)->hss_clearmask)); - - /* Checks for struct hsm_current_action */ - LASSERTF((int)sizeof(struct hsm_current_action) == 24, "found %lld\n", - (long long)(int)sizeof(struct hsm_current_action)); - LASSERTF((int)offsetof(struct hsm_current_action, hca_state) == 0, "found %lld\n", - (long long)(int)offsetof(struct hsm_current_action, hca_state)); - LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_state) == 4, "found %lld\n", - (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_state)); - LASSERTF((int)offsetof(struct hsm_current_action, hca_action) == 4, "found %lld\n", - (long long)(int)offsetof(struct hsm_current_action, hca_action)); - LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_action) == 4, "found %lld\n", - (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_action)); - LASSERTF((int)offsetof(struct hsm_current_action, hca_location) == 8, "found %lld\n", - (long long)(int)offsetof(struct hsm_current_action, hca_location)); - LASSERTF((int)sizeof(((struct hsm_current_action *)0)->hca_location) == 16, "found %lld\n", - (long long)(int)sizeof(((struct hsm_current_action *)0)->hca_location)); - - /* Checks for struct hsm_request */ - LASSERTF((int)sizeof(struct hsm_request) == 24, "found %lld\n", - (long long)(int)sizeof(struct hsm_request)); - LASSERTF((int)offsetof(struct hsm_request, hr_action) == 0, "found %lld\n", - (long long)(int)offsetof(struct hsm_request, hr_action)); - LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_action) == 4, "found %lld\n", - (long long)(int)sizeof(((struct hsm_request *)0)->hr_action)); - LASSERTF((int)offsetof(struct hsm_request, hr_archive_id) == 4, "found %lld\n", - (long long)(int)offsetof(struct hsm_request, hr_archive_id)); - LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_archive_id) == 4, "found %lld\n", - (long long)(int)sizeof(((struct hsm_request *)0)->hr_archive_id)); - LASSERTF((int)offsetof(struct hsm_request, hr_flags) == 8, "found %lld\n", - (long long)(int)offsetof(struct hsm_request, hr_flags)); - LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_flags) == 8, "found %lld\n", - (long long)(int)sizeof(((struct hsm_request *)0)->hr_flags)); - LASSERTF((int)offsetof(struct hsm_request, hr_itemcount) == 16, "found %lld\n", - (long long)(int)offsetof(struct hsm_request, hr_itemcount)); - LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_itemcount) == 4, "found %lld\n", - (long long)(int)sizeof(((struct hsm_request *)0)->hr_itemcount)); - LASSERTF((int)offsetof(struct hsm_request, hr_data_len) == 20, "found %lld\n", - (long long)(int)offsetof(struct hsm_request, hr_data_len)); - LASSERTF((int)sizeof(((struct hsm_request *)0)->hr_data_len) == 4, "found %lld\n", - (long long)(int)sizeof(((struct hsm_request *)0)->hr_data_len)); - LASSERTF(HSM_FORCE_ACTION == 0x00000001UL, "found 0x%.8xUL\n", - (unsigned int)HSM_FORCE_ACTION); - LASSERTF(HSM_GHOST_COPY == 0x00000002UL, "found 0x%.8xUL\n", - (unsigned int)HSM_GHOST_COPY); - - /* Checks for struct hsm_user_request */ - LASSERTF((int)sizeof(struct hsm_user_request) == 24, "found %lld\n", - (long long)(int)sizeof(struct hsm_user_request)); - LASSERTF((int)offsetof(struct hsm_user_request, hur_request) == 0, "found %lld\n", - (long long)(int)offsetof(struct hsm_user_request, hur_request)); - LASSERTF((int)sizeof(((struct hsm_user_request *)0)->hur_request) == 24, "found %lld\n", - (long long)(int)sizeof(((struct hsm_user_request *)0)->hur_request)); - LASSERTF((int)offsetof(struct hsm_user_request, hur_user_item) == 24, "found %lld\n", - (long long)(int)offsetof(struct hsm_user_request, hur_user_item)); - LASSERTF((int)sizeof(((struct hsm_user_request *)0)->hur_user_item) == 0, "found %lld\n", - (long long)(int)sizeof(((struct hsm_user_request *)0)->hur_user_item)); - - /* Checks for struct hsm_user_import */ - LASSERTF(sizeof(struct hsm_user_import) == 48, "found %lld\n", - (long long)sizeof(struct hsm_user_import)); - LASSERTF(offsetof(struct hsm_user_import, hui_size) == 0, - "found %lld\n", - (long long)offsetof(struct hsm_user_import, hui_size)); - LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_size) == 8, - "found %lld\n", - (long long)sizeof(((struct hsm_user_import *)0)->hui_size)); - LASSERTF(offsetof(struct hsm_user_import, hui_uid) == 32, - "found %lld\n", - (long long)offsetof(struct hsm_user_import, hui_uid)); - LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_uid) == 4, - "found %lld\n", - (long long)sizeof(((struct hsm_user_import *)0)->hui_uid)); - LASSERTF(offsetof(struct hsm_user_import, hui_gid) == 36, - "found %lld\n", - (long long)offsetof(struct hsm_user_import, hui_gid)); - LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_gid) == 4, - "found %lld\n", - (long long)sizeof(((struct hsm_user_import *)0)->hui_gid)); - LASSERTF(offsetof(struct hsm_user_import, hui_mode) == 40, - "found %lld\n", - (long long)offsetof(struct hsm_user_import, hui_mode)); - LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_mode) == 4, - "found %lld\n", - (long long)sizeof(((struct hsm_user_import *)0)->hui_mode)); - LASSERTF(offsetof(struct hsm_user_import, hui_atime) == 8, - "found %lld\n", - (long long)offsetof(struct hsm_user_import, hui_atime)); - LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_atime) == 8, - "found %lld\n", - (long long)sizeof(((struct hsm_user_import *)0)->hui_atime)); - LASSERTF(offsetof(struct hsm_user_import, hui_atime_ns) == 24, - "found %lld\n", - (long long)(int)offsetof(struct hsm_user_import, hui_atime_ns)); - LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_atime_ns) == 4, - "found %lld\n", - (long long)sizeof(((struct hsm_user_import *)0)->hui_atime_ns)); - LASSERTF(offsetof(struct hsm_user_import, hui_mtime) == 16, - "found %lld\n", - (long long)offsetof(struct hsm_user_import, hui_mtime)); - LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_mtime) == 8, - "found %lld\n", - (long long)sizeof(((struct hsm_user_import *)0)->hui_mtime)); - LASSERTF(offsetof(struct hsm_user_import, hui_mtime_ns) == 28, - "found %lld\n", - (long long)offsetof(struct hsm_user_import, hui_mtime_ns)); - LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_mtime_ns) == 4, - "found %lld\n", - (long long)sizeof(((struct hsm_user_import *)0)->hui_mtime_ns)); - LASSERTF(offsetof(struct hsm_user_import, hui_archive_id) == 44, - "found %lld\n", - (long long)offsetof(struct hsm_user_import, hui_archive_id)); - LASSERTF(sizeof(((struct hsm_user_import *)0)->hui_archive_id) == 4, - "found %lld\n", - (long long)sizeof(((struct hsm_user_import *)0)->hui_archive_id)); -} diff --git a/drivers/staging/lustre/sysfs-fs-lustre b/drivers/staging/lustre/sysfs-fs-lustre deleted file mode 100644 index 8691c6543a9c0f76d9926793e8af726fa957cede..0000000000000000000000000000000000000000 --- a/drivers/staging/lustre/sysfs-fs-lustre +++ /dev/null @@ -1,654 +0,0 @@ -What: /sys/fs/lustre/version -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Shows current running lustre version. - -What: /sys/fs/lustre/pinger -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Shows if the lustre module has pinger support. - "on" means yes and "off" means no. - -What: /sys/fs/lustre/health_check -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Shows whenever current system state believed to be "healthy", - "NOT HEALTHY", or "LBUG" whenever lustre has experienced - an internal assertion failure - -What: /sys/fs/lustre/jobid_name -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Currently running job "name" for this node to be transferred - to Lustre servers for purposes of QoS and statistics gathering. - Writing into this file will change the name, reading outputs - currently set value. - -What: /sys/fs/lustre/jobid_var -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Control file for lustre "jobstats" functionality, write new - value from the list below to change the mode: - disable - disable job name reporting to the servers (default) - procname_uid - form the job name as the current running - command name and pid with a dot in between - e.g. dd.1253 - nodelocal - use jobid_name value from above. - -What: /sys/fs/lustre/timeout -Date: June 2015 -Contact: "Oleg Drokin" -Description: - Controls "lustre timeout" variable, also known as obd_timeout - in some old manual. In the past obd_timeout was of paramount - importance as the timeout value used everywhere and where - other timeouts were derived from. These days it's much less - important as network timeouts are mostly determined by - AT (adaptive timeouts). - Unit: seconds, default: 100 - -What: /sys/fs/lustre/max_dirty_mb -Date: June 2015 -Contact: "Oleg Drokin" -Description: - Controls total number of dirty cache (in megabytes) allowed - across all mounted lustre filesystems. - Since writeout of dirty pages in Lustre is somewhat expensive, - when you allow to many dirty pages, this might lead to - performance degradations as kernel tries to desperately - find some pages to free/writeout. - Default 1/2 RAM. Min value 4, max value 9/10 of RAM. - -What: /sys/fs/lustre/debug_peer_on_timeout -Date: June 2015 -Contact: "Oleg Drokin" -Description: - Control if lnet debug information should be printed when - an RPC timeout occurs. - 0 disabled (default) - 1 enabled - -What: /sys/fs/lustre/dump_on_timeout -Date: June 2015 -Contact: "Oleg Drokin" -Description: - Controls if Lustre debug log should be dumped when an RPC - timeout occurs. This is useful if yout debug buffer typically - rolls over by the time you notice RPC timeouts. - -What: /sys/fs/lustre/dump_on_eviction -Date: June 2015 -Contact: "Oleg Drokin" -Description: - Controls if Lustre debug log should be dumped when an this - client is evicted from one of the servers. - This is useful if yout debug buffer typically rolls over - by the time you notice the eviction event. - -What: /sys/fs/lustre/at_min -Date: July 2015 -Contact: "Oleg Drokin" -Description: - Controls minimum adaptive timeout in seconds. If you encounter - a case where clients timeout due to server-reported processing - time being too short, you might consider increasing this value. - One common case of this if the underlying network has - unpredictable long delays. - Default: 0 - -What: /sys/fs/lustre/at_max -Date: July 2015 -Contact: "Oleg Drokin" -Description: - Controls maximum adaptive timeout in seconds. If at_max timeout - is reached for an RPC, the RPC will time out. - Some genuinuely slow network hardware might warrant increasing - this value. - Setting this value to 0 disables Adaptive Timeouts - functionality and old-style obd_timeout value is then used. - Default: 600 - -What: /sys/fs/lustre/at_extra -Date: July 2015 -Contact: "Oleg Drokin" -Description: - Controls how much extra time to request for unfinished requests - in processing in seconds. Normally a server-side parameter, it - is also used on the client for responses to various LDLM ASTs - that are handled with a special server thread on the client. - This is a way for the servers to ask the clients not to time - out the request that reached current servicing time estimate - yet and give it some more time. - Default: 30 - -What: /sys/fs/lustre/at_early_margin -Date: July 2015 -Contact: "Oleg Drokin" -Description: - Controls when to send the early reply for requests that are - about to timeout as an offset to the estimated service time in - seconds.. - Default: 5 - -What: /sys/fs/lustre/at_history -Date: July 2015 -Contact: "Oleg Drokin" -Description: - Controls for how many seconds to remember slowest events - encountered by adaptive timeouts code. - Default: 600 - -What: /sys/fs/lustre/llite/-/blocksize -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Biggest blocksize on object storage server for this filesystem. - -What: /sys/fs/lustre/llite/-/kbytestotal -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Shows total number of kilobytes of space on this filesystem - -What: /sys/fs/lustre/llite/-/kbytesfree -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Shows total number of free kilobytes of space on this filesystem - -What: /sys/fs/lustre/llite/-/kbytesavail -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Shows total number of free kilobytes of space on this filesystem - actually available for use (taking into account per-client - grants and filesystem reservations). - -What: /sys/fs/lustre/llite/-/filestotal -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Shows total number of inodes on the filesystem. - -What: /sys/fs/lustre/llite/-/filesfree -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Shows estimated number of free inodes on the filesystem - -What: /sys/fs/lustre/llite/-/client_type -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Shows whenever this filesystem considers this client to be - compute cluster-local or remote. Remote clients have - additional uid/gid convrting logic applied. - -What: /sys/fs/lustre/llite/-/fstype -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Shows filesystem type of the filesystem - -What: /sys/fs/lustre/llite/-/uuid -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Shows this filesystem superblock uuid - -What: /sys/fs/lustre/llite/-/max_read_ahead_mb -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Sets maximum number of megabytes in system memory to be - given to read-ahead cache. - -What: /sys/fs/lustre/llite/-/max_read_ahead_per_file_mb -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Sets maximum number of megabytes to read-ahead for a single file - -What: /sys/fs/lustre/llite/-/max_read_ahead_whole_mb -Date: May 2015 -Contact: "Oleg Drokin" -Description: - For small reads, how many megabytes to actually request from - the server as initial read-ahead. - -What: /sys/fs/lustre/llite/-/checksum_pages -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Enables or disables per-page checksum at llite layer, before - the pages are actually given to lower level for network transfer - -What: /sys/fs/lustre/llite/-/stats_track_pid -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Limit Lustre vfs operations gathering to just a single pid. - 0 to track everything. - -What: /sys/fs/lustre/llite/-/stats_track_ppid -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Limit Lustre vfs operations gathering to just a single ppid. - 0 to track everything. - -What: /sys/fs/lustre/llite/-/stats_track_gid -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Limit Lustre vfs operations gathering to just a single gid. - 0 to track everything. - -What: /sys/fs/lustre/llite/-/statahead_max -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Controls maximum number of statahead requests to send when - sequential readdir+stat pattern is detected. - -What: /sys/fs/lustre/llite/-/statahead_agl -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Controls if AGL (async glimpse ahead - obtain object information - from OSTs in parallel with MDS during statahead) should be - enabled or disabled. - 0 to disable, 1 to enable. - -What: /sys/fs/lustre/llite/-/lazystatfs -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Controls statfs(2) behaviour in the face of down servers. - If 0, always wait for all servers to come online, - if 1, ignote inactive servers. - -What: /sys/fs/lustre/llite/-/max_easize -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Shows maximum number of bytes file striping data could be - in current configuration of storage. - -What: /sys/fs/lustre/llite/-/default_easize -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Shows maximum observed file striping data seen by this - filesystem client instance. - -What: /sys/fs/lustre/llite/-/xattr_cache -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Controls extended attributes client-side cache. - 1 to enable, 0 to disable. - -What: /sys/fs/lustre/llite/-/unstable_stats -Date: Apr 2016 -Contact: "Oleg Drokin" -Description: - Shows number of pages that were sent and acknowledged by - server but were not yet committed and therefore still - pinned in client memory even though no longer dirty. - -What: /sys/fs/lustre/ldlm/cancel_unused_locks_before_replay -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Controls if client should replay unused locks during recovery - If a client tends to have a lot of unused locks in LRU, - recovery times might become prolonged. - 1 - just locally cancel unused locks (default) - 0 - replay unused locks. - -What: /sys/fs/lustre/ldlm/namespaces//resource_count -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Displays number of lock resources (objects on which individual - locks are taken) currently allocated in this namespace. - -What: /sys/fs/lustre/ldlm/namespaces//lock_count -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Displays number or locks allocated in this namespace. - -What: /sys/fs/lustre/ldlm/namespaces//lru_size -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Controls and displays LRU size limit for unused locks for this - namespace. - 0 - LRU size is unlimited, controlled by server resources - positive number - number of locks to allow in lock LRU list - -What: /sys/fs/lustre/ldlm/namespaces//lock_unused_count -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Display number of locks currently sitting in the LRU list - of this namespace - -What: /sys/fs/lustre/ldlm/namespaces//lru_max_age -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Maximum number of milliseconds a lock could sit in LRU list - before client would voluntarily cancel it as unused. - -What: /sys/fs/lustre/ldlm/namespaces//early_lock_cancel -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Controls "early lock cancellation" feature on this namespace - if supported by the server. - When enabled, tries to preemtively cancel locks that would be - cancelled by verious operations and bundle the cancellation - requests in the same RPC as the main operation, which results - in significant speedups due to reduced lock-pingpong RPCs. - 0 - disabled - 1 - enabled (default) - -What: /sys/fs/lustre/ldlm/namespaces//pool/granted -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Displays number of granted locks in this namespace - -What: /sys/fs/lustre/ldlm/namespaces//pool/grant_rate -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Number of granted locks in this namespace during last - time interval - -What: /sys/fs/lustre/ldlm/namespaces//pool/cancel_rate -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Number of lock cancellations in this namespace during - last time interval - -What: /sys/fs/lustre/ldlm/namespaces//pool/grant_speed -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Calculated speed of lock granting (grant_rate - cancel_rate) - in this namespace - -What: /sys/fs/lustre/ldlm/namespaces//pool/grant_plan -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Estimated number of locks to be granted in the next time - interval in this namespace - -What: /sys/fs/lustre/ldlm/namespaces//pool/limit -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Controls number of allowed locks in this pool. - When lru_size is 0, this is the actual limit then. - -What: /sys/fs/lustre/ldlm/namespaces//pool/lock_volume_factor -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Multiplier for all lock volume calculations above. - Default is 1. Increase to make the client to more agressively - clean it's lock LRU list for this namespace. - -What: /sys/fs/lustre/ldlm/namespaces//pool/server_lock_volume -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Calculated server lock volume. - -What: /sys/fs/lustre/ldlm/namespaces//pool/recalc_period -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Controls length of time between recalculation of above - values (in seconds). - -What: /sys/fs/lustre/ldlm/services/ldlm_cbd/threads_min -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Controls minimum number of ldlm callback threads to start. - -What: /sys/fs/lustre/ldlm/services/ldlm_cbd/threads_max -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Controls maximum number of ldlm callback threads to start. - -What: /sys/fs/lustre/ldlm/services/ldlm_cbd/threads_started -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Shows actual number of ldlm callback threads running. - -What: /sys/fs/lustre/ldlm/services/ldlm_cbd/high_priority_ratio -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Controls what percentage of ldlm callback threads is dedicated - to "high priority" incoming requests. - -What: /sys/fs/lustre/{obdtype}/{connection_name}/blocksize -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Blocksize on backend filesystem for service behind this obd - device (or biggest blocksize for compound devices like lov - and lmv) - -What: /sys/fs/lustre/{obdtype}/{connection_name}/kbytestotal -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Total number of kilobytes of space on backend filesystem - for service behind this obd (or total amount for compound - devices like lov lmv) - -What: /sys/fs/lustre/{obdtype}/{connection_name}/kbytesfree -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Number of free kilobytes on backend filesystem for service - behind this obd (or total amount for compound devices - like lov lmv) - -What: /sys/fs/lustre/{obdtype}/{connection_name}/kbytesavail -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Number of kilobytes of free space on backend filesystem - for service behind this obd (or total amount for compound - devices like lov lmv) that is actually available for use - (taking into account per-client and filesystem reservations). - -What: /sys/fs/lustre/{obdtype}/{connection_name}/filestotal -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Number of inodes on backend filesystem for service behind this - obd. - -What: /sys/fs/lustre/{obdtype}/{connection_name}/filesfree -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Number of free inodes on backend filesystem for service - behind this obd. - -What: /sys/fs/lustre/mdc/{connection_name}/max_pages_per_rpc -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Maximum number of readdir pages to fit into a single readdir - RPC. - -What: /sys/fs/lustre/{mdc,osc}/{connection_name}/max_rpcs_in_flight -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Maximum number of parallel RPCs on the wire to allow on - this connection. Increasing this number would help on higher - latency links, but has a chance of overloading a server - if you have too many clients like this. - Default: 8 - -What: /sys/fs/lustre/osc/{connection_name}/max_pages_per_rpc -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Maximum number of pages to fit into a single RPC. - Typically bigger RPCs allow for better performance. - Default: however many pages to form 1M of data (256 pages - for 4K page sized platforms) - -What: /sys/fs/lustre/osc/{connection_name}/active -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Controls accessibility of this connection. If set to 0, - fail all accesses immediately. - -What: /sys/fs/lustre/osc/{connection_name}/checksums -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Controls whenever to checksum bulk RPC data over the wire - to this target. - 1: enable (default) ; 0: disable - -What: /sys/fs/lustre/osc/{connection_name}/contention_seconds -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Controls for how long to consider a file contended once - indicated as such by the server. - When a file is considered contended, all operations switch to - synchronous lockless mode to avoid cache and lock pingpong. - -What: /sys/fs/lustre/osc/{connection_name}/cur_dirty_bytes -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Displays how many dirty bytes is presently in the cache for this - target. - -What: /sys/fs/lustre/osc/{connection_name}/cur_grant_bytes -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Shows how many bytes we have as a "dirty cache" grant from the - server. Writing a value smaller than shown allows to release - some grant back to the server. - Dirty cache grant is a way Lustre ensures that cached successful - writes on client do not end up discarded by the server due to - lack of space later on. - -What: /sys/fs/lustre/osc/{connection_name}/cur_lost_grant_bytes -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Shows how many granted bytes were released to the server due - to lack of write activity on this client. - -What: /sys/fs/lustre/osc/{connection_name}/grant_shrink_interval -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Number of seconds with no write activity for this target - to start releasing dirty grant back to the server. - -What: /sys/fs/lustre/osc/{connection_name}/destroys_in_flight -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Number of DESTROY RPCs currently in flight to this target. - -What: /sys/fs/lustre/osc/{connection_name}/lockless_truncate -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Controls whether lockless truncate RPCs are allowed to this - target. - Lockless truncate causes server to perform the locking which - is beneficial if the truncate is not followed by a write - immediately. - 1: enable ; 0: disable (default) - -What: /sys/fs/lustre/osc/{connection_name}/max_dirty_mb -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Controls how much dirty data this client can accumulate - for this target. This is orthogonal to dirty grant and is - a hard limit even if the server would allow a bigger dirty - cache. - While allowing higher dirty cache is beneficial for write - performance, flushing write cache takes longer and as such - the node might be more prone to OOMs. - Having this value set too low might result in not being able - to sent too many parallel WRITE RPCs. - Default: 32 - -What: /sys/fs/lustre/osc/{connection_name}/resend_count -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Controls how many times to try and resend RPCs to this target - that failed with "recoverable" status, such as EAGAIN, - ENOMEM. - -What: /sys/fs/lustre/lov/{connection_name}/numobd -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Number of OSC targets managed by this LOV instance. - -What: /sys/fs/lustre/lov/{connection_name}/activeobd -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Number of OSC targets managed by this LOV instance that are - actually active. - -What: /sys/fs/lustre/lmv/{connection_name}/numobd -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Number of MDC targets managed by this LMV instance. - -What: /sys/fs/lustre/lmv/{connection_name}/activeobd -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Number of MDC targets managed by this LMV instance that are - actually active. - -What: /sys/fs/lustre/lmv/{connection_name}/placement -Date: May 2015 -Contact: "Oleg Drokin" -Description: - Determines policy of inode placement in case of multiple - metadata servers: - CHAR - based on a hash of the file name used at creation time - (Default) - NID - based on a hash of creating client network id. diff --git a/scripts/selinux/mdp/mdp.c b/scripts/selinux/mdp/mdp.c index ffe8179f5d41b38e43c475037c5ad0ab49c3a00d..073fe7537f6c00db183cb28aabe40856b3b28690 100644 --- a/scripts/selinux/mdp/mdp.c +++ b/scripts/selinux/mdp/mdp.c @@ -124,7 +124,6 @@ int main(int argc, char *argv[]) fprintf(fout, "fs_use_xattr reiserfs user_u:base_r:base_t;\n"); fprintf(fout, "fs_use_xattr jffs2 user_u:base_r:base_t;\n"); fprintf(fout, "fs_use_xattr gfs2 user_u:base_r:base_t;\n"); - fprintf(fout, "fs_use_xattr lustre user_u:base_r:base_t;\n"); fprintf(fout, "fs_use_task eventpollfs user_u:base_r:base_t;\n"); fprintf(fout, "fs_use_task pipefs user_u:base_r:base_t;\n");