backing-dev-defs.h 8.3 KB
Newer Older
1
/* SPDX-License-Identifier: GPL-2.0 */
2 3 4 5
#ifndef __LINUX_BACKING_DEV_DEFS_H
#define __LINUX_BACKING_DEV_DEFS_H

#include <linux/list.h>
6 7
#include <linux/radix-tree.h>
#include <linux/rbtree.h>
8 9
#include <linux/spinlock.h>
#include <linux/percpu_counter.h>
10
#include <linux/percpu-refcount.h>
11 12 13
#include <linux/flex_proportions.h>
#include <linux/timer.h>
#include <linux/workqueue.h>
14
#include <linux/kref.h>
15 16 17 18 19 20 21 22 23 24 25

struct page;
struct device;
struct dentry;

/*
 * Bits in bdi_writeback.state
 */
enum wb_state {
	WB_registered,		/* bdi_register() was done */
	WB_writeback_running,	/* Writeback is in progress */
26
	WB_has_dirty_io,	/* Dirty inodes on ->b_{dirty|io|more_io} */
27
	WB_start_all,		/* nr_pages == 0 (all) work pending */
28 29
};

30 31 32 33 34
enum wb_congested_state {
	WB_async_congested,	/* The async (write) queue is getting full */
	WB_sync_congested,	/* The sync queue is getting full */
};

35 36 37 38 39 40 41 42 43 44 45 46
typedef int (congested_fn)(void *, int);

enum wb_stat_item {
	WB_RECLAIMABLE,
	WB_WRITEBACK,
	WB_DIRTIED,
	WB_WRITTEN,
	NR_WB_STAT_ITEMS
};

#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))

47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
/*
 * why some writeback work was initiated
 */
enum wb_reason {
	WB_REASON_BACKGROUND,
	WB_REASON_VMSCAN,
	WB_REASON_SYNC,
	WB_REASON_PERIODIC,
	WB_REASON_LAPTOP_TIMER,
	WB_REASON_FREE_MORE_MEM,
	WB_REASON_FS_FREE_SPACE,
	/*
	 * There is no bdi forker thread any more and works are done
	 * by emergency worker, however, this is TPs userland visible
	 * and we'll be exposing exactly the same information,
	 * so it has a mismatch name.
	 */
	WB_REASON_FORKER_THREAD,

	WB_REASON_MAX,
};

69 70 71 72 73 74 75
/*
 * For cgroup writeback, multiple wb's may map to the same blkcg.  Those
 * wb's can operate mostly independently but should share the congested
 * state.  To facilitate such sharing, the congested state is tracked using
 * the following struct which is created on demand, indexed by blkcg ID on
 * its bdi, and refcounted.
 */
76 77
struct bdi_writeback_congested {
	unsigned long state;		/* WB_[a]sync_congested flags */
78
	atomic_t refcnt;		/* nr of attached wb's and blkg */
79 80

#ifdef CONFIG_CGROUP_WRITEBACK
J
Jan Kara 已提交
81 82 83
	struct backing_dev_info *__bdi;	/* the associated bdi, set to NULL
					 * on bdi unregistration. For memcg-wb
					 * internal use only! */
84 85 86
	int blkcg_id;			/* ID of the associated blkcg */
	struct rb_node rb_node;		/* on bdi->cgwb_congestion_tree */
#endif
87 88
};

89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
/*
 * Each wb (bdi_writeback) can perform writeback operations, is measured
 * and throttled, independently.  Without cgroup writeback, each bdi
 * (bdi_writeback) is served by its embedded bdi->wb.
 *
 * On the default hierarchy, blkcg implicitly enables memcg.  This allows
 * using memcg's page ownership for attributing writeback IOs, and every
 * memcg - blkcg combination can be served by its own wb by assigning a
 * dedicated wb to each memcg, which enables isolation across different
 * cgroups and propagation of IO back pressure down from the IO layer upto
 * the tasks which are generating the dirty pages to be written back.
 *
 * A cgroup wb is indexed on its bdi by the ID of the associated memcg,
 * refcounted with the number of inodes attached to it, and pins the memcg
 * and the corresponding blkcg.  As the corresponding blkcg for a memcg may
 * change as blkcg is disabled and enabled higher up in the hierarchy, a wb
 * is tested for blkcg after lookup and removed from index on mismatch so
 * that a new wb for the combination can be created.
 */
108 109 110 111 112 113 114 115 116 117 118 119 120 121
struct bdi_writeback {
	struct backing_dev_info *bdi;	/* our parent bdi */

	unsigned long state;		/* Always use atomic bitops on this */
	unsigned long last_old_flush;	/* last old data flush */

	struct list_head b_dirty;	/* dirty inodes */
	struct list_head b_io;		/* parked for writeback */
	struct list_head b_more_io;	/* parked for more writeback */
	struct list_head b_dirty_time;	/* time stamps are dirty */
	spinlock_t list_lock;		/* protects the b_* lists */

	struct percpu_counter stat[NR_WB_STAT_ITEMS];

122 123
	struct bdi_writeback_congested *congested;

124 125 126 127
	unsigned long bw_time_stamp;	/* last time write bw is updated */
	unsigned long dirtied_stamp;
	unsigned long written_stamp;	/* pages written at bw_time_stamp */
	unsigned long write_bandwidth;	/* the estimated write bandwidth */
128
	unsigned long avg_write_bandwidth; /* further smoothed write bw, > 0 */
129 130 131 132 133 134 135 136 137 138 139 140

	/*
	 * The base dirty throttle rate, re-calculated on every 200ms.
	 * All the bdi tasks' dirty rate will be curbed under it.
	 * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
	 * in small steps and is much more smooth/stable than the latter.
	 */
	unsigned long dirty_ratelimit;
	unsigned long balanced_dirty_ratelimit;

	struct fprop_local_percpu completions;
	int dirty_exceeded;
141
	enum wb_reason start_all_reason;
142 143 144 145

	spinlock_t work_lock;		/* protects work_list & dwork scheduling */
	struct list_head work_list;
	struct delayed_work dwork;	/* work item used for writeback */
146

147 148
	unsigned long dirty_sleep;	/* last wait */

149 150
	struct list_head bdi_node;	/* anchored at bdi->wb_list */

151 152
#ifdef CONFIG_CGROUP_WRITEBACK
	struct percpu_ref refcnt;	/* used only for !root wb's */
T
Tejun Heo 已提交
153
	struct fprop_local_percpu memcg_completions;
154 155 156 157 158 159 160 161 162 163
	struct cgroup_subsys_state *memcg_css; /* the associated memcg */
	struct cgroup_subsys_state *blkcg_css; /* and blkcg */
	struct list_head memcg_node;	/* anchored at memcg->cgwb_list */
	struct list_head blkcg_node;	/* anchored at blkcg->cgwb_list */

	union {
		struct work_struct release_work;
		struct rcu_head rcu;
	};
#endif
164 165 166 167
};

struct backing_dev_info {
	struct list_head bdi_list;
168
	unsigned long ra_pages;	/* max readahead in PAGE_SIZE units */
169
	unsigned long io_pages;	/* max allowed IO size */
170 171 172
	congested_fn *congested_fn; /* Function pointer if device is md/dm */
	void *congested_data;	/* Pointer to aux data for congested func */

173
	const char *name;
174

175
	struct kref refcnt;	/* Reference counter for the structure */
176
	unsigned int capabilities; /* Device capabilities */
177 178 179
	unsigned int min_ratio;
	unsigned int max_ratio, max_prop_frac;

180 181 182 183 184
	/*
	 * Sum of avg_write_bw of wbs with dirty inodes.  > 0 if there are
	 * any dirty wbs, which is depended upon by bdi_has_dirty().
	 */
	atomic_long_t tot_write_bandwidth;
185

186
	struct bdi_writeback wb;  /* the root writeback info for this bdi */
187
	struct list_head wb_list; /* list of all wbs */
188 189 190
#ifdef CONFIG_CGROUP_WRITEBACK
	struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
	struct rb_root cgwb_congested_tree; /* their congested states */
J
Jan Kara 已提交
191
	struct mutex cgwb_release_mutex;  /* protect shutdown of wb structs */
192 193
#else
	struct bdi_writeback_congested *wb_congested;
194
#endif
195 196
	wait_queue_head_t wb_waitq;

197
	struct device *dev;
198
	struct device *owner;
199 200 201 202 203 204 205 206 207 208 209 210 211 212

	struct timer_list laptop_mode_wb_timer;

#ifdef CONFIG_DEBUG_FS
	struct dentry *debug_dir;
	struct dentry *debug_stats;
#endif
};

enum {
	BLK_RW_ASYNC	= 0,
	BLK_RW_SYNC	= 1,
};

213 214 215 216 217 218 219 220 221 222 223 224
void clear_wb_congested(struct bdi_writeback_congested *congested, int sync);
void set_wb_congested(struct bdi_writeback_congested *congested, int sync);

static inline void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
{
	clear_wb_congested(bdi->wb.congested, sync);
}

static inline void set_bdi_congested(struct backing_dev_info *bdi, int sync)
{
	set_wb_congested(bdi->wb.congested, sync);
}
225

G
Greg Thelen 已提交
226 227 228 229 230
struct wb_lock_cookie {
	bool locked;
	unsigned long flags;
};

231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263
#ifdef CONFIG_CGROUP_WRITEBACK

/**
 * wb_tryget - try to increment a wb's refcount
 * @wb: bdi_writeback to get
 */
static inline bool wb_tryget(struct bdi_writeback *wb)
{
	if (wb != &wb->bdi->wb)
		return percpu_ref_tryget(&wb->refcnt);
	return true;
}

/**
 * wb_get - increment a wb's refcount
 * @wb: bdi_writeback to get
 */
static inline void wb_get(struct bdi_writeback *wb)
{
	if (wb != &wb->bdi->wb)
		percpu_ref_get(&wb->refcnt);
}

/**
 * wb_put - decrement a wb's refcount
 * @wb: bdi_writeback to put
 */
static inline void wb_put(struct bdi_writeback *wb)
{
	if (wb != &wb->bdi->wb)
		percpu_ref_put(&wb->refcnt);
}

264 265 266 267 268 269 270 271 272 273 274
/**
 * wb_dying - is a wb dying?
 * @wb: bdi_writeback of interest
 *
 * Returns whether @wb is unlinked and being drained.
 */
static inline bool wb_dying(struct bdi_writeback *wb)
{
	return percpu_ref_is_dying(&wb->refcnt);
}

275 276 277 278 279 280 281 282 283 284 285 286 287 288 289
#else	/* CONFIG_CGROUP_WRITEBACK */

static inline bool wb_tryget(struct bdi_writeback *wb)
{
	return true;
}

static inline void wb_get(struct bdi_writeback *wb)
{
}

static inline void wb_put(struct bdi_writeback *wb)
{
}

290 291 292 293 294
static inline bool wb_dying(struct bdi_writeback *wb)
{
	return false;
}

295 296
#endif	/* CONFIG_CGROUP_WRITEBACK */

297
#endif	/* __LINUX_BACKING_DEV_DEFS_H */