' > $@
-
-###
-# Rules to create an aux XML and .db, and use them to re-process the DocBook XML
-# to fill internal hyperlinks
- gen_aux_xml = :
- quiet_gen_aux_xml = echo ' XMLREF $@'
-silent_gen_aux_xml = :
-%.aux.xml: %.xml
- @$($(quiet)gen_aux_xml)
- @rm -rf $@
- @(cat $< | egrep "^ $<.db)
- @$(KERNELDOCXMLREF) -db $<.db $< > $@
-.PRECIOUS: %.aux.xml
-
-%.html: %.aux.xml
- @(which xmlto > /dev/null 2>&1) || \
- (echo "*** You need to install xmlto ***"; \
- exit 1)
- @rm -rf $@ $(patsubst %.html,%,$@)
- $(call cmd,db2html)
- @if [ ! -z "$(PNG-$(basename $(notdir $@)))" ]; then \
- cp $(PNG-$(basename $(notdir $@))) $(patsubst %.html,%,$@); fi
-
-quiet_cmd_db2man = MAN $@
- cmd_db2man = if grep -q refentry $<; then xmlto man $(XMLTOFLAGS) -o $(obj)/man/$(*F) $< ; fi
-%.9 : %.xml
- @(which xmlto > /dev/null 2>&1) || \
- (echo "*** You need to install xmlto ***"; \
- exit 1)
- $(Q)mkdir -p $(obj)/man/$(*F)
- $(call cmd,db2man)
- @touch $@
-
-###
-# Rules to generate postscripts and PNG images from .fig format files
-quiet_cmd_fig2eps = FIG2EPS $@
- cmd_fig2eps = fig2dev -Leps $< $@
-
-%.eps: %.fig
- @(which fig2dev > /dev/null 2>&1) || \
- (echo "*** You need to install transfig ***"; \
- exit 1)
- $(call cmd,fig2eps)
-
-quiet_cmd_fig2png = FIG2PNG $@
- cmd_fig2png = fig2dev -Lpng $< $@
-
-%.png: %.fig
- @(which fig2dev > /dev/null 2>&1) || \
- (echo "*** You need to install transfig ***"; \
- exit 1)
- $(call cmd,fig2png)
-
-###
-# Rule to convert a .c file to inline XML documentation
- gen_xml = :
- quiet_gen_xml = echo ' GEN $@'
-silent_gen_xml = :
-%.xml: %.c
- @$($(quiet)gen_xml)
- @( \
- echo ""; \
- expand --tabs=8 < $< | \
- sed -e "s/&/\\&/g" \
- -e "s/\\</g" \
- -e "s/>/\\>/g"; \
- echo "") > $@
-
-endif # DOCBOOKS=""
-endif # SPHINDIR=...
-
-###
-# Help targets as used by the top-level makefile
-dochelp:
- @echo ' Linux kernel internal documentation in different formats (DocBook):'
- @echo ' htmldocs - HTML'
- @echo ' pdfdocs - PDF'
- @echo ' psdocs - Postscript'
- @echo ' xmldocs - XML DocBook'
- @echo ' mandocs - man pages'
- @echo ' installmandocs - install man pages generated by mandocs to INSTALL_MAN_PATH'; \
- echo ' (default: $(INSTALL_MAN_PATH))'; \
- echo ''
- @echo ' cleandocs - clean all generated DocBook files'
- @echo
- @echo ' make DOCBOOKS="s1.xml s2.xml" [target] Generate only docs s1.xml s2.xml'
- @echo ' valid values for DOCBOOKS are: $(DOCBOOKS)'
- @echo
- @echo " make DOCBOOKS=\"\" [target] Don't generate docs from Docbook"
- @echo ' This is useful to generate only the ReST docs (Sphinx)'
-
-
-###
-# Temporary files left by various tools
-clean-files := $(DOCBOOKS) \
- $(patsubst %.xml, %.dvi, $(DOCBOOKS)) \
- $(patsubst %.xml, %.aux, $(DOCBOOKS)) \
- $(patsubst %.xml, %.tex, $(DOCBOOKS)) \
- $(patsubst %.xml, %.log, $(DOCBOOKS)) \
- $(patsubst %.xml, %.out, $(DOCBOOKS)) \
- $(patsubst %.xml, %.ps, $(DOCBOOKS)) \
- $(patsubst %.xml, %.pdf, $(DOCBOOKS)) \
- $(patsubst %.xml, %.html, $(DOCBOOKS)) \
- $(patsubst %.xml, %.9, $(DOCBOOKS)) \
- $(patsubst %.xml, %.aux.xml, $(DOCBOOKS)) \
- $(patsubst %.xml, %.xml.db, $(DOCBOOKS)) \
- $(patsubst %.xml, %.xml, $(DOCBOOKS)) \
- $(patsubst %.xml, .%.xml.cmd, $(DOCBOOKS)) \
- $(index)
-
-clean-dirs := $(patsubst %.xml,%,$(DOCBOOKS)) man
-
-cleandocs:
- $(Q)rm -f $(call objectify, $(clean-files))
- $(Q)rm -rf $(call objectify, $(clean-dirs))
-
-# Declare the contents of the .PHONY variable as phony. We keep that
-# information in a variable so we can use it in if_changed and friends.
-
-.PHONY: $(PHONY)
diff --git a/Documentation/DocBook/filesystems.tmpl b/Documentation/DocBook/filesystems.tmpl
deleted file mode 100644
index 6006b6358c867f0d9ef1dd64dcbe471cff35063e..0000000000000000000000000000000000000000
--- a/Documentation/DocBook/filesystems.tmpl
+++ /dev/null
@@ -1,381 +0,0 @@
-
-
-
-
-
- Linux Filesystems API
-
-
-
- This documentation is free software; you can redistribute
- it and/or modify it under the terms of the GNU General Public
- License as published by the Free Software Foundation; either
- version 2 of the License, or (at your option) any later
- version.
-
-
-
- This program is distributed in the hope that it will be
- useful, but WITHOUT ANY WARRANTY; without even the implied
- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- See the GNU General Public License for more details.
-
-
-
- You should have received a copy of the GNU General Public
- License along with this program; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
- MA 02111-1307 USA
-
-
-
- For more details see the file COPYING in the source
- distribution of Linux.
-
-
-
-
-
-
-
- The Linux VFS
- The Filesystem types
-!Iinclude/linux/fs.h
-
- The Directory Cache
-!Efs/dcache.c
-!Iinclude/linux/dcache.h
-
- Inode Handling
-!Efs/inode.c
-!Efs/bad_inode.c
-
- Registration and Superblocks
-!Efs/super.c
-
- File Locks
-!Efs/locks.c
-!Ifs/locks.c
-
- Other Functions
-!Efs/mpage.c
-!Efs/namei.c
-!Efs/buffer.c
-!Eblock/bio.c
-!Efs/seq_file.c
-!Efs/filesystems.c
-!Efs/fs-writeback.c
-!Efs/block_dev.c
-
-
-
-
- The proc filesystem
-
- sysctl interface
-!Ekernel/sysctl.c
-
-
- proc filesystem interface
-!Ifs/proc/base.c
-
-
-
-
- Events based on file descriptors
-!Efs/eventfd.c
-
-
-
- The Filesystem for Exporting Kernel Objects
-!Efs/sysfs/file.c
-!Efs/sysfs/symlink.c
-
-
-
- The debugfs filesystem
-
- debugfs interface
-!Efs/debugfs/inode.c
-!Efs/debugfs/file.c
-
-
-
-
-
- The Linux Journalling API
-
-
-
- Roger
- Gammans
-
-
- rgammans@computer-surgery.co.uk
-
-
-
-
-
-
-
- Stephen
- Tweedie
-
-
- sct@redhat.com
-
-
-
-
-
-
- 2002
- Roger Gammans
-
-
-
- The Linux Journalling API
-
-
- Overview
-
- Details
-
-The journalling layer is easy to use. You need to
-first of all create a journal_t data structure. There are
-two calls to do this dependent on how you decide to allocate the physical
-media on which the journal resides. The jbd2_journal_init_inode() call
-is for journals stored in filesystem inodes, or the jbd2_journal_init_dev()
-call can be used for journal stored on a raw device (in a continuous range
-of blocks). A journal_t is a typedef for a struct pointer, so when
-you are finally finished make sure you call jbd2_journal_destroy() on it
-to free up any used kernel memory.
-
-
-
-Once you have got your journal_t object you need to 'mount' or load the journal
-file. The journalling layer expects the space for the journal was already
-allocated and initialized properly by the userspace tools. When loading the
-journal you must call jbd2_journal_load() to process journal contents. If the
-client file system detects the journal contents does not need to be processed
-(or even need not have valid contents), it may call jbd2_journal_wipe() to
-clear the journal contents before calling jbd2_journal_load().
-
-
-
-Note that jbd2_journal_wipe(..,0) calls jbd2_journal_skip_recovery() for you if
-it detects any outstanding transactions in the journal and similarly
-jbd2_journal_load() will call jbd2_journal_recover() if necessary. I would
-advise reading ext4_load_journal() in fs/ext4/super.c for examples on this
-stage.
-
-
-
-Now you can go ahead and start modifying the underlying
-filesystem. Almost.
-
-
-
-
-You still need to actually journal your filesystem changes, this
-is done by wrapping them into transactions. Additionally you
-also need to wrap the modification of each of the buffers
-with calls to the journal layer, so it knows what the modifications
-you are actually making are. To do this use jbd2_journal_start() which
-returns a transaction handle.
-
-
-
-jbd2_journal_start()
-and its counterpart jbd2_journal_stop(), which indicates the end of a
-transaction are nestable calls, so you can reenter a transaction if necessary,
-but remember you must call jbd2_journal_stop() the same number of times as
-jbd2_journal_start() before the transaction is completed (or more accurately
-leaves the update phase). Ext4/VFS makes use of this feature to simplify
-handling of inode dirtying, quota support, etc.
-
-
-
-Inside each transaction you need to wrap the modifications to the
-individual buffers (blocks). Before you start to modify a buffer you
-need to call jbd2_journal_get_{create,write,undo}_access() as appropriate,
-this allows the journalling layer to copy the unmodified data if it
-needs to. After all the buffer may be part of a previously uncommitted
-transaction.
-At this point you are at last ready to modify a buffer, and once
-you are have done so you need to call jbd2_journal_dirty_{meta,}data().
-Or if you've asked for access to a buffer you now know is now longer
-required to be pushed back on the device you can call jbd2_journal_forget()
-in much the same way as you might have used bforget() in the past.
-
-
-
-A jbd2_journal_flush() may be called at any time to commit and checkpoint
-all your transactions.
-
-
-
-Then at umount time , in your put_super() you can then call jbd2_journal_destroy()
-to clean up your in-core journal object.
-
-
-
-Unfortunately there a couple of ways the journal layer can cause a deadlock.
-The first thing to note is that each task can only have
-a single outstanding transaction at any one time, remember nothing
-commits until the outermost jbd2_journal_stop(). This means
-you must complete the transaction at the end of each file/inode/address
-etc. operation you perform, so that the journalling system isn't re-entered
-on another journal. Since transactions can't be nested/batched
-across differing journals, and another filesystem other than
-yours (say ext4) may be modified in a later syscall.
-
-
-
-The second case to bear in mind is that jbd2_journal_start() can
-block if there isn't enough space in the journal for your transaction
-(based on the passed nblocks param) - when it blocks it merely(!) needs to
-wait for transactions to complete and be committed from other tasks,
-so essentially we are waiting for jbd2_journal_stop(). So to avoid
-deadlocks you must treat jbd2_journal_start/stop() as if they
-were semaphores and include them in your semaphore ordering rules to prevent
-deadlocks. Note that jbd2_journal_extend() has similar blocking behaviour to
-jbd2_journal_start() so you can deadlock here just as easily as on
-jbd2_journal_start().
-
-
-
-Try to reserve the right number of blocks the first time. ;-). This will
-be the maximum number of blocks you are going to touch in this transaction.
-I advise having a look at at least ext4_jbd.h to see the basis on which
-ext4 uses to make these decisions.
-
-
-
-Another wriggle to watch out for is your on-disk block allocation strategy.
-Why? Because, if you do a delete, you need to ensure you haven't reused any
-of the freed blocks until the transaction freeing these blocks commits. If you
-reused these blocks and crash happens, there is no way to restore the contents
-of the reallocated blocks at the end of the last fully committed transaction.
-
-One simple way of doing this is to mark blocks as free in internal in-memory
-block allocation structures only after the transaction freeing them commits.
-Ext4 uses journal commit callback for this purpose.
-
-
-
-With journal commit callbacks you can ask the journalling layer to call a
-callback function when the transaction is finally committed to disk, so that
-you can do some of your own management. You ask the journalling layer for
-calling the callback by simply setting journal->j_commit_callback function
-pointer and that function is called after each transaction commit. You can also
-use transaction->t_private_list for attaching entries to a transaction that
-need processing when the transaction commits.
-
-
-
-JBD2 also provides a way to block all transaction updates via
-jbd2_journal_{un,}lock_updates(). Ext4 uses this when it wants a window with a
-clean and stable fs for a moment. E.g.
-
-
-
-
- jbd2_journal_lock_updates() //stop new stuff happening..
- jbd2_journal_flush() // checkpoint everything.
- ..do stuff on stable fs
- jbd2_journal_unlock_updates() // carry on with filesystem use.
-
-
-
-The opportunities for abuse and DOS attacks with this should be obvious,
-if you allow unprivileged userspace to trigger codepaths containing these
-calls.
-
-
-
-
-
- Summary
-
-Using the journal is a matter of wrapping the different context changes,
-being each mount, each modification (transaction) and each changed buffer
-to tell the journalling layer about them.
-
-
-
-
-
-
-
- Data Types
-
- The journalling layer uses typedefs to 'hide' the concrete definitions
- of the structures used. As a client of the JBD2 layer you can
- just rely on the using the pointer as a magic cookie of some sort.
-
- Obviously the hiding is not enforced as this is 'C'.
-
- Structures
-!Iinclude/linux/jbd2.h
-
-
-
-
- Functions
-
- The functions here are split into two groups those that
- affect a journal as a whole, and those which are used to
- manage transactions
-
- Journal Level
-!Efs/jbd2/journal.c
-!Ifs/jbd2/recovery.c
-
- Transasction Level
-!Efs/jbd2/transaction.c
-
-
-
- See also
-
-
-
- Journaling the Linux ext2fs Filesystem, LinuxExpo 98, Stephen Tweedie
-
-
-
-
-
-
- Ext3 Journalling FileSystem, OLS 2000, Dr. Stephen Tweedie
-
-
-
-
-
-
-
-
- splice API
-
- splice is a method for moving blocks of data around inside the
- kernel, without continually transferring them between the kernel
- and user space.
-
-!Ffs/splice.c
-
-
-
- pipes API
-
- Pipe interfaces are all for in-kernel (builtin image) use.
- They are not exported for use by modules.
-
-!Iinclude/linux/pipe_fs_i.h
-!Ffs/pipe.c
-
-
-
diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl
deleted file mode 100644
index da5c087462b1dfd9374a1727dd7ee2a365b313a1..0000000000000000000000000000000000000000
--- a/Documentation/DocBook/kernel-hacking.tmpl
+++ /dev/null
@@ -1,1312 +0,0 @@
-
-
-
-
-
- Unreliable Guide To Hacking The Linux Kernel
-
-
-
- Rusty
- Russell
-
-
- rusty@rustcorp.com.au
-
-
-
-
-
-
- 2005
- Rusty Russell
-
-
-
-
- This documentation is free software; you can redistribute
- it and/or modify it under the terms of the GNU General Public
- License as published by the Free Software Foundation; either
- version 2 of the License, or (at your option) any later
- version.
-
-
-
- This program is distributed in the hope that it will be
- useful, but WITHOUT ANY WARRANTY; without even the implied
- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- See the GNU General Public License for more details.
-
-
-
- You should have received a copy of the GNU General Public
- License along with this program; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
- MA 02111-1307 USA
-
-
-
- For more details see the file COPYING in the source
- distribution of Linux.
-
-
-
-
- This is the first release of this document as part of the kernel tarball.
-
-
-
-
-
-
-
- Introduction
-
- Welcome, gentle reader, to Rusty's Remarkably Unreliable Guide to Linux
- Kernel Hacking. This document describes the common routines and
- general requirements for kernel code: its goal is to serve as a
- primer for Linux kernel development for experienced C
- programmers. I avoid implementation details: that's what the
- code is for, and I ignore whole tracts of useful routines.
-
-
- Before you read this, please understand that I never wanted to
- write this document, being grossly under-qualified, but I always
- wanted to read it, and this was the only way. I hope it will
- grow into a compendium of best practice, common starting points
- and random information.
-
-
-
-
- The Players
-
-
- At any time each of the CPUs in a system can be:
-
-
-
-
-
- not associated with any process, serving a hardware interrupt;
-
-
-
-
-
- not associated with any process, serving a softirq or tasklet;
-
-
-
-
-
- running in kernel space, associated with a process (user context);
-
-
-
-
-
- running a process in user space.
-
-
-
-
-
- There is an ordering between these. The bottom two can preempt
- each other, but above that is a strict hierarchy: each can only be
- preempted by the ones above it. For example, while a softirq is
- running on a CPU, no other softirq will preempt it, but a hardware
- interrupt can. However, any other CPUs in the system execute
- independently.
-
-
-
- We'll see a number of ways that the user context can block
- interrupts, to become truly non-preemptable.
-
-
-
- User Context
-
-
- User context is when you are coming in from a system call or other
- trap: like userspace, you can be preempted by more important tasks
- and by interrupts. You can sleep, by calling
- schedule().
-
-
-
-
- You are always in user context on module load and unload,
- and on operations on the block device layer.
-
-
-
-
- In user context, the current pointer (indicating
- the task we are currently executing) is valid, and
- in_interrupt()
- (include/linux/interrupt.h) is false
- .
-
-
-
-
- Beware that if you have preemption or softirqs disabled
- (see below), in_interrupt() will return a
- false positive.
-
-
-
-
-
- Hardware Interrupts (Hard IRQs)
-
-
- Timer ticks, network cards and
- keyboard are examples of real
- hardware which produce interrupts at any time. The kernel runs
- interrupt handlers, which services the hardware. The kernel
- guarantees that this handler is never re-entered: if the same
- interrupt arrives, it is queued (or dropped). Because it
- disables interrupts, this handler has to be fast: frequently it
- simply acknowledges the interrupt, marks a 'software interrupt'
- for execution and exits.
-
-
-
- You can tell you are in a hardware interrupt, because
- in_irq() returns true.
-
-
-
- Beware that this will return a false positive if interrupts are disabled
- (see below).
-
-
-
-
-
- Software Interrupt Context: Softirqs and Tasklets
-
-
- Whenever a system call is about to return to userspace, or a
- hardware interrupt handler exits, any 'software interrupts'
- which are marked pending (usually by hardware interrupts) are
- run (kernel/softirq.c).
-
-
-
- Much of the real interrupt handling work is done here. Early in
- the transition to SMP, there were only 'bottom
- halves' (BHs), which didn't take advantage of multiple CPUs. Shortly
- after we switched from wind-up computers made of match-sticks and snot,
- we abandoned this limitation and switched to 'softirqs'.
-
-
-
- include/linux/interrupt.h lists the
- different softirqs. A very important softirq is the
- timer softirq (include/linux/timer.h): you can
- register to have it call functions for you in a given length of
- time.
-
-
-
- Softirqs are often a pain to deal with, since the same softirq
- will run simultaneously on more than one CPU. For this reason,
- tasklets (include/linux/interrupt.h) are more
- often used: they are dynamically-registrable (meaning you can have
- as many as you want), and they also guarantee that any tasklet
- will only run on one CPU at any time, although different tasklets
- can run simultaneously.
-
-
-
- The name 'tasklet' is misleading: they have nothing to do with 'tasks',
- and probably more to do with some bad vodka Alexey Kuznetsov had at the
- time.
-
-
-
-
- You can tell you are in a softirq (or tasklet)
- using the in_softirq() macro
- (include/linux/interrupt.h).
-
-
-
- Beware that this will return a false positive if a bh lock (see below)
- is held.
-
-
-
-
-
-
- Some Basic Rules
-
-
-
- No memory protection
-
-
- If you corrupt memory, whether in user context or
- interrupt context, the whole machine will crash. Are you
- sure you can't do what you want in userspace?
-
-
-
-
-
- No floating point or MMX
-
-
- The FPU context is not saved; even in user
- context the FPU state probably won't
- correspond with the current process: you would mess with some
- user process' FPU state. If you really want
- to do this, you would have to explicitly save/restore the full
- FPU state (and avoid context switches). It
- is generally a bad idea; use fixed point arithmetic first.
-
-
-
-
-
- A rigid stack limit
-
-
- Depending on configuration options the kernel stack is about 3K to 6K for most 32-bit architectures: it's
- about 14K on most 64-bit archs, and often shared with interrupts
- so you can't use it all. Avoid deep recursion and huge local
- arrays on the stack (allocate them dynamically instead).
-
-
-
-
-
- The Linux kernel is portable
-
-
- Let's keep it that way. Your code should be 64-bit clean,
- and endian-independent. You should also minimize CPU
- specific stuff, e.g. inline assembly should be cleanly
- encapsulated and minimized to ease porting. Generally it
- should be restricted to the architecture-dependent part of
- the kernel tree.
-
-
-
-
-
-
-
- ioctls: Not writing a new system call
-
-
- A system call generally looks like this
-
-
-
-asmlinkage long sys_mycall(int arg)
-{
- return 0;
-}
-
-
-
- First, in most cases you don't want to create a new system call.
- You create a character device and implement an appropriate ioctl
- for it. This is much more flexible than system calls, doesn't have
- to be entered in every architecture's
- include/asm/unistd.h and
- arch/kernel/entry.S file, and is much more
- likely to be accepted by Linus.
-
-
-
- If all your routine does is read or write some parameter, consider
- implementing a sysfs interface instead.
-
-
-
- Inside the ioctl you're in user context to a process. When a
- error occurs you return a negated errno (see
- include/linux/errno.h),
- otherwise you return 0.
-
-
-
- After you slept you should check if a signal occurred: the
- Unix/Linux way of handling signals is to temporarily exit the
- system call with the -ERESTARTSYS error. The
- system call entry code will switch back to user context, process
- the signal handler and then your system call will be restarted
- (unless the user disabled that). So you should be prepared to
- process the restart, e.g. if you're in the middle of manipulating
- some data structure.
-
-
-
-if (signal_pending(current))
- return -ERESTARTSYS;
-
-
-
- If you're doing longer computations: first think userspace. If you
- really want to do it in kernel you should
- regularly check if you need to give up the CPU (remember there is
- cooperative multitasking per CPU). Idiom:
-
-
-
-cond_resched(); /* Will sleep */
-
-
-
- A short note on interface design: the UNIX system call motto is
- "Provide mechanism not policy".
-
-
-
-
- Recipes for Deadlock
-
-
- You cannot call any routines which may sleep, unless:
-
-
-
-
- You are in user context.
-
-
-
-
-
- You do not own any spinlocks.
-
-
-
-
-
- You have interrupts enabled (actually, Andi Kleen says
- that the scheduling code will enable them for you, but
- that's probably not what you wanted).
-
-
-
-
-
- Note that some functions may sleep implicitly: common ones are
- the user space access functions (*_user) and memory allocation
- functions without GFP_ATOMIC.
-
-
-
- You should always compile your kernel
- CONFIG_DEBUG_ATOMIC_SLEEP on, and it will warn
- you if you break these rules. If you do break
- the rules, you will eventually lock up your box.
-
-
-
- Really.
-
-
-
-
- Common Routines
-
-
-
- printk()
- include/linux/kernel.h
-
-
-
- printk() feeds kernel messages to the
- console, dmesg, and the syslog daemon. It is useful for debugging
- and reporting errors, and can be used inside interrupt context,
- but use with caution: a machine which has its console flooded with
- printk messages is unusable. It uses a format string mostly
- compatible with ANSI C printf, and C string concatenation to give
- it a first "priority" argument:
-
-
-
-printk(KERN_INFO "i = %u\n", i);
-
-
-
- See include/linux/kernel.h;
- for other KERN_ values; these are interpreted by syslog as the
- level. Special case: for printing an IP address use
-
-
-
-__be32 ipaddress;
-printk(KERN_INFO "my ip: %pI4\n", &ipaddress);
-
-
-
- printk() internally uses a 1K buffer and does
- not catch overruns. Make sure that will be enough.
-
-
-
-
- You will know when you are a real kernel hacker
- when you start typoing printf as printk in your user programs :)
-
-
-
-
-
-
-
- Another sidenote: the original Unix Version 6 sources had a
- comment on top of its printf function: "Printf should not be
- used for chit-chat". You should follow that advice.
-
-
-
-
-
-
- copy_[to/from]_user()
- /
- get_user()
- /
- put_user()
- include/linux/uaccess.h
-
-
-
- [SLEEPS]
-
-
-
- put_user() and get_user()
- are used to get and put single values (such as an int, char, or
- long) from and to userspace. A pointer into userspace should
- never be simply dereferenced: data should be copied using these
- routines. Both return -EFAULT or 0.
-
-
- copy_to_user() and
- copy_from_user() are more general: they copy
- an arbitrary amount of data to and from userspace.
-
-
- Unlike put_user() and
- get_user(), they return the amount of
- uncopied data (ie. 0 still means
- success).
-
-
- [Yes, this moronic interface makes me cringe. The flamewar comes up every year or so. --RR.]
-
-
- The functions may sleep implicitly. This should never be called
- outside user context (it makes no sense), with interrupts
- disabled, or a spinlock held.
-
-
-
-
- kmalloc()/kfree()
- include/linux/slab.h
-
-
- [MAY SLEEP: SEE BELOW]
-
-
-
- These routines are used to dynamically request pointer-aligned
- chunks of memory, like malloc and free do in userspace, but
- kmalloc() takes an extra flag word.
- Important values:
-
-
-
-
-
-
- GFP_KERNEL
-
-
-
-
- May sleep and swap to free memory. Only allowed in user
- context, but is the most reliable way to allocate memory.
-
-
-
-
-
-
-
- GFP_ATOMIC
-
-
-
-
- Don't sleep. Less reliable than GFP_KERNEL,
- but may be called from interrupt context. You should
- really have a good out-of-memory
- error-handling strategy.
-
-
-
-
-
-
-
- GFP_DMA
-
-
-
-
- Allocate ISA DMA lower than 16MB. If you don't know what that
- is you don't need it. Very unreliable.
-
-
-
-
-
-
- If you see a sleeping function called from invalid
- context warning message, then maybe you called a
- sleeping allocation function from interrupt context without
- GFP_ATOMIC. You should really fix that.
- Run, don't walk.
-
-
-
- If you are allocating at least PAGE_SIZE
- (include/asm/page.h) bytes,
- consider using __get_free_pages()
-
- (include/linux/mm.h). It
- takes an order argument (0 for page sized, 1 for double page, 2
- for four pages etc.) and the same memory priority flag word as
- above.
-
-
-
- If you are allocating more than a page worth of bytes you can use
- vmalloc(). It'll allocate virtual memory in
- the kernel map. This block is not contiguous in physical memory,
- but the MMU makes it look like it is for you
- (so it'll only look contiguous to the CPUs, not to external device
- drivers). If you really need large physically contiguous memory
- for some weird device, you have a problem: it is poorly supported
- in Linux because after some time memory fragmentation in a running
- kernel makes it hard. The best way is to allocate the block early
- in the boot process via the alloc_bootmem()
- routine.
-
-
-
- Before inventing your own cache of often-used objects consider
- using a slab cache in
- include/linux/slab.h
-
-
-
-
- current
- include/asm/current.h
-
-
- This global variable (really a macro) contains a pointer to
- the current task structure, so is only valid in user context.
- For example, when a process makes a system call, this will
- point to the task structure of the calling process. It is
- not NULL in interrupt context.
-
-
-
-
- mdelay()/udelay()
- include/asm/delay.h
- include/linux/delay.h
-
-
-
- The udelay() and ndelay() functions can be used for small pauses.
- Do not use large values with them as you risk
- overflow - the helper function mdelay() is useful
- here, or consider msleep().
-
-
-
-
- cpu_to_be32()/be32_to_cpu()/cpu_to_le32()/le32_to_cpu()
- include/asm/byteorder.h
-
-
-
- The cpu_to_be32() family (where the "32" can
- be replaced by 64 or 16, and the "be" can be replaced by "le") are
- the general way to do endian conversions in the kernel: they
- return the converted value. All variations supply the reverse as
- well: be32_to_cpu(), etc.
-
-
-
- There are two major variations of these functions: the pointer
- variation, such as cpu_to_be32p(), which take
- a pointer to the given type, and return the converted value. The
- other variation is the "in-situ" family, such as
- cpu_to_be32s(), which convert value referred
- to by the pointer, and return void.
-
-
-
-
- local_irq_save()/local_irq_restore()
- include/linux/irqflags.h
-
-
-
- These routines disable hard interrupts on the local CPU, and
- restore them. They are reentrant; saving the previous state in
- their one unsigned long flags argument. If you
- know that interrupts are enabled, you can simply use
- local_irq_disable() and
- local_irq_enable().
-
-
-
-
- local_bh_disable()/local_bh_enable()
- include/linux/interrupt.h
-
-
- These routines disable soft interrupts on the local CPU, and
- restore them. They are reentrant; if soft interrupts were
- disabled before, they will still be disabled after this pair
- of functions has been called. They prevent softirqs and tasklets
- from running on the current CPU.
-
-
-
-
- smp_processor_id()
- include/asm/smp.h
-
-
- get_cpu() disables preemption (so you won't
- suddenly get moved to another CPU) and returns the current
- processor number, between 0 and NR_CPUS. Note
- that the CPU numbers are not necessarily continuous. You return
- it again with put_cpu() when you are done.
-
-
- If you know you cannot be preempted by another task (ie. you are
- in interrupt context, or have preemption disabled) you can use
- smp_processor_id().
-
-
-
-
- __init/__exit/__initdata
- include/linux/init.h
-
-
- After boot, the kernel frees up a special section; functions
- marked with __init and data structures marked with
- __initdata are dropped after boot is complete: similarly
- modules discard this memory after initialization. __exit
- is used to declare a function which is only required on exit: the
- function will be dropped if this file is not compiled as a module.
- See the header file for use. Note that it makes no sense for a function
- marked with __init to be exported to modules with
- EXPORT_SYMBOL() - this will break.
-
-
-
-
-
- __initcall()/module_init()
- include/linux/init.h
-
- Many parts of the kernel are well served as a module
- (dynamically-loadable parts of the kernel). Using the
- module_init() and
- module_exit() macros it is easy to write code
- without #ifdefs which can operate both as a module or built into
- the kernel.
-
-
-
- The module_init() macro defines which
- function is to be called at module insertion time (if the file is
- compiled as a module), or at boot time: if the file is not
- compiled as a module the module_init() macro
- becomes equivalent to __initcall(), which
- through linker magic ensures that the function is called on boot.
-
-
-
- The function can return a negative error number to cause
- module loading to fail (unfortunately, this has no effect if
- the module is compiled into the kernel). This function is
- called in user context with interrupts enabled, so it can sleep.
-
-
-
-
- module_exit()
- include/linux/init.h
-
-
- This macro defines the function to be called at module removal
- time (or never, in the case of the file compiled into the
- kernel). It will only be called if the module usage count has
- reached zero. This function can also sleep, but cannot fail:
- everything must be cleaned up by the time it returns.
-
-
-
- Note that this macro is optional: if it is not present, your
- module will not be removable (except for 'rmmod -f').
-
-
-
-
- try_module_get()/module_put()
- include/linux/module.h
-
-
- These manipulate the module usage count, to protect against
- removal (a module also can't be removed if another module uses one
- of its exported symbols: see below). Before calling into module
- code, you should call try_module_get() on
- that module: if it fails, then the module is being removed and you
- should act as if it wasn't there. Otherwise, you can safely enter
- the module, and call module_put() when you're
- finished.
-
-
-
- Most registerable structures have an
- owner field, such as in the
- file_operations structure. Set this field
- to the macro THIS_MODULE.
-
-
-
-
-
-
-
- Wait Queues
- include/linux/wait.h
-
-
- [SLEEPS]
-
-
-
- A wait queue is used to wait for someone to wake you up when a
- certain condition is true. They must be used carefully to ensure
- there is no race condition. You declare a
- wait_queue_head_t, and then processes which want to
- wait for that condition declare a wait_queue_t
- referring to themselves, and place that in the queue.
-
-
-
- Declaring
-
-
- You declare a wait_queue_head_t using the
- DECLARE_WAIT_QUEUE_HEAD() macro, or using the
- init_waitqueue_head() routine in your
- initialization code.
-
-
-
-
- Queuing
-
-
- Placing yourself in the waitqueue is fairly complex, because you
- must put yourself in the queue before checking the condition.
- There is a macro to do this:
- wait_event_interruptible()
-
- include/linux/wait.h The
- first argument is the wait queue head, and the second is an
- expression which is evaluated; the macro returns
- 0 when this expression is true, or
- -ERESTARTSYS if a signal is received.
- The wait_event() version ignores signals.
-
-
-
-
-
- Waking Up Queued Tasks
-
-
- Call wake_up()
-
- include/linux/wait.h;,
- which will wake up every process in the queue. The exception is
- if one has TASK_EXCLUSIVE set, in which case
- the remainder of the queue will not be woken. There are other variants
- of this basic function available in the same header.
-
-
-
-
-
- Atomic Operations
-
-
- Certain operations are guaranteed atomic on all platforms. The
- first class of operations work on atomic_t
-
- include/asm/atomic.h; this
- contains a signed integer (at least 32 bits long), and you must use
- these functions to manipulate or read atomic_t variables.
- atomic_read() and
- atomic_set() get and set the counter,
- atomic_add(),
- atomic_sub(),
- atomic_inc(),
- atomic_dec(), and
- atomic_dec_and_test() (returns
- true if it was decremented to zero).
-
-
-
- Yes. It returns true (i.e. != 0) if the
- atomic variable is zero.
-
-
-
- Note that these functions are slower than normal arithmetic, and
- so should not be used unnecessarily.
-
-
-
- The second class of atomic operations is atomic bit operations on an
- unsigned long, defined in
-
- include/linux/bitops.h. These
- operations generally take a pointer to the bit pattern, and a bit
- number: 0 is the least significant bit.
- set_bit(), clear_bit()
- and change_bit() set, clear, and flip the
- given bit. test_and_set_bit(),
- test_and_clear_bit() and
- test_and_change_bit() do the same thing,
- except return true if the bit was previously set; these are
- particularly useful for atomically setting flags.
-
-
-
- It is possible to call these operations with bit indices greater
- than BITS_PER_LONG. The resulting behavior is strange on big-endian
- platforms though so it is a good idea not to do this.
-
-
-
-
- Symbols
-
-
- Within the kernel proper, the normal linking rules apply
- (ie. unless a symbol is declared to be file scope with the
- static keyword, it can be used anywhere in the
- kernel). However, for modules, a special exported symbol table is
- kept which limits the entry points to the kernel proper. Modules
- can also export symbols.
-
-
-
- EXPORT_SYMBOL()
- include/linux/export.h
-
-
- This is the classic method of exporting a symbol: dynamically
- loaded modules will be able to use the symbol as normal.
-
-
-
-
- EXPORT_SYMBOL_GPL()
- include/linux/export.h
-
-
- Similar to EXPORT_SYMBOL() except that the
- symbols exported by EXPORT_SYMBOL_GPL() can
- only be seen by modules with a
- MODULE_LICENSE() that specifies a GPL
- compatible license. It implies that the function is considered
- an internal implementation issue, and not really an interface.
- Some maintainers and developers may however
- require EXPORT_SYMBOL_GPL() when adding any new APIs or functionality.
-
-
-
-
-
- Routines and Conventions
-
-
- Double-linked lists
- include/linux/list.h
-
-
- There used to be three sets of linked-list routines in the kernel
- headers, but this one is the winner. If you don't have some
- particular pressing need for a single list, it's a good choice.
-
-
-
- In particular, list_for_each_entry is useful.
-
-
-
-
- Return Conventions
-
-
- For code called in user context, it's very common to defy C
- convention, and return 0 for success,
- and a negative error number
- (eg. -EFAULT) for failure. This can be
- unintuitive at first, but it's fairly widespread in the kernel.
-
-
-
- Using ERR_PTR()
-
- include/linux/err.h; to
- encode a negative error number into a pointer, and
- IS_ERR() and PTR_ERR()
- to get it back out again: avoids a separate pointer parameter for
- the error number. Icky, but in a good way.
-
-
-
-
- Breaking Compilation
-
-
- Linus and the other developers sometimes change function or
- structure names in development kernels; this is not done just to
- keep everyone on their toes: it reflects a fundamental change
- (eg. can no longer be called with interrupts on, or does extra
- checks, or doesn't do checks which were caught before). Usually
- this is accompanied by a fairly complete note to the linux-kernel
- mailing list; search the archive. Simply doing a global replace
- on the file usually makes things worse.
-
-
-
-
- Initializing structure members
-
-
- The preferred method of initializing structures is to use
- designated initialisers, as defined by ISO C99, eg:
-
-
-static struct block_device_operations opt_fops = {
- .open = opt_open,
- .release = opt_release,
- .ioctl = opt_ioctl,
- .check_media_change = opt_media_change,
-};
-
-
- This makes it easy to grep for, and makes it clear which
- structure fields are set. You should do this because it looks
- cool.
-
-
-
-
- GNU Extensions
-
-
- GNU Extensions are explicitly allowed in the Linux kernel.
- Note that some of the more complex ones are not very well
- supported, due to lack of general use, but the following are
- considered standard (see the GCC info page section "C
- Extensions" for more details - Yes, really the info page, the
- man page is only a short summary of the stuff in info).
-
-
-
-
- Inline functions
-
-
-
-
- Statement expressions (ie. the ({ and }) constructs).
-
-
-
-
- Declaring attributes of a function / variable / type
- (__attribute__)
-
-
-
-
- typeof
-
-
-
-
- Zero length arrays
-
-
-
-
- Macro varargs
-
-
-
-
- Arithmetic on void pointers
-
-
-
-
- Non-Constant initializers
-
-
-
-
- Assembler Instructions (not outside arch/ and include/asm/)
-
-
-
-
- Function names as strings (__func__).
-
-
-
-
- __builtin_constant_p()
-
-
-
-
-
- Be wary when using long long in the kernel, the code gcc generates for
- it is horrible and worse: division and multiplication does not work
- on i386 because the GCC runtime functions for it are missing from
- the kernel environment.
-
-
-
-
-
-
- C++
-
-
- Using C++ in the kernel is usually a bad idea, because the
- kernel does not provide the necessary runtime environment
- and the include files are not tested for it. It is still
- possible, but not recommended. If you really want to do
- this, forget about exceptions at least.
-
-
-
-
- #if
-
-
- It is generally considered cleaner to use macros in header files
- (or at the top of .c files) to abstract away functions rather than
- using `#if' pre-processor statements throughout the source code.
-
-
-
-
-
- Putting Your Stuff in the Kernel
-
-
- In order to get your stuff into shape for official inclusion, or
- even to make a neat patch, there's administrative work to be
- done:
-
-
-
-
- Figure out whose pond you've been pissing in. Look at the top of
- the source files, inside the MAINTAINERS
- file, and last of all in the CREDITS file.
- You should coordinate with this person to make sure you're not
- duplicating effort, or trying something that's already been
- rejected.
-
-
-
- Make sure you put your name and EMail address at the top of
- any files you create or mangle significantly. This is the
- first place people will look when they find a bug, or when
- they want to make a change.
-
-
-
-
-
- Usually you want a configuration option for your kernel hack.
- Edit Kconfig in the appropriate directory.
- The Config language is simple to use by cut and paste, and there's
- complete documentation in
- Documentation/kbuild/kconfig-language.txt.
-
-
-
- In your description of the option, make sure you address both the
- expert user and the user who knows nothing about your feature. Mention
- incompatibilities and issues here. Definitely
- end your description with if in doubt, say N
- (or, occasionally, `Y'); this is for people who have no
- idea what you are talking about.
-
-
-
-
-
- Edit the Makefile: the CONFIG variables are
- exported here so you can usually just add a "obj-$(CONFIG_xxx) +=
- xxx.o" line. The syntax is documented in
- Documentation/kbuild/makefiles.txt.
-
-
-
-
-
- Put yourself in CREDITS if you've done
- something noteworthy, usually beyond a single file (your name
- should be at the top of the source files anyway).
- MAINTAINERS means you want to be consulted
- when changes are made to a subsystem, and hear about bugs; it
- implies a more-than-passing commitment to some part of the code.
-
-
-
-
-
- Finally, don't forget to read Documentation/process/submitting-patches.rst
- and possibly Documentation/process/submitting-drivers.rst.
-
-
-
-
-
-
- Kernel Cantrips
-
-
- Some favorites from browsing the source. Feel free to add to this
- list.
-
-
-
- arch/x86/include/asm/delay.h:
-
-
-#define ndelay(n) (__builtin_constant_p(n) ? \
- ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \
- __ndelay(n))
-
-
-
- include/linux/fs.h:
-
-
-/*
- * Kernel pointers have redundant information, so we can use a
- * scheme where we can return either an error code or a dentry
- * pointer with the same return value.
- *
- * This should be a per-architecture thing, to allow different
- * error and pointer decisions.
- */
- #define ERR_PTR(err) ((void *)((long)(err)))
- #define PTR_ERR(ptr) ((long)(ptr))
- #define IS_ERR(ptr) ((unsigned long)(ptr) > (unsigned long)(-1000))
-
-
-
- arch/x86/include/asm/uaccess_32.h:
-
-
-
-#define copy_to_user(to,from,n) \
- (__builtin_constant_p(n) ? \
- __constant_copy_to_user((to),(from),(n)) : \
- __generic_copy_to_user((to),(from),(n)))
-
-
-
- arch/sparc/kernel/head.S:
-
-
-
-/*
- * Sun people can't spell worth damn. "compatability" indeed.
- * At least we *know* we can't spell, and use a spell-checker.
- */
-
-/* Uh, actually Linus it is I who cannot spell. Too much murky
- * Sparc assembly will do this to ya.
- */
-C_LABEL(cputypvar):
- .asciz "compatibility"
-
-/* Tested on SS-5, SS-10. Probably someone at Sun applied a spell-checker. */
- .align 4
-C_LABEL(cputypvar_sun4m):
- .asciz "compatible"
-
-
-
- arch/sparc/lib/checksum.S:
-
-
-
- /* Sun, you just can't beat me, you just can't. Stop trying,
- * give up. I'm serious, I am going to kick the living shit
- * out of you, game over, lights out.
- */
-
-
-
-
- Thanks
-
-
- Thanks to Andi Kleen for the idea, answering my questions, fixing
- my mistakes, filling content, etc. Philipp Rumpf for more spelling
- and clarity fixes, and some excellent non-obvious points. Werner
- Almesberger for giving me a great summary of
- disable_irq(), and Jes Sorensen and Andrea
- Arcangeli added caveats. Michael Elizabeth Chastain for checking
- and adding to the Configure section. Telsa Gwynne for teaching me DocBook.
-
-
-
-
diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl
deleted file mode 100644
index 7c9cc4846cb67144c9d5f864f018cfd8b3efb9b6..0000000000000000000000000000000000000000
--- a/Documentation/DocBook/kernel-locking.tmpl
+++ /dev/null
@@ -1,2151 +0,0 @@
-
-
-
-
-
- Unreliable Guide To Locking
-
-
-
- Rusty
- Russell
-
-
- rusty@rustcorp.com.au
-
-
-
-
-
-
- 2003
- Rusty Russell
-
-
-
-
- This documentation is free software; you can redistribute
- it and/or modify it under the terms of the GNU General Public
- License as published by the Free Software Foundation; either
- version 2 of the License, or (at your option) any later
- version.
-
-
-
- This program is distributed in the hope that it will be
- useful, but WITHOUT ANY WARRANTY; without even the implied
- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- See the GNU General Public License for more details.
-
-
-
- You should have received a copy of the GNU General Public
- License along with this program; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
- MA 02111-1307 USA
-
-
-
- For more details see the file COPYING in the source
- distribution of Linux.
-
-
-
-
-
-
- Introduction
-
- Welcome, to Rusty's Remarkably Unreliable Guide to Kernel
- Locking issues. This document describes the locking systems in
- the Linux Kernel in 2.6.
-
-
- With the wide availability of HyperThreading, and preemption in the Linux
- Kernel, everyone hacking on the kernel needs to know the
- fundamentals of concurrency and locking for
- SMP.
-
-
-
-
- The Problem With Concurrency
-
- (Skip this if you know what a Race Condition is).
-
-
- In a normal program, you can increment a counter like so:
-
-
- very_important_count++;
-
-
-
- This is what they would expect to happen:
-
-
-
-
-
- Race Conditions and Critical Regions
-
- This overlap, where the result depends on the
- relative timing of multiple tasks, is called a race condition.
- The piece of code containing the concurrency issue is called a
- critical region. And especially since Linux starting running
- on SMP machines, they became one of the major issues in kernel
- design and implementation.
-
-
- Preemption can have the same effect, even if there is only one
- CPU: by preempting one task during the critical region, we have
- exactly the same race condition. In this case the thread which
- preempts might run the critical region itself.
-
-
- The solution is to recognize when these simultaneous accesses
- occur, and use locks to make sure that only one instance can
- enter the critical region at any time. There are many
- friendly primitives in the Linux kernel to help you do this.
- And then there are the unfriendly primitives, but I'll pretend
- they don't exist.
-
-
-
-
-
- Locking in the Linux Kernel
-
-
- If I could give you one piece of advice: never sleep with anyone
- crazier than yourself. But if I had to give you advice on
- locking: keep it simple.
-
-
-
- Be reluctant to introduce new locks.
-
-
-
- Strangely enough, this last one is the exact reverse of my advice when
- you have slept with someone crazier than yourself.
- And you should think about getting a big dog.
-
-
-
- Two Main Types of Kernel Locks: Spinlocks and Mutexes
-
-
- There are two main types of kernel locks. The fundamental type
- is the spinlock
- (include/asm/spinlock.h),
- which is a very simple single-holder lock: if you can't get the
- spinlock, you keep trying (spinning) until you can. Spinlocks are
- very small and fast, and can be used anywhere.
-
-
- The second type is a mutex
- (include/linux/mutex.h): it
- is like a spinlock, but you may block holding a mutex.
- If you can't lock a mutex, your task will suspend itself, and be woken
- up when the mutex is released. This means the CPU can do something
- else while you are waiting. There are many cases when you simply
- can't sleep (see ), and so have to
- use a spinlock instead.
-
-
- Neither type of lock is recursive: see
- .
-
-
-
-
- Locks and Uniprocessor Kernels
-
-
- For kernels compiled without CONFIG_SMP, and
- without CONFIG_PREEMPT spinlocks do not exist at
- all. This is an excellent design decision: when no-one else can
- run at the same time, there is no reason to have a lock.
-
-
-
- If the kernel is compiled without CONFIG_SMP,
- but CONFIG_PREEMPT is set, then spinlocks
- simply disable preemption, which is sufficient to prevent any
- races. For most purposes, we can think of preemption as
- equivalent to SMP, and not worry about it separately.
-
-
-
- You should always test your locking code with CONFIG_SMP
- and CONFIG_PREEMPT enabled, even if you don't have an SMP test box, because it
- will still catch some kinds of locking bugs.
-
-
-
- Mutexes still exist, because they are required for
- synchronization between user
- contexts, as we will see below.
-
-
-
-
- Locking Only In User Context
-
-
- If you have a data structure which is only ever accessed from
- user context, then you can use a simple mutex
- (include/linux/mutex.h) to protect it. This
- is the most trivial case: you initialize the mutex. Then you can
- call mutex_lock_interruptible() to grab the mutex,
- and mutex_unlock() to release it. There is also a
- mutex_lock(), which should be avoided, because it
- will not return if a signal is received.
-
-
-
- Example: net/netfilter/nf_sockopt.c allows
- registration of new setsockopt() and
- getsockopt() calls, with
- nf_register_sockopt(). Registration and
- de-registration are only done on module load and unload (and boot
- time, where there is no concurrency), and the list of registrations
- is only consulted for an unknown setsockopt()
- or getsockopt() system call. The
- nf_sockopt_mutex is perfect to protect this,
- especially since the setsockopt and getsockopt calls may well
- sleep.
-
-
-
-
- Locking Between User Context and Softirqs
-
-
- If a softirq shares
- data with user context, you have two problems. Firstly, the current
- user context can be interrupted by a softirq, and secondly, the
- critical region could be entered from another CPU. This is where
- spin_lock_bh()
- (include/linux/spinlock.h) is
- used. It disables softirqs on that CPU, then grabs the lock.
- spin_unlock_bh() does the reverse. (The
- '_bh' suffix is a historical reference to "Bottom Halves", the
- old name for software interrupts. It should really be
- called spin_lock_softirq()' in a perfect world).
-
-
-
- Note that you can also use spin_lock_irq()
- or spin_lock_irqsave() here, which stop
- hardware interrupts as well: see .
-
-
-
- This works perfectly for UP
- as well: the spin lock vanishes, and this macro
- simply becomes local_bh_disable()
- (include/linux/interrupt.h), which
- protects you from the softirq being run.
-
-
-
-
- Locking Between User Context and Tasklets
-
-
- This is exactly the same as above, because tasklets are actually run
- from a softirq.
-
-
-
-
- Locking Between User Context and Timers
-
-
- This, too, is exactly the same as above, because timers are actually run from
- a softirq. From a locking point of view, tasklets and timers
- are identical.
-
-
-
-
- Locking Between Tasklets/Timers
-
-
- Sometimes a tasklet or timer might want to share data with
- another tasklet or timer.
-
-
-
- The Same Tasklet/Timer
-
- Since a tasklet is never run on two CPUs at once, you don't
- need to worry about your tasklet being reentrant (running
- twice at once), even on SMP.
-
-
-
-
- Different Tasklets/Timers
-
- If another tasklet/timer wants
- to share data with your tasklet or timer , you will both need to use
- spin_lock() and
- spin_unlock() calls.
- spin_lock_bh() is
- unnecessary here, as you are already in a tasklet, and
- none will be run on the same CPU.
-
-
-
-
-
- Locking Between Softirqs
-
-
- Often a softirq might
- want to share data with itself or a tasklet/timer.
-
-
-
- The Same Softirq
-
-
- The same softirq can run on the other CPUs: you can use a
- per-CPU array (see ) for better
- performance. If you're going so far as to use a softirq,
- you probably care about scalable performance enough
- to justify the extra complexity.
-
-
-
- You'll need to use spin_lock() and
- spin_unlock() for shared data.
-
-
-
-
- Different Softirqs
-
-
- You'll need to use spin_lock() and
- spin_unlock() for shared data, whether it
- be a timer, tasklet, different softirq or the same or another
- softirq: any of them could be running on a different CPU.
-
-
-
-
-
-
- Hard IRQ Context
-
-
- Hardware interrupts usually communicate with a
- tasklet or softirq. Frequently this involves putting work in a
- queue, which the softirq will take out.
-
-
-
- Locking Between Hard IRQ and Softirqs/Tasklets
-
-
- If a hardware irq handler shares data with a softirq, you have
- two concerns. Firstly, the softirq processing can be
- interrupted by a hardware interrupt, and secondly, the
- critical region could be entered by a hardware interrupt on
- another CPU. This is where spin_lock_irq() is
- used. It is defined to disable interrupts on that cpu, then grab
- the lock. spin_unlock_irq() does the reverse.
-
-
-
- The irq handler does not to use
- spin_lock_irq(), because the softirq cannot
- run while the irq handler is running: it can use
- spin_lock(), which is slightly faster. The
- only exception would be if a different hardware irq handler uses
- the same lock: spin_lock_irq() will stop
- that from interrupting us.
-
-
-
- This works perfectly for UP as well: the spin lock vanishes,
- and this macro simply becomes local_irq_disable()
- (include/asm/smp.h), which
- protects you from the softirq/tasklet/BH being run.
-
-
-
- spin_lock_irqsave()
- (include/linux/spinlock.h) is a variant
- which saves whether interrupts were on or off in a flags word,
- which is passed to spin_unlock_irqrestore(). This
- means that the same code can be used inside an hard irq handler (where
- interrupts are already off) and in softirqs (where the irq
- disabling is required).
-
-
-
- Note that softirqs (and hence tasklets and timers) are run on
- return from hardware interrupts, so
- spin_lock_irq() also stops these. In that
- sense, spin_lock_irqsave() is the most
- general and powerful locking function.
-
-
-
-
- Locking Between Two Hard IRQ Handlers
-
- It is rare to have to share data between two IRQ handlers, but
- if you do, spin_lock_irqsave() should be
- used: it is architecture-specific whether all interrupts are
- disabled inside irq handlers themselves.
-
-
-
-
-
-
- Cheat Sheet For Locking
-
- Pete Zaitcev gives the following summary:
-
-
-
-
- If you are in a process context (any syscall) and want to
- lock other process out, use a mutex. You can take a mutex
- and sleep (copy_from_user*( or
- kmalloc(x,GFP_KERNEL)).
-
-
-
-
- Otherwise (== data can be touched in an interrupt), use
- spin_lock_irqsave() and
- spin_unlock_irqrestore().
-
-
-
-
- Avoid holding spinlock for more than 5 lines of code and
- across any function call (except accessors like
- readb).
-
-
-
-
-
- Table of Minimum Requirements
-
- The following table lists the minimum
- locking requirements between various contexts. In some cases,
- the same context can only be running on one CPU at a time, so
- no locking is required for that context (eg. a particular
- thread can only run on one CPU at a time, but if it needs
- shares data with another thread, locking is required).
-
-
- Remember the advice above: you can always use
- spin_lock_irqsave(), which is a superset
- of all other spinlock primitives.
-
-
-
-Table of Locking Requirements
-
-
-
-
-
-IRQ Handler A
-IRQ Handler B
-Softirq A
-Softirq B
-Tasklet A
-Tasklet B
-Timer A
-Timer B
-User Context A
-User Context B
-
-
-
-IRQ Handler A
-None
-
-
-
-IRQ Handler B
-SLIS
-None
-
-
-
-Softirq A
-SLI
-SLI
-SL
-
-
-
-Softirq B
-SLI
-SLI
-SL
-SL
-
-
-
-Tasklet A
-SLI
-SLI
-SL
-SL
-None
-
-
-
-Tasklet B
-SLI
-SLI
-SL
-SL
-SL
-None
-
-
-
-Timer A
-SLI
-SLI
-SL
-SL
-SL
-SL
-None
-
-
-
-Timer B
-SLI
-SLI
-SL
-SL
-SL
-SL
-SL
-None
-
-
-
-User Context A
-SLI
-SLI
-SLBH
-SLBH
-SLBH
-SLBH
-SLBH
-SLBH
-None
-
-
-
-User Context B
-SLI
-SLI
-SLBH
-SLBH
-SLBH
-SLBH
-SLBH
-SLBH
-MLI
-None
-
-
-
-
-
-
-
-
-
-
- The trylock Functions
-
- There are functions that try to acquire a lock only once and immediately
- return a value telling about success or failure to acquire the lock.
- They can be used if you need no access to the data protected with the lock
- when some other thread is holding the lock. You should acquire the lock
- later if you then need access to the data protected with the lock.
-
-
-
- spin_trylock() does not spin but returns non-zero if
- it acquires the spinlock on the first try or 0 if not. This function can
- be used in all contexts like spin_lock: you must have
- disabled the contexts that might interrupt you and acquire the spin lock.
-
-
-
- mutex_trylock() does not suspend your task
- but returns non-zero if it could lock the mutex on the first try
- or 0 if not. This function cannot be safely used in hardware or software
- interrupt contexts despite not sleeping.
-
-
-
-
- Common Examples
-
-Let's step through a simple example: a cache of number to name
-mappings. The cache keeps a count of how often each of the objects is
-used, and when it gets full, throws out the least used one.
-
-
-
-
- All In User Context
-
-For our first example, we assume that all operations are in user
-context (ie. from system calls), so we can sleep. This means we can
-use a mutex to protect the cache and all the objects within
-it. Here's the code:
-
-
-
-#include <linux/list.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/mutex.h>
-#include <asm/errno.h>
-
-struct object
-{
- struct list_head list;
- int id;
- char name[32];
- int popularity;
-};
-
-/* Protects the cache, cache_num, and the objects within it */
-static DEFINE_MUTEX(cache_lock);
-static LIST_HEAD(cache);
-static unsigned int cache_num = 0;
-#define MAX_CACHE_SIZE 10
-
-/* Must be holding cache_lock */
-static struct object *__cache_find(int id)
-{
- struct object *i;
-
- list_for_each_entry(i, &cache, list)
- if (i->id == id) {
- i->popularity++;
- return i;
- }
- return NULL;
-}
-
-/* Must be holding cache_lock */
-static void __cache_delete(struct object *obj)
-{
- BUG_ON(!obj);
- list_del(&obj->list);
- kfree(obj);
- cache_num--;
-}
-
-/* Must be holding cache_lock */
-static void __cache_add(struct object *obj)
-{
- list_add(&obj->list, &cache);
- if (++cache_num > MAX_CACHE_SIZE) {
- struct object *i, *outcast = NULL;
- list_for_each_entry(i, &cache, list) {
- if (!outcast || i->popularity < outcast->popularity)
- outcast = i;
- }
- __cache_delete(outcast);
- }
-}
-
-int cache_add(int id, const char *name)
-{
- struct object *obj;
-
- if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL)
- return -ENOMEM;
-
- strlcpy(obj->name, name, sizeof(obj->name));
- obj->id = id;
- obj->popularity = 0;
-
- mutex_lock(&cache_lock);
- __cache_add(obj);
- mutex_unlock(&cache_lock);
- return 0;
-}
-
-void cache_delete(int id)
-{
- mutex_lock(&cache_lock);
- __cache_delete(__cache_find(id));
- mutex_unlock(&cache_lock);
-}
-
-int cache_find(int id, char *name)
-{
- struct object *obj;
- int ret = -ENOENT;
-
- mutex_lock(&cache_lock);
- obj = __cache_find(id);
- if (obj) {
- ret = 0;
- strcpy(name, obj->name);
- }
- mutex_unlock(&cache_lock);
- return ret;
-}
-
-
-
-Note that we always make sure we have the cache_lock when we add,
-delete, or look up the cache: both the cache infrastructure itself and
-the contents of the objects are protected by the lock. In this case
-it's easy, since we copy the data for the user, and never let them
-access the objects directly.
-
-
-There is a slight (and common) optimization here: in
-cache_add we set up the fields of the object
-before grabbing the lock. This is safe, as no-one else can access it
-until we put it in cache.
-
-
-
-
- Accessing From Interrupt Context
-
-Now consider the case where cache_find can be
-called from interrupt context: either a hardware interrupt or a
-softirq. An example would be a timer which deletes object from the
-cache.
-
-
-The change is shown below, in standard patch format: the
-- are lines which are taken away, and the
-+ are lines which are added.
-
-
---- cache.c.usercontext 2003-12-09 13:58:54.000000000 +1100
-+++ cache.c.interrupt 2003-12-09 14:07:49.000000000 +1100
-@@ -12,7 +12,7 @@
- int popularity;
- };
-
--static DEFINE_MUTEX(cache_lock);
-+static DEFINE_SPINLOCK(cache_lock);
- static LIST_HEAD(cache);
- static unsigned int cache_num = 0;
- #define MAX_CACHE_SIZE 10
-@@ -55,6 +55,7 @@
- int cache_add(int id, const char *name)
- {
- struct object *obj;
-+ unsigned long flags;
-
- if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL)
- return -ENOMEM;
-@@ -63,30 +64,33 @@
- obj->id = id;
- obj->popularity = 0;
-
-- mutex_lock(&cache_lock);
-+ spin_lock_irqsave(&cache_lock, flags);
- __cache_add(obj);
-- mutex_unlock(&cache_lock);
-+ spin_unlock_irqrestore(&cache_lock, flags);
- return 0;
- }
-
- void cache_delete(int id)
- {
-- mutex_lock(&cache_lock);
-+ unsigned long flags;
-+
-+ spin_lock_irqsave(&cache_lock, flags);
- __cache_delete(__cache_find(id));
-- mutex_unlock(&cache_lock);
-+ spin_unlock_irqrestore(&cache_lock, flags);
- }
-
- int cache_find(int id, char *name)
- {
- struct object *obj;
- int ret = -ENOENT;
-+ unsigned long flags;
-
-- mutex_lock(&cache_lock);
-+ spin_lock_irqsave(&cache_lock, flags);
- obj = __cache_find(id);
- if (obj) {
- ret = 0;
- strcpy(name, obj->name);
- }
-- mutex_unlock(&cache_lock);
-+ spin_unlock_irqrestore(&cache_lock, flags);
- return ret;
- }
-
-
-
-Note that the spin_lock_irqsave will turn off
-interrupts if they are on, otherwise does nothing (if we are already
-in an interrupt handler), hence these functions are safe to call from
-any context.
-
-
-Unfortunately, cache_add calls
-kmalloc with the GFP_KERNEL
-flag, which is only legal in user context. I have assumed that
-cache_add is still only called in user context,
-otherwise this should become a parameter to
-cache_add.
-
-
-
- Exposing Objects Outside This File
-
-If our objects contained more information, it might not be sufficient
-to copy the information in and out: other parts of the code might want
-to keep pointers to these objects, for example, rather than looking up
-the id every time. This produces two problems.
-
-
-The first problem is that we use the cache_lock to
-protect objects: we'd need to make this non-static so the rest of the
-code can use it. This makes locking trickier, as it is no longer all
-in one place.
-
-
-The second problem is the lifetime problem: if another structure keeps
-a pointer to an object, it presumably expects that pointer to remain
-valid. Unfortunately, this is only guaranteed while you hold the
-lock, otherwise someone might call cache_delete
-and even worse, add another object, re-using the same address.
-
-
-As there is only one lock, you can't hold it forever: no-one else would
-get any work done.
-
-
-The solution to this problem is to use a reference count: everyone who
-has a pointer to the object increases it when they first get the
-object, and drops the reference count when they're finished with it.
-Whoever drops it to zero knows it is unused, and can actually delete it.
-
-
-Here is the code:
-
-
-
---- cache.c.interrupt 2003-12-09 14:25:43.000000000 +1100
-+++ cache.c.refcnt 2003-12-09 14:33:05.000000000 +1100
-@@ -7,6 +7,7 @@
- struct object
- {
- struct list_head list;
-+ unsigned int refcnt;
- int id;
- char name[32];
- int popularity;
-@@ -17,6 +18,35 @@
- static unsigned int cache_num = 0;
- #define MAX_CACHE_SIZE 10
-
-+static void __object_put(struct object *obj)
-+{
-+ if (--obj->refcnt == 0)
-+ kfree(obj);
-+}
-+
-+static void __object_get(struct object *obj)
-+{
-+ obj->refcnt++;
-+}
-+
-+void object_put(struct object *obj)
-+{
-+ unsigned long flags;
-+
-+ spin_lock_irqsave(&cache_lock, flags);
-+ __object_put(obj);
-+ spin_unlock_irqrestore(&cache_lock, flags);
-+}
-+
-+void object_get(struct object *obj)
-+{
-+ unsigned long flags;
-+
-+ spin_lock_irqsave(&cache_lock, flags);
-+ __object_get(obj);
-+ spin_unlock_irqrestore(&cache_lock, flags);
-+}
-+
- /* Must be holding cache_lock */
- static struct object *__cache_find(int id)
- {
-@@ -35,6 +65,7 @@
- {
- BUG_ON(!obj);
- list_del(&obj->list);
-+ __object_put(obj);
- cache_num--;
- }
-
-@@ -63,6 +94,7 @@
- strlcpy(obj->name, name, sizeof(obj->name));
- obj->id = id;
- obj->popularity = 0;
-+ obj->refcnt = 1; /* The cache holds a reference */
-
- spin_lock_irqsave(&cache_lock, flags);
- __cache_add(obj);
-@@ -79,18 +111,15 @@
- spin_unlock_irqrestore(&cache_lock, flags);
- }
-
--int cache_find(int id, char *name)
-+struct object *cache_find(int id)
- {
- struct object *obj;
-- int ret = -ENOENT;
- unsigned long flags;
-
- spin_lock_irqsave(&cache_lock, flags);
- obj = __cache_find(id);
-- if (obj) {
-- ret = 0;
-- strcpy(name, obj->name);
-- }
-+ if (obj)
-+ __object_get(obj);
- spin_unlock_irqrestore(&cache_lock, flags);
-- return ret;
-+ return obj;
- }
-
-
-
-We encapsulate the reference counting in the standard 'get' and 'put'
-functions. Now we can return the object itself from
-cache_find which has the advantage that the user
-can now sleep holding the object (eg. to
-copy_to_user to name to userspace).
-
-
-The other point to note is that I said a reference should be held for
-every pointer to the object: thus the reference count is 1 when first
-inserted into the cache. In some versions the framework does not hold
-a reference count, but they are more complicated.
-
-
-
- Using Atomic Operations For The Reference Count
-
-In practice, atomic_t would usually be used for
-refcnt. There are a number of atomic
-operations defined in
-
-include/asm/atomic.h: these are
-guaranteed to be seen atomically from all CPUs in the system, so no
-lock is required. In this case, it is simpler than using spinlocks,
-although for anything non-trivial using spinlocks is clearer. The
-atomic_inc and
-atomic_dec_and_test are used instead of the
-standard increment and decrement operators, and the lock is no longer
-used to protect the reference count itself.
-
-
-
---- cache.c.refcnt 2003-12-09 15:00:35.000000000 +1100
-+++ cache.c.refcnt-atomic 2003-12-11 15:49:42.000000000 +1100
-@@ -7,7 +7,7 @@
- struct object
- {
- struct list_head list;
-- unsigned int refcnt;
-+ atomic_t refcnt;
- int id;
- char name[32];
- int popularity;
-@@ -18,33 +18,15 @@
- static unsigned int cache_num = 0;
- #define MAX_CACHE_SIZE 10
-
--static void __object_put(struct object *obj)
--{
-- if (--obj->refcnt == 0)
-- kfree(obj);
--}
--
--static void __object_get(struct object *obj)
--{
-- obj->refcnt++;
--}
--
- void object_put(struct object *obj)
- {
-- unsigned long flags;
--
-- spin_lock_irqsave(&cache_lock, flags);
-- __object_put(obj);
-- spin_unlock_irqrestore(&cache_lock, flags);
-+ if (atomic_dec_and_test(&obj->refcnt))
-+ kfree(obj);
- }
-
- void object_get(struct object *obj)
- {
-- unsigned long flags;
--
-- spin_lock_irqsave(&cache_lock, flags);
-- __object_get(obj);
-- spin_unlock_irqrestore(&cache_lock, flags);
-+ atomic_inc(&obj->refcnt);
- }
-
- /* Must be holding cache_lock */
-@@ -65,7 +47,7 @@
- {
- BUG_ON(!obj);
- list_del(&obj->list);
-- __object_put(obj);
-+ object_put(obj);
- cache_num--;
- }
-
-@@ -94,7 +76,7 @@
- strlcpy(obj->name, name, sizeof(obj->name));
- obj->id = id;
- obj->popularity = 0;
-- obj->refcnt = 1; /* The cache holds a reference */
-+ atomic_set(&obj->refcnt, 1); /* The cache holds a reference */
-
- spin_lock_irqsave(&cache_lock, flags);
- __cache_add(obj);
-@@ -119,7 +101,7 @@
- spin_lock_irqsave(&cache_lock, flags);
- obj = __cache_find(id);
- if (obj)
-- __object_get(obj);
-+ object_get(obj);
- spin_unlock_irqrestore(&cache_lock, flags);
- return obj;
- }
-
-
-
-
-
- Protecting The Objects Themselves
-
-In these examples, we assumed that the objects (except the reference
-counts) never changed once they are created. If we wanted to allow
-the name to change, there are three possibilities:
-
-
-
-
-You can make cache_lock non-static, and tell people
-to grab that lock before changing the name in any object.
-
-
-
-
-You can provide a cache_obj_rename which grabs
-this lock and changes the name for the caller, and tell everyone to
-use that function.
-
-
-
-
-You can make the cache_lock protect only the cache
-itself, and use another lock to protect the name.
-
-
-
-
-
-Theoretically, you can make the locks as fine-grained as one lock for
-every field, for every object. In practice, the most common variants
-are:
-
-
-
-
-One lock which protects the infrastructure (the cache
-list in this example) and all the objects. This is what we have done
-so far.
-
-
-
-
-One lock which protects the infrastructure (including the list
-pointers inside the objects), and one lock inside the object which
-protects the rest of that object.
-
-
-
-
-Multiple locks to protect the infrastructure (eg. one lock per hash
-chain), possibly with a separate per-object lock.
-
-
-
-
-
-Here is the "lock-per-object" implementation:
-
-
---- cache.c.refcnt-atomic 2003-12-11 15:50:54.000000000 +1100
-+++ cache.c.perobjectlock 2003-12-11 17:15:03.000000000 +1100
-@@ -6,11 +6,17 @@
-
- struct object
- {
-+ /* These two protected by cache_lock. */
- struct list_head list;
-+ int popularity;
-+
- atomic_t refcnt;
-+
-+ /* Doesn't change once created. */
- int id;
-+
-+ spinlock_t lock; /* Protects the name */
- char name[32];
-- int popularity;
- };
-
- static DEFINE_SPINLOCK(cache_lock);
-@@ -77,6 +84,7 @@
- obj->id = id;
- obj->popularity = 0;
- atomic_set(&obj->refcnt, 1); /* The cache holds a reference */
-+ spin_lock_init(&obj->lock);
-
- spin_lock_irqsave(&cache_lock, flags);
- __cache_add(obj);
-
-
-
-Note that I decide that the popularity
-count should be protected by the cache_lock rather
-than the per-object lock: this is because it (like the
-struct list_head inside the object) is
-logically part of the infrastructure. This way, I don't need to grab
-the lock of every object in __cache_add when
-seeking the least popular.
-
-
-
-I also decided that the id member is
-unchangeable, so I don't need to grab each object lock in
-__cache_find() to examine the
-id: the object lock is only used by a
-caller who wants to read or write the name
-field.
-
-
-
-Note also that I added a comment describing what data was protected by
-which locks. This is extremely important, as it describes the runtime
-behavior of the code, and can be hard to gain from just reading. And
-as Alan Cox says, Lock data, not code.
-
-
-
-
-
- Common Problems
-
- Deadlock: Simple and Advanced
-
-
- There is a coding bug where a piece of code tries to grab a
- spinlock twice: it will spin forever, waiting for the lock to
- be released (spinlocks, rwlocks and mutexes are not
- recursive in Linux). This is trivial to diagnose: not a
- stay-up-five-nights-talk-to-fluffy-code-bunnies kind of
- problem.
-
-
-
- For a slightly more complex case, imagine you have a region
- shared by a softirq and user context. If you use a
- spin_lock() call to protect it, it is
- possible that the user context will be interrupted by the softirq
- while it holds the lock, and the softirq will then spin
- forever trying to get the same lock.
-
-
-
- Both of these are called deadlock, and as shown above, it can
- occur even with a single CPU (although not on UP compiles,
- since spinlocks vanish on kernel compiles with
- CONFIG_SMP=n. You'll still get data corruption
- in the second example).
-
-
-
- This complete lockup is easy to diagnose: on SMP boxes the
- watchdog timer or compiling with DEBUG_SPINLOCK set
- (include/linux/spinlock.h) will show this up
- immediately when it happens.
-
-
-
- A more complex problem is the so-called 'deadly embrace',
- involving two or more locks. Say you have a hash table: each
- entry in the table is a spinlock, and a chain of hashed
- objects. Inside a softirq handler, you sometimes want to
- alter an object from one place in the hash to another: you
- grab the spinlock of the old hash chain and the spinlock of
- the new hash chain, and delete the object from the old one,
- and insert it in the new one.
-
-
-
- There are two problems here. First, if your code ever
- tries to move the object to the same chain, it will deadlock
- with itself as it tries to lock it twice. Secondly, if the
- same softirq on another CPU is trying to move another object
- in the reverse direction, the following could happen:
-
-
-
- Consequences
-
-
-
-
-
- CPU 1
- CPU 2
-
-
-
-
-
- Grab lock A -> OK
- Grab lock B -> OK
-
-
- Grab lock B -> spin
- Grab lock A -> spin
-
-
-
-
-
-
- The two CPUs will spin forever, waiting for the other to give up
- their lock. It will look, smell, and feel like a crash.
-
-
-
-
- Preventing Deadlock
-
-
- Textbooks will tell you that if you always lock in the same
- order, you will never get this kind of deadlock. Practice
- will tell you that this approach doesn't scale: when I
- create a new lock, I don't understand enough of the kernel
- to figure out where in the 5000 lock hierarchy it will fit.
-
-
-
- The best locks are encapsulated: they never get exposed in
- headers, and are never held around calls to non-trivial
- functions outside the same file. You can read through this
- code and see that it will never deadlock, because it never
- tries to grab another lock while it has that one. People
- using your code don't even need to know you are using a
- lock.
-
-
-
- A classic problem here is when you provide callbacks or
- hooks: if you call these with the lock held, you risk simple
- deadlock, or a deadly embrace (who knows what the callback
- will do?). Remember, the other programmers are out to get
- you, so don't do this.
-
-
-
- Overzealous Prevention Of Deadlocks
-
-
- Deadlocks are problematic, but not as bad as data
- corruption. Code which grabs a read lock, searches a list,
- fails to find what it wants, drops the read lock, grabs a
- write lock and inserts the object has a race condition.
-
-
-
- If you don't see why, please stay the fuck away from my code.
-
-
-
-
-
- Racing Timers: A Kernel Pastime
-
-
- Timers can produce their own special problems with races.
- Consider a collection of objects (list, hash, etc) where each
- object has a timer which is due to destroy it.
-
-
-
- If you want to destroy the entire collection (say on module
- removal), you might do the following:
-
-
-
- /* THIS CODE BAD BAD BAD BAD: IF IT WAS ANY WORSE IT WOULD USE
- HUNGARIAN NOTATION */
- spin_lock_bh(&list_lock);
-
- while (list) {
- struct foo *next = list->next;
- del_timer(&list->timer);
- kfree(list);
- list = next;
- }
-
- spin_unlock_bh(&list_lock);
-
-
-
- Sooner or later, this will crash on SMP, because a timer can
- have just gone off before the spin_lock_bh(),
- and it will only get the lock after we
- spin_unlock_bh(), and then try to free
- the element (which has already been freed!).
-
-
-
- This can be avoided by checking the result of
- del_timer(): if it returns
- 1, the timer has been deleted.
- If 0, it means (in this
- case) that it is currently running, so we can do:
-
-
-
- retry:
- spin_lock_bh(&list_lock);
-
- while (list) {
- struct foo *next = list->next;
- if (!del_timer(&list->timer)) {
- /* Give timer a chance to delete this */
- spin_unlock_bh(&list_lock);
- goto retry;
- }
- kfree(list);
- list = next;
- }
-
- spin_unlock_bh(&list_lock);
-
-
-
- Another common problem is deleting timers which restart
- themselves (by calling add_timer() at the end
- of their timer function). Because this is a fairly common case
- which is prone to races, you should use del_timer_sync()
- (include/linux/timer.h)
- to handle this case. It returns the number of times the timer
- had to be deleted before we finally stopped it from adding itself back
- in.
-
-
-
-
-
-
- Locking Speed
-
-
-There are three main things to worry about when considering speed of
-some code which does locking. First is concurrency: how many things
-are going to be waiting while someone else is holding a lock. Second
-is the time taken to actually acquire and release an uncontended lock.
-Third is using fewer, or smarter locks. I'm assuming that the lock is
-used fairly often: otherwise, you wouldn't be concerned about
-efficiency.
-
-
-Concurrency depends on how long the lock is usually held: you should
-hold the lock for as long as needed, but no longer. In the cache
-example, we always create the object without the lock held, and then
-grab the lock only when we are ready to insert it in the list.
-
-
-Acquisition times depend on how much damage the lock operations do to
-the pipeline (pipeline stalls) and how likely it is that this CPU was
-the last one to grab the lock (ie. is the lock cache-hot for this
-CPU): on a machine with more CPUs, this likelihood drops fast.
-Consider a 700MHz Intel Pentium III: an instruction takes about 0.7ns,
-an atomic increment takes about 58ns, a lock which is cache-hot on
-this CPU takes 160ns, and a cacheline transfer from another CPU takes
-an additional 170 to 360ns. (These figures from Paul McKenney's
- Linux
-Journal RCU article).
-
-
-These two aims conflict: holding a lock for a short time might be done
-by splitting locks into parts (such as in our final per-object-lock
-example), but this increases the number of lock acquisitions, and the
-results are often slower than having a single lock. This is another
-reason to advocate locking simplicity.
-
-
-The third concern is addressed below: there are some methods to reduce
-the amount of locking which needs to be done.
-
-
-
- Read/Write Lock Variants
-
-
- Both spinlocks and mutexes have read/write variants:
- rwlock_t and struct rw_semaphore.
- These divide users into two classes: the readers and the writers. If
- you are only reading the data, you can get a read lock, but to write to
- the data you need the write lock. Many people can hold a read lock,
- but a writer must be sole holder.
-
-
-
- If your code divides neatly along reader/writer lines (as our
- cache code does), and the lock is held by readers for
- significant lengths of time, using these locks can help. They
- are slightly slower than the normal locks though, so in practice
- rwlock_t is not usually worthwhile.
-
-
-
-
- Avoiding Locks: Read Copy Update
-
-
- There is a special method of read/write locking called Read Copy
- Update. Using RCU, the readers can avoid taking a lock
- altogether: as we expect our cache to be read more often than
- updated (otherwise the cache is a waste of time), it is a
- candidate for this optimization.
-
-
-
- How do we get rid of read locks? Getting rid of read locks
- means that writers may be changing the list underneath the
- readers. That is actually quite simple: we can read a linked
- list while an element is being added if the writer adds the
- element very carefully. For example, adding
- new to a single linked list called
- list:
-
-
-
- new->next = list->next;
- wmb();
- list->next = new;
-
-
-
- The wmb() is a write memory barrier. It
- ensures that the first operation (setting the new element's
- next pointer) is complete and will be seen by
- all CPUs, before the second operation is (putting the new
- element into the list). This is important, since modern
- compilers and modern CPUs can both reorder instructions unless
- told otherwise: we want a reader to either not see the new
- element at all, or see the new element with the
- next pointer correctly pointing at the rest of
- the list.
-
-
- Fortunately, there is a function to do this for standard
- struct list_head lists:
- list_add_rcu()
- (include/linux/list.h).
-
-
- Removing an element from the list is even simpler: we replace
- the pointer to the old element with a pointer to its successor,
- and readers will either see it, or skip over it.
-
-
- list->next = old->next;
-
-
- There is list_del_rcu()
- (include/linux/list.h) which does this (the
- normal version poisons the old object, which we don't want).
-
-
- The reader must also be careful: some CPUs can look through the
- next pointer to start reading the contents of
- the next element early, but don't realize that the pre-fetched
- contents is wrong when the next pointer changes
- underneath them. Once again, there is a
- list_for_each_entry_rcu()
- (include/linux/list.h) to help you. Of
- course, writers can just use
- list_for_each_entry(), since there cannot
- be two simultaneous writers.
-
-
- Our final dilemma is this: when can we actually destroy the
- removed element? Remember, a reader might be stepping through
- this element in the list right now: if we free this element and
- the next pointer changes, the reader will jump
- off into garbage and crash. We need to wait until we know that
- all the readers who were traversing the list when we deleted the
- element are finished. We use call_rcu() to
- register a callback which will actually destroy the object once
- all pre-existing readers are finished. Alternatively,
- synchronize_rcu() may be used to block until
- all pre-existing are finished.
-
-
- But how does Read Copy Update know when the readers are
- finished? The method is this: firstly, the readers always
- traverse the list inside
- rcu_read_lock()/rcu_read_unlock()
- pairs: these simply disable preemption so the reader won't go to
- sleep while reading the list.
-
-
- RCU then waits until every other CPU has slept at least once:
- since readers cannot sleep, we know that any readers which were
- traversing the list during the deletion are finished, and the
- callback is triggered. The real Read Copy Update code is a
- little more optimized than this, but this is the fundamental
- idea.
-
-
-
---- cache.c.perobjectlock 2003-12-11 17:15:03.000000000 +1100
-+++ cache.c.rcupdate 2003-12-11 17:55:14.000000000 +1100
-@@ -1,15 +1,18 @@
- #include <linux/list.h>
- #include <linux/slab.h>
- #include <linux/string.h>
-+#include <linux/rcupdate.h>
- #include <linux/mutex.h>
- #include <asm/errno.h>
-
- struct object
- {
-- /* These two protected by cache_lock. */
-+ /* This is protected by RCU */
- struct list_head list;
- int popularity;
-
-+ struct rcu_head rcu;
-+
- atomic_t refcnt;
-
- /* Doesn't change once created. */
-@@ -40,7 +43,7 @@
- {
- struct object *i;
-
-- list_for_each_entry(i, &cache, list) {
-+ list_for_each_entry_rcu(i, &cache, list) {
- if (i->id == id) {
- i->popularity++;
- return i;
-@@ -49,19 +52,25 @@
- return NULL;
- }
-
-+/* Final discard done once we know no readers are looking. */
-+static void cache_delete_rcu(void *arg)
-+{
-+ object_put(arg);
-+}
-+
- /* Must be holding cache_lock */
- static void __cache_delete(struct object *obj)
- {
- BUG_ON(!obj);
-- list_del(&obj->list);
-- object_put(obj);
-+ list_del_rcu(&obj->list);
- cache_num--;
-+ call_rcu(&obj->rcu, cache_delete_rcu);
- }
-
- /* Must be holding cache_lock */
- static void __cache_add(struct object *obj)
- {
-- list_add(&obj->list, &cache);
-+ list_add_rcu(&obj->list, &cache);
- if (++cache_num > MAX_CACHE_SIZE) {
- struct object *i, *outcast = NULL;
- list_for_each_entry(i, &cache, list) {
-@@ -104,12 +114,11 @@
- struct object *cache_find(int id)
- {
- struct object *obj;
-- unsigned long flags;
-
-- spin_lock_irqsave(&cache_lock, flags);
-+ rcu_read_lock();
- obj = __cache_find(id);
- if (obj)
- object_get(obj);
-- spin_unlock_irqrestore(&cache_lock, flags);
-+ rcu_read_unlock();
- return obj;
- }
-
-
-
-Note that the reader will alter the
-popularity member in
-__cache_find(), and now it doesn't hold a lock.
-One solution would be to make it an atomic_t, but for
-this usage, we don't really care about races: an approximate result is
-good enough, so I didn't change it.
-
-
-
-The result is that cache_find() requires no
-synchronization with any other functions, so is almost as fast on SMP
-as it would be on UP.
-
-
-
-There is a further optimization possible here: remember our original
-cache code, where there were no reference counts and the caller simply
-held the lock whenever using the object? This is still possible: if
-you hold the lock, no one can delete the object, so you don't need to
-get and put the reference count.
-
-
-
-Now, because the 'read lock' in RCU is simply disabling preemption, a
-caller which always has preemption disabled between calling
-cache_find() and
-object_put() does not need to actually get and
-put the reference count: we could expose
-__cache_find() by making it non-static, and
-such callers could simply call that.
-
-
-The benefit here is that the reference count is not written to: the
-object is not altered in any way, which is much faster on SMP
-machines due to caching.
-
-
-
-
- Per-CPU Data
-
-
- Another technique for avoiding locking which is used fairly
- widely is to duplicate information for each CPU. For example,
- if you wanted to keep a count of a common condition, you could
- use a spin lock and a single counter. Nice and simple.
-
-
-
- If that was too slow (it's usually not, but if you've got a
- really big machine to test on and can show that it is), you
- could instead use a counter for each CPU, then none of them need
- an exclusive lock. See DEFINE_PER_CPU(),
- get_cpu_var() and
- put_cpu_var()
- (include/linux/percpu.h).
-
-
-
- Of particular use for simple per-cpu counters is the
- local_t type, and the
- cpu_local_inc() and related functions,
- which are more efficient than simple code on some architectures
- (include/asm/local.h).
-
-
-
- Note that there is no simple, reliable way of getting an exact
- value of such a counter, without introducing more locks. This
- is not a problem for some uses.
-
-
-
-
- Data Which Mostly Used By An IRQ Handler
-
-
- If data is always accessed from within the same IRQ handler, you
- don't need a lock at all: the kernel already guarantees that the
- irq handler will not run simultaneously on multiple CPUs.
-
-
- Manfred Spraul points out that you can still do this, even if
- the data is very occasionally accessed in user context or
- softirqs/tasklets. The irq handler doesn't use a lock, and
- all other accesses are done as so:
-
-
-
- spin_lock(&lock);
- disable_irq(irq);
- ...
- enable_irq(irq);
- spin_unlock(&lock);
-
-
- The disable_irq() prevents the irq handler
- from running (and waits for it to finish if it's currently
- running on other CPUs). The spinlock prevents any other
- accesses happening at the same time. Naturally, this is slower
- than just a spin_lock_irq() call, so it
- only makes sense if this type of access happens extremely
- rarely.
-
-
-
-
-
- What Functions Are Safe To Call From Interrupts?
-
-
- Many functions in the kernel sleep (ie. call schedule())
- directly or indirectly: you can never call them while holding a
- spinlock, or with preemption disabled. This also means you need
- to be in user context: calling them from an interrupt is illegal.
-
-
-
- Some Functions Which Sleep
-
-
- The most common ones are listed below, but you usually have to
- read the code to find out if other calls are safe. If everyone
- else who calls it can sleep, you probably need to be able to
- sleep, too. In particular, registration and deregistration
- functions usually expect to be called from user context, and can
- sleep.
-
-
-
-
-
- Accesses to
- userspace:
-
-
-
-
- copy_from_user()
-
-
-
-
- copy_to_user()
-
-
-
-
- get_user()
-
-
-
-
- put_user()
-
-
-
-
-
-
-
- kmalloc(GFP_KERNEL)
-
-
-
-
-
- mutex_lock_interruptible() and
- mutex_lock()
-
-
- There is a mutex_trylock() which does not
- sleep. Still, it must not be used inside interrupt context since
- its implementation is not safe for that.
- mutex_unlock() will also never sleep.
- It cannot be used in interrupt context either since a mutex
- must be released by the same task that acquired it.
-
-
-
-
-
-
- Some Functions Which Don't Sleep
-
-
- Some functions are safe to call from any context, or holding
- almost any lock.
-
-
-
-
-
- printk()
-
-
-
-
- kfree()
-
-
-
-
- add_timer() and del_timer()
-
-
-
-
-
-
-
- Mutex API reference
-!Iinclude/linux/mutex.h
-!Ekernel/locking/mutex.c
-
-
-
- Futex API reference
-!Ikernel/futex.c
-
-
-
- Further reading
-
-
-
-
- Documentation/locking/spinlocks.txt:
- Linus Torvalds' spinlocking tutorial in the kernel sources.
-
-
-
-
-
- Unix Systems for Modern Architectures: Symmetric
- Multiprocessing and Caching for Kernel Programmers:
-
-
-
- Curt Schimmel's very good introduction to kernel level
- locking (not written for Linux, but nearly everything
- applies). The book is expensive, but really worth every
- penny to understand SMP locking. [ISBN: 0201633388]
-
-
-
-
-
-
- Thanks
-
-
- Thanks to Telsa Gwynne for DocBooking, neatening and adding
- style.
-
-
-
- Thanks to Martin Pool, Philipp Rumpf, Stephen Rothwell, Paul
- Mackerras, Ruedi Aschwanden, Alan Cox, Manfred Spraul, Tim
- Waugh, Pete Zaitcev, James Morris, Robert Love, Paul McKenney,
- John Ashby for proofreading, correcting, flaming, commenting.
-
-
-
- Thanks to the cabal for having no influence on this document.
-
-
-
-
- Glossary
-
-
- preemption
-
-
- Prior to 2.5, or when CONFIG_PREEMPT is
- unset, processes in user context inside the kernel would not
- preempt each other (ie. you had that CPU until you gave it up,
- except for interrupts). With the addition of
- CONFIG_PREEMPT in 2.5.4, this changed: when
- in user context, higher priority tasks can "cut in": spinlocks
- were changed to disable preemption, even on UP.
-
-
-
-
-
- bh
-
-
- Bottom Half: for historical reasons, functions with
- '_bh' in them often now refer to any software interrupt, e.g.
- spin_lock_bh() blocks any software interrupt
- on the current CPU. Bottom halves are deprecated, and will
- eventually be replaced by tasklets. Only one bottom half will be
- running at any time.
-
-
-
-
-
- Hardware Interrupt / Hardware IRQ
-
-
- Hardware interrupt request. in_irq() returns
- true in a hardware interrupt handler.
-
-
-
-
-
- Interrupt Context
-
-
- Not user context: processing a hardware irq or software irq.
- Indicated by the in_interrupt() macro
- returning true.
-
-
-
-
-
- SMP
-
-
- Symmetric Multi-Processor: kernels compiled for multiple-CPU
- machines. (CONFIG_SMP=y).
-
-
-
-
-
- Software Interrupt / softirq
-
-
- Software interrupt handler. in_irq() returns
- false; in_softirq()
- returns true. Tasklets and softirqs
- both fall into the category of 'software interrupts'.
-
-
- Strictly speaking a softirq is one of up to 32 enumerated software
- interrupts which can run on multiple CPUs at once.
- Sometimes used to refer to tasklets as
- well (ie. all software interrupts).
-
-
-
-
-
- tasklet
-
-
- A dynamically-registrable software interrupt,
- which is guaranteed to only run on one CPU at a time.
-
-
-
-
-
- timer
-
-
- A dynamically-registrable software interrupt, which is run at
- (or close to) a given time. When running, it is just like a
- tasklet (in fact, they are called from the TIMER_SOFTIRQ).
-
-
-
-
-
- UP
-
-
- Uni-Processor: Non-SMP. (CONFIG_SMP=n).
-
-
-
-
-
- User Context
-
-
- The kernel executing on behalf of a particular process (ie. a
- system call or trap) or kernel thread. You can tell which
- process with the current macro.) Not to
- be confused with userspace. Can be interrupted by software or
- hardware interrupts.
-
-
-
-
-
- Userspace
-
-
- A process executing its own code outside the kernel.
-
-
-
-
-
-
-
diff --git a/Documentation/DocBook/kgdb.tmpl b/Documentation/DocBook/kgdb.tmpl
deleted file mode 100644
index 856ac20bf36782f0aab56612abfd4cba2a69b861..0000000000000000000000000000000000000000
--- a/Documentation/DocBook/kgdb.tmpl
+++ /dev/null
@@ -1,918 +0,0 @@
-
-
-
-
-
- Using kgdb, kdb and the kernel debugger internals
-
-
-
- Jason
- Wessel
-
-
- jason.wessel@windriver.com
-
-
-
-
-
- 2008,2010
- Wind River Systems, Inc.
-
-
- 2004-2005
- MontaVista Software, Inc.
-
-
- 2004
- Amit S. Kale
-
-
-
-
- This file is licensed under the terms of the GNU General Public License
- version 2. This program is licensed "as is" without any warranty of any
- kind, whether express or implied.
-
-
-
-
-
-
-
- Introduction
-
- The kernel has two different debugger front ends (kdb and kgdb)
- which interface to the debug core. It is possible to use either
- of the debugger front ends and dynamically transition between them
- if you configure the kernel properly at compile and runtime.
-
-
- Kdb is simplistic shell-style interface which you can use on a
- system console with a keyboard or serial console. You can use it
- to inspect memory, registers, process lists, dmesg, and even set
- breakpoints to stop in a certain location. Kdb is not a source
- level debugger, although you can set breakpoints and execute some
- basic kernel run control. Kdb is mainly aimed at doing some
- analysis to aid in development or diagnosing kernel problems. You
- can access some symbols by name in kernel built-ins or in kernel
- modules if the code was built
- with CONFIG_KALLSYMS.
-
-
- Kgdb is intended to be used as a source level debugger for the
- Linux kernel. It is used along with gdb to debug a Linux kernel.
- The expectation is that gdb can be used to "break in" to the
- kernel to inspect memory, variables and look through call stack
- information similar to the way an application developer would use
- gdb to debug an application. It is possible to place breakpoints
- in kernel code and perform some limited execution stepping.
-
-
- Two machines are required for using kgdb. One of these machines is
- a development machine and the other is the target machine. The
- kernel to be debugged runs on the target machine. The development
- machine runs an instance of gdb against the vmlinux file which
- contains the symbols (not a boot image such as bzImage, zImage,
- uImage...). In gdb the developer specifies the connection
- parameters and connects to kgdb. The type of connection a
- developer makes with gdb depends on the availability of kgdb I/O
- modules compiled as built-ins or loadable kernel modules in the test
- machine's kernel.
-
-
-
- Compiling a kernel
-
-
- In order to enable compilation of kdb, you must first enable kgdb.
- The kgdb test compile options are described in the kgdb test suite chapter.
-
-
-
- Kernel config options for kgdb
-
- To enable CONFIG_KGDB you should look under
- "Kernel hacking" / "Kernel debugging" and select "KGDB: kernel debugger".
-
-
- While it is not a hard requirement that you have symbols in your
- vmlinux file, gdb tends not to be very useful without the symbolic
- data, so you will want to turn
- on CONFIG_DEBUG_INFO which is called "Compile the
- kernel with debug info" in the config menu.
-
-
- It is advised, but not required, that you turn on the
- CONFIG_FRAME_POINTER kernel option which is called "Compile the
- kernel with frame pointers" in the config menu. This option
- inserts code to into the compiled executable which saves the frame
- information in registers or on the stack at different points which
- allows a debugger such as gdb to more accurately construct
- stack back traces while debugging the kernel.
-
-
- If the architecture that you are using supports the kernel option
- CONFIG_STRICT_KERNEL_RWX, you should consider turning it off. This
- option will prevent the use of software breakpoints because it
- marks certain regions of the kernel's memory space as read-only.
- If kgdb supports it for the architecture you are using, you can
- use hardware breakpoints if you desire to run with the
- CONFIG_STRICT_KERNEL_RWX option turned on, else you need to turn off
- this option.
-
-
- Next you should choose one of more I/O drivers to interconnect
- debugging host and debugged target. Early boot debugging requires
- a KGDB I/O driver that supports early debugging and the driver
- must be built into the kernel directly. Kgdb I/O driver
- configuration takes place via kernel or module parameters which
- you can learn more about in the in the section that describes the
- parameter "kgdboc".
-
- Here is an example set of .config symbols to enable or
- disable for kgdb:
-
- # CONFIG_STRICT_KERNEL_RWX is not set
- CONFIG_FRAME_POINTER=y
- CONFIG_KGDB=y
- CONFIG_KGDB_SERIAL_CONSOLE=y
-
-
-
-
- Kernel config options for kdb
- Kdb is quite a bit more complex than the simple gdbstub
- sitting on top of the kernel's debug core. Kdb must implement a
- shell, and also adds some helper functions in other parts of the
- kernel, responsible for printing out interesting data such as what
- you would see if you ran "lsmod", or "ps". In order to build kdb
- into the kernel you follow the same steps as you would for kgdb.
-
- The main config option for kdb
- is CONFIG_KGDB_KDB which is called "KGDB_KDB:
- include kdb frontend for kgdb" in the config menu. In theory you
- would have already also selected an I/O driver such as the
- CONFIG_KGDB_SERIAL_CONSOLE interface if you plan on using kdb on a
- serial port, when you were configuring kgdb.
-
- If you want to use a PS/2-style keyboard with kdb, you would
- select CONFIG_KDB_KEYBOARD which is called "KGDB_KDB: keyboard as
- input device" in the config menu. The CONFIG_KDB_KEYBOARD option
- is not used for anything in the gdb interface to kgdb. The
- CONFIG_KDB_KEYBOARD option only works with kdb.
-
- Here is an example set of .config symbols to enable/disable kdb:
-
- # CONFIG_STRICT_KERNEL_RWX is not set
- CONFIG_FRAME_POINTER=y
- CONFIG_KGDB=y
- CONFIG_KGDB_SERIAL_CONSOLE=y
- CONFIG_KGDB_KDB=y
- CONFIG_KDB_KEYBOARD=y
-
-
-
-
-
- Kernel Debugger Boot Arguments
- This section describes the various runtime kernel
- parameters that affect the configuration of the kernel debugger.
- The following chapter covers using kdb and kgdb as well as
- providing some examples of the configuration parameters.
-
- Kernel parameter: kgdboc
- The kgdboc driver was originally an abbreviation meant to
- stand for "kgdb over console". Today it is the primary mechanism
- to configure how to communicate from gdb to kgdb as well as the
- devices you want to use to interact with the kdb shell.
-
- For kgdb/gdb, kgdboc is designed to work with a single serial
- port. It is intended to cover the circumstance where you want to
- use a serial console as your primary console as well as using it to
- perform kernel debugging. It is also possible to use kgdb on a
- serial port which is not designated as a system console. Kgdboc
- may be configured as a kernel built-in or a kernel loadable module.
- You can only make use of kgdbwait and early
- debugging if you build kgdboc into the kernel as a built-in.
-
- Optionally you can elect to activate kms (Kernel Mode
- Setting) integration. When you use kms with kgdboc and you have a
- video driver that has atomic mode setting hooks, it is possible to
- enter the debugger on the graphics console. When the kernel
- execution is resumed, the previous graphics mode will be restored.
- This integration can serve as a useful tool to aid in diagnosing
- crashes or doing analysis of memory with kdb while allowing the
- full graphics console applications to run.
-
-
- kgdboc arguments
- Usage: kgdboc=[kms][[,]kbd][[,]serial_device][,baud]
- The order listed above must be observed if you use any of the
- optional configurations together.
-
- Abbreviations:
-
- kms = Kernel Mode Setting
- kbd = Keyboard
-
-
- You can configure kgdboc to use the keyboard, and/or a serial
- device depending on if you are using kdb and/or kgdb, in one of the
- following scenarios. The order listed above must be observed if
- you use any of the optional configurations together. Using kms +
- only gdb is generally not a useful combination.
-
- Using loadable module or built-in
-
-
- As a kernel built-in:
- Use the kernel boot argument: kgdboc=<tty-device>,[baud]
-
- As a kernel loadable module:
- Use the command: modprobe kgdboc kgdboc=<tty-device>,[baud]
- Here are two examples of how you might format the kgdboc
- string. The first is for an x86 target using the first serial port.
- The second example is for the ARM Versatile AB using the second
- serial port.
-
- kgdboc=ttyS0,115200
- kgdboc=ttyAMA1,115200
-
-
-
-
-
-
- Configure kgdboc at runtime with sysfs
- At run time you can enable or disable kgdboc by echoing a
- parameters into the sysfs. Here are two examples:
-
- Enable kgdboc on ttyS0
- echo ttyS0 > /sys/module/kgdboc/parameters/kgdboc
- Disable kgdboc
- echo "" > /sys/module/kgdboc/parameters/kgdboc
-
- NOTE: You do not need to specify the baud if you are
- configuring the console on tty which is already configured or
- open.
-
-
- More examples
- You can configure kgdboc to use the keyboard, and/or a serial device
- depending on if you are using kdb and/or kgdb, in one of the
- following scenarios.
-
- kdb and kgdb over only a serial port
- kgdboc=<serial_device>[,baud]
- Example: kgdboc=ttyS0,115200
-
- kdb and kgdb with keyboard and a serial port
- kgdboc=kbd,<serial_device>[,baud]
- Example: kgdboc=kbd,ttyS0,115200
-
- kdb with a keyboard
- kgdboc=kbd
-
- kdb with kernel mode setting
- kgdboc=kms,kbd
-
- kdb with kernel mode setting and kgdb over a serial port
- kgdboc=kms,kbd,ttyS0,115200
-
-
-
- NOTE: Kgdboc does not support interrupting the target via the
- gdb remote protocol. You must manually send a sysrq-g unless you
- have a proxy that splits console output to a terminal program.
- A console proxy has a separate TCP port for the debugger and a separate
- TCP port for the "human" console. The proxy can take care of sending
- the sysrq-g for you.
-
- When using kgdboc with no debugger proxy, you can end up
- connecting the debugger at one of two entry points. If an
- exception occurs after you have loaded kgdboc, a message should
- print on the console stating it is waiting for the debugger. In
- this case you disconnect your terminal program and then connect the
- debugger in its place. If you want to interrupt the target system
- and forcibly enter a debug session you have to issue a Sysrq
- sequence and then type the letter g. Then
- you disconnect the terminal session and connect gdb. Your options
- if you don't like this are to hack gdb to send the sysrq-g for you
- as well as on the initial connect, or to use a debugger proxy that
- allows an unmodified gdb to do the debugging.
-
-
-
-
-
- Kernel parameter: kgdbwait
-
- The Kernel command line option kgdbwait makes
- kgdb wait for a debugger connection during booting of a kernel. You
- can only use this option if you compiled a kgdb I/O driver into the
- kernel and you specified the I/O driver configuration as a kernel
- command line option. The kgdbwait parameter should always follow the
- configuration parameter for the kgdb I/O driver in the kernel
- command line else the I/O driver will not be configured prior to
- asking the kernel to use it to wait.
-
-
- The kernel will stop and wait as early as the I/O driver and
- architecture allows when you use this option. If you build the
- kgdb I/O driver as a loadable kernel module kgdbwait will not do
- anything.
-
-
-
- Kernel parameter: kgdbcon
- The kgdbcon feature allows you to see printk() messages
- inside gdb while gdb is connected to the kernel. Kdb does not make
- use of the kgdbcon feature.
-
- Kgdb supports using the gdb serial protocol to send console
- messages to the debugger when the debugger is connected and running.
- There are two ways to activate this feature.
-
- Activate with the kernel command line option:
- kgdbcon
-
- Use sysfs before configuring an I/O driver
-
- echo 1 > /sys/module/kgdb/parameters/kgdb_use_con
-
-
- NOTE: If you do this after you configure the kgdb I/O driver, the
- setting will not take effect until the next point the I/O is
- reconfigured.
-
-
-
-
- IMPORTANT NOTE: You cannot use kgdboc + kgdbcon on a tty that is an
- active system console. An example of incorrect usage is console=ttyS0,115200 kgdboc=ttyS0 kgdbcon
-
- It is possible to use this option with kgdboc on a tty that is not a system console.
-
-
-
- Run time parameter: kgdbreboot
- The kgdbreboot feature allows you to change how the debugger
- deals with the reboot notification. You have 3 choices for the
- behavior. The default behavior is always set to 0.
-
- echo -1 > /sys/module/debug_core/parameters/kgdbreboot
- Ignore the reboot notification entirely.
-
- echo 0 > /sys/module/debug_core/parameters/kgdbreboot
- Send the detach message to any attached debugger client.
-
- echo 1 > /sys/module/debug_core/parameters/kgdbreboot
- Enter the debugger on reboot notify.
-
-
-
-
-
- Using kdb
-
-
-
- Quick start for kdb on a serial port
- This is a quick example of how to use kdb.
-
- Configure kgdboc at boot using kernel parameters:
-
- console=ttyS0,115200 kgdboc=ttyS0,115200
-
- OR
- Configure kgdboc after the kernel has booted; assuming you are using a serial port console:
-
- echo ttyS0 > /sys/module/kgdboc/parameters/kgdboc
-
-
-
- Enter the kernel debugger manually or by waiting for an oops or fault. There are several ways you can enter the kernel debugger manually; all involve using the sysrq-g, which means you must have enabled CONFIG_MAGIC_SYSRQ=y in your kernel config.
-
- When logged in as root or with a super user session you can run:
- echo g > /proc/sysrq-trigger
- Example using minicom 2.2
- Press: Control-a
- Press: f
- Press: g
-
- When you have telneted to a terminal server that supports sending a remote break
- Press: Control-]
- Type in:send break
- Press: Enter
- Press: g
-
-
-
- From the kdb prompt you can run the "help" command to see a complete list of the commands that are available.
- Some useful commands in kdb include:
-
- lsmod -- Shows where kernel modules are loaded
- ps -- Displays only the active processes
- ps A -- Shows all the processes
- summary -- Shows kernel version info and memory usage
- bt -- Get a backtrace of the current process using dump_stack()
- dmesg -- View the kernel syslog buffer
- go -- Continue the system
-
-
-
-
- When you are done using kdb you need to consider rebooting the
- system or using the "go" command to resuming normal kernel
- execution. If you have paused the kernel for a lengthy period of
- time, applications that rely on timely networking or anything to do
- with real wall clock time could be adversely affected, so you
- should take this into consideration when using the kernel
- debugger.
-
-
-
-
- Quick start for kdb using a keyboard connected console
- This is a quick example of how to use kdb with a keyboard.
-
- Configure kgdboc at boot using kernel parameters:
-
- kgdboc=kbd
-
- OR
- Configure kgdboc after the kernel has booted:
-
- echo kbd > /sys/module/kgdboc/parameters/kgdboc
-
-
-
- Enter the kernel debugger manually or by waiting for an oops or fault. There are several ways you can enter the kernel debugger manually; all involve using the sysrq-g, which means you must have enabled CONFIG_MAGIC_SYSRQ=y in your kernel config.
-
- When logged in as root or with a super user session you can run:
- echo g > /proc/sysrq-trigger
- Example using a laptop keyboard
- Press and hold down: Alt
- Press and hold down: Fn
- Press and release the key with the label: SysRq
- Release: Fn
- Press and release: g
- Release: Alt
-
- Example using a PS/2 101-key keyboard
- Press and hold down: Alt
- Press and release the key with the label: SysRq
- Press and release: g
- Release: Alt
-
-
-
-
- Now type in a kdb command such as "help", "dmesg", "bt" or "go" to continue kernel execution.
-
-
-
-
-
- Using kgdb / gdb
- In order to use kgdb you must activate it by passing
- configuration information to one of the kgdb I/O drivers. If you
- do not pass any configuration information kgdb will not do anything
- at all. Kgdb will only actively hook up to the kernel trap hooks
- if a kgdb I/O driver is loaded and configured. If you unconfigure
- a kgdb I/O driver, kgdb will unregister all the kernel hook points.
-
- All kgdb I/O drivers can be reconfigured at run time, if
- CONFIG_SYSFS and CONFIG_MODULES
- are enabled, by echo'ing a new config string to
- /sys/module/<driver>/parameter/<option>.
- The driver can be unconfigured by passing an empty string. You cannot
- change the configuration while the debugger is attached. Make sure
- to detach the debugger with the detach command
- prior to trying to unconfigure a kgdb I/O driver.
-
-
- Connecting with gdb to a serial port
-
- Configure kgdboc
- Configure kgdboc at boot using kernel parameters:
-
- kgdboc=ttyS0,115200
-
- OR
- Configure kgdboc after the kernel has booted:
-
- echo ttyS0 > /sys/module/kgdboc/parameters/kgdboc
-
-
-
- Stop kernel execution (break into the debugger)
- In order to connect to gdb via kgdboc, the kernel must
- first be stopped. There are several ways to stop the kernel which
- include using kgdbwait as a boot argument, via a sysrq-g, or running
- the kernel until it takes an exception where it waits for the
- debugger to attach.
-
- When logged in as root or with a super user session you can run:
- echo g > /proc/sysrq-trigger
- Example using minicom 2.2
- Press: Control-a
- Press: f
- Press: g
-
- When you have telneted to a terminal server that supports sending a remote break
- Press: Control-]
- Type in:send break
- Press: Enter
- Press: g
-
-
-
-
-
- Connect from gdb
-
- Example (using a directly connected port):
-
-
- % gdb ./vmlinux
- (gdb) set remotebaud 115200
- (gdb) target remote /dev/ttyS0
-
-
- Example (kgdb to a terminal server on TCP port 2012):
-
-
- % gdb ./vmlinux
- (gdb) target remote 192.168.2.2:2012
-
-
- Once connected, you can debug a kernel the way you would debug an
- application program.
-
-
- If you are having problems connecting or something is going
- seriously wrong while debugging, it will most often be the case
- that you want to enable gdb to be verbose about its target
- communications. You do this prior to issuing the target
- remote command by typing in: set debug remote 1
-
-
-
- Remember if you continue in gdb, and need to "break in" again,
- you need to issue an other sysrq-g. It is easy to create a simple
- entry point by putting a breakpoint at sys_sync
- and then you can run "sync" from a shell or script to break into the
- debugger.
-
-
-
- kgdb and kdb interoperability
- It is possible to transition between kdb and kgdb dynamically.
- The debug core will remember which you used the last time and
- automatically start in the same mode.
-
- Switching between kdb and kgdb
-
- Switching from kgdb to kdb
-
- There are two ways to switch from kgdb to kdb: you can use gdb to
- issue a maintenance packet, or you can blindly type the command $3#33.
- Whenever the kernel debugger stops in kgdb mode it will print the
- message KGDB or $3#33 for KDB. It is important
- to note that you have to type the sequence correctly in one pass.
- You cannot type a backspace or delete because kgdb will interpret
- that as part of the debug stream.
-
- Change from kgdb to kdb by blindly typing:
- $3#33
- Change from kgdb to kdb with gdb
- maintenance packet 3
- NOTE: Now you must kill gdb. Typically you press control-z and
- issue the command: kill -9 %
-
-
-
-
- Change from kdb to kgdb
- There are two ways you can change from kdb to kgdb. You can
- manually enter kgdb mode by issuing the kgdb command from the kdb
- shell prompt, or you can connect gdb while the kdb shell prompt is
- active. The kdb shell looks for the typical first commands that gdb
- would issue with the gdb remote protocol and if it sees one of those
- commands it automatically changes into kgdb mode.
-
- From kdb issue the command:
- kgdb
- Now disconnect your terminal program and connect gdb in its place
- At the kdb prompt, disconnect the terminal program and connect gdb in its place.
-
-
-
-
- Running kdb commands from gdb
- It is possible to run a limited set of kdb commands from gdb,
- using the gdb monitor command. You don't want to execute any of the
- run control or breakpoint operations, because it can disrupt the
- state of the kernel debugger. You should be using gdb for
- breakpoints and run control operations if you have gdb connected.
- The more useful commands to run are things like lsmod, dmesg, ps or
- possibly some of the memory information commands. To see all the kdb
- commands you can run monitor help.
- Example:
-
-(gdb) monitor ps
-1 idle process (state I) and
-27 sleeping system daemon (state M) processes suppressed,
-use 'ps A' to see all.
-Task Addr Pid Parent [*] cpu State Thread Command
-
-0xc78291d0 1 0 0 0 S 0xc7829404 init
-0xc7954150 942 1 0 0 S 0xc7954384 dropbear
-0xc78789c0 944 1 0 0 S 0xc7878bf4 sh
-(gdb)
-
-
-
-
-
- kgdb Test Suite
-
- When kgdb is enabled in the kernel config you can also elect to
- enable the config parameter KGDB_TESTS. Turning this on will
- enable a special kgdb I/O module which is designed to test the
- kgdb internal functions.
-
-
- The kgdb tests are mainly intended for developers to test the kgdb
- internals as well as a tool for developing a new kgdb architecture
- specific implementation. These tests are not really for end users
- of the Linux kernel. The primary source of documentation would be
- to look in the drivers/misc/kgdbts.c file.
-
-
- The kgdb test suite can also be configured at compile time to run
- the core set of tests by setting the kernel config parameter
- KGDB_TESTS_ON_BOOT. This particular option is aimed at automated
- regression testing and does not require modifying the kernel boot
- config arguments. If this is turned on, the kgdb test suite can
- be disabled by specifying "kgdbts=" as a kernel boot argument.
-
-
-
- Kernel Debugger Internals
-
- Architecture Specifics
-
- The kernel debugger is organized into a number of components:
-
- The debug core
-
- The debug core is found in kernel/debugger/debug_core.c. It contains:
-
- A generic OS exception handler which includes
- sync'ing the processors into a stopped state on an multi-CPU
- system.
- The API to talk to the kgdb I/O drivers
- The API to make calls to the arch-specific kgdb implementation
- The logic to perform safe memory reads and writes to memory while using the debugger
- A full implementation for software breakpoints unless overridden by the arch
- The API to invoke either the kdb or kgdb frontend to the debug core.
- The structures and callback API for atomic kernel mode setting.
- NOTE: kgdboc is where the kms callbacks are invoked.
-
-
-
- kgdb arch-specific implementation
-
- This implementation is generally found in arch/*/kernel/kgdb.c.
- As an example, arch/x86/kernel/kgdb.c contains the specifics to
- implement HW breakpoint as well as the initialization to
- dynamically register and unregister for the trap handlers on
- this architecture. The arch-specific portion implements:
-
- contains an arch-specific trap catcher which
- invokes kgdb_handle_exception() to start kgdb about doing its
- work
- translation to and from gdb specific packet format to pt_regs
- Registration and unregistration of architecture specific trap hooks
- Any special exception handling and cleanup
- NMI exception handling and cleanup
- (optional) HW breakpoints
-
-
-
- gdbstub frontend (aka kgdb)
- The gdbstub is located in kernel/debug/gdbstub.c. It contains:
-
- All the logic to implement the gdb serial protocol
-
-
- kdb frontend
- The kdb debugger shell is broken down into a number of
- components. The kdb core is located in kernel/debug/kdb. There
- are a number of helper functions in some of the other kernel
- components to make it possible for kdb to examine and report
- information about the kernel without taking locks that could
- cause a kernel deadlock. The kdb core contains implements the following functionality.
-
- A simple shell
- The kdb core command set
- A registration API to register additional kdb shell commands.
-
- A good example of a self-contained kdb module
- is the "ftdump" command for dumping the ftrace buffer. See:
- kernel/trace/trace_kdb.c
- For an example of how to dynamically register
- a new kdb command you can build the kdb_hello.ko kernel module
- from samples/kdb/kdb_hello.c. To build this example you can
- set CONFIG_SAMPLES=y and CONFIG_SAMPLE_KDB=m in your kernel
- config. Later run "modprobe kdb_hello" and the next time you
- enter the kdb shell, you can run the "hello"
- command.
-
- The implementation for kdb_printf() which
- emits messages directly to I/O drivers, bypassing the kernel
- log.
- SW / HW breakpoint management for the kdb shell
-
-
- kgdb I/O driver
-
- Each kgdb I/O driver has to provide an implementation for the following:
-
- configuration via built-in or module
- dynamic configuration and kgdb hook registration calls
- read and write character interface
- A cleanup handler for unconfiguring from the kgdb core
- (optional) Early debug methodology
-
- Any given kgdb I/O driver has to operate very closely with the
- hardware and must do it in such a way that does not enable
- interrupts or change other parts of the system context without
- completely restoring them. The kgdb core will repeatedly "poll"
- a kgdb I/O driver for characters when it needs input. The I/O
- driver is expected to return immediately if there is no data
- available. Doing so allows for the future possibility to touch
- watchdog hardware in such a way as to have a target system not
- reset when these are enabled.
-
-
-
-
-
- If you are intent on adding kgdb architecture specific support
- for a new architecture, the architecture should define
- HAVE_ARCH_KGDB in the architecture specific
- Kconfig file. This will enable kgdb for the architecture, and
- at that point you must create an architecture specific kgdb
- implementation.
-
-
- There are a few flags which must be set on every architecture in
- their <asm/kgdb.h> file. These are:
-
-
-
- NUMREGBYTES: The size in bytes of all of the registers, so
- that we can ensure they will all fit into a packet.
-
-
-
-
- BUFMAX: The size in bytes of the buffer GDB will read into.
- This must be larger than NUMREGBYTES.
-
-
-
-
- CACHE_FLUSH_IS_SAFE: Set to 1 if it is always safe to call
- flush_cache_range or flush_icache_range. On some architectures,
- these functions may not be safe to call on SMP since we keep other
- CPUs in a holding pattern.
-
-
-
-
-
- There are also the following functions for the common backend,
- found in kernel/kgdb.c, that must be supplied by the
- architecture-specific backend unless marked as (optional), in
- which case a default function maybe used if the architecture
- does not need to provide a specific implementation.
-
-!Iinclude/linux/kgdb.h
-
-
- kgdboc internals
-
- kgdboc and uarts
-
- The kgdboc driver is actually a very thin driver that relies on the
- underlying low level to the hardware driver having "polling hooks"
- to which the tty driver is attached. In the initial
- implementation of kgdboc the serial_core was changed to expose a
- low level UART hook for doing polled mode reading and writing of a
- single character while in an atomic context. When kgdb makes an I/O
- request to the debugger, kgdboc invokes a callback in the serial
- core which in turn uses the callback in the UART driver.
-
- When using kgdboc with a UART, the UART driver must implement two callbacks in the struct uart_ops. Example from drivers/8250.c:
-#ifdef CONFIG_CONSOLE_POLL
- .poll_get_char = serial8250_get_poll_char,
- .poll_put_char = serial8250_put_poll_char,
-#endif
-
- Any implementation specifics around creating a polling driver use the
- #ifdef CONFIG_CONSOLE_POLL, as shown above.
- Keep in mind that polling hooks have to be implemented in such a way
- that they can be called from an atomic context and have to restore
- the state of the UART chip on return such that the system can return
- to normal when the debugger detaches. You need to be very careful
- with any kind of lock you consider, because failing here is most likely
- going to mean pressing the reset button.
-
-
-
- kgdboc and keyboards
- The kgdboc driver contains logic to configure communications
- with an attached keyboard. The keyboard infrastructure is only
- compiled into the kernel when CONFIG_KDB_KEYBOARD=y is set in the
- kernel configuration.
- The core polled keyboard driver driver for PS/2 type keyboards
- is in drivers/char/kdb_keyboard.c. This driver is hooked into the
- debug core when kgdboc populates the callback in the array
- called kdb_poll_funcs[]. The
- kdb_get_kbd_char() is the top-level function which polls hardware
- for single character input.
-
-
-
- kgdboc and kms
- The kgdboc driver contains logic to request the graphics
- display to switch to a text context when you are using
- "kgdboc=kms,kbd", provided that you have a video driver which has a
- frame buffer console and atomic kernel mode setting support.
-
- Every time the kernel
- debugger is entered it calls kgdboc_pre_exp_handler() which in turn
- calls con_debug_enter() in the virtual console layer. On resuming kernel
- execution, the kernel debugger calls kgdboc_post_exp_handler() which
- in turn calls con_debug_leave().
- Any video driver that wants to be compatible with the kernel
- debugger and the atomic kms callbacks must implement the
- mode_set_base_atomic, fb_debug_enter and fb_debug_leave operations.
- For the fb_debug_enter and fb_debug_leave the option exists to use
- the generic drm fb helper functions or implement something custom for
- the hardware. The following example shows the initialization of the
- .mode_set_base_atomic operation in
- drivers/gpu/drm/i915/intel_display.c:
-
-
-static const struct drm_crtc_helper_funcs intel_helper_funcs = {
-[...]
- .mode_set_base_atomic = intel_pipe_set_base_atomic,
-[...]
-};
-
-
-
- Here is an example of how the i915 driver initializes the fb_debug_enter and fb_debug_leave functions to use the generic drm helpers in
- drivers/gpu/drm/i915/intel_fb.c:
-
-
-static struct fb_ops intelfb_ops = {
-[...]
- .fb_debug_enter = drm_fb_helper_debug_enter,
- .fb_debug_leave = drm_fb_helper_debug_leave,
-[...]
-};
-
-
-
-
-
-
-
- Credits
-
- The following people have contributed to this document:
-
- Amit Kaleamitkale@linsyssoft.com
- Tom Rinitrini@kernel.crashing.org
-
- In March 2008 this document was completely rewritten by:
-
- Jason Wesseljason.wessel@windriver.com
-
- In Jan 2010 this document was updated to include kdb.
-
- Jason Wesseljason.wessel@windriver.com
-
-
-
-
-
diff --git a/Documentation/DocBook/libata.tmpl b/Documentation/DocBook/libata.tmpl
deleted file mode 100644
index 0320910b866db2132a733b7b70d22a5d54c34107..0000000000000000000000000000000000000000
--- a/Documentation/DocBook/libata.tmpl
+++ /dev/null
@@ -1,1625 +0,0 @@
-
-
-
-
-
- libATA Developer's Guide
-
-
-
- Jeff
- Garzik
-
-
-
-
- 2003-2006
- Jeff Garzik
-
-
-
-
- The contents of this file are subject to the Open
- Software License version 1.1 that can be found at
- http://fedoraproject.org/wiki/Licensing:OSL1.1
- and is included herein by reference.
-
-
-
- Alternatively, the contents of this file may be used under the terms
- of the GNU General Public License version 2 (the "GPL") as distributed
- in the kernel source COPYING file, in which case the provisions of
- the GPL are applicable instead of the above. If you wish to allow
- the use of your version of this file only under the terms of the
- GPL and not to allow others to use your version of this file under
- the OSL, indicate your decision by deleting the provisions above and
- replace them with the notice and other provisions required by the GPL.
- If you do not delete the provisions above, a recipient may use your
- version of this file under either the OSL or the GPL.
-
-
-
-
-
-
-
-
- Introduction
-
- libATA is a library used inside the Linux kernel to support ATA host
- controllers and devices. libATA provides an ATA driver API, class
- transports for ATA and ATAPI devices, and SCSI<->ATA translation
- for ATA devices according to the T10 SAT specification.
-
-
- This Guide documents the libATA driver API, library functions, library
- internals, and a couple sample ATA low-level drivers.
-
-
-
-
- libata Driver API
-
- struct ata_port_operations is defined for every low-level libata
- hardware driver, and it controls how the low-level driver
- interfaces with the ATA and SCSI layers.
-
-
- FIS-based drivers will hook into the system with ->qc_prep() and
- ->qc_issue() high-level hooks. Hardware which behaves in a manner
- similar to PCI IDE hardware may utilize several generic helpers,
- defining at a bare minimum the bus I/O addresses of the ATA shadow
- register blocks.
-
-
- struct ata_port_operations
-
- Disable ATA port
-
-void (*port_disable) (struct ata_port *);
-
-
-
- Called from ata_bus_probe() error path, as well as when
- unregistering from the SCSI module (rmmod, hot unplug).
- This function should do whatever needs to be done to take the
- port out of use. In most cases, ata_port_disable() can be used
- as this hook.
-
-
- Called from ata_bus_probe() on a failed probe.
- Called from ata_scsi_release().
-
-
-
-
- Post-IDENTIFY device configuration
-
-void (*dev_config) (struct ata_port *, struct ata_device *);
-
-
-
- Called after IDENTIFY [PACKET] DEVICE is issued to each device
- found. Typically used to apply device-specific fixups prior to
- issue of SET FEATURES - XFER MODE, and prior to operation.
-
-
- This entry may be specified as NULL in ata_port_operations.
-
-
-
-
- Set PIO/DMA mode
-
-void (*set_piomode) (struct ata_port *, struct ata_device *);
-void (*set_dmamode) (struct ata_port *, struct ata_device *);
-void (*post_set_mode) (struct ata_port *);
-unsigned int (*mode_filter) (struct ata_port *, struct ata_device *, unsigned int);
-
-
-
- Hooks called prior to the issue of SET FEATURES - XFER MODE
- command. The optional ->mode_filter() hook is called when libata
- has built a mask of the possible modes. This is passed to the
- ->mode_filter() function which should return a mask of valid modes
- after filtering those unsuitable due to hardware limits. It is not
- valid to use this interface to add modes.
-
-
- dev->pio_mode and dev->dma_mode are guaranteed to be valid when
- ->set_piomode() and when ->set_dmamode() is called. The timings for
- any other drive sharing the cable will also be valid at this point.
- That is the library records the decisions for the modes of each
- drive on a channel before it attempts to set any of them.
-
-
- ->post_set_mode() is
- called unconditionally, after the SET FEATURES - XFER MODE
- command completes successfully.
-
-
-
- ->set_piomode() is always called (if present), but
- ->set_dma_mode() is only called if DMA is possible.
-
-
-
-
- Taskfile read/write
-
-void (*sff_tf_load) (struct ata_port *ap, struct ata_taskfile *tf);
-void (*sff_tf_read) (struct ata_port *ap, struct ata_taskfile *tf);
-
-
-
- ->tf_load() is called to load the given taskfile into hardware
- registers / DMA buffers. ->tf_read() is called to read the
- hardware registers / DMA buffers, to obtain the current set of
- taskfile register values.
- Most drivers for taskfile-based hardware (PIO or MMIO) use
- ata_sff_tf_load() and ata_sff_tf_read() for these hooks.
-
-
-
-
- PIO data read/write
-
-void (*sff_data_xfer) (struct ata_device *, unsigned char *, unsigned int, int);
-
-
-
-All bmdma-style drivers must implement this hook. This is the low-level
-operation that actually copies the data bytes during a PIO data
-transfer.
-Typically the driver will choose one of ata_sff_data_xfer_noirq(),
-ata_sff_data_xfer(), or ata_sff_data_xfer32().
-
-
-
-
- ATA command execute
-
-void (*sff_exec_command)(struct ata_port *ap, struct ata_taskfile *tf);
-
-
-
- causes an ATA command, previously loaded with
- ->tf_load(), to be initiated in hardware.
- Most drivers for taskfile-based hardware use ata_sff_exec_command()
- for this hook.
-
-
-
-
- Per-cmd ATAPI DMA capabilities filter
-
-int (*check_atapi_dma) (struct ata_queued_cmd *qc);
-
-
-
-Allow low-level driver to filter ATA PACKET commands, returning a status
-indicating whether or not it is OK to use DMA for the supplied PACKET
-command.
-
-
- This hook may be specified as NULL, in which case libata will
- assume that atapi dma can be supported.
-
-
-
-
- Read specific ATA shadow registers
-
-u8 (*sff_check_status)(struct ata_port *ap);
-u8 (*sff_check_altstatus)(struct ata_port *ap);
-
-
-
- Reads the Status/AltStatus ATA shadow register from
- hardware. On some hardware, reading the Status register has
- the side effect of clearing the interrupt condition.
- Most drivers for taskfile-based hardware use
- ata_sff_check_status() for this hook.
-
-
-
-
- Write specific ATA shadow register
-
-void (*sff_set_devctl)(struct ata_port *ap, u8 ctl);
-
-
-
- Write the device control ATA shadow register to the hardware.
- Most drivers don't need to define this.
-
-
-
-
- Select ATA device on bus
-
-void (*sff_dev_select)(struct ata_port *ap, unsigned int device);
-
-
-
- Issues the low-level hardware command(s) that causes one of N
- hardware devices to be considered 'selected' (active and
- available for use) on the ATA bus. This generally has no
- meaning on FIS-based devices.
-
-
- Most drivers for taskfile-based hardware use
- ata_sff_dev_select() for this hook.
-
-
-
-
- Private tuning method
-
-void (*set_mode) (struct ata_port *ap);
-
-
-
- By default libata performs drive and controller tuning in
- accordance with the ATA timing rules and also applies blacklists
- and cable limits. Some controllers need special handling and have
- custom tuning rules, typically raid controllers that use ATA
- commands but do not actually do drive timing.
-
-
-
-
- This hook should not be used to replace the standard controller
- tuning logic when a controller has quirks. Replacing the default
- tuning logic in that case would bypass handling for drive and
- bridge quirks that may be important to data reliability. If a
- controller needs to filter the mode selection it should use the
- mode_filter hook instead.
-
-
-
-
-
- Control PCI IDE BMDMA engine
-
-void (*bmdma_setup) (struct ata_queued_cmd *qc);
-void (*bmdma_start) (struct ata_queued_cmd *qc);
-void (*bmdma_stop) (struct ata_port *ap);
-u8 (*bmdma_status) (struct ata_port *ap);
-
-
-
-When setting up an IDE BMDMA transaction, these hooks arm
-(->bmdma_setup), fire (->bmdma_start), and halt (->bmdma_stop)
-the hardware's DMA engine. ->bmdma_status is used to read the standard
-PCI IDE DMA Status register.
-
-
-
-These hooks are typically either no-ops, or simply not implemented, in
-FIS-based drivers.
-
-
-Most legacy IDE drivers use ata_bmdma_setup() for the bmdma_setup()
-hook. ata_bmdma_setup() will write the pointer to the PRD table to
-the IDE PRD Table Address register, enable DMA in the DMA Command
-register, and call exec_command() to begin the transfer.
-
-
-Most legacy IDE drivers use ata_bmdma_start() for the bmdma_start()
-hook. ata_bmdma_start() will write the ATA_DMA_START flag to the DMA
-Command register.
-
-
-Many legacy IDE drivers use ata_bmdma_stop() for the bmdma_stop()
-hook. ata_bmdma_stop() clears the ATA_DMA_START flag in the DMA
-command register.
-
-
-Many legacy IDE drivers use ata_bmdma_status() as the bmdma_status() hook.
-
-
-
-
- High-level taskfile hooks
-
-void (*qc_prep) (struct ata_queued_cmd *qc);
-int (*qc_issue) (struct ata_queued_cmd *qc);
-
-
-
- Higher-level hooks, these two hooks can potentially supercede
- several of the above taskfile/DMA engine hooks. ->qc_prep is
- called after the buffers have been DMA-mapped, and is typically
- used to populate the hardware's DMA scatter-gather table.
- Most drivers use the standard ata_qc_prep() helper function, but
- more advanced drivers roll their own.
-
-
- ->qc_issue is used to make a command active, once the hardware
- and S/G tables have been prepared. IDE BMDMA drivers use the
- helper function ata_qc_issue_prot() for taskfile protocol-based
- dispatch. More advanced drivers implement their own ->qc_issue.
-
-
- ata_qc_issue_prot() calls ->tf_load(), ->bmdma_setup(), and
- ->bmdma_start() as necessary to initiate a transfer.
-
-
-
-
- Exception and probe handling (EH)
-
-void (*eng_timeout) (struct ata_port *ap);
-void (*phy_reset) (struct ata_port *ap);
-
-
-
-Deprecated. Use ->error_handler() instead.
-
-
-
-void (*freeze) (struct ata_port *ap);
-void (*thaw) (struct ata_port *ap);
-
-
-
-ata_port_freeze() is called when HSM violations or some other
-condition disrupts normal operation of the port. A frozen port
-is not allowed to perform any operation until the port is
-thawed, which usually follows a successful reset.
-
-
-
-The optional ->freeze() callback can be used for freezing the port
-hardware-wise (e.g. mask interrupt and stop DMA engine). If a
-port cannot be frozen hardware-wise, the interrupt handler
-must ack and clear interrupts unconditionally while the port
-is frozen.
-
-
-The optional ->thaw() callback is called to perform the opposite of ->freeze():
-prepare the port for normal operation once again. Unmask interrupts,
-start DMA engine, etc.
-
-
-
-void (*error_handler) (struct ata_port *ap);
-
-
-
-->error_handler() is a driver's hook into probe, hotplug, and recovery
-and other exceptional conditions. The primary responsibility of an
-implementation is to call ata_do_eh() or ata_bmdma_drive_eh() with a set
-of EH hooks as arguments:
-
-
-
-'prereset' hook (may be NULL) is called during an EH reset, before any other actions
-are taken.
-
-
-
-'postreset' hook (may be NULL) is called after the EH reset is performed. Based on
-existing conditions, severity of the problem, and hardware capabilities,
-
-
-
-Either 'softreset' (may be NULL) or 'hardreset' (may be NULL) will be
-called to perform the low-level EH reset.
-
-
-
-void (*post_internal_cmd) (struct ata_queued_cmd *qc);
-
-
-
-Perform any hardware-specific actions necessary to finish processing
-after executing a probe-time or EH-time command via ata_exec_internal().
-
-
-
-
- Hardware interrupt handling
-
-irqreturn_t (*irq_handler)(int, void *, struct pt_regs *);
-void (*irq_clear) (struct ata_port *);
-
-
-
- ->irq_handler is the interrupt handling routine registered with
- the system, by libata. ->irq_clear is called during probe just
- before the interrupt handler is registered, to be sure hardware
- is quiet.
-
-
- The second argument, dev_instance, should be cast to a pointer
- to struct ata_host_set.
-
-
- Most legacy IDE drivers use ata_sff_interrupt() for the
- irq_handler hook, which scans all ports in the host_set,
- determines which queued command was active (if any), and calls
- ata_sff_host_intr(ap,qc).
-
-
- Most legacy IDE drivers use ata_sff_irq_clear() for the
- irq_clear() hook, which simply clears the interrupt and error
- flags in the DMA status register.
-
-
-
-
- SATA phy read/write
-
-int (*scr_read) (struct ata_port *ap, unsigned int sc_reg,
- u32 *val);
-int (*scr_write) (struct ata_port *ap, unsigned int sc_reg,
- u32 val);
-
-
-
- Read and write standard SATA phy registers. Currently only used
- if ->phy_reset hook called the sata_phy_reset() helper function.
- sc_reg is one of SCR_STATUS, SCR_CONTROL, SCR_ERROR, or SCR_ACTIVE.
-
-
-
-
- Init and shutdown
-
-int (*port_start) (struct ata_port *ap);
-void (*port_stop) (struct ata_port *ap);
-void (*host_stop) (struct ata_host_set *host_set);
-
-
-
- ->port_start() is called just after the data structures for each
- port are initialized. Typically this is used to alloc per-port
- DMA buffers / tables / rings, enable DMA engines, and similar
- tasks. Some drivers also use this entry point as a chance to
- allocate driver-private memory for ap->private_data.
-
-
- Many drivers use ata_port_start() as this hook or call
- it from their own port_start() hooks. ata_port_start()
- allocates space for a legacy IDE PRD table and returns.
-
-
- ->port_stop() is called after ->host_stop(). Its sole function
- is to release DMA/memory resources, now that they are no longer
- actively being used. Many drivers also free driver-private
- data from port at this time.
-
-
- ->host_stop() is called after all ->port_stop() calls
-have completed. The hook must finalize hardware shutdown, release DMA
-and other resources, etc.
- This hook may be specified as NULL, in which case it is not called.
-
-
-
-
-
-
-
-
- Error handling
-
-
- This chapter describes how errors are handled under libata.
- Readers are advised to read SCSI EH
- (Documentation/scsi/scsi_eh.txt) and ATA exceptions doc first.
-
-
- Origins of commands
-
- In libata, a command is represented with struct ata_queued_cmd
- or qc. qc's are preallocated during port initialization and
- repetitively used for command executions. Currently only one
- qc is allocated per port but yet-to-be-merged NCQ branch
- allocates one for each tag and maps each qc to NCQ tag 1-to-1.
-
-
- libata commands can originate from two sources - libata itself
- and SCSI midlayer. libata internal commands are used for
- initialization and error handling. All normal blk requests
- and commands for SCSI emulation are passed as SCSI commands
- through queuecommand callback of SCSI host template.
-
-
-
- How commands are issued
-
-
-
- Internal commands
-
-
- First, qc is allocated and initialized using
- ata_qc_new_init(). Although ata_qc_new_init() doesn't
- implement any wait or retry mechanism when qc is not
- available, internal commands are currently issued only during
- initialization and error recovery, so no other command is
- active and allocation is guaranteed to succeed.
-
-
- Once allocated qc's taskfile is initialized for the command to
- be executed. qc currently has two mechanisms to notify
- completion. One is via qc->complete_fn() callback and the
- other is completion qc->waiting. qc->complete_fn() callback
- is the asynchronous path used by normal SCSI translated
- commands and qc->waiting is the synchronous (issuer sleeps in
- process context) path used by internal commands.
-
-
- Once initialization is complete, host_set lock is acquired
- and the qc is issued.
-
-
-
-
- SCSI commands
-
-
- All libata drivers use ata_scsi_queuecmd() as
- hostt->queuecommand callback. scmds can either be simulated
- or translated. No qc is involved in processing a simulated
- scmd. The result is computed right away and the scmd is
- completed.
-
-
- For a translated scmd, ata_qc_new_init() is invoked to
- allocate a qc and the scmd is translated into the qc. SCSI
- midlayer's completion notification function pointer is stored
- into qc->scsidone.
-
-
- qc->complete_fn() callback is used for completion
- notification. ATA commands use ata_scsi_qc_complete() while
- ATAPI commands use atapi_qc_complete(). Both functions end up
- calling qc->scsidone to notify upper layer when the qc is
- finished. After translation is completed, the qc is issued
- with ata_qc_issue().
-
-
- Note that SCSI midlayer invokes hostt->queuecommand while
- holding host_set lock, so all above occur while holding
- host_set lock.
-
-
-
-
-
-
-
- How commands are processed
-
- Depending on which protocol and which controller are used,
- commands are processed differently. For the purpose of
- discussion, a controller which uses taskfile interface and all
- standard callbacks is assumed.
-
-
- Currently 6 ATA command protocols are used. They can be
- sorted into the following four categories according to how
- they are processed.
-
-
-
- ATA NO DATA or DMA
-
-
- ATA_PROT_NODATA and ATA_PROT_DMA fall into this category.
- These types of commands don't require any software
- intervention once issued. Device will raise interrupt on
- completion.
-
-
-
-
- ATA PIO
-
-
- ATA_PROT_PIO is in this category. libata currently
- implements PIO with polling. ATA_NIEN bit is set to turn
- off interrupt and pio_task on ata_wq performs polling and
- IO.
-
-
-
-
- ATAPI NODATA or DMA
-
-
- ATA_PROT_ATAPI_NODATA and ATA_PROT_ATAPI_DMA are in this
- category. packet_task is used to poll BSY bit after
- issuing PACKET command. Once BSY is turned off by the
- device, packet_task transfers CDB and hands off processing
- to interrupt handler.
-
-
-
-
- ATAPI PIO
-
-
- ATA_PROT_ATAPI is in this category. ATA_NIEN bit is set
- and, as in ATAPI NODATA or DMA, packet_task submits cdb.
- However, after submitting cdb, further processing (data
- transfer) is handed off to pio_task.
-
-
-
-
-
-
- How commands are completed
-
- Once issued, all qc's are either completed with
- ata_qc_complete() or time out. For commands which are handled
- by interrupts, ata_host_intr() invokes ata_qc_complete(), and,
- for PIO tasks, pio_task invokes ata_qc_complete(). In error
- cases, packet_task may also complete commands.
-
-
- ata_qc_complete() does the following.
-
-
-
-
-
-
- DMA memory is unmapped.
-
-
-
-
-
- ATA_QCFLAG_ACTIVE is cleared from qc->flags.
-
-
-
-
-
- qc->complete_fn() callback is invoked. If the return value of
- the callback is not zero. Completion is short circuited and
- ata_qc_complete() returns.
-
-
-
-
-
- __ata_qc_complete() is called, which does
-
-
-
-
- qc->flags is cleared to zero.
-
-
-
-
-
- ap->active_tag and qc->tag are poisoned.
-
-
-
-
-
- qc->waiting is cleared & completed (in that order).
-
-
-
-
-
- qc is deallocated by clearing appropriate bit in ap->qactive.
-
-
-
-
-
-
-
-
-
-
- So, it basically notifies upper layer and deallocates qc. One
- exception is short-circuit path in #3 which is used by
- atapi_qc_complete().
-
-
- For all non-ATAPI commands, whether it fails or not, almost
- the same code path is taken and very little error handling
- takes place. A qc is completed with success status if it
- succeeded, with failed status otherwise.
-
-
- However, failed ATAPI commands require more handling as
- REQUEST SENSE is needed to acquire sense data. If an ATAPI
- command fails, ata_qc_complete() is invoked with error status,
- which in turn invokes atapi_qc_complete() via
- qc->complete_fn() callback.
-
-
- This makes atapi_qc_complete() set scmd->result to
- SAM_STAT_CHECK_CONDITION, complete the scmd and return 1. As
- the sense data is empty but scmd->result is CHECK CONDITION,
- SCSI midlayer will invoke EH for the scmd, and returning 1
- makes ata_qc_complete() to return without deallocating the qc.
- This leads us to ata_scsi_error() with partially completed qc.
-
-
-
-
- ata_scsi_error()
-
- ata_scsi_error() is the current transportt->eh_strategy_handler()
- for libata. As discussed above, this will be entered in two
- cases - timeout and ATAPI error completion. This function
- calls low level libata driver's eng_timeout() callback, the
- standard callback for which is ata_eng_timeout(). It checks
- if a qc is active and calls ata_qc_timeout() on the qc if so.
- Actual error handling occurs in ata_qc_timeout().
-
-
- If EH is invoked for timeout, ata_qc_timeout() stops BMDMA and
- completes the qc. Note that as we're currently in EH, we
- cannot call scsi_done. As described in SCSI EH doc, a
- recovered scmd should be either retried with
- scsi_queue_insert() or finished with scsi_finish_command().
- Here, we override qc->scsidone with scsi_finish_command() and
- calls ata_qc_complete().
-
-
- If EH is invoked due to a failed ATAPI qc, the qc here is
- completed but not deallocated. The purpose of this
- half-completion is to use the qc as place holder to make EH
- code reach this place. This is a bit hackish, but it works.
-
-
- Once control reaches here, the qc is deallocated by invoking
- __ata_qc_complete() explicitly. Then, internal qc for REQUEST
- SENSE is issued. Once sense data is acquired, scmd is
- finished by directly invoking scsi_finish_command() on the
- scmd. Note that as we already have completed and deallocated
- the qc which was associated with the scmd, we don't need
- to/cannot call ata_qc_complete() again.
-
-
-
-
- Problems with the current EH
-
-
-
-
-
- Error representation is too crude. Currently any and all
- error conditions are represented with ATA STATUS and ERROR
- registers. Errors which aren't ATA device errors are treated
- as ATA device errors by setting ATA_ERR bit. Better error
- descriptor which can properly represent ATA and other
- errors/exceptions is needed.
-
-
-
-
-
- When handling timeouts, no action is taken to make device
- forget about the timed out command and ready for new commands.
-
-
-
-
-
- EH handling via ata_scsi_error() is not properly protected
- from usual command processing. On EH entrance, the device is
- not in quiescent state. Timed out commands may succeed or
- fail any time. pio_task and atapi_task may still be running.
-
-
-
-
-
- Too weak error recovery. Devices / controllers causing HSM
- mismatch errors and other errors quite often require reset to
- return to known state. Also, advanced error handling is
- necessary to support features like NCQ and hotplug.
-
-
-
-
-
- ATA errors are directly handled in the interrupt handler and
- PIO errors in pio_task. This is problematic for advanced
- error handling for the following reasons.
-
-
- First, advanced error handling often requires context and
- internal qc execution.
-
-
- Second, even a simple failure (say, CRC error) needs
- information gathering and could trigger complex error handling
- (say, resetting & reconfiguring). Having multiple code
- paths to gather information, enter EH and trigger actions
- makes life painful.
-
-
- Third, scattered EH code makes implementing low level drivers
- difficult. Low level drivers override libata callbacks. If
- EH is scattered over several places, each affected callbacks
- should perform its part of error handling. This can be error
- prone and painful.
-
-
-
-
-
-
-
-
- libata Library
-!Edrivers/ata/libata-core.c
-
-
-
- libata Core Internals
-!Idrivers/ata/libata-core.c
-
-
-
- libata SCSI translation/emulation
-!Edrivers/ata/libata-scsi.c
-!Idrivers/ata/libata-scsi.c
-
-
-
- ATA errors and exceptions
-
-
- This chapter tries to identify what error/exception conditions exist
- for ATA/ATAPI devices and describe how they should be handled in
- implementation-neutral way.
-
-
-
- The term 'error' is used to describe conditions where either an
- explicit error condition is reported from device or a command has
- timed out.
-
-
-
- The term 'exception' is either used to describe exceptional
- conditions which are not errors (say, power or hotplug events), or
- to describe both errors and non-error exceptional conditions. Where
- explicit distinction between error and exception is necessary, the
- term 'non-error exception' is used.
-
-
-
- Exception categories
-
- Exceptions are described primarily with respect to legacy
- taskfile + bus master IDE interface. If a controller provides
- other better mechanism for error reporting, mapping those into
- categories described below shouldn't be difficult.
-
-
-
- In the following sections, two recovery actions - reset and
- reconfiguring transport - are mentioned. These are described
- further in .
-
-
-
- HSM violation
-
- This error is indicated when STATUS value doesn't match HSM
- requirement during issuing or execution any ATA/ATAPI command.
-
-
-
- Examples
-
-
-
- ATA_STATUS doesn't contain !BSY && DRDY && !DRQ while trying
- to issue a command.
-
-
-
-
-
- !BSY && !DRQ during PIO data transfer.
-
-
-
-
-
- DRQ on command completion.
-
-
-
-
-
- !BSY && ERR after CDB transfer starts but before the
- last byte of CDB is transferred. ATA/ATAPI standard states
- that "The device shall not terminate the PACKET command
- with an error before the last byte of the command packet has
- been written" in the error outputs description of PACKET
- command and the state diagram doesn't include such
- transitions.
-
-
-
-
-
-
- In these cases, HSM is violated and not much information
- regarding the error can be acquired from STATUS or ERROR
- register. IOW, this error can be anything - driver bug,
- faulty device, controller and/or cable.
-
-
-
- As HSM is violated, reset is necessary to restore known state.
- Reconfiguring transport for lower speed might be helpful too
- as transmission errors sometimes cause this kind of errors.
-
-
-
-
- ATA/ATAPI device error (non-NCQ / non-CHECK CONDITION)
-
-
- These are errors detected and reported by ATA/ATAPI devices
- indicating device problems. For this type of errors, STATUS
- and ERROR register values are valid and describe error
- condition. Note that some of ATA bus errors are detected by
- ATA/ATAPI devices and reported using the same mechanism as
- device errors. Those cases are described later in this
- section.
-
-
-
- For ATA commands, this type of errors are indicated by !BSY
- && ERR during command execution and on completion.
-
-
- For ATAPI commands,
-
-
-
-
-
- !BSY && ERR && ABRT right after issuing PACKET
- indicates that PACKET command is not supported and falls in
- this category.
-
-
-
-
-
- !BSY && ERR(==CHK) && !ABRT after the last
- byte of CDB is transferred indicates CHECK CONDITION and
- doesn't fall in this category.
-
-
-
-
-
- !BSY && ERR(==CHK) && ABRT after the last byte
- of CDB is transferred *probably* indicates CHECK CONDITION and
- doesn't fall in this category.
-
-
-
-
-
-
- Of errors detected as above, the following are not ATA/ATAPI
- device errors but ATA bus errors and should be handled
- according to .
-
-
-
-
-
- CRC error during data transfer
-
-
- This is indicated by ICRC bit in the ERROR register and
- means that corruption occurred during data transfer. Up to
- ATA/ATAPI-7, the standard specifies that this bit is only
- applicable to UDMA transfers but ATA/ATAPI-8 draft revision
- 1f says that the bit may be applicable to multiword DMA and
- PIO.
-
-
-
-
-
- ABRT error during data transfer or on completion
-
-
- Up to ATA/ATAPI-7, the standard specifies that ABRT could be
- set on ICRC errors and on cases where a device is not able
- to complete a command. Combined with the fact that MWDMA
- and PIO transfer errors aren't allowed to use ICRC bit up to
- ATA/ATAPI-7, it seems to imply that ABRT bit alone could
- indicate transfer errors.
-
-
- However, ATA/ATAPI-8 draft revision 1f removes the part
- that ICRC errors can turn on ABRT. So, this is kind of
- gray area. Some heuristics are needed here.
-
-
-
-
-
-
-
- ATA/ATAPI device errors can be further categorized as follows.
-
-
-
-
-
- Media errors
-
-
- This is indicated by UNC bit in the ERROR register. ATA
- devices reports UNC error only after certain number of
- retries cannot recover the data, so there's nothing much
- else to do other than notifying upper layer.
-
-
- READ and WRITE commands report CHS or LBA of the first
- failed sector but ATA/ATAPI standard specifies that the
- amount of transferred data on error completion is
- indeterminate, so we cannot assume that sectors preceding
- the failed sector have been transferred and thus cannot
- complete those sectors successfully as SCSI does.
-
-
-
-
-
- Media changed / media change requested error
-
-
- <<TODO: fill here>>
-
-
-
-
- Address error
-
-
- This is indicated by IDNF bit in the ERROR register.
- Report to upper layer.
-
-
-
-
- Other errors
-
-
- This can be invalid command or parameter indicated by ABRT
- ERROR bit or some other error condition. Note that ABRT
- bit can indicate a lot of things including ICRC and Address
- errors. Heuristics needed.
-
-
-
-
-
-
-
- Depending on commands, not all STATUS/ERROR bits are
- applicable. These non-applicable bits are marked with
- "na" in the output descriptions but up to ATA/ATAPI-7
- no definition of "na" can be found. However,
- ATA/ATAPI-8 draft revision 1f describes "N/A" as
- follows.
-
-
-
-
- 3.2.3.3a N/A
-
-
- A keyword the indicates a field has no defined value in
- this standard and should not be checked by the host or
- device. N/A fields should be cleared to zero.
-
-
-
-
-
-
-
- So, it seems reasonable to assume that "na" bits are
- cleared to zero by devices and thus need no explicit masking.
-
-
-
-
-
- ATAPI device CHECK CONDITION
-
-
- ATAPI device CHECK CONDITION error is indicated by set CHK bit
- (ERR bit) in the STATUS register after the last byte of CDB is
- transferred for a PACKET command. For this kind of errors,
- sense data should be acquired to gather information regarding
- the errors. REQUEST SENSE packet command should be used to
- acquire sense data.
-
-
-
- Once sense data is acquired, this type of errors can be
- handled similarly to other SCSI errors. Note that sense data
- may indicate ATA bus error (e.g. Sense Key 04h HARDWARE ERROR
- && ASC/ASCQ 47h/00h SCSI PARITY ERROR). In such
- cases, the error should be considered as an ATA bus error and
- handled according to .
-
-
-
-
-
- ATA device error (NCQ)
-
-
- NCQ command error is indicated by cleared BSY and set ERR bit
- during NCQ command phase (one or more NCQ commands
- outstanding). Although STATUS and ERROR registers will
- contain valid values describing the error, READ LOG EXT is
- required to clear the error condition, determine which command
- has failed and acquire more information.
-
-
-
- READ LOG EXT Log Page 10h reports which tag has failed and
- taskfile register values describing the error. With this
- information the failed command can be handled as a normal ATA
- command error as in and all
- other in-flight commands must be retried. Note that this
- retry should not be counted - it's likely that commands
- retried this way would have completed normally if it were not
- for the failed command.
-
-
-
- Note that ATA bus errors can be reported as ATA device NCQ
- errors. This should be handled as described in .
-
-
-
- If READ LOG EXT Log Page 10h fails or reports NQ, we're
- thoroughly screwed. This condition should be treated
- according to .
-
-
-
-
-
- ATA bus error
-
-
- ATA bus error means that data corruption occurred during
- transmission over ATA bus (SATA or PATA). This type of errors
- can be indicated by
-
-
-
-
-
-
- ICRC or ABRT error as described in .
-
-
-
-
-
- Controller-specific error completion with error information
- indicating transmission error.
-
-
-
-
-
- On some controllers, command timeout. In this case, there may
- be a mechanism to determine that the timeout is due to
- transmission error.
-
-
-
-
-
- Unknown/random errors, timeouts and all sorts of weirdities.
-
-
-
-
-
-
- As described above, transmission errors can cause wide variety
- of symptoms ranging from device ICRC error to random device
- lockup, and, for many cases, there is no way to tell if an
- error condition is due to transmission error or not;
- therefore, it's necessary to employ some kind of heuristic
- when dealing with errors and timeouts. For example,
- encountering repetitive ABRT errors for known supported
- command is likely to indicate ATA bus error.
-
-
-
- Once it's determined that ATA bus errors have possibly
- occurred, lowering ATA bus transmission speed is one of
- actions which may alleviate the problem. See for more information.
-
-
-
-
-
- PCI bus error
-
-
- Data corruption or other failures during transmission over PCI
- (or other system bus). For standard BMDMA, this is indicated
- by Error bit in the BMDMA Status register. This type of
- errors must be logged as it indicates something is very wrong
- with the system. Resetting host controller is recommended.
-
-
-
-
-
- Late completion
-
-
- This occurs when timeout occurs and the timeout handler finds
- out that the timed out command has completed successfully or
- with error. This is usually caused by lost interrupts. This
- type of errors must be logged. Resetting host controller is
- recommended.
-
-
-
-
-
- Unknown error (timeout)
-
-
- This is when timeout occurs and the command is still
- processing or the host and device are in unknown state. When
- this occurs, HSM could be in any valid or invalid state. To
- bring the device to known state and make it forget about the
- timed out command, resetting is necessary. The timed out
- command may be retried.
-
-
-
- Timeouts can also be caused by transmission errors. Refer to
- for more details.
-
-
-
-
-
- Hotplug and power management exceptions
-
-
- <<TODO: fill here>>
-
-
-
-
-
-
-
- EH recovery actions
-
-
- This section discusses several important recovery actions.
-
-
-
- Clearing error condition
-
-
- Many controllers require its error registers to be cleared by
- error handler. Different controllers may have different
- requirements.
-
-
-
- For SATA, it's strongly recommended to clear at least SError
- register during error handling.
-
-
-
-
- Reset
-
-
- During EH, resetting is necessary in the following cases.
-
-
-
-
-
-
- HSM is in unknown or invalid state
-
-
-
-
-
- HBA is in unknown or invalid state
-
-
-
-
-
- EH needs to make HBA/device forget about in-flight commands
-
-
-
-
-
- HBA/device behaves weirdly
-
-
-
-
-
-
- Resetting during EH might be a good idea regardless of error
- condition to improve EH robustness. Whether to reset both or
- either one of HBA and device depends on situation but the
- following scheme is recommended.
-
-
-
-
-
-
- When it's known that HBA is in ready state but ATA/ATAPI
- device is in unknown state, reset only device.
-
-
-
-
-
- If HBA is in unknown state, reset both HBA and device.
-
-
-
-
-
-
- HBA resetting is implementation specific. For a controller
- complying to taskfile/BMDMA PCI IDE, stopping active DMA
- transaction may be sufficient iff BMDMA state is the only HBA
- context. But even mostly taskfile/BMDMA PCI IDE complying
- controllers may have implementation specific requirements and
- mechanism to reset themselves. This must be addressed by
- specific drivers.
-
-
-
- OTOH, ATA/ATAPI standard describes in detail ways to reset
- ATA/ATAPI devices.
-
-
-
-
- PATA hardware reset
-
-
- This is hardware initiated device reset signalled with
- asserted PATA RESET- signal. There is no standard way to
- initiate hardware reset from software although some
- hardware provides registers that allow driver to directly
- tweak the RESET- signal.
-
-
-
-
- Software reset
-
-
- This is achieved by turning CONTROL SRST bit on for at
- least 5us. Both PATA and SATA support it but, in case of
- SATA, this may require controller-specific support as the
- second Register FIS to clear SRST should be transmitted
- while BSY bit is still set. Note that on PATA, this resets
- both master and slave devices on a channel.
-
-
-
-
- EXECUTE DEVICE DIAGNOSTIC command
-
-
- Although ATA/ATAPI standard doesn't describe exactly, EDD
- implies some level of resetting, possibly similar level
- with software reset. Host-side EDD protocol can be handled
- with normal command processing and most SATA controllers
- should be able to handle EDD's just like other commands.
- As in software reset, EDD affects both devices on a PATA
- bus.
-
-
- Although EDD does reset devices, this doesn't suit error
- handling as EDD cannot be issued while BSY is set and it's
- unclear how it will act when device is in unknown/weird
- state.
-
-
-
-
- ATAPI DEVICE RESET command
-
-
- This is very similar to software reset except that reset
- can be restricted to the selected device without affecting
- the other device sharing the cable.
-
-
-
-
- SATA phy reset
-
-
- This is the preferred way of resetting a SATA device. In
- effect, it's identical to PATA hardware reset. Note that
- this can be done with the standard SCR Control register.
- As such, it's usually easier to implement than software
- reset.
-
-
-
-
-
-
-
- One more thing to consider when resetting devices is that
- resetting clears certain configuration parameters and they
- need to be set to their previous or newly adjusted values
- after reset.
-
-
-
- Parameters affected are.
-
-
-
-
-
-
- CHS set up with INITIALIZE DEVICE PARAMETERS (seldom used)
-
-
-
-
-
- Parameters set with SET FEATURES including transfer mode setting
-
-
-
-
-
- Block count set with SET MULTIPLE MODE
-
-
-
-
-
- Other parameters (SET MAX, MEDIA LOCK...)
-
-
-
-
-
-
- ATA/ATAPI standard specifies that some parameters must be
- maintained across hardware or software reset, but doesn't
- strictly specify all of them. Always reconfiguring needed
- parameters after reset is required for robustness. Note that
- this also applies when resuming from deep sleep (power-off).
-
-
-
- Also, ATA/ATAPI standard requires that IDENTIFY DEVICE /
- IDENTIFY PACKET DEVICE is issued after any configuration
- parameter is updated or a hardware reset and the result used
- for further operation. OS driver is required to implement
- revalidation mechanism to support this.
-
-
-
-
-
- Reconfigure transport
-
-
- For both PATA and SATA, a lot of corners are cut for cheap
- connectors, cables or controllers and it's quite common to see
- high transmission error rate. This can be mitigated by
- lowering transmission speed.
-
-
-
- The following is a possible scheme Jeff Garzik suggested.
-
-
-
-
- If more than $N (3?) transmission errors happen in 15 minutes,
-
-
-
-
- if SATA, decrease SATA PHY speed. if speed cannot be decreased,
-
-
-
-
- decrease UDMA xfer speed. if at UDMA0, switch to PIO4,
-
-
-
-
- decrease PIO xfer speed. if at PIO3, complain, but continue
-
-
-
-
-
-
-
-
-
-
-
-
- ata_piix Internals
-!Idrivers/ata/ata_piix.c
-
-
-
- sata_sil Internals
-!Idrivers/ata/sata_sil.c
-
-
-
- Thanks
-
- The bulk of the ATA knowledge comes thanks to long conversations with
- Andre Hedrick (www.linux-ide.org), and long hours pondering the ATA
- and SCSI specifications.
-
-
- Thanks to Alan Cox for pointing out similarities
- between SATA and SCSI, and in general for motivation to hack on
- libata.
-
-
- libata's device detection
- method, ata_pio_devchk, and in general all the early probing was
- based on extensive study of Hale Landis's probe/reset code in his
- ATADRVR driver (www.ata-atapi.com).
-
-
-
-
diff --git a/Documentation/DocBook/librs.tmpl b/Documentation/DocBook/librs.tmpl
deleted file mode 100644
index 94f21361e0edaa2778cfa4e8b1e4b5ab2cc0d1cf..0000000000000000000000000000000000000000
--- a/Documentation/DocBook/librs.tmpl
+++ /dev/null
@@ -1,289 +0,0 @@
-
-
-
-
-
- Reed-Solomon Library Programming Interface
-
-
-
- Thomas
- Gleixner
-
-
- tglx@linutronix.de
-
-
-
-
-
-
- 2004
- Thomas Gleixner
-
-
-
-
- This documentation is free software; you can redistribute
- it and/or modify it under the terms of the GNU General Public
- License version 2 as published by the Free Software Foundation.
-
-
-
- This program is distributed in the hope that it will be
- useful, but WITHOUT ANY WARRANTY; without even the implied
- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- See the GNU General Public License for more details.
-
-
-
- You should have received a copy of the GNU General Public
- License along with this program; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
- MA 02111-1307 USA
-
-
-
- For more details see the file COPYING in the source
- distribution of Linux.
-
-
-
-
-
-
-
- Introduction
-
- The generic Reed-Solomon Library provides encoding, decoding
- and error correction functions.
-
-
- Reed-Solomon codes are used in communication and storage
- applications to ensure data integrity.
-
-
- This documentation is provided for developers who want to utilize
- the functions provided by the library.
-
-
-
-
- Known Bugs And Assumptions
-
- None.
-
-
-
-
- Usage
-
- This chapter provides examples of how to use the library.
-
-
- Initializing
-
- The init function init_rs returns a pointer to an
- rs decoder structure, which holds the necessary
- information for encoding, decoding and error correction
- with the given polynomial. It either uses an existing
- matching decoder or creates a new one. On creation all
- the lookup tables for fast en/decoding are created.
- The function may take a while, so make sure not to
- call it in critical code paths.
-
-
-/* the Reed Solomon control structure */
-static struct rs_control *rs_decoder;
-
-/* Symbolsize is 10 (bits)
- * Primitive polynomial is x^10+x^3+1
- * first consecutive root is 0
- * primitive element to generate roots = 1
- * generator polynomial degree (number of roots) = 6
- */
-rs_decoder = init_rs (10, 0x409, 0, 1, 6);
-
-
-
- Encoding
-
- The encoder calculates the Reed-Solomon code over
- the given data length and stores the result in
- the parity buffer. Note that the parity buffer must
- be initialized before calling the encoder.
-
-
- The expanded data can be inverted on the fly by
- providing a non-zero inversion mask. The expanded data is
- XOR'ed with the mask. This is used e.g. for FLASH
- ECC, where the all 0xFF is inverted to an all 0x00.
- The Reed-Solomon code for all 0x00 is all 0x00. The
- code is inverted before storing to FLASH so it is 0xFF
- too. This prevents that reading from an erased FLASH
- results in ECC errors.
-
-
- The databytes are expanded to the given symbol size
- on the fly. There is no support for encoding continuous
- bitstreams with a symbol size != 8 at the moment. If
- it is necessary it should be not a big deal to implement
- such functionality.
-
-
-/* Parity buffer. Size = number of roots */
-uint16_t par[6];
-/* Initialize the parity buffer */
-memset(par, 0, sizeof(par));
-/* Encode 512 byte in data8. Store parity in buffer par */
-encode_rs8 (rs_decoder, data8, 512, par, 0);
-
-
-
- Decoding
-
- The decoder calculates the syndrome over
- the given data length and the received parity symbols
- and corrects errors in the data.
-
-
- If a syndrome is available from a hardware decoder
- then the syndrome calculation is skipped.
-
-
- The correction of the data buffer can be suppressed
- by providing a correction pattern buffer and an error
- location buffer to the decoder. The decoder stores the
- calculated error location and the correction bitmask
- in the given buffers. This is useful for hardware
- decoders which use a weird bit ordering scheme.
-
-
- The databytes are expanded to the given symbol size
- on the fly. There is no support for decoding continuous
- bitstreams with a symbolsize != 8 at the moment. If
- it is necessary it should be not a big deal to implement
- such functionality.
-
-
-
-
- Decoding with syndrome calculation, direct data correction
-
-
-/* Parity buffer. Size = number of roots */
-uint16_t par[6];
-uint8_t data[512];
-int numerr;
-/* Receive data */
-.....
-/* Receive parity */
-.....
-/* Decode 512 byte in data8.*/
-numerr = decode_rs8 (rs_decoder, data8, par, 512, NULL, 0, NULL, 0, NULL);
-
-
-
-
-
- Decoding with syndrome given by hardware decoder, direct data correction
-
-
-/* Parity buffer. Size = number of roots */
-uint16_t par[6], syn[6];
-uint8_t data[512];
-int numerr;
-/* Receive data */
-.....
-/* Receive parity */
-.....
-/* Get syndrome from hardware decoder */
-.....
-/* Decode 512 byte in data8.*/
-numerr = decode_rs8 (rs_decoder, data8, par, 512, syn, 0, NULL, 0, NULL);
-
-
-
-
-
- Decoding with syndrome given by hardware decoder, no direct data correction.
-
-
- Note: It's not necessary to give data and received parity to the decoder.
-
-
-/* Parity buffer. Size = number of roots */
-uint16_t par[6], syn[6], corr[8];
-uint8_t data[512];
-int numerr, errpos[8];
-/* Receive data */
-.....
-/* Receive parity */
-.....
-/* Get syndrome from hardware decoder */
-.....
-/* Decode 512 byte in data8.*/
-numerr = decode_rs8 (rs_decoder, NULL, NULL, 512, syn, 0, errpos, 0, corr);
-for (i = 0; i < numerr; i++) {
- do_error_correction_in_your_buffer(errpos[i], corr[i]);
-}
-
-
-
-
- Cleanup
-
- The function free_rs frees the allocated resources,
- if the caller is the last user of the decoder.
-
-
-/* Release resources */
-free_rs(rs_decoder);
-
-
-
-
-
-
- Structures
-
- This chapter contains the autogenerated documentation of the structures which are
- used in the Reed-Solomon Library and are relevant for a developer.
-
-!Iinclude/linux/rslib.h
-
-
-
- Public Functions Provided
-
- This chapter contains the autogenerated documentation of the Reed-Solomon functions
- which are exported.
-
-!Elib/reed_solomon/reed_solomon.c
-
-
-
- Credits
-
- The library code for encoding and decoding was written by Phil Karn.
-
-
- Copyright 2002, Phil Karn, KA9Q
- May be used under the terms of the GNU General Public License (GPL)
-
-
- The wrapper functions and interfaces are written by Thomas Gleixner.
-
-
- Many users have provided bugfixes, improvements and helping hands for testing.
- Thanks a lot.
-
-
- The following people have contributed to this document:
-
-
- Thomas Gleixnertglx@linutronix.de
-
-
-
diff --git a/Documentation/DocBook/lsm.tmpl b/Documentation/DocBook/lsm.tmpl
deleted file mode 100644
index fe7664ce96678077eb2e461c10d1ebbe4dc9e42e..0000000000000000000000000000000000000000
--- a/Documentation/DocBook/lsm.tmpl
+++ /dev/null
@@ -1,265 +0,0 @@
-
-
-
-
-
- Linux Security Modules: General Security Hooks for Linux
-
-
- Stephen
- Smalley
-
- NAI Labs
- ssmalley@nai.com
-
-
-
- Timothy
- Fraser
-
- NAI Labs
- tfraser@nai.com
-
-
-
- Chris
- Vance
-
- NAI Labs
- cvance@nai.com
-
-
-
-
-
-Introduction
-
-
-In March 2001, the National Security Agency (NSA) gave a presentation
-about Security-Enhanced Linux (SELinux) at the 2.5 Linux Kernel
-Summit. SELinux is an implementation of flexible and fine-grained
-nondiscretionary access controls in the Linux kernel, originally
-implemented as its own particular kernel patch. Several other
-security projects (e.g. RSBAC, Medusa) have also developed flexible
-access control architectures for the Linux kernel, and various
-projects have developed particular access control models for Linux
-(e.g. LIDS, DTE, SubDomain). Each project has developed and
-maintained its own kernel patch to support its security needs.
-
-
-
-In response to the NSA presentation, Linus Torvalds made a set of
-remarks that described a security framework he would be willing to
-consider for inclusion in the mainstream Linux kernel. He described a
-general framework that would provide a set of security hooks to
-control operations on kernel objects and a set of opaque security
-fields in kernel data structures for maintaining security attributes.
-This framework could then be used by loadable kernel modules to
-implement any desired model of security. Linus also suggested the
-possibility of migrating the Linux capabilities code into such a
-module.
-
-
-
-The Linux Security Modules (LSM) project was started by WireX to
-develop such a framework. LSM is a joint development effort by
-several security projects, including Immunix, SELinux, SGI and Janus,
-and several individuals, including Greg Kroah-Hartman and James
-Morris, to develop a Linux kernel patch that implements this
-framework. The patch is currently tracking the 2.4 series and is
-targeted for integration into the 2.5 development series. This
-technical report provides an overview of the framework and the example
-capabilities security module provided by the LSM kernel patch.
-
-
-
-
-LSM Framework
-
-
-The LSM kernel patch provides a general kernel framework to support
-security modules. In particular, the LSM framework is primarily
-focused on supporting access control modules, although future
-development is likely to address other security needs such as
-auditing. By itself, the framework does not provide any additional
-security; it merely provides the infrastructure to support security
-modules. The LSM kernel patch also moves most of the capabilities
-logic into an optional security module, with the system defaulting
-to the traditional superuser logic. This capabilities module
-is discussed further in .
-
-
-
-The LSM kernel patch adds security fields to kernel data structures
-and inserts calls to hook functions at critical points in the kernel
-code to manage the security fields and to perform access control. It
-also adds functions for registering and unregistering security
-modules, and adds a general security system call
-to support new system calls for security-aware applications.
-
-
-
-The LSM security fields are simply void* pointers. For
-process and program execution security information, security fields
-were added to struct task_struct and
-struct linux_binprm. For filesystem security
-information, a security field was added to
-struct super_block. For pipe, file, and socket
-security information, security fields were added to
-struct inode and
-struct file. For packet and network device security
-information, security fields were added to
-struct sk_buff and
-struct net_device. For System V IPC security
-information, security fields were added to
-struct kern_ipc_perm and
-struct msg_msg; additionally, the definitions
-for struct msg_msg, struct
-msg_queue, and struct
-shmid_kernel were moved to header files
-(include/linux/msg.h and
-include/linux/shm.h as appropriate) to allow
-the security modules to use these definitions.
-
-
-
-Each LSM hook is a function pointer in a global table,
-security_ops. This table is a
-security_operations structure as defined by
-include/linux/security.h. Detailed documentation
-for each hook is included in this header file. At present, this
-structure consists of a collection of substructures that group related
-hooks based on the kernel object (e.g. task, inode, file, sk_buff,
-etc) as well as some top-level hook function pointers for system
-operations. This structure is likely to be flattened in the future
-for performance. The placement of the hook calls in the kernel code
-is described by the "called:" lines in the per-hook documentation in
-the header file. The hook calls can also be easily found in the
-kernel code by looking for the string "security_ops->".
-
-
-
-
-Linus mentioned per-process security hooks in his original remarks as a
-possible alternative to global security hooks. However, if LSM were
-to start from the perspective of per-process hooks, then the base
-framework would have to deal with how to handle operations that
-involve multiple processes (e.g. kill), since each process might have
-its own hook for controlling the operation. This would require a
-general mechanism for composing hooks in the base framework.
-Additionally, LSM would still need global hooks for operations that
-have no process context (e.g. network input operations).
-Consequently, LSM provides global security hooks, but a security
-module is free to implement per-process hooks (where that makes sense)
-by storing a security_ops table in each process' security field and
-then invoking these per-process hooks from the global hooks.
-The problem of composition is thus deferred to the module.
-
-
-
-The global security_ops table is initialized to a set of hook
-functions provided by a dummy security module that provides
-traditional superuser logic. A register_security
-function (in security/security.c) is provided to
-allow a security module to set security_ops to refer to its own hook
-functions, and an unregister_security function is
-provided to revert security_ops to the dummy module hooks. This
-mechanism is used to set the primary security module, which is
-responsible for making the final decision for each hook.
-
-
-
-LSM also provides a simple mechanism for stacking additional security
-modules with the primary security module. It defines
-register_security and
-unregister_security hooks in the
-security_operations structure and provides
-mod_reg_security and
-mod_unreg_security functions that invoke these
-hooks after performing some sanity checking. A security module can
-call these functions in order to stack with other modules. However,
-the actual details of how this stacking is handled are deferred to the
-module, which can implement these hooks in any way it wishes
-(including always returning an error if it does not wish to support
-stacking). In this manner, LSM again defers the problem of
-composition to the module.
-
-
-
-Although the LSM hooks are organized into substructures based on
-kernel object, all of the hooks can be viewed as falling into two
-major categories: hooks that are used to manage the security fields
-and hooks that are used to perform access control. Examples of the
-first category of hooks include the
-alloc_security and
-free_security hooks defined for each kernel data
-structure that has a security field. These hooks are used to allocate
-and free security structures for kernel objects. The first category
-of hooks also includes hooks that set information in the security
-field after allocation, such as the post_lookup
-hook in struct inode_security_ops. This hook
-is used to set security information for inodes after successful lookup
-operations. An example of the second category of hooks is the
-permission hook in
-struct inode_security_ops. This hook checks
-permission when accessing an inode.
-
-
-
-
-LSM Capabilities Module
-
-
-The LSM kernel patch moves most of the existing POSIX.1e capabilities
-logic into an optional security module stored in the file
-security/capability.c. This change allows
-users who do not want to use capabilities to omit this code entirely
-from their kernel, instead using the dummy module for traditional
-superuser logic or any other module that they desire. This change
-also allows the developers of the capabilities logic to maintain and
-enhance their code more freely, without needing to integrate patches
-back into the base kernel.
-
-
-
-In addition to moving the capabilities logic, the LSM kernel patch
-could move the capability-related fields from the kernel data
-structures into the new security fields managed by the security
-modules. However, at present, the LSM kernel patch leaves the
-capability fields in the kernel data structures. In his original
-remarks, Linus suggested that this might be preferable so that other
-security modules can be easily stacked with the capabilities module
-without needing to chain multiple security structures on the security field.
-It also avoids imposing extra overhead on the capabilities module
-to manage the security fields. However, the LSM framework could
-certainly support such a move if it is determined to be desirable,
-with only a few additional changes described below.
-
-
-
-At present, the capabilities logic for computing process capabilities
-on execve and set*uid,
-checking capabilities for a particular process, saving and checking
-capabilities for netlink messages, and handling the
-capget and capset system
-calls have been moved into the capabilities module. There are still a
-few locations in the base kernel where capability-related fields are
-directly examined or modified, but the current version of the LSM
-patch does allow a security module to completely replace the
-assignment and testing of capabilities. These few locations would
-need to be changed if the capability-related fields were moved into
-the security field. The following is a list of known locations that
-still perform such direct examination or modification of
-capability-related fields:
-
-fs/open.c:sys_access
-fs/lockd/host.c:nlm_bind_host
-fs/nfsd/auth.c:nfsd_setuser
-fs/proc/array.c:task_cap
-
-
-
-
-
-
diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl
deleted file mode 100644
index b442921bca540a7e8833be39446822dc48ca1628..0000000000000000000000000000000000000000
--- a/Documentation/DocBook/mtdnand.tmpl
+++ /dev/null
@@ -1,1291 +0,0 @@
-
-
-
-
-
- MTD NAND Driver Programming Interface
-
-
-
- Thomas
- Gleixner
-
-
- tglx@linutronix.de
-
-
-
-
-
-
- 2004
- Thomas Gleixner
-
-
-
-
- This documentation is free software; you can redistribute
- it and/or modify it under the terms of the GNU General Public
- License version 2 as published by the Free Software Foundation.
-
-
-
- This program is distributed in the hope that it will be
- useful, but WITHOUT ANY WARRANTY; without even the implied
- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- See the GNU General Public License for more details.
-
-
-
- You should have received a copy of the GNU General Public
- License along with this program; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
- MA 02111-1307 USA
-
-
-
- For more details see the file COPYING in the source
- distribution of Linux.
-
-
-
-
-
-
-
- Introduction
-
- The generic NAND driver supports almost all NAND and AG-AND based
- chips and connects them to the Memory Technology Devices (MTD)
- subsystem of the Linux Kernel.
-
-
- This documentation is provided for developers who want to implement
- board drivers or filesystem drivers suitable for NAND devices.
-
-
-
-
- Known Bugs And Assumptions
-
- None.
-
-
-
-
- Documentation hints
-
- The function and structure docs are autogenerated. Each function and
- struct member has a short description which is marked with an [XXX] identifier.
- The following chapters explain the meaning of those identifiers.
-
-
- Function identifiers [XXX]
-
- The functions are marked with [XXX] identifiers in the short
- comment. The identifiers explain the usage and scope of the
- functions. Following identifiers are used:
-
-
-
- [MTD Interface]
- These functions provide the interface to the MTD kernel API.
- They are not replaceable and provide functionality
- which is complete hardware independent.
-
-
- [NAND Interface]
- These functions are exported and provide the interface to the NAND kernel API.
-
-
- [GENERIC]
- Generic functions are not replaceable and provide functionality
- which is complete hardware independent.
-
-
- [DEFAULT]
- Default functions provide hardware related functionality which is suitable
- for most of the implementations. These functions can be replaced by the
- board driver if necessary. Those functions are called via pointers in the
- NAND chip description structure. The board driver can set the functions which
- should be replaced by board dependent functions before calling nand_scan().
- If the function pointer is NULL on entry to nand_scan() then the pointer
- is set to the default function which is suitable for the detected chip type.
-
-
-
-
- Struct member identifiers [XXX]
-
- The struct members are marked with [XXX] identifiers in the
- comment. The identifiers explain the usage and scope of the
- members. Following identifiers are used:
-
-
-
- [INTERN]
- These members are for NAND driver internal use only and must not be
- modified. Most of these values are calculated from the chip geometry
- information which is evaluated during nand_scan().
-
-
- [REPLACEABLE]
- Replaceable members hold hardware related functions which can be
- provided by the board driver. The board driver can set the functions which
- should be replaced by board dependent functions before calling nand_scan().
- If the function pointer is NULL on entry to nand_scan() then the pointer
- is set to the default function which is suitable for the detected chip type.
-
-
- [BOARDSPECIFIC]
- Board specific members hold hardware related information which must
- be provided by the board driver. The board driver must set the function
- pointers and datafields before calling nand_scan().
-
-
- [OPTIONAL]
- Optional members can hold information relevant for the board driver. The
- generic NAND driver code does not use this information.
-
-
-
-
-
-
- Basic board driver
-
- For most boards it will be sufficient to provide just the
- basic functions and fill out some really board dependent
- members in the nand chip description structure.
-
-
- Basic defines
-
- At least you have to provide a nand_chip structure
- and a storage for the ioremap'ed chip address.
- You can allocate the nand_chip structure using
- kmalloc or you can allocate it statically.
- The NAND chip structure embeds an mtd structure
- which will be registered to the MTD subsystem.
- You can extract a pointer to the mtd structure
- from a nand_chip pointer using the nand_to_mtd()
- helper.
-
-
- Kmalloc based example
-
-
-static struct mtd_info *board_mtd;
-static void __iomem *baseaddr;
-
-
- Static example
-
-
-static struct nand_chip board_chip;
-static void __iomem *baseaddr;
-
-
-
- Partition defines
-
- If you want to divide your device into partitions, then
- define a partitioning scheme suitable to your board.
-
-
-#define NUM_PARTITIONS 2
-static struct mtd_partition partition_info[] = {
- { .name = "Flash partition 1",
- .offset = 0,
- .size = 8 * 1024 * 1024 },
- { .name = "Flash partition 2",
- .offset = MTDPART_OFS_NEXT,
- .size = MTDPART_SIZ_FULL },
-};
-
-
-
- Hardware control function
-
- The hardware control function provides access to the
- control pins of the NAND chip(s).
- The access can be done by GPIO pins or by address lines.
- If you use address lines, make sure that the timing
- requirements are met.
-
-
- GPIO based example
-
-
-static void board_hwcontrol(struct mtd_info *mtd, int cmd)
-{
- switch(cmd){
- case NAND_CTL_SETCLE: /* Set CLE pin high */ break;
- case NAND_CTL_CLRCLE: /* Set CLE pin low */ break;
- case NAND_CTL_SETALE: /* Set ALE pin high */ break;
- case NAND_CTL_CLRALE: /* Set ALE pin low */ break;
- case NAND_CTL_SETNCE: /* Set nCE pin low */ break;
- case NAND_CTL_CLRNCE: /* Set nCE pin high */ break;
- }
-}
-
-
- Address lines based example. It's assumed that the
- nCE pin is driven by a chip select decoder.
-
-
-static void board_hwcontrol(struct mtd_info *mtd, int cmd)
-{
- struct nand_chip *this = mtd_to_nand(mtd);
- switch(cmd){
- case NAND_CTL_SETCLE: this->IO_ADDR_W |= CLE_ADRR_BIT; break;
- case NAND_CTL_CLRCLE: this->IO_ADDR_W &= ~CLE_ADRR_BIT; break;
- case NAND_CTL_SETALE: this->IO_ADDR_W |= ALE_ADRR_BIT; break;
- case NAND_CTL_CLRALE: this->IO_ADDR_W &= ~ALE_ADRR_BIT; break;
- }
-}
-
-
-
- Device ready function
-
- If the hardware interface has the ready busy pin of the NAND chip connected to a
- GPIO or other accessible I/O pin, this function is used to read back the state of the
- pin. The function has no arguments and should return 0, if the device is busy (R/B pin
- is low) and 1, if the device is ready (R/B pin is high).
- If the hardware interface does not give access to the ready busy pin, then
- the function must not be defined and the function pointer this->dev_ready is set to NULL.
-
-
-
- Init function
-
- The init function allocates memory and sets up all the board
- specific parameters and function pointers. When everything
- is set up nand_scan() is called. This function tries to
- detect and identify then chip. If a chip is found all the
- internal data fields are initialized accordingly.
- The structure(s) have to be zeroed out first and then filled with the necessary
- information about the device.
-
-
-static int __init board_init (void)
-{
- struct nand_chip *this;
- int err = 0;
-
- /* Allocate memory for MTD device structure and private data */
- this = kzalloc(sizeof(struct nand_chip), GFP_KERNEL);
- if (!this) {
- printk ("Unable to allocate NAND MTD device structure.\n");
- err = -ENOMEM;
- goto out;
- }
-
- board_mtd = nand_to_mtd(this);
-
- /* map physical address */
- baseaddr = ioremap(CHIP_PHYSICAL_ADDRESS, 1024);
- if (!baseaddr) {
- printk("Ioremap to access NAND chip failed\n");
- err = -EIO;
- goto out_mtd;
- }
-
- /* Set address of NAND IO lines */
- this->IO_ADDR_R = baseaddr;
- this->IO_ADDR_W = baseaddr;
- /* Reference hardware control function */
- this->hwcontrol = board_hwcontrol;
- /* Set command delay time, see datasheet for correct value */
- this->chip_delay = CHIP_DEPENDEND_COMMAND_DELAY;
- /* Assign the device ready function, if available */
- this->dev_ready = board_dev_ready;
- this->eccmode = NAND_ECC_SOFT;
-
- /* Scan to find existence of the device */
- if (nand_scan (board_mtd, 1)) {
- err = -ENXIO;
- goto out_ior;
- }
-
- add_mtd_partitions(board_mtd, partition_info, NUM_PARTITIONS);
- goto out;
-
-out_ior:
- iounmap(baseaddr);
-out_mtd:
- kfree (this);
-out:
- return err;
-}
-module_init(board_init);
-
-
-
- Exit function
-
- The exit function is only necessary if the driver is
- compiled as a module. It releases all resources which
- are held by the chip driver and unregisters the partitions
- in the MTD layer.
-
-
-#ifdef MODULE
-static void __exit board_cleanup (void)
-{
- /* Release resources, unregister device */
- nand_release (board_mtd);
-
- /* unmap physical address */
- iounmap(baseaddr);
-
- /* Free the MTD device structure */
- kfree (mtd_to_nand(board_mtd));
-}
-module_exit(board_cleanup);
-#endif
-
-
-
-
-
- Advanced board driver functions
-
- This chapter describes the advanced functionality of the NAND
- driver. For a list of functions which can be overridden by the board
- driver see the documentation of the nand_chip structure.
-
-
- Multiple chip control
-
- The nand driver can control chip arrays. Therefore the
- board driver must provide an own select_chip function. This
- function must (de)select the requested chip.
- The function pointer in the nand_chip structure must
- be set before calling nand_scan(). The maxchip parameter
- of nand_scan() defines the maximum number of chips to
- scan for. Make sure that the select_chip function can
- handle the requested number of chips.
-
-
- The nand driver concatenates the chips to one virtual
- chip and provides this virtual chip to the MTD layer.
-
-
- Note: The driver can only handle linear chip arrays
- of equally sized chips. There is no support for
- parallel arrays which extend the buswidth.
-
-
- GPIO based example
-
-
-static void board_select_chip (struct mtd_info *mtd, int chip)
-{
- /* Deselect all chips, set all nCE pins high */
- GPIO(BOARD_NAND_NCE) |= 0xff;
- if (chip >= 0)
- GPIO(BOARD_NAND_NCE) &= ~ (1 << chip);
-}
-
-
- Address lines based example.
- Its assumed that the nCE pins are connected to an
- address decoder.
-
-
-static void board_select_chip (struct mtd_info *mtd, int chip)
-{
- struct nand_chip *this = mtd_to_nand(mtd);
-
- /* Deselect all chips */
- this->IO_ADDR_R &= ~BOARD_NAND_ADDR_MASK;
- this->IO_ADDR_W &= ~BOARD_NAND_ADDR_MASK;
- switch (chip) {
- case 0:
- this->IO_ADDR_R |= BOARD_NAND_ADDR_CHIP0;
- this->IO_ADDR_W |= BOARD_NAND_ADDR_CHIP0;
- break;
- ....
- case n:
- this->IO_ADDR_R |= BOARD_NAND_ADDR_CHIPn;
- this->IO_ADDR_W |= BOARD_NAND_ADDR_CHIPn;
- break;
- }
-}
-
-
-
- Hardware ECC support
-
- Functions and constants
-
- The nand driver supports three different types of
- hardware ECC.
-
- NAND_ECC_HW3_256
- Hardware ECC generator providing 3 bytes ECC per
- 256 byte.
-
- NAND_ECC_HW3_512
- Hardware ECC generator providing 3 bytes ECC per
- 512 byte.
-
- NAND_ECC_HW6_512
- Hardware ECC generator providing 6 bytes ECC per
- 512 byte.
-
- NAND_ECC_HW8_512
- Hardware ECC generator providing 6 bytes ECC per
- 512 byte.
-
-
- If your hardware generator has a different functionality
- add it at the appropriate place in nand_base.c
-
-
- The board driver must provide following functions:
-
- enable_hwecc
- This function is called before reading / writing to
- the chip. Reset or initialize the hardware generator
- in this function. The function is called with an
- argument which let you distinguish between read
- and write operations.
-
- calculate_ecc
- This function is called after read / write from / to
- the chip. Transfer the ECC from the hardware to
- the buffer. If the option NAND_HWECC_SYNDROME is set
- then the function is only called on write. See below.
-
- correct_data
- In case of an ECC error this function is called for
- error detection and correction. Return 1 respectively 2
- in case the error can be corrected. If the error is
- not correctable return -1. If your hardware generator
- matches the default algorithm of the nand_ecc software
- generator then use the correction function provided
- by nand_ecc instead of implementing duplicated code.
-
-
-
-
-
- Hardware ECC with syndrome calculation
-
- Many hardware ECC implementations provide Reed-Solomon
- codes and calculate an error syndrome on read. The syndrome
- must be converted to a standard Reed-Solomon syndrome
- before calling the error correction code in the generic
- Reed-Solomon library.
-
-
- The ECC bytes must be placed immediately after the data
- bytes in order to make the syndrome generator work. This
- is contrary to the usual layout used by software ECC. The
- separation of data and out of band area is not longer
- possible. The nand driver code handles this layout and
- the remaining free bytes in the oob area are managed by
- the autoplacement code. Provide a matching oob-layout
- in this case. See rts_from4.c and diskonchip.c for
- implementation reference. In those cases we must also
- use bad block tables on FLASH, because the ECC layout is
- interfering with the bad block marker positions.
- See bad block table support for details.
-
-
-
-
- Bad block table support
-
- Most NAND chips mark the bad blocks at a defined
- position in the spare area. Those blocks must
- not be erased under any circumstances as the bad
- block information would be lost.
- It is possible to check the bad block mark each
- time when the blocks are accessed by reading the
- spare area of the first page in the block. This
- is time consuming so a bad block table is used.
-
-
- The nand driver supports various types of bad block
- tables.
-
- Per device
- The bad block table contains all bad block information
- of the device which can consist of multiple chips.
-
- Per chip
- A bad block table is used per chip and contains the
- bad block information for this particular chip.
-
- Fixed offset
- The bad block table is located at a fixed offset
- in the chip (device). This applies to various
- DiskOnChip devices.
-
- Automatic placed
- The bad block table is automatically placed and
- detected either at the end or at the beginning
- of a chip (device)
-
- Mirrored tables
- The bad block table is mirrored on the chip (device) to
- allow updates of the bad block table without data loss.
-
-
-
-
- nand_scan() calls the function nand_default_bbt().
- nand_default_bbt() selects appropriate default
- bad block table descriptors depending on the chip information
- which was retrieved by nand_scan().
-
-
- The standard policy is scanning the device for bad
- blocks and build a ram based bad block table which
- allows faster access than always checking the
- bad block information on the flash chip itself.
-
-
- Flash based tables
-
- It may be desired or necessary to keep a bad block table in FLASH.
- For AG-AND chips this is mandatory, as they have no factory marked
- bad blocks. They have factory marked good blocks. The marker pattern
- is erased when the block is erased to be reused. So in case of
- powerloss before writing the pattern back to the chip this block
- would be lost and added to the bad blocks. Therefore we scan the
- chip(s) when we detect them the first time for good blocks and
- store this information in a bad block table before erasing any
- of the blocks.
-
-
- The blocks in which the tables are stored are protected against
- accidental access by marking them bad in the memory bad block
- table. The bad block table management functions are allowed
- to circumvent this protection.
-
-
- The simplest way to activate the FLASH based bad block table support
- is to set the option NAND_BBT_USE_FLASH in the bbt_option field of
- the nand chip structure before calling nand_scan(). For AG-AND
- chips is this done by default.
- This activates the default FLASH based bad block table functionality
- of the NAND driver. The default bad block table options are
-
- Store bad block table per chip
- Use 2 bits per block
- Automatic placement at the end of the chip
- Use mirrored tables with version numbers
- Reserve 4 blocks at the end of the chip
-
-
-
-
- User defined tables
-
- User defined tables are created by filling out a
- nand_bbt_descr structure and storing the pointer in the
- nand_chip structure member bbt_td before calling nand_scan().
- If a mirror table is necessary a second structure must be
- created and a pointer to this structure must be stored
- in bbt_md inside the nand_chip structure. If the bbt_md
- member is set to NULL then only the main table is used
- and no scan for the mirrored table is performed.
-
-
- The most important field in the nand_bbt_descr structure
- is the options field. The options define most of the
- table properties. Use the predefined constants from
- nand.h to define the options.
-
- Number of bits per block
- The supported number of bits is 1, 2, 4, 8.
- Table per chip
- Setting the constant NAND_BBT_PERCHIP selects that
- a bad block table is managed for each chip in a chip array.
- If this option is not set then a per device bad block table
- is used.
- Table location is absolute
- Use the option constant NAND_BBT_ABSPAGE and
- define the absolute page number where the bad block
- table starts in the field pages. If you have selected bad block
- tables per chip and you have a multi chip array then the start page
- must be given for each chip in the chip array. Note: there is no scan
- for a table ident pattern performed, so the fields
- pattern, veroffs, offs, len can be left uninitialized
- Table location is automatically detected
- The table can either be located in the first or the last good
- blocks of the chip (device). Set NAND_BBT_LASTBLOCK to place
- the bad block table at the end of the chip (device). The
- bad block tables are marked and identified by a pattern which
- is stored in the spare area of the first page in the block which
- holds the bad block table. Store a pointer to the pattern
- in the pattern field. Further the length of the pattern has to be
- stored in len and the offset in the spare area must be given
- in the offs member of the nand_bbt_descr structure. For mirrored
- bad block tables different patterns are mandatory.
- Table creation
- Set the option NAND_BBT_CREATE to enable the table creation
- if no table can be found during the scan. Usually this is done only
- once if a new chip is found.
- Table write support
- Set the option NAND_BBT_WRITE to enable the table write support.
- This allows the update of the bad block table(s) in case a block has
- to be marked bad due to wear. The MTD interface function block_markbad
- is calling the update function of the bad block table. If the write
- support is enabled then the table is updated on FLASH.
-
- Note: Write support should only be enabled for mirrored tables with
- version control.
-
- Table version control
- Set the option NAND_BBT_VERSION to enable the table version control.
- It's highly recommended to enable this for mirrored tables with write
- support. It makes sure that the risk of losing the bad block
- table information is reduced to the loss of the information about the
- one worn out block which should be marked bad. The version is stored in
- 4 consecutive bytes in the spare area of the device. The position of
- the version number is defined by the member veroffs in the bad block table
- descriptor.
- Save block contents on write
-
- In case that the block which holds the bad block table does contain
- other useful information, set the option NAND_BBT_SAVECONTENT. When
- the bad block table is written then the whole block is read the bad
- block table is updated and the block is erased and everything is
- written back. If this option is not set only the bad block table
- is written and everything else in the block is ignored and erased.
-
- Number of reserved blocks
-
- For automatic placement some blocks must be reserved for
- bad block table storage. The number of reserved blocks is defined
- in the maxblocks member of the bad block table description structure.
- Reserving 4 blocks for mirrored tables should be a reasonable number.
- This also limits the number of blocks which are scanned for the bad
- block table ident pattern.
-
-
-
-
-
-
- Spare area (auto)placement
-
- The nand driver implements different possibilities for
- placement of filesystem data in the spare area,
-
- Placement defined by fs driver
- Automatic placement
-
- The default placement function is automatic placement. The
- nand driver has built in default placement schemes for the
- various chiptypes. If due to hardware ECC functionality the
- default placement does not fit then the board driver can
- provide a own placement scheme.
-
-
- File system drivers can provide a own placement scheme which
- is used instead of the default placement scheme.
-
-
- Placement schemes are defined by a nand_oobinfo structure
-
-struct nand_oobinfo {
- int useecc;
- int eccbytes;
- int eccpos[24];
- int oobfree[8][2];
-};
-
-
- useecc
- The useecc member controls the ecc and placement function. The header
- file include/mtd/mtd-abi.h contains constants to select ecc and
- placement. MTD_NANDECC_OFF switches off the ecc complete. This is
- not recommended and available for testing and diagnosis only.
- MTD_NANDECC_PLACE selects caller defined placement, MTD_NANDECC_AUTOPLACE
- selects automatic placement.
-
- eccbytes
- The eccbytes member defines the number of ecc bytes per page.
-
- eccpos
- The eccpos array holds the byte offsets in the spare area where
- the ecc codes are placed.
-
- oobfree
- The oobfree array defines the areas in the spare area which can be
- used for automatic placement. The information is given in the format
- {offset, size}. offset defines the start of the usable area, size the
- length in bytes. More than one area can be defined. The list is terminated
- by an {0, 0} entry.
-
-
-
-
- Placement defined by fs driver
-
- The calling function provides a pointer to a nand_oobinfo
- structure which defines the ecc placement. For writes the
- caller must provide a spare area buffer along with the
- data buffer. The spare area buffer size is (number of pages) *
- (size of spare area). For reads the buffer size is
- (number of pages) * ((size of spare area) + (number of ecc
- steps per page) * sizeof (int)). The driver stores the
- result of the ecc check for each tuple in the spare buffer.
- The storage sequence is
-
-
- <spare data page 0><ecc result 0>...<ecc result n>
-
-
- ...
-
-
- <spare data page n><ecc result 0>...<ecc result n>
-
-
- This is a legacy mode used by YAFFS1.
-
-
- If the spare area buffer is NULL then only the ECC placement is
- done according to the given scheme in the nand_oobinfo structure.
-
-
-
- Automatic placement
-
- Automatic placement uses the built in defaults to place the
- ecc bytes in the spare area. If filesystem data have to be stored /
- read into the spare area then the calling function must provide a
- buffer. The buffer size per page is determined by the oobfree array in
- the nand_oobinfo structure.
-
-
- If the spare area buffer is NULL then only the ECC placement is
- done according to the default builtin scheme.
-
-
-
-
- Spare area autoplacement default schemes
-
- 256 byte pagesize
-
-
-Offset
-Content
-Comment
-
-
-0x00
-ECC byte 0
-Error correction code byte 0
-
-
-0x01
-ECC byte 1
-Error correction code byte 1
-
-
-0x02
-ECC byte 2
-Error correction code byte 2
-
-
-0x03
-Autoplace 0
-
-
-
-0x04
-Autoplace 1
-
-
-
-0x05
-Bad block marker
-If any bit in this byte is zero, then this block is bad.
-This applies only to the first page in a block. In the remaining
-pages this byte is reserved
-
-
-0x06
-Autoplace 2
-
-
-
-0x07
-Autoplace 3
-
-
-
-
-
- 512 byte pagesize
-
-
-Offset
-Content
-Comment
-
-
-0x00
-ECC byte 0
-Error correction code byte 0 of the lower 256 Byte data in
-this page
-
-
-0x01
-ECC byte 1
-Error correction code byte 1 of the lower 256 Bytes of data
-in this page
-
-
-0x02
-ECC byte 2
-Error correction code byte 2 of the lower 256 Bytes of data
-in this page
-
-
-0x03
-ECC byte 3
-Error correction code byte 0 of the upper 256 Bytes of data
-in this page
-
-
-0x04
-reserved
-reserved
-
-
-0x05
-Bad block marker
-If any bit in this byte is zero, then this block is bad.
-This applies only to the first page in a block. In the remaining
-pages this byte is reserved
-
-
-0x06
-ECC byte 4
-Error correction code byte 1 of the upper 256 Bytes of data
-in this page
-
-
-0x07
-ECC byte 5
-Error correction code byte 2 of the upper 256 Bytes of data
-in this page
-
-
-0x08 - 0x0F
-Autoplace 0 - 7
-
-
-
-
-
- 2048 byte pagesize
-
-
-Offset
-Content
-Comment
-
-
-0x00
-Bad block marker
-If any bit in this byte is zero, then this block is bad.
-This applies only to the first page in a block. In the remaining
-pages this byte is reserved
-
-
-0x01
-Reserved
-Reserved
-
-
-0x02-0x27
-Autoplace 0 - 37
-
-
-
-0x28
-ECC byte 0
-Error correction code byte 0 of the first 256 Byte data in
-this page
-
-
-0x29
-ECC byte 1
-Error correction code byte 1 of the first 256 Bytes of data
-in this page
-
-
-0x2A
-ECC byte 2
-Error correction code byte 2 of the first 256 Bytes data in
-this page
-
-
-0x2B
-ECC byte 3
-Error correction code byte 0 of the second 256 Bytes of data
-in this page
-
-
-0x2C
-ECC byte 4
-Error correction code byte 1 of the second 256 Bytes of data
-in this page
-
-
-0x2D
-ECC byte 5
-Error correction code byte 2 of the second 256 Bytes of data
-in this page
-
-
-0x2E
-ECC byte 6
-Error correction code byte 0 of the third 256 Bytes of data
-in this page
-
-
-0x2F
-ECC byte 7
-Error correction code byte 1 of the third 256 Bytes of data
-in this page
-
-
-0x30
-ECC byte 8
-Error correction code byte 2 of the third 256 Bytes of data
-in this page
-
-
-0x31
-ECC byte 9
-Error correction code byte 0 of the fourth 256 Bytes of data
-in this page
-
-
-0x32
-ECC byte 10
-Error correction code byte 1 of the fourth 256 Bytes of data
-in this page
-
-
-0x33
-ECC byte 11
-Error correction code byte 2 of the fourth 256 Bytes of data
-in this page
-
-
-0x34
-ECC byte 12
-Error correction code byte 0 of the fifth 256 Bytes of data
-in this page
-
-
-0x35
-ECC byte 13
-Error correction code byte 1 of the fifth 256 Bytes of data
-in this page
-
-
-0x36
-ECC byte 14
-Error correction code byte 2 of the fifth 256 Bytes of data
-in this page
-
-
-0x37
-ECC byte 15
-Error correction code byte 0 of the sixt 256 Bytes of data
-in this page
-
-
-0x38
-ECC byte 16
-Error correction code byte 1 of the sixt 256 Bytes of data
-in this page
-
-
-0x39
-ECC byte 17
-Error correction code byte 2 of the sixt 256 Bytes of data
-in this page
-
-
-0x3A
-ECC byte 18
-Error correction code byte 0 of the seventh 256 Bytes of
-data in this page
-
-
-0x3B
-ECC byte 19
-Error correction code byte 1 of the seventh 256 Bytes of
-data in this page
-
-
-0x3C
-ECC byte 20
-Error correction code byte 2 of the seventh 256 Bytes of
-data in this page
-
-
-0x3D
-ECC byte 21
-Error correction code byte 0 of the eighth 256 Bytes of data
-in this page
-
-
-0x3E
-ECC byte 22
-Error correction code byte 1 of the eighth 256 Bytes of data
-in this page
-
-
-0x3F
-ECC byte 23
-Error correction code byte 2 of the eighth 256 Bytes of data
-in this page
-
-
-
-
-
-
-
- Filesystem support
-
- The NAND driver provides all necessary functions for a
- filesystem via the MTD interface.
-
-
- Filesystems must be aware of the NAND peculiarities and
- restrictions. One major restrictions of NAND Flash is, that you cannot
- write as often as you want to a page. The consecutive writes to a page,
- before erasing it again, are restricted to 1-3 writes, depending on the
- manufacturers specifications. This applies similar to the spare area.
-
-
- Therefore NAND aware filesystems must either write in page size chunks
- or hold a writebuffer to collect smaller writes until they sum up to
- pagesize. Available NAND aware filesystems: JFFS2, YAFFS.
-
-
- The spare area usage to store filesystem data is controlled by
- the spare area placement functionality which is described in one
- of the earlier chapters.
-
-
-
- Tools
-
- The MTD project provides a couple of helpful tools to handle NAND Flash.
-
- flasherase, flasheraseall: Erase and format FLASH partitions
- nandwrite: write filesystem images to NAND FLASH
- nanddump: dump the contents of a NAND FLASH partitions
-
-
-
- These tools are aware of the NAND restrictions. Please use those tools
- instead of complaining about errors which are caused by non NAND aware
- access methods.
-
-
-
-
- Constants
-
- This chapter describes the constants which might be relevant for a driver developer.
-
-
- Chip option constants
-
- Constants for chip id table
-
- These constants are defined in nand.h. They are ored together to describe
- the chip functionality.
-
-/* Buswitdh is 16 bit */
-#define NAND_BUSWIDTH_16 0x00000002
-/* Device supports partial programming without padding */
-#define NAND_NO_PADDING 0x00000004
-/* Chip has cache program function */
-#define NAND_CACHEPRG 0x00000008
-/* Chip has copy back function */
-#define NAND_COPYBACK 0x00000010
-/* AND Chip which has 4 banks and a confusing page / block
- * assignment. See Renesas datasheet for further information */
-#define NAND_IS_AND 0x00000020
-/* Chip has a array of 4 pages which can be read without
- * additional ready /busy waits */
-#define NAND_4PAGE_ARRAY 0x00000040
-
-
-
-
- Constants for runtime options
-
- These constants are defined in nand.h. They are ored together to describe
- the functionality.
-
-/* The hw ecc generator provides a syndrome instead a ecc value on read
- * This can only work if we have the ecc bytes directly behind the
- * data bytes. Applies for DOC and AG-AND Renesas HW Reed Solomon generators */
-#define NAND_HWECC_SYNDROME 0x00020000
-
-
-
-
-
-
- ECC selection constants
-
- Use these constants to select the ECC algorithm.
-
-/* No ECC. Usage is not recommended ! */
-#define NAND_ECC_NONE 0
-/* Software ECC 3 byte ECC per 256 Byte data */
-#define NAND_ECC_SOFT 1
-/* Hardware ECC 3 byte ECC per 256 Byte data */
-#define NAND_ECC_HW3_256 2
-/* Hardware ECC 3 byte ECC per 512 Byte data */
-#define NAND_ECC_HW3_512 3
-/* Hardware ECC 6 byte ECC per 512 Byte data */
-#define NAND_ECC_HW6_512 4
-/* Hardware ECC 6 byte ECC per 512 Byte data */
-#define NAND_ECC_HW8_512 6
-
-
-
-
-
- Hardware control related constants
-
- These constants describe the requested hardware access function when
- the boardspecific hardware control function is called
-
-/* Select the chip by setting nCE to low */
-#define NAND_CTL_SETNCE 1
-/* Deselect the chip by setting nCE to high */
-#define NAND_CTL_CLRNCE 2
-/* Select the command latch by setting CLE to high */
-#define NAND_CTL_SETCLE 3
-/* Deselect the command latch by setting CLE to low */
-#define NAND_CTL_CLRCLE 4
-/* Select the address latch by setting ALE to high */
-#define NAND_CTL_SETALE 5
-/* Deselect the address latch by setting ALE to low */
-#define NAND_CTL_CLRALE 6
-/* Set write protection by setting WP to high. Not used! */
-#define NAND_CTL_SETWP 7
-/* Clear write protection by setting WP to low. Not used! */
-#define NAND_CTL_CLRWP 8
-
-
-
-
-
- Bad block table related constants
-
- These constants describe the options used for bad block
- table descriptors.
-
-/* Options for the bad block table descriptors */
-
-/* The number of bits used per block in the bbt on the device */
-#define NAND_BBT_NRBITS_MSK 0x0000000F
-#define NAND_BBT_1BIT 0x00000001
-#define NAND_BBT_2BIT 0x00000002
-#define NAND_BBT_4BIT 0x00000004
-#define NAND_BBT_8BIT 0x00000008
-/* The bad block table is in the last good block of the device */
-#define NAND_BBT_LASTBLOCK 0x00000010
-/* The bbt is at the given page, else we must scan for the bbt */
-#define NAND_BBT_ABSPAGE 0x00000020
-/* bbt is stored per chip on multichip devices */
-#define NAND_BBT_PERCHIP 0x00000080
-/* bbt has a version counter at offset veroffs */
-#define NAND_BBT_VERSION 0x00000100
-/* Create a bbt if none axists */
-#define NAND_BBT_CREATE 0x00000200
-/* Write bbt if necessary */
-#define NAND_BBT_WRITE 0x00001000
-/* Read and write back block contents when writing bbt */
-#define NAND_BBT_SAVECONTENT 0x00002000
-
-
-
-
-
-
-
- Structures
-
- This chapter contains the autogenerated documentation of the structures which are
- used in the NAND driver and might be relevant for a driver developer. Each
- struct member has a short description which is marked with an [XXX] identifier.
- See the chapter "Documentation hints" for an explanation.
-
-!Iinclude/linux/mtd/nand.h
-
-
-
- Public Functions Provided
-
- This chapter contains the autogenerated documentation of the NAND kernel API functions
- which are exported. Each function has a short description which is marked with an [XXX] identifier.
- See the chapter "Documentation hints" for an explanation.
-
-!Edrivers/mtd/nand/nand_base.c
-!Edrivers/mtd/nand/nand_bbt.c
-!Edrivers/mtd/nand/nand_ecc.c
-
-
-
- Internal Functions Provided
-
- This chapter contains the autogenerated documentation of the NAND driver internal functions.
- Each function has a short description which is marked with an [XXX] identifier.
- See the chapter "Documentation hints" for an explanation.
- The functions marked with [DEFAULT] might be relevant for a board driver developer.
-
-!Idrivers/mtd/nand/nand_base.c
-!Idrivers/mtd/nand/nand_bbt.c
-
-
-
-
- Credits
-
- The following people have contributed to the NAND driver:
-
- Steven J. Hillsjhill@realitydiluted.com
- David Woodhousedwmw2@infradead.org
- Thomas Gleixnertglx@linutronix.de
-
- A lot of users have provided bugfixes, improvements and helping hands for testing.
- Thanks a lot.
-
-
- The following people have contributed to this document:
-
- Thomas Gleixnertglx@linutronix.de
-
-
-
-
diff --git a/Documentation/DocBook/networking.tmpl b/Documentation/DocBook/networking.tmpl
deleted file mode 100644
index 29df25016c7c121b1965b7a79122a08b691c1ac4..0000000000000000000000000000000000000000
--- a/Documentation/DocBook/networking.tmpl
+++ /dev/null
@@ -1,111 +0,0 @@
-
-
-
-
-
- Linux Networking and Network Devices APIs
-
-
-
- This documentation is free software; you can redistribute
- it and/or modify it under the terms of the GNU General Public
- License as published by the Free Software Foundation; either
- version 2 of the License, or (at your option) any later
- version.
-
-
-
- This program is distributed in the hope that it will be
- useful, but WITHOUT ANY WARRANTY; without even the implied
- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- See the GNU General Public License for more details.
-
-
-
- You should have received a copy of the GNU General Public
- License along with this program; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
- MA 02111-1307 USA
-
-
-
- For more details see the file COPYING in the source
- distribution of Linux.
-
-
-
-
-
-
-
- Linux Networking
- Networking Base Types
-!Iinclude/linux/net.h
-
- Socket Buffer Functions
-!Iinclude/linux/skbuff.h
-!Iinclude/net/sock.h
-!Enet/socket.c
-!Enet/core/skbuff.c
-!Enet/core/sock.c
-!Enet/core/datagram.c
-!Enet/core/stream.c
-
- Socket Filter
-!Enet/core/filter.c
-
- Generic Network Statistics
-!Iinclude/uapi/linux/gen_stats.h
-!Enet/core/gen_stats.c
-!Enet/core/gen_estimator.c
-
- SUN RPC subsystem
-
-!Enet/sunrpc/xdr.c
-!Enet/sunrpc/svc_xprt.c
-!Enet/sunrpc/xprt.c
-!Enet/sunrpc/sched.c
-!Enet/sunrpc/socklib.c
-!Enet/sunrpc/stats.c
-!Enet/sunrpc/rpc_pipe.c
-!Enet/sunrpc/rpcb_clnt.c
-!Enet/sunrpc/clnt.c
-
- WiMAX
-!Enet/wimax/op-msg.c
-!Enet/wimax/op-reset.c
-!Enet/wimax/op-rfkill.c
-!Enet/wimax/stack.c
-!Iinclude/net/wimax.h
-!Iinclude/uapi/linux/wimax.h
-
-
-
-
- Network device support
- Driver Support
-!Enet/core/dev.c
-!Enet/ethernet/eth.c
-!Enet/sched/sch_generic.c
-!Iinclude/linux/etherdevice.h
-!Iinclude/linux/netdevice.h
-
- PHY Support
-!Edrivers/net/phy/phy.c
-!Idrivers/net/phy/phy.c
-!Edrivers/net/phy/phy_device.c
-!Idrivers/net/phy/phy_device.c
-!Edrivers/net/phy/mdio_bus.c
-!Idrivers/net/phy/mdio_bus.c
-
-
-
-
-
diff --git a/Documentation/DocBook/rapidio.tmpl b/Documentation/DocBook/rapidio.tmpl
deleted file mode 100644
index ac3cca3399a1e4d4ee205bdbfa3435e8b02f74a7..0000000000000000000000000000000000000000
--- a/Documentation/DocBook/rapidio.tmpl
+++ /dev/null
@@ -1,155 +0,0 @@
-
-
- ]>
-
-
-
- RapidIO Subsystem Guide
-
-
-
- Matt
- Porter
-
-
- mporter@kernel.crashing.org
- mporter@mvista.com
-
-
-
-
-
-
- 2005
- MontaVista Software, Inc.
-
-
-
-
- This documentation is free software; you can redistribute
- it and/or modify it under the terms of the GNU General Public
- License version 2 as published by the Free Software Foundation.
-
-
-
- This program is distributed in the hope that it will be
- useful, but WITHOUT ANY WARRANTY; without even the implied
- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- See the GNU General Public License for more details.
-
-
-
- You should have received a copy of the GNU General Public
- License along with this program; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
- MA 02111-1307 USA
-
-
-
- For more details see the file COPYING in the source
- distribution of Linux.
-
-
-
-
-
-
-
- Introduction
-
- RapidIO is a high speed switched fabric interconnect with
- features aimed at the embedded market. RapidIO provides
- support for memory-mapped I/O as well as message-based
- transactions over the switched fabric network. RapidIO has
- a standardized discovery mechanism not unlike the PCI bus
- standard that allows simple detection of devices in a
- network.
-
-
- This documentation is provided for developers intending
- to support RapidIO on new architectures, write new drivers,
- or to understand the subsystem internals.
-
-
-
-
- Known Bugs and Limitations
-
-
- Bugs
- None. ;)
-
-
- Limitations
-
-
- Access/management of RapidIO memory regions is not supported
- Multiple host enumeration is not supported
-
-
-
-
-
-
- RapidIO driver interface
-
- Drivers are provided a set of calls in order
- to interface with the subsystem to gather info
- on devices, request/map memory region resources,
- and manage mailboxes/doorbells.
-
-
- Functions
-!Iinclude/linux/rio_drv.h
-!Edrivers/rapidio/rio-driver.c
-!Edrivers/rapidio/rio.c
-
-
-
-
- Internals
-
-
- This chapter contains the autogenerated documentation of the RapidIO
- subsystem.
-
-
- Structures
-!Iinclude/linux/rio.h
-
- Enumeration and Discovery
-!Idrivers/rapidio/rio-scan.c
-
- Driver functionality
-!Idrivers/rapidio/rio.c
-!Idrivers/rapidio/rio-access.c
-
- Device model support
-!Idrivers/rapidio/rio-driver.c
-
- PPC32 support
-!Iarch/powerpc/sysdev/fsl_rio.c
-
-
-
-
- Credits
-
- The following people have contributed to the RapidIO
- subsystem directly or indirectly:
-
- Matt Portermporter@kernel.crashing.org
- Randy Vinsonrvinson@mvista.com
- Dan Malekdan@embeddedalley.com
-
-
-
- The following people have contributed to this document:
-
- Matt Portermporter@kernel.crashing.org
-
-
-
-
diff --git a/Documentation/DocBook/s390-drivers.tmpl b/Documentation/DocBook/s390-drivers.tmpl
deleted file mode 100644
index 95bfc12e5439d572f02e924f887802fd273bcf96..0000000000000000000000000000000000000000
--- a/Documentation/DocBook/s390-drivers.tmpl
+++ /dev/null
@@ -1,161 +0,0 @@
-
-
-
-
-
- Writing s390 channel device drivers
-
-
-
- Cornelia
- Huck
-
-
- cornelia.huck@de.ibm.com
-
-
-
-
-
-
- 2007
- IBM Corp.
-
-
-
-
- This documentation is free software; you can redistribute
- it and/or modify it under the terms of the GNU General Public
- License as published by the Free Software Foundation; either
- version 2 of the License, or (at your option) any later
- version.
-
-
-
- This program is distributed in the hope that it will be
- useful, but WITHOUT ANY WARRANTY; without even the implied
- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- See the GNU General Public License for more details.
-
-
-
- You should have received a copy of the GNU General Public
- License along with this program; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
- MA 02111-1307 USA
-
-
-
- For more details see the file COPYING in the source
- distribution of Linux.
-
-
-
-
-
-
-
- Introduction
-
- This document describes the interfaces available for device drivers that
- drive s390 based channel attached I/O devices. This includes interfaces for
- interaction with the hardware and interfaces for interacting with the
- common driver core. Those interfaces are provided by the s390 common I/O
- layer.
-
-
- The document assumes a familarity with the technical terms associated
- with the s390 channel I/O architecture. For a description of this
- architecture, please refer to the "z/Architecture: Principles of
- Operation", IBM publication no. SA22-7832.
-
-
- While most I/O devices on a s390 system are typically driven through the
- channel I/O mechanism described here, there are various other methods
- (like the diag interface). These are out of the scope of this document.
-
-
- Some additional information can also be found in the kernel source
- under Documentation/s390/driver-model.txt.
-
-
-
- The ccw bus
-
- The ccw bus typically contains the majority of devices available to
- a s390 system. Named after the channel command word (ccw), the basic
- command structure used to address its devices, the ccw bus contains
- so-called channel attached devices. They are addressed via I/O
- subchannels, visible on the css bus. A device driver for
- channel-attached devices, however, will never interact with the
- subchannel directly, but only via the I/O device on the ccw bus,
- the ccw device.
-
-
- I/O functions for channel-attached devices
-
- Some hardware structures have been translated into C structures for use
- by the common I/O layer and device drivers. For more information on
- the hardware structures represented here, please consult the Principles
- of Operation.
-
-!Iarch/s390/include/asm/cio.h
-
-
- ccw devices
-
- Devices that want to initiate channel I/O need to attach to the ccw bus.
- Interaction with the driver core is done via the common I/O layer, which
- provides the abstractions of ccw devices and ccw device drivers.
-
-
- The functions that initiate or terminate channel I/O all act upon a
- ccw device structure. Device drivers must not bypass those functions
- or strange side effects may happen.
-
-!Iarch/s390/include/asm/ccwdev.h
-!Edrivers/s390/cio/device.c
-!Edrivers/s390/cio/device_ops.c
-
-
- The channel-measurement facility
-
- The channel-measurement facility provides a means to collect
- measurement data which is made available by the channel subsystem
- for each channel attached device.
-
-!Iarch/s390/include/asm/cmb.h
-!Edrivers/s390/cio/cmf.c
-
-
-
-
- The ccwgroup bus
-
- The ccwgroup bus only contains artificial devices, created by the user.
- Many networking devices (e.g. qeth) are in fact composed of several
- ccw devices (like read, write and data channel for qeth). The
- ccwgroup bus provides a mechanism to create a meta-device which
- contains those ccw devices as slave devices and can be associated
- with the netdevice.
-
-
- ccw group devices
-!Iarch/s390/include/asm/ccwgroup.h
-!Edrivers/s390/cio/ccwgroup.c
-
-
-
-
- Generic interfaces
-
- Some interfaces are available to other drivers that do not necessarily
- have anything to do with the busses described above, but still are
- indirectly using basic infrastructure in the common I/O layer.
- One example is the support for adapter interrupts.
-
-!Edrivers/s390/cio/airq.c
-
-
-
diff --git a/Documentation/DocBook/scsi.tmpl b/Documentation/DocBook/scsi.tmpl
deleted file mode 100644
index 4b9b9b286cea6ca8ccb54baffb065793a1ae90e1..0000000000000000000000000000000000000000
--- a/Documentation/DocBook/scsi.tmpl
+++ /dev/null
@@ -1,409 +0,0 @@
-
-
-
-
-
- SCSI Interfaces Guide
-
-
-
- James
- Bottomley
-
-
- James.Bottomley@hansenpartnership.com
-
-
-
-
-
- Rob
- Landley
-
-
- rob@landley.net
-
-
-
-
-
-
-
- 2007
- Linux Foundation
-
-
-
-
- This documentation is free software; you can redistribute
- it and/or modify it under the terms of the GNU General Public
- License version 2.
-
-
-
- This program is distributed in the hope that it will be
- useful, but WITHOUT ANY WARRANTY; without even the implied
- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- For more details see the file COPYING in the source
- distribution of Linux.
-
-
-
-
-
-
-
- Introduction
-
- Protocol vs bus
-
- Once upon a time, the Small Computer Systems Interface defined both
- a parallel I/O bus and a data protocol to connect a wide variety of
- peripherals (disk drives, tape drives, modems, printers, scanners,
- optical drives, test equipment, and medical devices) to a host
- computer.
-
-
- Although the old parallel (fast/wide/ultra) SCSI bus has largely
- fallen out of use, the SCSI command set is more widely used than ever
- to communicate with devices over a number of different busses.
-
-
- The SCSI protocol
- is a big-endian peer-to-peer packet based protocol. SCSI commands
- are 6, 10, 12, or 16 bytes long, often followed by an associated data
- payload.
-
-
- SCSI commands can be transported over just about any kind of bus, and
- are the default protocol for storage devices attached to USB, SATA,
- SAS, Fibre Channel, FireWire, and ATAPI devices. SCSI packets are
- also commonly exchanged over Infiniband,
- I20, TCP/IP
- (iSCSI), even
- Parallel
- ports.
-
-
-
- Design of the Linux SCSI subsystem
-
- The SCSI subsystem uses a three layer design, with upper, mid, and low
- layers. Every operation involving the SCSI subsystem (such as reading
- a sector from a disk) uses one driver at each of the 3 levels: one
- upper layer driver, one lower layer driver, and the SCSI midlayer.
-
-
- The SCSI upper layer provides the interface between userspace and the
- kernel, in the form of block and char device nodes for I/O and
- ioctl(). The SCSI lower layer contains drivers for specific hardware
- devices.
-
-
- In between is the SCSI mid-layer, analogous to a network routing
- layer such as the IPv4 stack. The SCSI mid-layer routes a packet
- based data protocol between the upper layer's /dev nodes and the
- corresponding devices in the lower layer. It manages command queues,
- provides error handling and power management functions, and responds
- to ioctl() requests.
-
-
-
-
-
- SCSI upper layer
-
- The upper layer supports the user-kernel interface by providing
- device nodes.
-
-
- sd (SCSI Disk)
- sd (sd_mod.o)
-
-
-
- sr (SCSI CD-ROM)
- sr (sr_mod.o)
-
-
- st (SCSI Tape)
- st (st.o)
-
-
- sg (SCSI Generic)
- sg (sg.o)
-
-
- ch (SCSI Media Changer)
- ch (ch.c)
-
-
-
-
- SCSI mid layer
-
-
- SCSI midlayer implementation
-
- include/scsi/scsi_device.h
-
-
-!Iinclude/scsi/scsi_device.h
-
-
-
- drivers/scsi/scsi.c
- Main file for the SCSI midlayer.
-!Edrivers/scsi/scsi.c
-
-
- drivers/scsi/scsicam.c
-
- SCSI
- Common Access Method support functions, for use with
- HDIO_GETGEO, etc.
-
-!Edrivers/scsi/scsicam.c
-
-
- drivers/scsi/scsi_error.c
- Common SCSI error/timeout handling routines.
-!Edrivers/scsi/scsi_error.c
-
-
- drivers/scsi/scsi_devinfo.c
-
- Manage scsi_dev_info_list, which tracks blacklisted and whitelisted
- devices.
-
-!Idrivers/scsi/scsi_devinfo.c
-
-
- drivers/scsi/scsi_ioctl.c
-
- Handle ioctl() calls for SCSI devices.
-
-!Edrivers/scsi/scsi_ioctl.c
-
-
- drivers/scsi/scsi_lib.c
-
- SCSI queuing library.
-
-!Edrivers/scsi/scsi_lib.c
-
-
- drivers/scsi/scsi_lib_dma.c
-
- SCSI library functions depending on DMA
- (map and unmap scatter-gather lists).
-
-!Edrivers/scsi/scsi_lib_dma.c
-
-
- drivers/scsi/scsi_module.c
-
- The file drivers/scsi/scsi_module.c contains legacy support for
- old-style host templates. It should never be used by any new driver.
-
-
-
- drivers/scsi/scsi_proc.c
-
- The functions in this file provide an interface between
- the PROC file system and the SCSI device drivers
- It is mainly used for debugging, statistics and to pass
- information directly to the lowlevel driver.
-
- I.E. plumbing to manage /proc/scsi/*
-
-!Idrivers/scsi/scsi_proc.c
-
-
- drivers/scsi/scsi_netlink.c
-
- Infrastructure to provide async events from transports to userspace
- via netlink, using a single NETLINK_SCSITRANSPORT protocol for all
- transports.
-
- See the
- original patch submission for more details.
-
-!Idrivers/scsi/scsi_netlink.c
-
-
- drivers/scsi/scsi_scan.c
-
- Scan a host to determine which (if any) devices are attached.
-
- The general scanning/probing algorithm is as follows, exceptions are
- made to it depending on device specific flags, compilation options,
- and global variable (boot or module load time) settings.
-
- A specific LUN is scanned via an INQUIRY command; if the LUN has a
- device attached, a scsi_device is allocated and setup for it.
-
- For every id of every channel on the given host, start by scanning
- LUN 0. Skip hosts that don't respond at all to a scan of LUN 0.
- Otherwise, if LUN 0 has a device attached, allocate and setup a
- scsi_device for it. If target is SCSI-3 or up, issue a REPORT LUN,
- and scan all of the LUNs returned by the REPORT LUN; else,
- sequentially scan LUNs up until some maximum is reached, or a LUN is
- seen that cannot have a device attached to it.
-
-!Idrivers/scsi/scsi_scan.c
-
-
- drivers/scsi/scsi_sysctl.c
-
- Set up the sysctl entry: "/dev/scsi/logging_level"
- (DEV_SCSI_LOGGING_LEVEL) which sets/returns scsi_logging_level.
-
-
-
- drivers/scsi/scsi_sysfs.c
-
- SCSI sysfs interface routines.
-
-!Edrivers/scsi/scsi_sysfs.c
-
-
- drivers/scsi/hosts.c
-
- mid to lowlevel SCSI driver interface
-
-!Edrivers/scsi/hosts.c
-
-
- drivers/scsi/constants.c
-
- mid to lowlevel SCSI driver interface
-
-!Edrivers/scsi/constants.c
-
-
-
-
- Transport classes
-
- Transport classes are service libraries for drivers in the SCSI
- lower layer, which expose transport attributes in sysfs.
-
-
- Fibre Channel transport
-
- The file drivers/scsi/scsi_transport_fc.c defines transport attributes
- for Fibre Channel.
-
-!Edrivers/scsi/scsi_transport_fc.c
-
-
- iSCSI transport class
-
- The file drivers/scsi/scsi_transport_iscsi.c defines transport
- attributes for the iSCSI class, which sends SCSI packets over TCP/IP
- connections.
-
-!Edrivers/scsi/scsi_transport_iscsi.c
-
-
- Serial Attached SCSI (SAS) transport class
-
- The file drivers/scsi/scsi_transport_sas.c defines transport
- attributes for Serial Attached SCSI, a variant of SATA aimed at
- large high-end systems.
-
-
- The SAS transport class contains common code to deal with SAS HBAs,
- an aproximated representation of SAS topologies in the driver model,
- and various sysfs attributes to expose these topologies and management
- interfaces to userspace.
-
-
- In addition to the basic SCSI core objects this transport class
- introduces two additional intermediate objects: The SAS PHY
- as represented by struct sas_phy defines an "outgoing" PHY on
- a SAS HBA or Expander, and the SAS remote PHY represented by
- struct sas_rphy defines an "incoming" PHY on a SAS Expander or
- end device. Note that this is purely a software concept, the
- underlying hardware for a PHY and a remote PHY is the exactly
- the same.
-
-
- There is no concept of a SAS port in this code, users can see
- what PHYs form a wide port based on the port_identifier attribute,
- which is the same for all PHYs in a port.
-
-!Edrivers/scsi/scsi_transport_sas.c
-
-
- SATA transport class
-
- The SATA transport is handled by libata, which has its own book of
- documentation in this directory.
-
-
-
- Parallel SCSI (SPI) transport class
-
- The file drivers/scsi/scsi_transport_spi.c defines transport
- attributes for traditional (fast/wide/ultra) SCSI busses.
-
-!Edrivers/scsi/scsi_transport_spi.c
-
-
- SCSI RDMA (SRP) transport class
-
- The file drivers/scsi/scsi_transport_srp.c defines transport
- attributes for SCSI over Remote Direct Memory Access.
-
-!Edrivers/scsi/scsi_transport_srp.c
-
-
-
-
-
-
- SCSI lower layer
-
- Host Bus Adapter transport types
-
- Many modern device controllers use the SCSI command set as a protocol to
- communicate with their devices through many different types of physical
- connections.
-
-
- In SCSI language a bus capable of carrying SCSI commands is
- called a "transport", and a controller connecting to such a bus is
- called a "host bus adapter" (HBA).
-
-
- Debug transport
-
- The file drivers/scsi/scsi_debug.c simulates a host adapter with a
- variable number of disks (or disk like devices) attached, sharing a
- common amount of RAM. Does a lot of checking to make sure that we are
- not getting blocks mixed up, and panics the kernel if anything out of
- the ordinary is seen.
-
-
- To be more realistic, the simulated devices have the transport
- attributes of SAS disks.
-
-
- For documentation see
- http://sg.danny.cz/sg/sdebug26.html
-
-
-
-
- todo
- Parallel (fast/wide/ultra) SCSI, USB, SATA,
- SAS, Fibre Channel, FireWire, ATAPI devices, Infiniband,
- I20, iSCSI, Parallel ports, netlink...
-
-
-
-
-
diff --git a/Documentation/DocBook/sh.tmpl b/Documentation/DocBook/sh.tmpl
deleted file mode 100644
index 4a38f604fa661fe3b2383746b67f59bc45ed619c..0000000000000000000000000000000000000000
--- a/Documentation/DocBook/sh.tmpl
+++ /dev/null
@@ -1,105 +0,0 @@
-
-
-
-
-
- SuperH Interfaces Guide
-
-
-
- Paul
- Mundt
-
-
- lethal@linux-sh.org
-
-
-
-
-
-
- 2008-2010
- Paul Mundt
-
-
- 2008-2010
- Renesas Technology Corp.
-
-
- 2010
- Renesas Electronics Corp.
-
-
-
-
- This documentation is free software; you can redistribute
- it and/or modify it under the terms of the GNU General Public
- License version 2 as published by the Free Software Foundation.
-
-
-
- This program is distributed in the hope that it will be
- useful, but WITHOUT ANY WARRANTY; without even the implied
- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- See the GNU General Public License for more details.
-
-
-
- You should have received a copy of the GNU General Public
- License along with this program; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
- MA 02111-1307 USA
-
-
-
- For more details see the file COPYING in the source
- distribution of Linux.
-
-
-
-
-
-
-
- Memory Management
-
- SH-4
-
- Store Queue API
-!Earch/sh/kernel/cpu/sh4/sq.c
-
-
-
- SH-5
-
- TLB Interfaces
-!Iarch/sh/mm/tlb-sh5.c
-!Iarch/sh/include/asm/tlb_64.h
-
-
-
-
- Machine Specific Interfaces
-
- mach-dreamcast
-!Iarch/sh/boards/mach-dreamcast/rtc.c
-
-
- mach-x3proto
-!Earch/sh/boards/mach-x3proto/ilsel.c
-
-
-
- Busses
-
- SuperHyway
-!Edrivers/sh/superhyway/superhyway.c
-
-
-
- Maple
-!Edrivers/sh/maple/maple.c
-
-
-
diff --git a/Documentation/DocBook/stylesheet.xsl b/Documentation/DocBook/stylesheet.xsl
deleted file mode 100644
index 3bf4ecf3d760c8145479be580d6db8da17543447..0000000000000000000000000000000000000000
--- a/Documentation/DocBook/stylesheet.xsl
+++ /dev/null
@@ -1,11 +0,0 @@
-
-
-1
-ansi
-80
-0
-
-1
-2
-1
-
diff --git a/Documentation/DocBook/w1.tmpl b/Documentation/DocBook/w1.tmpl
deleted file mode 100644
index b0228d4c81bb5ccca176720a9e0846acb8668b4f..0000000000000000000000000000000000000000
--- a/Documentation/DocBook/w1.tmpl
+++ /dev/null
@@ -1,101 +0,0 @@
-
-
-
-
-
- W1: Dallas' 1-wire bus
-
-
-
- David
- Fries
-
-
- David@Fries.net
-
-
-
-
-
-
-
- 2013
-
-
-
-
-
- This documentation is free software; you can redistribute
- it and/or modify it under the terms of the GNU General Public
- License version 2.
-
-
-
- This program is distributed in the hope that it will be
- useful, but WITHOUT ANY WARRANTY; without even the implied
- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- For more details see the file COPYING in the source
- distribution of Linux.
-
-
-
-
-
-
-
- W1 API internal to the kernel
-
-
- W1 API internal to the kernel
-
- drivers/w1/w1.h
- W1 core functions.
-!Idrivers/w1/w1.h
-
-
-
- drivers/w1/w1.c
- W1 core functions.
-!Idrivers/w1/w1.c
-
-
-
- drivers/w1/w1_family.h
- Allows registering device family operations.
-!Idrivers/w1/w1_family.h
-
-
-
- drivers/w1/w1_family.c
- Allows registering device family operations.
-!Edrivers/w1/w1_family.c
-
-
-
- drivers/w1/w1_int.c
- W1 internal initialization for master devices.
-!Edrivers/w1/w1_int.c
-
-
-
- drivers/w1/w1_netlink.h
- W1 external netlink API structures and commands.
-!Idrivers/w1/w1_netlink.h
-
-
-
- drivers/w1/w1_io.c
- W1 input/output.
-!Edrivers/w1/w1_io.c
-!Idrivers/w1/w1_io.c
-
-
-
-
-
-
-
-
diff --git a/Documentation/DocBook/z8530book.tmpl b/Documentation/DocBook/z8530book.tmpl
deleted file mode 100644
index 6f3883be877e2a00a5664e5539fec5e47089a902..0000000000000000000000000000000000000000
--- a/Documentation/DocBook/z8530book.tmpl
+++ /dev/null
@@ -1,371 +0,0 @@
-
-
-
-
-
- Z8530 Programming Guide
-
-
-
- Alan
- Cox
-
-
- alan@lxorguk.ukuu.org.uk
-
-
-
-
-
-
- 2000
- Alan Cox
-
-
-
-
- This documentation is free software; you can redistribute
- it and/or modify it under the terms of the GNU General Public
- License as published by the Free Software Foundation; either
- version 2 of the License, or (at your option) any later
- version.
-
-
-
- This program is distributed in the hope that it will be
- useful, but WITHOUT ANY WARRANTY; without even the implied
- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- See the GNU General Public License for more details.
-
-
-
- You should have received a copy of the GNU General Public
- License along with this program; if not, write to the Free
- Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
- MA 02111-1307 USA
-
-
-
- For more details see the file COPYING in the source
- distribution of Linux.
-
-
-
-
-
-
-
- Introduction
-
- The Z85x30 family synchronous/asynchronous controller chips are
- used on a large number of cheap network interface cards. The
- kernel provides a core interface layer that is designed to make
- it easy to provide WAN services using this chip.
-
-
- The current driver only support synchronous operation. Merging the
- asynchronous driver support into this code to allow any Z85x30
- device to be used as both a tty interface and as a synchronous
- controller is a project for Linux post the 2.4 release
-
-
-
-
- Driver Modes
-
- The Z85230 driver layer can drive Z8530, Z85C30 and Z85230 devices
- in three different modes. Each mode can be applied to an individual
- channel on the chip (each chip has two channels).
-
-
- The PIO synchronous mode supports the most common Z8530 wiring. Here
- the chip is interface to the I/O and interrupt facilities of the
- host machine but not to the DMA subsystem. When running PIO the
- Z8530 has extremely tight timing requirements. Doing high speeds,
- even with a Z85230 will be tricky. Typically you should expect to
- achieve at best 9600 baud with a Z8C530 and 64Kbits with a Z85230.
-
-
- The DMA mode supports the chip when it is configured to use dual DMA
- channels on an ISA bus. The better cards tend to support this mode
- of operation for a single channel. With DMA running the Z85230 tops
- out when it starts to hit ISA DMA constraints at about 512Kbits. It
- is worth noting here that many PC machines hang or crash when the
- chip is driven fast enough to hold the ISA bus solid.
-
-
- Transmit DMA mode uses a single DMA channel. The DMA channel is used
- for transmission as the transmit FIFO is smaller than the receive
- FIFO. it gives better performance than pure PIO mode but is nowhere
- near as ideal as pure DMA mode.
-
-
-
-
- Using the Z85230 driver
-
- The Z85230 driver provides the back end interface to your board. To
- configure a Z8530 interface you need to detect the board and to
- identify its ports and interrupt resources. It is also your problem
- to verify the resources are available.
-
-
- Having identified the chip you need to fill in a struct z8530_dev,
- which describes each chip. This object must exist until you finally
- shutdown the board. Firstly zero the active field. This ensures
- nothing goes off without you intending it. The irq field should
- be set to the interrupt number of the chip. (Each chip has a single
- interrupt source rather than each channel). You are responsible
- for allocating the interrupt line. The interrupt handler should be
- set to z8530_interrupt. The device id should
- be set to the z8530_dev structure pointer. Whether the interrupt can
- be shared or not is board dependent, and up to you to initialise.
-
-
- The structure holds two channel structures.
- Initialise chanA.ctrlio and chanA.dataio with the address of the
- control and data ports. You can or this with Z8530_PORT_SLEEP to
- indicate your interface needs the 5uS delay for chip settling done
- in software. The PORT_SLEEP option is architecture specific. Other
- flags may become available on future platforms, eg for MMIO.
- Initialise the chanA.irqs to &z8530_nop to start the chip up
- as disabled and discarding interrupt events. This ensures that
- stray interrupts will be mopped up and not hang the bus. Set
- chanA.dev to point to the device structure itself. The
- private and name field you may use as you wish. The private field
- is unused by the Z85230 layer. The name is used for error reporting
- and it may thus make sense to make it match the network name.
-
-
- Repeat the same operation with the B channel if your chip has
- both channels wired to something useful. This isn't always the
- case. If it is not wired then the I/O values do not matter, but
- you must initialise chanB.dev.
-
-
- If your board has DMA facilities then initialise the txdma and
- rxdma fields for the relevant channels. You must also allocate the
- ISA DMA channels and do any necessary board level initialisation
- to configure them. The low level driver will do the Z8530 and
- DMA controller programming but not board specific magic.
-
-
- Having initialised the device you can then call
- z8530_init. This will probe the chip and
- reset it into a known state. An identification sequence is then
- run to identify the chip type. If the checks fail to pass the
- function returns a non zero error code. Typically this indicates
- that the port given is not valid. After this call the
- type field of the z8530_dev structure is initialised to either
- Z8530, Z85C30 or Z85230 according to the chip found.
-
-
- Once you have called z8530_init you can also make use of the utility
- function z8530_describe. This provides a
- consistent reporting format for the Z8530 devices, and allows all
- the drivers to provide consistent reporting.
-
-
-
-
- Attaching Network Interfaces
-
- If you wish to use the network interface facilities of the driver,
- then you need to attach a network device to each channel that is
- present and in use. In addition to use the generic HDLC
- you need to follow some additional plumbing rules. They may seem
- complex but a look at the example hostess_sv11 driver should
- reassure you.
-
-
- The network device used for each channel should be pointed to by
- the netdevice field of each channel. The hdlc-> priv field of the
- network device points to your private data - you will need to be
- able to find your private data from this.
-
-
- The way most drivers approach this particular problem is to
- create a structure holding the Z8530 device definition and
- put that into the private field of the network device. The
- network device fields of the channels then point back to the
- network devices.
-
-
- If you wish to use the generic HDLC then you need to register
- the HDLC device.
-
-
- Before you register your network device you will also need to
- provide suitable handlers for most of the network device callbacks.
- See the network device documentation for more details on this.
-
-
-
-
- Configuring And Activating The Port
-
- The Z85230 driver provides helper functions and tables to load the
- port registers on the Z8530 chips. When programming the register
- settings for a channel be aware that the documentation recommends
- initialisation orders. Strange things happen when these are not
- followed.
-
-
- z8530_channel_load takes an array of
- pairs of initialisation values in an array of u8 type. The first
- value is the Z8530 register number. Add 16 to indicate the alternate
- register bank on the later chips. The array is terminated by a 255.
-
-
- The driver provides a pair of public tables. The
- z8530_hdlc_kilostream table is for the UK 'Kilostream' service and
- also happens to cover most other end host configurations. The
- z8530_hdlc_kilostream_85230 table is the same configuration using
- the enhancements of the 85230 chip. The configuration loaded is
- standard NRZ encoded synchronous data with HDLC bitstuffing. All
- of the timing is taken from the other end of the link.
-
-
- When writing your own tables be aware that the driver internally
- tracks register values. It may need to reload values. You should
- therefore be sure to set registers 1-7, 9-11, 14 and 15 in all
- configurations. Where the register settings depend on DMA selection
- the driver will update the bits itself when you open or close.
- Loading a new table with the interface open is not recommended.
-
-
- There are three standard configurations supported by the core
- code. In PIO mode the interface is programmed up to use
- interrupt driven PIO. This places high demands on the host processor
- to avoid latency. The driver is written to take account of latency
- issues but it cannot avoid latencies caused by other drivers,
- notably IDE in PIO mode. Because the drivers allocate buffers you
- must also prevent MTU changes while the port is open.
-
-
- Once the port is open it will call the rx_function of each channel
- whenever a completed packet arrived. This is invoked from
- interrupt context and passes you the channel and a network
- buffer (struct sk_buff) holding the data. The data includes
- the CRC bytes so most users will want to trim the last two
- bytes before processing the data. This function is very timing
- critical. When you wish to simply discard data the support
- code provides the function z8530_null_rx
- to discard the data.
-
-
- To active PIO mode sending and receiving the
- z8530_sync_open is called. This expects to be passed
- the network device and the channel. Typically this is called from
- your network device open callback. On a failure a non zero error
- status is returned. The z8530_sync_close
- function shuts down a PIO channel. This must be done before the
- channel is opened again and before the driver shuts down
- and unloads.
-
-
- The ideal mode of operation is dual channel DMA mode. Here the
- kernel driver will configure the board for DMA in both directions.
- The driver also handles ISA DMA issues such as controller
- programming and the memory range limit for you. This mode is
- activated by calling the z8530_sync_dma_open
- function. On failure a non zero error value is returned.
- Once this mode is activated it can be shut down by calling the
- z8530_sync_dma_close. You must call the close
- function matching the open mode you used.
-
-
- The final supported mode uses a single DMA channel to drive the
- transmit side. As the Z85C30 has a larger FIFO on the receive
- channel this tends to increase the maximum speed a little.
- This is activated by calling the z8530_sync_txdma_open
- . This returns a non zero error code on failure. The
- z8530_sync_txdma_close function closes down
- the Z8530 interface from this mode.
-
-
-
-
- Network Layer Functions
-
- The Z8530 layer provides functions to queue packets for
- transmission. The driver internally buffers the frame currently
- being transmitted and one further frame (in order to keep back
- to back transmission running). Any further buffering is up to
- the caller.
-
-
- The function z8530_queue_xmit takes a network
- buffer in sk_buff format and queues it for transmission. The
- caller must provide the entire packet with the exception of the
- bitstuffing and CRC. This is normally done by the caller via
- the generic HDLC interface layer. It returns 0 if the buffer has been
- queued and non zero values for queue full. If the function accepts
- the buffer it becomes property of the Z8530 layer and the caller
- should not free it.
-
-
- The function z8530_get_stats returns a pointer
- to an internally maintained per interface statistics block. This
- provides most of the interface code needed to implement the network
- layer get_stats callback.
-
-
-
-
- Porting The Z8530 Driver
-
- The Z8530 driver is written to be portable. In DMA mode it makes
- assumptions about the use of ISA DMA. These are probably warranted
- in most cases as the Z85230 in particular was designed to glue to PC
- type machines. The PIO mode makes no real assumptions.
-
-
- Should you need to retarget the Z8530 driver to another architecture
- the only code that should need changing are the port I/O functions.
- At the moment these assume PC I/O port accesses. This may not be
- appropriate for all platforms. Replacing
- z8530_read_port and z8530_write_port
- is intended to be all that is required to port this
- driver layer.
-
-
-
-
- Known Bugs And Assumptions
-
-
- Interrupt Locking
-
-
- The locking in the driver is done via the global cli/sti lock. This
- makes for relatively poor SMP performance. Switching this to use a
- per device spin lock would probably materially improve performance.
-
-
-
- Occasional Failures
-
-
- We have reports of occasional failures when run for very long
- periods of time and the driver starts to receive junk frames. At
- the moment the cause of this is not clear.
-
-
-
-
-
-
-
-
- Public Functions Provided
-!Edrivers/net/wan/z85230.c
-
-
-
- Internal Functions
-!Idrivers/net/wan/z85230.c
-
-
-
diff --git a/Documentation/Makefile b/Documentation/Makefile
index c2a469112c37bbc95d4a58e671eab5344b4aa428..a42320385df34eed57b2ece17e9f859ae1a92a2d 100644
--- a/Documentation/Makefile
+++ b/Documentation/Makefile
@@ -1 +1,126 @@
+# -*- makefile -*-
+# Makefile for Sphinx documentation
+#
+
subdir-y :=
+
+# You can set these variables from the command line.
+SPHINXBUILD = sphinx-build
+SPHINXOPTS =
+SPHINXDIRS = .
+_SPHINXDIRS = $(patsubst $(srctree)/Documentation/%/conf.py,%,$(wildcard $(srctree)/Documentation/*/conf.py))
+SPHINX_CONF = conf.py
+PAPER =
+BUILDDIR = $(obj)/output
+PDFLATEX = xelatex
+LATEXOPTS = -interaction=batchmode
+
+# User-friendly check for sphinx-build
+HAVE_SPHINX := $(shell if which $(SPHINXBUILD) >/dev/null 2>&1; then echo 1; else echo 0; fi)
+
+ifeq ($(HAVE_SPHINX),0)
+
+.DEFAULT:
+ $(warning The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed and in PATH, or set the SPHINXBUILD make variable to point to the full path of the '$(SPHINXBUILD)' executable.)
+ @echo " SKIP Sphinx $@ target."
+
+else # HAVE_SPHINX
+
+# User-friendly check for pdflatex
+HAVE_PDFLATEX := $(shell if which $(PDFLATEX) >/dev/null 2>&1; then echo 1; else echo 0; fi)
+
+# Internal variables.
+PAPEROPT_a4 = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+KERNELDOC = $(srctree)/scripts/kernel-doc
+KERNELDOC_CONF = -D kerneldoc_srctree=$(srctree) -D kerneldoc_bin=$(KERNELDOC)
+ALLSPHINXOPTS = $(KERNELDOC_CONF) $(PAPEROPT_$(PAPER)) $(SPHINXOPTS)
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+# commands; the 'cmd' from scripts/Kbuild.include is not *loopable*
+loop_cmd = $(echo-cmd) $(cmd_$(1)) || exit;
+
+# $2 sphinx builder e.g. "html"
+# $3 name of the build subfolder / e.g. "media", used as:
+# * dest folder relative to $(BUILDDIR) and
+# * cache folder relative to $(BUILDDIR)/.doctrees
+# $4 dest subfolder e.g. "man" for man pages at media/man
+# $5 reST source folder relative to $(srctree)/$(src),
+# e.g. "media" for the linux-tv book-set at ./Documentation/media
+
+quiet_cmd_sphinx = SPHINX $@ --> file://$(abspath $(BUILDDIR)/$3/$4)
+ cmd_sphinx = $(MAKE) BUILDDIR=$(abspath $(BUILDDIR)) $(build)=Documentation/media $2 && \
+ PYTHONDONTWRITEBYTECODE=1 \
+ BUILDDIR=$(abspath $(BUILDDIR)) SPHINX_CONF=$(abspath $(srctree)/$(src)/$5/$(SPHINX_CONF)) \
+ $(SPHINXBUILD) \
+ -b $2 \
+ -c $(abspath $(srctree)/$(src)) \
+ -d $(abspath $(BUILDDIR)/.doctrees/$3) \
+ -D version=$(KERNELVERSION) -D release=$(KERNELRELEASE) \
+ $(ALLSPHINXOPTS) \
+ $(abspath $(srctree)/$(src)/$5) \
+ $(abspath $(BUILDDIR)/$3/$4)
+
+htmldocs:
+ @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,html,$(var),,$(var)))
+
+linkcheckdocs:
+ @$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,linkcheck,$(var),,$(var)))
+
+latexdocs:
+ @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,latex,$(var),latex,$(var)))
+
+ifeq ($(HAVE_PDFLATEX),0)
+
+pdfdocs:
+ $(warning The '$(PDFLATEX)' command was not found. Make sure you have it installed and in PATH to produce PDF output.)
+ @echo " SKIP Sphinx $@ target."
+
+else # HAVE_PDFLATEX
+
+pdfdocs: latexdocs
+ $(foreach var,$(SPHINXDIRS), $(MAKE) PDFLATEX=$(PDFLATEX) LATEXOPTS="$(LATEXOPTS)" -C $(BUILDDIR)/$(var)/latex || exit;)
+
+endif # HAVE_PDFLATEX
+
+epubdocs:
+ @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,epub,$(var),epub,$(var)))
+
+xmldocs:
+ @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,xml,$(var),xml,$(var)))
+
+endif # HAVE_SPHINX
+
+# The following targets are independent of HAVE_SPHINX, and the rules should
+# work or silently pass without Sphinx.
+
+# no-ops for the Sphinx toolchain
+sgmldocs:
+ @:
+psdocs:
+ @:
+mandocs:
+ @:
+installmandocs:
+ @:
+
+cleandocs:
+ $(Q)rm -rf $(BUILDDIR)
+ $(Q)$(MAKE) BUILDDIR=$(abspath $(BUILDDIR)) $(build)=Documentation/media clean
+
+dochelp:
+ @echo ' Linux kernel internal documentation in different formats from ReST:'
+ @echo ' htmldocs - HTML'
+ @echo ' latexdocs - LaTeX'
+ @echo ' pdfdocs - PDF'
+ @echo ' epubdocs - EPUB'
+ @echo ' xmldocs - XML'
+ @echo ' linkcheckdocs - check for broken external links (will connect to external hosts)'
+ @echo ' cleandocs - clean all generated files'
+ @echo
+ @echo ' make SPHINXDIRS="s1 s2" [target] Generate only docs of folder s1, s2'
+ @echo ' valid values for SPHINXDIRS are: $(_SPHINXDIRS)'
+ @echo
+ @echo ' make SPHINX_CONF={conf-file} [target] use *additional* sphinx-build'
+ @echo ' configuration. This is e.g. useful to build with nit-picking config.'
diff --git a/Documentation/Makefile.sphinx b/Documentation/Makefile.sphinx
deleted file mode 100644
index bcf529f6cf9b258e4389377a4cf04796fb1594de..0000000000000000000000000000000000000000
--- a/Documentation/Makefile.sphinx
+++ /dev/null
@@ -1,130 +0,0 @@
-# -*- makefile -*-
-# Makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line.
-SPHINXBUILD = sphinx-build
-SPHINXOPTS =
-SPHINXDIRS = .
-_SPHINXDIRS = $(patsubst $(srctree)/Documentation/%/conf.py,%,$(wildcard $(srctree)/Documentation/*/conf.py))
-SPHINX_CONF = conf.py
-PAPER =
-BUILDDIR = $(obj)/output
-PDFLATEX = xelatex
-LATEXOPTS = -interaction=batchmode
-
-# User-friendly check for sphinx-build
-HAVE_SPHINX := $(shell if which $(SPHINXBUILD) >/dev/null 2>&1; then echo 1; else echo 0; fi)
-
-ifeq ($(HAVE_SPHINX),0)
-
-.DEFAULT:
- $(warning The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed and in PATH, or set the SPHINXBUILD make variable to point to the full path of the '$(SPHINXBUILD)' executable.)
- @echo " SKIP Sphinx $@ target."
-
-else ifneq ($(DOCBOOKS),)
-
-# Skip Sphinx build if the user explicitly requested DOCBOOKS.
-.DEFAULT:
- @echo " SKIP Sphinx $@ target (DOCBOOKS specified)."
-
-else # HAVE_SPHINX
-
-# User-friendly check for pdflatex
-HAVE_PDFLATEX := $(shell if which $(PDFLATEX) >/dev/null 2>&1; then echo 1; else echo 0; fi)
-
-# Internal variables.
-PAPEROPT_a4 = -D latex_paper_size=a4
-PAPEROPT_letter = -D latex_paper_size=letter
-KERNELDOC = $(srctree)/scripts/kernel-doc
-KERNELDOC_CONF = -D kerneldoc_srctree=$(srctree) -D kerneldoc_bin=$(KERNELDOC)
-ALLSPHINXOPTS = $(KERNELDOC_CONF) $(PAPEROPT_$(PAPER)) $(SPHINXOPTS)
-# the i18n builder cannot share the environment and doctrees with the others
-I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
-
-# commands; the 'cmd' from scripts/Kbuild.include is not *loopable*
-loop_cmd = $(echo-cmd) $(cmd_$(1)) || exit;
-
-# $2 sphinx builder e.g. "html"
-# $3 name of the build subfolder / e.g. "media", used as:
-# * dest folder relative to $(BUILDDIR) and
-# * cache folder relative to $(BUILDDIR)/.doctrees
-# $4 dest subfolder e.g. "man" for man pages at media/man
-# $5 reST source folder relative to $(srctree)/$(src),
-# e.g. "media" for the linux-tv book-set at ./Documentation/media
-
-quiet_cmd_sphinx = SPHINX $@ --> file://$(abspath $(BUILDDIR)/$3/$4)
- cmd_sphinx = $(MAKE) BUILDDIR=$(abspath $(BUILDDIR)) $(build)=Documentation/media $2 && \
- PYTHONDONTWRITEBYTECODE=1 \
- BUILDDIR=$(abspath $(BUILDDIR)) SPHINX_CONF=$(abspath $(srctree)/$(src)/$5/$(SPHINX_CONF)) \
- $(SPHINXBUILD) \
- -b $2 \
- -c $(abspath $(srctree)/$(src)) \
- -d $(abspath $(BUILDDIR)/.doctrees/$3) \
- -D version=$(KERNELVERSION) -D release=$(KERNELRELEASE) \
- $(ALLSPHINXOPTS) \
- $(abspath $(srctree)/$(src)/$5) \
- $(abspath $(BUILDDIR)/$3/$4)
-
-htmldocs:
- @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,html,$(var),,$(var)))
-
-linkcheckdocs:
- @$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,linkcheck,$(var),,$(var)))
-
-latexdocs:
- @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,latex,$(var),latex,$(var)))
-
-ifeq ($(HAVE_PDFLATEX),0)
-
-pdfdocs:
- $(warning The '$(PDFLATEX)' command was not found. Make sure you have it installed and in PATH to produce PDF output.)
- @echo " SKIP Sphinx $@ target."
-
-else # HAVE_PDFLATEX
-
-pdfdocs: latexdocs
- $(foreach var,$(SPHINXDIRS), $(MAKE) PDFLATEX=$(PDFLATEX) LATEXOPTS="$(LATEXOPTS)" -C $(BUILDDIR)/$(var)/latex || exit;)
-
-endif # HAVE_PDFLATEX
-
-epubdocs:
- @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,epub,$(var),epub,$(var)))
-
-xmldocs:
- @+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,xml,$(var),xml,$(var)))
-
-endif # HAVE_SPHINX
-
-# The following targets are independent of HAVE_SPHINX, and the rules should
-# work or silently pass without Sphinx.
-
-# no-ops for the Sphinx toolchain
-sgmldocs:
- @:
-psdocs:
- @:
-mandocs:
- @:
-installmandocs:
- @:
-
-cleandocs:
- $(Q)rm -rf $(BUILDDIR)
- $(Q)$(MAKE) BUILDDIR=$(abspath $(BUILDDIR)) $(build)=Documentation/media clean
-
-dochelp:
- @echo ' Linux kernel internal documentation in different formats (Sphinx):'
- @echo ' htmldocs - HTML'
- @echo ' latexdocs - LaTeX'
- @echo ' pdfdocs - PDF'
- @echo ' epubdocs - EPUB'
- @echo ' xmldocs - XML'
- @echo ' linkcheckdocs - check for broken external links (will connect to external hosts)'
- @echo ' cleandocs - clean all generated files'
- @echo
- @echo ' make SPHINXDIRS="s1 s2" [target] Generate only docs of folder s1, s2'
- @echo ' valid values for SPHINXDIRS are: $(_SPHINXDIRS)'
- @echo
- @echo ' make SPHINX_CONF={conf-file} [target] use *additional* sphinx-build'
- @echo ' configuration. This is e.g. useful to build with nit-picking config.'
diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt
index 1e37138027a3c85e758bdcb647abfdf2ed47f3c2..618e13d5e27632513791218707b94eb308395c07 100644
--- a/Documentation/PCI/MSI-HOWTO.txt
+++ b/Documentation/PCI/MSI-HOWTO.txt
@@ -186,7 +186,7 @@ must disable interrupts while the lock is held. If the device sends
a different interrupt, the driver will deadlock trying to recursively
acquire the spinlock. Such deadlocks can be avoided by using
spin_lock_irqsave() or spin_lock_irq() which disable local interrupts
-and acquire the lock (see Documentation/DocBook/kernel-locking).
+and acquire the lock (see Documentation/kernel-hacking/locking.rst).
4.5 How to tell whether MSI/MSI-X is enabled on a device
diff --git a/Documentation/admin-guide/README.rst b/Documentation/admin-guide/README.rst
index b96e80f79e853109abdb0d73a6a688453f9689cc..b5343c5aa224ce0ffbbddadc8d2a73e3c427f3b1 100644
--- a/Documentation/admin-guide/README.rst
+++ b/Documentation/admin-guide/README.rst
@@ -55,12 +55,6 @@ Documentation
contains information about the problems, which may result by upgrading
your kernel.
- - The Documentation/DocBook/ subdirectory contains several guides for
- kernel developers and users. These guides can be rendered in a
- number of formats: PostScript (.ps), PDF, HTML, & man-pages, among others.
- After installation, ``make psdocs``, ``make pdfdocs``, ``make htmldocs``,
- or ``make mandocs`` will render the documentation in the requested format.
-
Installing the kernel source
----------------------------
diff --git a/Documentation/conf.py b/Documentation/conf.py
index bacf9d337c89a067c8a9187d4cfeba6fe6582fcd..77d47bb1df1d5b97455646168b1d258caf1f29d5 100644
--- a/Documentation/conf.py
+++ b/Documentation/conf.py
@@ -281,6 +281,7 @@ latex_elements = {
\\definecolor{NoteColor}{RGB}{204,255,255}
\\definecolor{WarningColor}{RGB}{255,204,204}
\\definecolor{AttentionColor}{RGB}{255,255,204}
+ \\definecolor{ImportantColor}{RGB}{192,255,204}
\\definecolor{OtherColor}{RGB}{204,204,204}
\\newlength{\\mynoticelength}
\\makeatletter\\newenvironment{coloredbox}[1]{%
@@ -301,7 +302,12 @@ latex_elements = {
\\ifthenelse%
{\\equal{\\py@noticetype}{attention}}%
{\\colorbox{AttentionColor}{\\usebox{\\@tempboxa}}}%
- {\\colorbox{OtherColor}{\\usebox{\\@tempboxa}}}%
+ {%
+ \\ifthenelse%
+ {\\equal{\\py@noticetype}{important}}%
+ {\\colorbox{ImportantColor}{\\usebox{\\@tempboxa}}}%
+ {\\colorbox{OtherColor}{\\usebox{\\@tempboxa}}}%
+ }%
}%
}%
}\\makeatother
@@ -339,27 +345,42 @@ if major == 1 and minor > 3:
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
+# Sorted in alphabetical order
latex_documents = [
- ('doc-guide/index', 'kernel-doc-guide.tex', 'Linux Kernel Documentation Guide',
- 'The kernel development community', 'manual'),
('admin-guide/index', 'linux-user.tex', 'Linux Kernel User Documentation',
'The kernel development community', 'manual'),
('core-api/index', 'core-api.tex', 'The kernel core API manual',
'The kernel development community', 'manual'),
- ('driver-api/index', 'driver-api.tex', 'The kernel driver API manual',
+ ('crypto/index', 'crypto-api.tex', 'Linux Kernel Crypto API manual',
'The kernel development community', 'manual'),
- ('input/index', 'linux-input.tex', 'The Linux input driver subsystem',
+ ('dev-tools/index', 'dev-tools.tex', 'Development tools for the Kernel',
'The kernel development community', 'manual'),
- ('kernel-documentation', 'kernel-documentation.tex', 'The Linux Kernel Documentation',
+ ('doc-guide/index', 'kernel-doc-guide.tex', 'Linux Kernel Documentation Guide',
'The kernel development community', 'manual'),
- ('process/index', 'development-process.tex', 'Linux Kernel Development Documentation',
+ ('driver-api/index', 'driver-api.tex', 'The kernel driver API manual',
+ 'The kernel development community', 'manual'),
+ ('filesystems/index', 'filesystems.tex', 'Linux Filesystems API',
'The kernel development community', 'manual'),
('gpu/index', 'gpu.tex', 'Linux GPU Driver Developer\'s Guide',
'The kernel development community', 'manual'),
+ ('input/index', 'linux-input.tex', 'The Linux input driver subsystem',
+ 'The kernel development community', 'manual'),
+ ('kernel-hacking/index', 'kernel-hacking.tex', 'Unreliable Guide To Hacking The Linux Kernel',
+ 'The kernel development community', 'manual'),
('media/index', 'media.tex', 'Linux Media Subsystem Documentation',
'The kernel development community', 'manual'),
+ ('networking/index', 'networking.tex', 'Linux Networking Documentation',
+ 'The kernel development community', 'manual'),
+ ('process/index', 'development-process.tex', 'Linux Kernel Development Documentation',
+ 'The kernel development community', 'manual'),
('security/index', 'security.tex', 'The kernel security subsystem manual',
'The kernel development community', 'manual'),
+ ('sh/index', 'sh.tex', 'SuperH architecture implementation manual',
+ 'The kernel development community', 'manual'),
+ ('sound/index', 'sound.tex', 'Linux Sound Subsystem Documentation',
+ 'The kernel development community', 'manual'),
+ ('userspace-api/index', 'userspace-api.tex', 'The Linux kernel user-space API guide',
+ 'The kernel development community', 'manual'),
]
# The name of an image file (relative to this directory) to place at the top of
diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index 62abd36bfffbc1c1421c9212f40ac33d2f0fcf0c..0606be3a3111819de772c3ce13ba61c0915898e5 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -19,6 +19,7 @@ Core utilities
workqueue
genericirq
flexible-arrays
+ librs
Interfaces for kernel debugging
===============================
diff --git a/Documentation/core-api/librs.rst b/Documentation/core-api/librs.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6010f5bc5bf91d3c9561dc866b9d900ae88ea438
--- /dev/null
+++ b/Documentation/core-api/librs.rst
@@ -0,0 +1,212 @@
+==========================================
+Reed-Solomon Library Programming Interface
+==========================================
+
+:Author: Thomas Gleixner
+
+Introduction
+============
+
+The generic Reed-Solomon Library provides encoding, decoding and error
+correction functions.
+
+Reed-Solomon codes are used in communication and storage applications to
+ensure data integrity.
+
+This documentation is provided for developers who want to utilize the
+functions provided by the library.
+
+Known Bugs And Assumptions
+==========================
+
+None.
+
+Usage
+=====
+
+This chapter provides examples of how to use the library.
+
+Initializing
+------------
+
+The init function init_rs returns a pointer to an rs decoder structure,
+which holds the necessary information for encoding, decoding and error
+correction with the given polynomial. It either uses an existing
+matching decoder or creates a new one. On creation all the lookup tables
+for fast en/decoding are created. The function may take a while, so make
+sure not to call it in critical code paths.
+
+::
+
+ /* the Reed Solomon control structure */
+ static struct rs_control *rs_decoder;
+
+ /* Symbolsize is 10 (bits)
+ * Primitive polynomial is x^10+x^3+1
+ * first consecutive root is 0
+ * primitive element to generate roots = 1
+ * generator polynomial degree (number of roots) = 6
+ */
+ rs_decoder = init_rs (10, 0x409, 0, 1, 6);
+
+
+Encoding
+--------
+
+The encoder calculates the Reed-Solomon code over the given data length
+and stores the result in the parity buffer. Note that the parity buffer
+must be initialized before calling the encoder.
+
+The expanded data can be inverted on the fly by providing a non-zero
+inversion mask. The expanded data is XOR'ed with the mask. This is used
+e.g. for FLASH ECC, where the all 0xFF is inverted to an all 0x00. The
+Reed-Solomon code for all 0x00 is all 0x00. The code is inverted before
+storing to FLASH so it is 0xFF too. This prevents that reading from an
+erased FLASH results in ECC errors.
+
+The databytes are expanded to the given symbol size on the fly. There is
+no support for encoding continuous bitstreams with a symbol size != 8 at
+the moment. If it is necessary it should be not a big deal to implement
+such functionality.
+
+::
+
+ /* Parity buffer. Size = number of roots */
+ uint16_t par[6];
+ /* Initialize the parity buffer */
+ memset(par, 0, sizeof(par));
+ /* Encode 512 byte in data8. Store parity in buffer par */
+ encode_rs8 (rs_decoder, data8, 512, par, 0);
+
+
+Decoding
+--------
+
+The decoder calculates the syndrome over the given data length and the
+received parity symbols and corrects errors in the data.
+
+If a syndrome is available from a hardware decoder then the syndrome
+calculation is skipped.
+
+The correction of the data buffer can be suppressed by providing a
+correction pattern buffer and an error location buffer to the decoder.
+The decoder stores the calculated error location and the correction
+bitmask in the given buffers. This is useful for hardware decoders which
+use a weird bit ordering scheme.
+
+The databytes are expanded to the given symbol size on the fly. There is
+no support for decoding continuous bitstreams with a symbolsize != 8 at
+the moment. If it is necessary it should be not a big deal to implement
+such functionality.
+
+Decoding with syndrome calculation, direct data correction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+::
+
+ /* Parity buffer. Size = number of roots */
+ uint16_t par[6];
+ uint8_t data[512];
+ int numerr;
+ /* Receive data */
+ .....
+ /* Receive parity */
+ .....
+ /* Decode 512 byte in data8.*/
+ numerr = decode_rs8 (rs_decoder, data8, par, 512, NULL, 0, NULL, 0, NULL);
+
+
+Decoding with syndrome given by hardware decoder, direct data correction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+::
+
+ /* Parity buffer. Size = number of roots */
+ uint16_t par[6], syn[6];
+ uint8_t data[512];
+ int numerr;
+ /* Receive data */
+ .....
+ /* Receive parity */
+ .....
+ /* Get syndrome from hardware decoder */
+ .....
+ /* Decode 512 byte in data8.*/
+ numerr = decode_rs8 (rs_decoder, data8, par, 512, syn, 0, NULL, 0, NULL);
+
+
+Decoding with syndrome given by hardware decoder, no direct data correction.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Note: It's not necessary to give data and received parity to the
+decoder.
+
+::
+
+ /* Parity buffer. Size = number of roots */
+ uint16_t par[6], syn[6], corr[8];
+ uint8_t data[512];
+ int numerr, errpos[8];
+ /* Receive data */
+ .....
+ /* Receive parity */
+ .....
+ /* Get syndrome from hardware decoder */
+ .....
+ /* Decode 512 byte in data8.*/
+ numerr = decode_rs8 (rs_decoder, NULL, NULL, 512, syn, 0, errpos, 0, corr);
+ for (i = 0; i < numerr; i++) {
+ do_error_correction_in_your_buffer(errpos[i], corr[i]);
+ }
+
+
+Cleanup
+-------
+
+The function free_rs frees the allocated resources, if the caller is
+the last user of the decoder.
+
+::
+
+ /* Release resources */
+ free_rs(rs_decoder);
+
+
+Structures
+==========
+
+This chapter contains the autogenerated documentation of the structures
+which are used in the Reed-Solomon Library and are relevant for a
+developer.
+
+.. kernel-doc:: include/linux/rslib.h
+ :internal:
+
+Public Functions Provided
+=========================
+
+This chapter contains the autogenerated documentation of the
+Reed-Solomon functions which are exported.
+
+.. kernel-doc:: lib/reed_solomon/reed_solomon.c
+ :export:
+
+Credits
+=======
+
+The library code for encoding and decoding was written by Phil Karn.
+
+::
+
+ Copyright 2002, Phil Karn, KA9Q
+ May be used under the terms of the GNU General Public License (GPL)
+
+
+The wrapper functions and interfaces are written by Thomas Gleixner.
+
+Many users have provided bugfixes, improvements and helping hands for
+testing. Thanks a lot.
+
+The following people have contributed to this document:
+
+Thomas Gleixner\ tglx@linutronix.de
diff --git a/Documentation/crypto/conf.py b/Documentation/crypto/conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..4335d251ddf339d72ace4a118f5204a7e5e59275
--- /dev/null
+++ b/Documentation/crypto/conf.py
@@ -0,0 +1,10 @@
+# -*- coding: utf-8; mode: python -*-
+
+project = 'Linux Kernel Crypto API'
+
+tags.add("subproject")
+
+latex_documents = [
+ ('index', 'crypto-api.tex', 'Linux Kernel Crypto API manual',
+ 'The kernel development community', 'manual'),
+]
diff --git a/Documentation/dev-tools/index.rst b/Documentation/dev-tools/index.rst
index 07d881147ef3c1eec38ea0daf06dc336ec92d633..4ac991dbddb714219ae5231480784317dbd4aece 100644
--- a/Documentation/dev-tools/index.rst
+++ b/Documentation/dev-tools/index.rst
@@ -23,6 +23,7 @@ whole; patches welcome!
kmemleak
kmemcheck
gdb-kernel-debugging
+ kgdb
.. only:: subproject and html
diff --git a/Documentation/dev-tools/kgdb.rst b/Documentation/dev-tools/kgdb.rst
new file mode 100644
index 0000000000000000000000000000000000000000..75273203a35a25bcab3e4b90e4b368941fdc2b2a
--- /dev/null
+++ b/Documentation/dev-tools/kgdb.rst
@@ -0,0 +1,907 @@
+=================================================
+Using kgdb, kdb and the kernel debugger internals
+=================================================
+
+:Author: Jason Wessel
+
+Introduction
+============
+
+The kernel has two different debugger front ends (kdb and kgdb) which
+interface to the debug core. It is possible to use either of the
+debugger front ends and dynamically transition between them if you
+configure the kernel properly at compile and runtime.
+
+Kdb is simplistic shell-style interface which you can use on a system
+console with a keyboard or serial console. You can use it to inspect
+memory, registers, process lists, dmesg, and even set breakpoints to
+stop in a certain location. Kdb is not a source level debugger, although
+you can set breakpoints and execute some basic kernel run control. Kdb
+is mainly aimed at doing some analysis to aid in development or
+diagnosing kernel problems. You can access some symbols by name in
+kernel built-ins or in kernel modules if the code was built with
+``CONFIG_KALLSYMS``.
+
+Kgdb is intended to be used as a source level debugger for the Linux
+kernel. It is used along with gdb to debug a Linux kernel. The
+expectation is that gdb can be used to "break in" to the kernel to
+inspect memory, variables and look through call stack information
+similar to the way an application developer would use gdb to debug an
+application. It is possible to place breakpoints in kernel code and
+perform some limited execution stepping.
+
+Two machines are required for using kgdb. One of these machines is a
+development machine and the other is the target machine. The kernel to
+be debugged runs on the target machine. The development machine runs an
+instance of gdb against the vmlinux file which contains the symbols (not
+a boot image such as bzImage, zImage, uImage...). In gdb the developer
+specifies the connection parameters and connects to kgdb. The type of
+connection a developer makes with gdb depends on the availability of
+kgdb I/O modules compiled as built-ins or loadable kernel modules in the
+test machine's kernel.
+
+Compiling a kernel
+==================
+
+- In order to enable compilation of kdb, you must first enable kgdb.
+
+- The kgdb test compile options are described in the kgdb test suite
+ chapter.
+
+Kernel config options for kgdb
+------------------------------
+
+To enable ``CONFIG_KGDB`` you should look under
+:menuselection:`Kernel hacking --> Kernel debugging` and select
+:menuselection:`KGDB: kernel debugger`.
+
+While it is not a hard requirement that you have symbols in your vmlinux
+file, gdb tends not to be very useful without the symbolic data, so you
+will want to turn on ``CONFIG_DEBUG_INFO`` which is called
+:menuselection:`Compile the kernel with debug info` in the config menu.
+
+It is advised, but not required, that you turn on the
+``CONFIG_FRAME_POINTER`` kernel option which is called :menuselection:`Compile
+the kernel with frame pointers` in the config menu. This option inserts code
+to into the compiled executable which saves the frame information in
+registers or on the stack at different points which allows a debugger
+such as gdb to more accurately construct stack back traces while
+debugging the kernel.
+
+If the architecture that you are using supports the kernel option
+``CONFIG_STRICT_KERNEL_RWX``, you should consider turning it off. This
+option will prevent the use of software breakpoints because it marks
+certain regions of the kernel's memory space as read-only. If kgdb
+supports it for the architecture you are using, you can use hardware
+breakpoints if you desire to run with the ``CONFIG_STRICT_KERNEL_RWX``
+option turned on, else you need to turn off this option.
+
+Next you should choose one of more I/O drivers to interconnect debugging
+host and debugged target. Early boot debugging requires a KGDB I/O
+driver that supports early debugging and the driver must be built into
+the kernel directly. Kgdb I/O driver configuration takes place via
+kernel or module parameters which you can learn more about in the in the
+section that describes the parameter kgdboc.
+
+Here is an example set of ``.config`` symbols to enable or disable for kgdb::
+
+ # CONFIG_STRICT_KERNEL_RWX is not set
+ CONFIG_FRAME_POINTER=y
+ CONFIG_KGDB=y
+ CONFIG_KGDB_SERIAL_CONSOLE=y
+
+Kernel config options for kdb
+-----------------------------
+
+Kdb is quite a bit more complex than the simple gdbstub sitting on top
+of the kernel's debug core. Kdb must implement a shell, and also adds
+some helper functions in other parts of the kernel, responsible for
+printing out interesting data such as what you would see if you ran
+``lsmod``, or ``ps``. In order to build kdb into the kernel you follow the
+same steps as you would for kgdb.
+
+The main config option for kdb is ``CONFIG_KGDB_KDB`` which is called
+:menuselection:`KGDB_KDB: include kdb frontend for kgdb` in the config menu.
+In theory you would have already also selected an I/O driver such as the
+``CONFIG_KGDB_SERIAL_CONSOLE`` interface if you plan on using kdb on a
+serial port, when you were configuring kgdb.
+
+If you want to use a PS/2-style keyboard with kdb, you would select
+``CONFIG_KDB_KEYBOARD`` which is called :menuselection:`KGDB_KDB: keyboard as
+input device` in the config menu. The ``CONFIG_KDB_KEYBOARD`` option is not
+used for anything in the gdb interface to kgdb. The ``CONFIG_KDB_KEYBOARD``
+option only works with kdb.
+
+Here is an example set of ``.config`` symbols to enable/disable kdb::
+
+ # CONFIG_STRICT_KERNEL_RWX is not set
+ CONFIG_FRAME_POINTER=y
+ CONFIG_KGDB=y
+ CONFIG_KGDB_SERIAL_CONSOLE=y
+ CONFIG_KGDB_KDB=y
+ CONFIG_KDB_KEYBOARD=y
+
+Kernel Debugger Boot Arguments
+==============================
+
+This section describes the various runtime kernel parameters that affect
+the configuration of the kernel debugger. The following chapter covers
+using kdb and kgdb as well as providing some examples of the
+configuration parameters.
+
+Kernel parameter: kgdboc
+------------------------
+
+The kgdboc driver was originally an abbreviation meant to stand for
+"kgdb over console". Today it is the primary mechanism to configure how
+to communicate from gdb to kgdb as well as the devices you want to use
+to interact with the kdb shell.
+
+For kgdb/gdb, kgdboc is designed to work with a single serial port. It
+is intended to cover the circumstance where you want to use a serial
+console as your primary console as well as using it to perform kernel
+debugging. It is also possible to use kgdb on a serial port which is not
+designated as a system console. Kgdboc may be configured as a kernel
+built-in or a kernel loadable module. You can only make use of
+``kgdbwait`` and early debugging if you build kgdboc into the kernel as
+a built-in.
+
+Optionally you can elect to activate kms (Kernel Mode Setting)
+integration. When you use kms with kgdboc and you have a video driver
+that has atomic mode setting hooks, it is possible to enter the debugger
+on the graphics console. When the kernel execution is resumed, the
+previous graphics mode will be restored. This integration can serve as a
+useful tool to aid in diagnosing crashes or doing analysis of memory
+with kdb while allowing the full graphics console applications to run.
+
+kgdboc arguments
+~~~~~~~~~~~~~~~~
+
+Usage::
+
+ kgdboc=[kms][[,]kbd][[,]serial_device][,baud]
+
+The order listed above must be observed if you use any of the optional
+configurations together.
+
+Abbreviations:
+
+- kms = Kernel Mode Setting
+
+- kbd = Keyboard
+
+You can configure kgdboc to use the keyboard, and/or a serial device
+depending on if you are using kdb and/or kgdb, in one of the following
+scenarios. The order listed above must be observed if you use any of the
+optional configurations together. Using kms + only gdb is generally not
+a useful combination.
+
+Using loadable module or built-in
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+1. As a kernel built-in:
+
+ Use the kernel boot argument::
+
+ kgdboc=,[baud]
+
+2. As a kernel loadable module:
+
+ Use the command::
+
+ modprobe kgdboc kgdboc=,[baud]
+
+ Here are two examples of how you might format the kgdboc string. The
+ first is for an x86 target using the first serial port. The second
+ example is for the ARM Versatile AB using the second serial port.
+
+ 1. ``kgdboc=ttyS0,115200``
+
+ 2. ``kgdboc=ttyAMA1,115200``
+
+Configure kgdboc at runtime with sysfs
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+At run time you can enable or disable kgdboc by echoing a parameters
+into the sysfs. Here are two examples:
+
+1. Enable kgdboc on ttyS0::
+
+ echo ttyS0 > /sys/module/kgdboc/parameters/kgdboc
+
+2. Disable kgdboc::
+
+ echo "" > /sys/module/kgdboc/parameters/kgdboc
+
+.. note::
+
+ You do not need to specify the baud if you are configuring the
+ console on tty which is already configured or open.
+
+More examples
+^^^^^^^^^^^^^
+
+You can configure kgdboc to use the keyboard, and/or a serial device
+depending on if you are using kdb and/or kgdb, in one of the following
+scenarios.
+
+1. kdb and kgdb over only a serial port::
+
+ kgdboc=[,baud]
+
+ Example::
+
+ kgdboc=ttyS0,115200
+
+2. kdb and kgdb with keyboard and a serial port::
+
+ kgdboc=kbd,[,baud]
+
+ Example::
+
+ kgdboc=kbd,ttyS0,115200
+
+3. kdb with a keyboard::
+
+ kgdboc=kbd
+
+4. kdb with kernel mode setting::
+
+ kgdboc=kms,kbd
+
+5. kdb with kernel mode setting and kgdb over a serial port::
+
+ kgdboc=kms,kbd,ttyS0,115200
+
+.. note::
+
+ Kgdboc does not support interrupting the target via the gdb remote
+ protocol. You must manually send a :kbd:`SysRq-G` unless you have a proxy
+ that splits console output to a terminal program. A console proxy has a
+ separate TCP port for the debugger and a separate TCP port for the
+ "human" console. The proxy can take care of sending the :kbd:`SysRq-G`
+ for you.
+
+When using kgdboc with no debugger proxy, you can end up connecting the
+debugger at one of two entry points. If an exception occurs after you
+have loaded kgdboc, a message should print on the console stating it is
+waiting for the debugger. In this case you disconnect your terminal
+program and then connect the debugger in its place. If you want to
+interrupt the target system and forcibly enter a debug session you have
+to issue a :kbd:`Sysrq` sequence and then type the letter :kbd:`g`. Then you
+disconnect the terminal session and connect gdb. Your options if you
+don't like this are to hack gdb to send the :kbd:`SysRq-G` for you as well as
+on the initial connect, or to use a debugger proxy that allows an
+unmodified gdb to do the debugging.
+
+Kernel parameter: ``kgdbwait``
+------------------------------
+
+The Kernel command line option ``kgdbwait`` makes kgdb wait for a
+debugger connection during booting of a kernel. You can only use this
+option if you compiled a kgdb I/O driver into the kernel and you
+specified the I/O driver configuration as a kernel command line option.
+The kgdbwait parameter should always follow the configuration parameter
+for the kgdb I/O driver in the kernel command line else the I/O driver
+will not be configured prior to asking the kernel to use it to wait.
+
+The kernel will stop and wait as early as the I/O driver and
+architecture allows when you use this option. If you build the kgdb I/O
+driver as a loadable kernel module kgdbwait will not do anything.
+
+Kernel parameter: ``kgdbcon``
+-----------------------------
+
+The ``kgdbcon`` feature allows you to see :c:func:`printk` messages inside gdb
+while gdb is connected to the kernel. Kdb does not make use of the kgdbcon
+feature.
+
+Kgdb supports using the gdb serial protocol to send console messages to
+the debugger when the debugger is connected and running. There are two
+ways to activate this feature.
+
+1. Activate with the kernel command line option::
+
+ kgdbcon
+
+2. Use sysfs before configuring an I/O driver::
+
+ echo 1 > /sys/module/kgdb/parameters/kgdb_use_con
+
+.. note::
+
+ If you do this after you configure the kgdb I/O driver, the
+ setting will not take effect until the next point the I/O is
+ reconfigured.
+
+.. important::
+
+ You cannot use kgdboc + kgdbcon on a tty that is an
+ active system console. An example of incorrect usage is::
+
+ console=ttyS0,115200 kgdboc=ttyS0 kgdbcon
+
+It is possible to use this option with kgdboc on a tty that is not a
+system console.
+
+Run time parameter: ``kgdbreboot``
+----------------------------------
+
+The kgdbreboot feature allows you to change how the debugger deals with
+the reboot notification. You have 3 choices for the behavior. The
+default behavior is always set to 0.
+
+.. tabularcolumns:: |p{0.4cm}|p{11.5cm}|p{5.6cm}|
+
+.. flat-table::
+ :widths: 1 10 8
+
+ * - 1
+ - ``echo -1 > /sys/module/debug_core/parameters/kgdbreboot``
+ - Ignore the reboot notification entirely.
+
+ * - 2
+ - ``echo 0 > /sys/module/debug_core/parameters/kgdbreboot``
+ - Send the detach message to any attached debugger client.
+
+ * - 3
+ - ``echo 1 > /sys/module/debug_core/parameters/kgdbreboot``
+ - Enter the debugger on reboot notify.
+
+Using kdb
+=========
+
+Quick start for kdb on a serial port
+------------------------------------
+
+This is a quick example of how to use kdb.
+
+1. Configure kgdboc at boot using kernel parameters::
+
+ console=ttyS0,115200 kgdboc=ttyS0,115200
+
+ OR
+
+ Configure kgdboc after the kernel has booted; assuming you are using
+ a serial port console::
+
+ echo ttyS0 > /sys/module/kgdboc/parameters/kgdboc
+
+2. Enter the kernel debugger manually or by waiting for an oops or
+ fault. There are several ways you can enter the kernel debugger
+ manually; all involve using the :kbd:`SysRq-G`, which means you must have
+ enabled ``CONFIG_MAGIC_SysRq=y`` in your kernel config.
+
+ - When logged in as root or with a super user session you can run::
+
+ echo g > /proc/sysrq-trigger
+
+ - Example using minicom 2.2
+
+ Press: :kbd:`CTRL-A` :kbd:`f` :kbd:`g`
+
+ - When you have telneted to a terminal server that supports sending
+ a remote break
+
+ Press: :kbd:`CTRL-]`
+
+ Type in: ``send break``
+
+ Press: :kbd:`Enter` :kbd:`g`
+
+3. From the kdb prompt you can run the ``help`` command to see a complete
+ list of the commands that are available.
+
+ Some useful commands in kdb include:
+
+ =========== =================================================================
+ ``lsmod`` Shows where kernel modules are loaded
+ ``ps`` Displays only the active processes
+ ``ps A`` Shows all the processes
+ ``summary`` Shows kernel version info and memory usage
+ ``bt`` Get a backtrace of the current process using :c:func:`dump_stack`
+ ``dmesg`` View the kernel syslog buffer
+ ``go`` Continue the system
+ =========== =================================================================
+
+4. When you are done using kdb you need to consider rebooting the system
+ or using the ``go`` command to resuming normal kernel execution. If you
+ have paused the kernel for a lengthy period of time, applications
+ that rely on timely networking or anything to do with real wall clock
+ time could be adversely affected, so you should take this into
+ consideration when using the kernel debugger.
+
+Quick start for kdb using a keyboard connected console
+------------------------------------------------------
+
+This is a quick example of how to use kdb with a keyboard.
+
+1. Configure kgdboc at boot using kernel parameters::
+
+ kgdboc=kbd
+
+ OR
+
+ Configure kgdboc after the kernel has booted::
+
+ echo kbd > /sys/module/kgdboc/parameters/kgdboc
+
+2. Enter the kernel debugger manually or by waiting for an oops or
+ fault. There are several ways you can enter the kernel debugger
+ manually; all involve using the :kbd:`SysRq-G`, which means you must have
+ enabled ``CONFIG_MAGIC_SysRq=y`` in your kernel config.
+
+ - When logged in as root or with a super user session you can run::
+
+ echo g > /proc/sysrq-trigger
+
+ - Example using a laptop keyboard:
+
+ Press and hold down: :kbd:`Alt`
+
+ Press and hold down: :kbd:`Fn`
+
+ Press and release the key with the label: :kbd:`SysRq`
+
+ Release: :kbd:`Fn`
+
+ Press and release: :kbd:`g`
+
+ Release: :kbd:`Alt`
+
+ - Example using a PS/2 101-key keyboard
+
+ Press and hold down: :kbd:`Alt`
+
+ Press and release the key with the label: :kbd:`SysRq`
+
+ Press and release: :kbd:`g`
+
+ Release: :kbd:`Alt`
+
+3. Now type in a kdb command such as ``help``, ``dmesg``, ``bt`` or ``go`` to
+ continue kernel execution.
+
+Using kgdb / gdb
+================
+
+In order to use kgdb you must activate it by passing configuration
+information to one of the kgdb I/O drivers. If you do not pass any
+configuration information kgdb will not do anything at all. Kgdb will
+only actively hook up to the kernel trap hooks if a kgdb I/O driver is
+loaded and configured. If you unconfigure a kgdb I/O driver, kgdb will
+unregister all the kernel hook points.
+
+All kgdb I/O drivers can be reconfigured at run time, if
+``CONFIG_SYSFS`` and ``CONFIG_MODULES`` are enabled, by echo'ing a new
+config string to ``/sys/module//parameter/