http://linus.bkbits.net/linux-2.5 greg@kroah.com[torvalds]|ChangeSet|20040331005319|52424 greg # This is a BitKeeper generated diff -Nru style patch. # # ChangeSet # 2004/03/30 16:53:19-08:00 greg@kroah.com # [PATCH] back out sysfs reference count change # # This backs out Maneesh's sysfs patch that was recently added to the # kernel. # # In its defense, the original patch did solve some fixes that could be # duplicated on SMP machines, but the side affect of the patch caused lots # of problems. Basically it caused kobjects to get their references # incremented when files that are not present in the kobject are asked for # (udev can easily trigger this when it looks for files call "dev" in # directories that do not have that file). This can cause easy oopses # when the VFS later ages out those old dentries and the kobject has its # reference finally released (usually after the module that the kobject # lived in was removed.) # # I will continue to work with Maneesh to try to solve the original bug, # but for now, this patch needs to be applied. # # fs/sysfs/dir.c # 2004/03/30 07:23:21-08:00 greg@kroah.com +1 -14 # back out sysfs reference count change # # ChangeSet # 2004/03/30 16:53:09-08:00 rth@twiddle.net # [PATCH] Alpha: UP1500 pci_mem fix # # From: Ivan Kokshaysky # # The memory reserved for PCI probe is not freed properly in some cases, # for instance with a 3.5Gb of RAM. # # Forward port from 2.4. # # arch/alpha/kernel/sys_nautilus.c # 2003/08/20 04:30:07-07:00 rth@twiddle.net +5 -3 # Alpha: UP1500 pci_mem fix # # ChangeSet # 2004/03/30 10:52:10-08:00 benh@kernel.crashing.org # [PATCH] ppc64: More incorrect syscall error test # # Oops, there was two different code path affected by this # bug (strace and normal) and I fixed only one. Here's the # other one: # # arch/ppc64/kernel/entry.S # 2004/03/30 07:59:43-08:00 benh@kernel.crashing.org +1 -1 # ppc64: More incorrect syscall error test # # ChangeSet # 2004/03/30 10:51:56-08:00 benh@kernel.crashing.org # [PATCH] ppc64: Add a sync in context switch on SMP # # For the same reason as ppc32, we need to ensure that all stores # done on a CPU has reached the coherency domain and are visible # to loads done by another CPU when context switching as the same # thread may be rescheduled almost right away there. # # arch/ppc64/kernel/entry.S # 2004/03/29 20:55:47-08:00 benh@kernel.crashing.org +8 -0 # ppc64: Add a sync in context switch on SMP # # ChangeSet # 2004/03/30 10:51:44-08:00 benh@kernel.crashing.org # [PATCH] ppc32: PCI mmap update # # This updates the ppc32 PCI mmap facility to allow mmap'ing of space # outside of the actual devices, using the host bridge resources instead. # # This allow userland to map things like legacy IO space by either using # the bridge device itself, or simply any PCI device on the same bus # domain # # arch/ppc/kernel/pci.c # 2004/03/28 19:56:09-08:00 benh@kernel.crashing.org +28 -37 # ppc32: PCI mmap update # # ChangeSet # 2004/03/30 10:51:30-08:00 benh@kernel.crashing.org # [PATCH] ppc32: Allow PREEMPT with SMP in KConfig # # On ppc32, CONFIG_PREEMPT wasn't settable along with CONFIG_SMP # for historical reasons (smp_processor_id() races). Those races have # been fixes since then (well, should have been at least) so it's now # safe to allow both options. # # arch/ppc/Kconfig # 2004/03/30 07:39:41-08:00 benh@kernel.crashing.org +0 -4 # ppc32: Allow PREEMPT with SMP in KConfig # # ChangeSet # 2004/03/30 10:51:17-08:00 benh@kernel.crashing.org # [PATCH] ppc32: context switch fixes # # This fixes a few issues with context switch on ppc32: # # - Makes sure we properly flush out all stores to the coherency domain # when switching out, since the same thread could be switched back in # on another CPU right away, those stores must be visible to all other # CPUs. # # - Remove dssall in the assembly calls and do it now once in switch_mm # (stop vmx streams). Assume the G5 doesn't need a sync after dssall. # # - Remove bogus isync in the loop setting the userland segment registers # # - Do not switch the userland segments when the mm stays the same # # include/asm-ppc/mmu_context.h # 2004/03/29 20:58:44-08:00 benh@kernel.crashing.org +19 -6 # ppc32: context switch fixes # # include/asm-ppc/cputable.h # 2004/03/29 20:58:43-08:00 benh@kernel.crashing.org +16 -2 # ppc32: context switch fixes # # arch/ppc/kernel/head.S # 2004/03/29 20:55:41-08:00 benh@kernel.crashing.org +2 -5 # ppc32: context switch fixes # # arch/ppc/kernel/entry.S # 2004/03/29 20:55:41-08:00 benh@kernel.crashing.org +9 -0 # ppc32: context switch fixes # # ChangeSet # 2004/03/30 10:51:04-08:00 benh@kernel.crashing.org # [PATCH] ppc32: Remove duplicate export # # enable_kernel_fp is exported both in ppc_ksyms and near it's # definition in process.c, remove the former. # # arch/ppc/kernel/ppc_ksyms.c # 2004/03/29 19:00:44-08:00 benh@kernel.crashing.org +0 -1 # ppc32: Remove duplicate export # # ChangeSet # 2004/03/30 10:50:51-08:00 benh@kernel.crashing.org # [PATCH] ppc32: Even more preempt fixes # # Add a warning if enable_kernel_{fp,altivec} is called with preempt # enabled since this is always an error, and make sure the alignement # exception handler properly disables preempt when doing FP operations. # # arch/ppc/kernel/process.c # 2004/03/29 20:55:42-08:00 benh@kernel.crashing.org +6 -4 # ppc32: Even more preempt fixes # # arch/ppc/kernel/align.c # 2004/03/29 19:00:44-08:00 benh@kernel.crashing.org +4 -0 # ppc32: Even more preempt fixes # # ChangeSet # 2004/03/30 10:49:13-08:00 vatsa@in.ibm.com # [PATCH] Fix obvious stupid race in do_stop # # We don't set the task state to TASK_INTERRUPTIBLE _before_ checking for # kthread_should_stop in do_stop. # # kernel/stop_machine.c # 2004/03/08 22:53:56-08:00 vatsa@in.ibm.com +3 -1 # Fix obvious stupid race in do_stop # # ChangeSet # 2004/03/30 10:47:17-08:00 marcelo.tosatti@cyclades.com # [PATCH] pc300 driver misplaced ; # # From Dave Jones. # # Oops. # # drivers/net/wan/pc300_drv.c # 2004/03/30 06:32:11-08:00 marcelo.tosatti@cyclades.com +1 -1 # pc300 driver misplaced ; # # ChangeSet # 2004/03/30 10:47:05-08:00 armin@melware.de # [PATCH] ISDN Eicon driver: NULL pointer check inside spinlock # # Check for valid application pointer inside api spinlock # in diva_send_message(). # # drivers/isdn/hardware/eicon/capifunc.c # 2004/03/30 06:17:51-08:00 armin@melware.de +4 -3 # ISDN Eicon driver: NULL pointer check inside spinlock # # ChangeSet # 2004/03/30 10:41:57-08:00 akpm@osdl.org # [PATCH] Make pdflush run at nice 0 # # Since pdflush was converted to be launched by the kthread infrastructure it # has inherited keventd's `nice -10' setting. That hurts interactivity when # pdflush is doing lots of work writing back through the dm-crypt layer. # # So set pdflush back to `nice 0'. # # mm/pdflush.c # 2004/03/30 09:58:09-08:00 akpm@osdl.org +6 -0 # Make pdflush run at nice 0 # # ChangeSet # 2004/03/30 10:41:44-08:00 akpm@osdl.org # [PATCH] catch errors when completing bio pairs # # From: Mike Christie # # A couple of drivers can sometimes fail the first segments in a bio then # requeue the rest of the request. In this situation, if the last part of # the bio completes successfully bio_pair_end_* will miss that the beginging # of the bio had failed becuase they just return one when bi_size is not yet # zero. The attached patch moves the error value test before the bi_size to # catch the above case. # # fs/bio.c # 2004/03/23 07:05:19-08:00 akpm@osdl.org +6 -4 # catch errors when completing bio pairs # # ChangeSet # 2004/03/30 10:41:31-08:00 akpm@osdl.org # [PATCH] Fix BLKPREP_KILL # # From: Jens Axboe # # Samuel Rydh wrote: # # If a MODE_SENSE(6) command is sent to an IDE cd using the CDROM_SEND_PACKET # ioctl, then the kernel freezes solidly. To reproduce this, one can take the # SCSI cmd [1a 08 31 00 10 00] and a 16 byte data buffer. # # After some bug hunting, I found out that the following is what happens: # # - ide-cd recognizes that MODE_SENSE(6) isn't supported and tries # to abort the request from ide_cdrom_prep_pc by returning BLKPREP_KILL. # # - in elv_next_request(), the kill request is handled by # the following code: # # while (end_that_request_first(rq, 0, rq->nr_sectors)) # ; # end_that_request_last(rq); # # The while loop never exits. The end_that_request_first() doesn't do anything # since rq->nr_sectors is 0; it just returns "not-done" after handling those 0 # bytes (rq->bio->bi_size is 16). # # drivers/block/elevator.c # 2004/03/23 06:45:00-08:00 akpm@osdl.org +6 -2 # Fix BLKPREP_KILL # # ChangeSet # 2004/03/29 20:26:56-08:00 laforge@netfilter.org # [NETFILTER]: Fix DELETE_LIST oopses. # # We've now narrowed down the issue of kernel oopses in combination with # 'LIST_DELETE' syslog messages happening in certain setups. # # Apparently people who do not enable CONFIG_IP_NF_NAT_LOCAL and do # DNAT/REDIRECT and want to connect locally from the gateway via DNAT to # the DNAT'ed address experience the bug ;) # # Patch courtesy of KOVACS Krisztian and Henrik Nordstrom # # net/ipv4/netfilter/ip_nat_standalone.c # 2004/03/29 20:26:43-08:00 laforge@netfilter.org +10 -1 # [NETFILTER]: Fix DELETE_LIST oopses. # # We've now narrowed down the issue of kernel oopses in combination with # 'LIST_DELETE' syslog messages happening in certain setups. # # Apparently people who do not enable CONFIG_IP_NF_NAT_LOCAL and do # DNAT/REDIRECT and want to connect locally from the gateway via DNAT to # the DNAT'ed address experience the bug ;) # # Patch courtesy of KOVACS Krisztian and Henrik Nordstrom # # ChangeSet # 2004/03/29 20:19:57-08:00 laforge@netfilter.org # [NETFILTER]: Fix DEBUG compile in ipt_MASQUERADE. # # net/ipv4/netfilter/ipt_MASQUERADE.c # 2004/03/29 20:19:44-08:00 laforge@netfilter.org +1 -1 # [NETFILTER]: Fix DEBUG compile in ipt_MASQUERADE. # # ChangeSet # 2004/03/29 20:11:56-08:00 uaca@alumni.uv.es # [AF_PACKET]: Add PACKET_MMAP documentation. # # net/Kconfig # 2004/03/29 20:11:38-08:00 uaca@alumni.uv.es +0 -0 # [AF_PACKET]: Add PACKET_MMAP documentation. # # Documentation/networking/packet_mmap.txt # 2004/03/29 20:11:32-08:00 uaca@alumni.uv.es +412 -0 # [AF_PACKET]: Add PACKET_MMAP documentation. # # Documentation/networking/packet_mmap.txt # 2004/03/29 20:11:32-08:00 uaca@alumni.uv.es +0 -0 # BitKeeper file /disk1/BK/net-2.6/Documentation/networking/packet_mmap.txt # # ChangeSet # 2004/03/28 21:51:55-08:00 niv@us.ibm.com # [TCP]: Use tcp_tw_put on time-wait sockets. # # net/ipv4/tcp_ipv4.c # 2004/03/28 21:51:37-08:00 niv@us.ibm.com +6 -3 # [TCP]: Use tcp_tw_put on time-wait sockets. # # ChangeSet # 2004/03/28 01:56:20-08:00 jmorris@redhat.com # [IPV6]: Link some packet walker helpers always statically. # # Put the extension header helper funcs always statically into # the kernel even if ipv6 is built as a module, this is needed # for things like SELinux. # # net/ipv6/ipv6_syms.c # 2004/03/28 01:55:27-08:00 jmorris@redhat.com +0 -2 # [IPV6]: Link some packet walker helpers always statically. # # net/ipv6/exthdrs.c # 2004/03/28 01:55:27-08:00 jmorris@redhat.com +0 -102 # [IPV6]: Link some packet walker helpers always statically. # # net/ipv6/Makefile # 2004/03/28 01:55:27-08:00 jmorris@redhat.com +2 -0 # [IPV6]: Link some packet walker helpers always statically. # # net/Makefile # 2004/03/28 01:55:27-08:00 jmorris@redhat.com +3 -1 # [IPV6]: Link some packet walker helpers always statically. # # net/ipv6/exthdrs_core.c # 2004/03/28 01:55:23-08:00 jmorris@redhat.com +108 -0 # [IPV6]: Link some packet walker helpers always statically. # # net/ipv6/exthdrs_core.c # 2004/03/28 01:55:23-08:00 jmorris@redhat.com +0 -0 # BitKeeper file /disk1/BK/net-2.6/net/ipv6/exthdrs_core.c # # ChangeSet # 2004/03/28 01:54:03-08:00 uaca@alumni.uv.es # [AF_PACKET]: Fix packet_set_ring memleak and remove num frame limit. # # net/packet/af_packet.c # 2004/03/28 01:50:58-08:00 uaca@alumni.uv.es +53 -36 # [AF_PACKET]: Fix packet_set_ring memleak and remove num frame limit. # diff -Nru a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/Documentation/networking/packet_mmap.txt Tue Mar 30 20:00:37 2004 @@ -0,0 +1,412 @@ + +DaveM: + +If you agree with it I will send two small patches to modify +kernel's configure help. + + Ulisses + +-------------------------------------------------------------------------------- ++ ABSTRACT +-------------------------------------------------------------------------------- + +This file documents the CONFIG_PACKET_MMAP option available with the PACKET +socket interface on 2.4 and 2.6 kernels. This type of sockets is used for +capture network traffic with utilities like tcpdump or any other that uses +the libpcap library. + +You can find the latest version of this document at + + http://pusa.uv.es/~ulisses/packet_mmap/ + +Please send me your comments to + + Ulisses Alonso Camaró + +------------------------------------------------------------------------------- ++ Why use PACKET_MMAP +-------------------------------------------------------------------------------- + +In Linux 2.4/2.6 if PACKET_MMAP is not enabled, the capture process is very +inefficient. It uses very limited buffers and requires one system call +to capture each packet, it requires two if you want to get packet's +timestamp (like libpcap always does). + +In the other hand PACKET_MMAP is very efficient. PACKET_MMAP provides a size +configurable circular buffer mapped in user space. This way reading packets just +needs to wait for them, most of the time there is no need to issue a single +system call. By using a shared buffer between the kernel and the user +also has the benefit of minimizing packet copies. + +It's fine to use PACKET_MMAP to improve the performance of the capture process, +but it isn't everything. At least, if you are capturing at high speeds (this +is relative to the cpu speed), you should check if the device driver of your +network interface card supports some sort of interrupt load mitigation or +(even better) if it supports NAPI, also make sure it is enabled. + +-------------------------------------------------------------------------------- ++ How to use CONFIG_PACKET_MMAP +-------------------------------------------------------------------------------- + +From the user standpoint, you should use the higher level libpcap library, wich +is a de facto standard, portable across nearly all operating systems +including Win32. + +Said that, at time of this writing, official libpcap 0.8.1 is out and doesn't include +support for PACKET_MMAP, and also probably the libpcap included in your distribution. + +I'm aware of two implementations of PACKET_MMAP in libpcap: + + http://pusa.uv.es/~ulisses/packet_mmap/ (by Simon Patarin, based on libpcap 0.6.2) + http://public.lanl.gov/cpw/ (by Phil Wood, based on lastest libpcap) + +The rest of this document is intended for people who want to understand +the low level details or want to improve libpcap by including PACKET_MMAP +support. + +-------------------------------------------------------------------------------- ++ How to use CONFIG_PACKET_MMAP directly +-------------------------------------------------------------------------------- + +From the system calls stand point, the use of PACKET_MMAP involves +the following process: + + +[setup] socket() -------> creation of the capture socket + setsockopt() ---> allocation of the circular buffer (ring) + mmap() ---------> maping of the allocated buffer to the + user process + +[capture] poll() ---------> to wait for incoming packets + +[shutdown] close() --------> destruction of the capture socket and + deallocation of all associated + resources. + + +socket creation and destruction is straight forward, and is done +the same way with or without PACKET_MMAP: + +int fd; + +fd= socket(PF_PACKET, mode, htons(ETH_P_ALL)) + +where mode is SOCK_RAW for the raw interface were link level +information can be captured or SOCK_DGRAM for the cooked +interface where link level information capture is not +supported and a link level pseudo-header is provided +by the kernel. + +The destruction of the socket and all associated resources +is done by a simple call to close(fd). + +Next I will describe PACKET_MMAP settings and it's constraints, +also the maping of the circular buffer in the user process and +the use of this buffer. + +-------------------------------------------------------------------------------- ++ PACKET_MMAP settings +-------------------------------------------------------------------------------- + + +To setup PACKET_MMAP from user level code is done with a call like + + setsockopt(fd, SOL_PACKET, PACKET_RX_RING, (void *) &req, sizeof(req)) + +The most significant argument in the previous call is the req parameter, +this parameter must to have the following structure: + + struct tpacket_req + { + unsigned int tp_block_size; /* Minimal size of contiguous block */ + unsigned int tp_block_nr; /* Number of blocks */ + unsigned int tp_frame_size; /* Size of frame */ + unsigned int tp_frame_nr; /* Total number of frames */ + }; + +This structure is defined in /usr/include/linux/if_packet.h and establishes a +circular buffer (ring) of unswappable memory mapped in the capture process. +Being mapped in the capture process allows reading the captured frames and +related meta-information like timestamps without requiring a system call. + +Captured frames are grouped in blocks. Each block is a physically contiguous +region of memory and holds tp_block_size/tp_frame_size frames. The total number +of blocks is tp_block_nr. Note that tp_frame_nr is a redundant parameter because + + frames_per_block = tp_block_size/tp_frame_size + +indeed, packet_set_ring checks that the following condition is true + + frames_per_block * tp_block_nr == tp_frame_nr + + +Lets see an example, with the following values: + + tp_block_size= 4096 + tp_frame_size= 2048 + tp_block_nr = 4 + tp_frame_nr = 8 + +we will get the following buffer structure: + + block #1 block #2 ++---------+---------+ +---------+---------+ +| frame 1 | frame 2 | | frame 3 | frame 4 | ++---------+---------+ +---------+---------+ + + block #3 block #4 ++---------+---------+ +---------+---------+ +| frame 5 | frame 6 | | frame 7 | frame 8 | ++---------+---------+ +---------+---------+ + +A frame can be of any size with the only condition it can fit in a block. A block +can only hold an integer number of frames, or in other words, a frame cannot +be spawn accross two blocks so there are some datails you have to take into +account when choosing the frame_size. See "Maping and use of the circular +buffer (ring)". + + +-------------------------------------------------------------------------------- ++ PACKET_MMAP setting constraints +-------------------------------------------------------------------------------- + +In kernel versions prior to 2.4.26 (for the 2.4 branch) and 2.6.5 (2.6 branch), +the PACKET_MMAP buffer could hold only 32768 frames in a 32 bit architecture or +16384 in a 64 bit architecture. For information on these kernel versions +see http://pusa.uv.es/~ulisses/packet_mmap/packet_mmap.pre-2.4.26_2.6.5.txt + + Block size limit +------------------ + +As stated earlier, each block is a contiguous physical region of memory. These +memory regions are allocated with calls to the __get_free_pages() function. As +the name indicates, this function allocates pages of memory, and the second +argument is "order" or a power of two number of pages, that is +(for PAGE_SIZE == 4096) order=0 ==> 4096 bytes, order=1 ==> 8192 bytes, +order=2 ==> 16384 bytes, etc. The maximum size of a +region allocated by __get_free_pages is determined by the MAX_ORDER macro. More +precisely the limit can be calculated as: + + PAGE_SIZE << MAX_ORDER + + In a i386 architecture PAGE_SIZE is 4096 bytes + In a 2.4/i386 kernel MAX_ORDER is 10 + In a 2.6/i386 kernel MAX_ORDER is 11 + +So get_free_pages can allocate as much as 4MB or 8MB in a 2.4/2.6 kernel +respectively, with an i386 architecture. + +User space programs can include /usr/include/sys/user.h and +/usr/include/linux/mmzone.h to get PAGE_SIZE MAX_ORDER declarations. + +The pagesize can also be determined dynamically with the getpagesize (2) +system call. + + + Block number limit +-------------------- + +To understand the constraints of PACKET_MMAP, we have to see the structure +used to hold the pointers to each block. + +Currently, this structure is a dynamically allocated vector with kmalloc +called pg_vec, its size limits the number of blocks that can be allocated. + + +---+---+---+---+ + | x | x | x | x | + +---+---+---+---+ + | | | | + | | | v + | | v block #4 + | v block #3 + v block #2 + block #1 + + +kmalloc allocates any number of bytes of phisically contiguous memory from +a pool of pre-determined sizes. This pool of memory is mantained by the slab +allocator wich is at the end the responsible for doing the allocation and +hence wich imposes the maximum memory that kmalloc can allocate. + +In a 2.4/2.6 kernel and the i386 architecture, the limit is 131072 bytes. The +predetermined sizes that kmalloc uses can be checked in the "size-" +entries of /proc/slabinfo + +In a 32 bit architecture, pointers are 4 bytes long, so the total number of +pointers to blocks is + + 131072/4 = 32768 blocks + + + PACKET_MMAP buffer size calculator +------------------------------------ + +Definitions: + + : is the maximum size of allocable with kmalloc (see /proc/slabinfo) +: depends on the architecture -- sizeof(void *) + : depends on the architecture -- PAGE_SIZE or getpagesize (2) + : is the value defined with MAX_ORDER + : it's an upper bound of frame's capture size (more on this later) + +from these definitions we will derive + + = / + = << + +so, the max buffer size is + + * + +and, the number of frames be + + * / + +Suposse the following parameters, wich apply for 2.6 kernel and an +i386 architecture: + + = 131072 bytes + = 4 bytes + = 4096 bytes + = 11 + +and a value for of 2048 byteas. These parameters will yield + + = 131072/4 = 32768 blocks + = 4096 << 11 = 8 MiB. + +and hence the buffer will have a 262144 MiB size. So it can hold +262144 MiB / 2048 bytes = 134217728 frames + + +Actually, this buffer size is not possible with an i386 architecture. +Remember that the memory is allocated in kernel space, in the case of +an i386 kernel's memory size is limited to 1GiB. + +All memory allocations are not freed until the socket is closed. The memory +allocations are done with GFP_KERNEL priority, this basically means that +the allocation can wait and swap other process' memory in order to allocate +the nececessary memory, so normally limits can be reached. + + Other constraints +------------------- + +If you check the source code you will see that what I draw here as a frame +is not only the link level frame. At the begining of each frame there is a +header called struct tpacket_hdr used in PACKET_MMAP to hold link level's frame +meta information like timestamp. So what we draw here a frame it's really +the following (from include/linux/if_packet.h): + +/* + Frame structure: + + - Start. Frame must be aligned to TPACKET_ALIGNMENT=16 + - struct tpacket_hdr + - pad to TPACKET_ALIGNMENT=16 + - struct sockaddr_ll + - Gap, chosen so that packet data (Start+tp_net) alignes to + TPACKET_ALIGNMENT=16 + - Start+tp_mac: [ Optional MAC header ] + - Start+tp_net: Packet data, aligned to TPACKET_ALIGNMENT=16. + - Pad to align to TPACKET_ALIGNMENT=16 + */ + + + The following are conditions that are checked in packet_set_ring + + tp_block_size must be a multiple of PAGE_SIZE (1) + tp_frame_size must be greater than TPACKET_HDRLEN (obvious) + tp_frame_size must be a multiple of TPACKET_ALIGNMENT + tp_frame_nr must be exactly frames_per_block*tp_block_nr + +Note that tp_block_size should be choosed to be a power of two or there will +be a waste of memory. + +-------------------------------------------------------------------------------- ++ Maping and use of the circular buffer (ring) +-------------------------------------------------------------------------------- + +The maping of the buffer in the user process is done with the conventional +mmap function. Even the circular buffer is compound of several physically +discontiguous blocks of memory, they are contiguous to the user space, hence +just one call to mmap is needed: + + mmap(0, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + +If tp_frame_size is a divisor of tp_block_size frames will be +contiguosly spaced by tp_frame_size bytes. If not, each +tp_block_size/tp_frame_size frames there will be a gap between +the frames. This is because a frame cannot be spawn across two +blocks. + +At the beginning of each frame there is an status field (see +struct tpacket_hdr). If this field is 0 means that the frame is ready +to be used for the kernel, If not, there is a frame the user can read +and the following flags apply: + + from include/linux/if_packet.h + + #define TP_STATUS_COPY 2 + #define TP_STATUS_LOSING 4 + #define TP_STATUS_CSUMNOTREADY 8 + + +TP_STATUS_COPY : This flag indicates that the frame (and associated + meta information) has been truncated because it's + larger than tp_frame_size. This packet can be + read entirely with recvfrom(). + + In order to make this work it must to be + enabled previously with setsockopt() and + the PACKET_COPY_THRESH option. + + The number of frames than can be buffered to + be read with recvfrom is limited like a normal socket. + See the SO_RCVBUF option in the socket (7) man page. + +TP_STATUS_LOSING : indicates there were packet drops from last time + statistics where checked with getsockopt() and + the PACKET_STATISTICS option. + +TP_STATUS_CSUMNOTREADY: currently it's used for outgoing IP packets wich + it's checksum will be done in hardware. So while + reading the packet we should not try to check the + checksum. + +for convenience there are also the following defines: + + #define TP_STATUS_KERNEL 0 + #define TP_STATUS_USER 1 + +The kernel initializes all frames to TP_STATUS_KERNEL, when the kernel +receives a packet it puts in the buffer and updates the status with +at least the TP_STATUS_USER flag. Then the user can read the packet, +once the packet is read the user must zero the status field, so the kernel +can use again that frame buffer. + +The user can use poll (any other variant should apply too) to check if new +packets are in the ring: + + struct pollfd pfd; + + pfd.fd = fd; + pfd.revents = 0; + pfd.events = POLLIN|POLLRDNORM|POLLERR; + + if (status == TP_STATUS_KERNEL) + retval = poll(&pfd, 1, timeout); + +It doesn't incur in a race condition to first check the status value and +then poll for frames. + +-------------------------------------------------------------------------------- ++ THANKS +-------------------------------------------------------------------------------- + + Jesse Brandeburg, for fixing my grammathical/spelling errors + +>>> EOF +- +To unsubscribe from this list: send the line "unsubscribe linux-net" in +the body of a message to majordomo@vger.kernel.org +More majordomo info at http://vger.kernel.org/majordomo-info.html \ No newline at end of file diff -Nru a/arch/alpha/kernel/sys_nautilus.c b/arch/alpha/kernel/sys_nautilus.c --- a/arch/alpha/kernel/sys_nautilus.c Tue Mar 30 20:00:36 2004 +++ b/arch/alpha/kernel/sys_nautilus.c Tue Mar 30 20:00:36 2004 @@ -225,11 +225,13 @@ if (request_resource(&iomem_resource, bus->resource[1]) < 0) printk(KERN_ERR "Failed to request MEM on hose 0\n"); - if (pci_mem < memtop && pci_mem > alpha_mv.min_mem_address) { + if (pci_mem < memtop) + memtop = pci_mem; + if (memtop > alpha_mv.min_mem_address) { free_reserved_mem(__va(alpha_mv.min_mem_address), - __va(pci_mem)); + __va(memtop)); printk("nautilus_init_pci: %ldk freed\n", - (pci_mem - alpha_mv.min_mem_address) >> 10); + (memtop - alpha_mv.min_mem_address) >> 10); } if ((IRONGATE0->dev_vendor >> 16) > 0x7006) /* Albacore? */ diff -Nru a/arch/ppc/Kconfig b/arch/ppc/Kconfig --- a/arch/ppc/Kconfig Tue Mar 30 20:00:37 2004 +++ b/arch/ppc/Kconfig Tue Mar 30 20:00:37 2004 @@ -696,14 +696,10 @@ config PREEMPT bool "Preemptible Kernel" - depends on !SMP help This option reduces the latency of the kernel when reacting to real-time or interactive events by allowing a low priority process to be preempted even if it is in kernel mode executing a system call. - Unfortunately the kernel code has some race conditions if both - CONFIG_SMP and CONFIG_PREEMPT are enabled, so this option is - currently disabled if you are building an SMP kernel. Say Y here if you are building a kernel for a desktop, embedded or real-time system. Say N if you are unsure. diff -Nru a/arch/ppc/kernel/align.c b/arch/ppc/kernel/align.c --- a/arch/ppc/kernel/align.c Tue Mar 30 20:00:36 2004 +++ b/arch/ppc/kernel/align.c Tue Mar 30 20:00:36 2004 @@ -325,14 +325,18 @@ * the kernel with -msoft-float so it doesn't use the * fp regs for copying 8-byte objects. */ case LD+F+S: + preempt_disable(); enable_kernel_fp(); cvt_fd(&data.f, ¤t->thread.fpr[reg], ¤t->thread.fpscr); /* current->thread.fpr[reg] = data.f; */ + preempt_enable(); break; case ST+F+S: + preempt_disable(); enable_kernel_fp(); cvt_df(¤t->thread.fpr[reg], &data.f, ¤t->thread.fpscr); /* data.f = current->thread.fpr[reg]; */ + preempt_enable(); break; default: printk("align: can't handle flags=%x\n", flags); diff -Nru a/arch/ppc/kernel/entry.S b/arch/ppc/kernel/entry.S --- a/arch/ppc/kernel/entry.S Tue Mar 30 20:00:37 2004 +++ b/arch/ppc/kernel/entry.S Tue Mar 30 20:00:37 2004 @@ -469,10 +469,19 @@ stw r10,_CCR(r1) stw r1,KSP(r3) /* Set old stack pointer */ +#ifdef CONFIG_SMP + /* We need a sync somewhere here to make sure that if the + * previous task gets rescheduled on another CPU, it sees all + * stores it has performed on this one. + */ + sync +#endif /* CONFIG_SMP */ + tophys(r0,r4) CLR_TOP32(r0) mtspr SPRG3,r0 /* Update current THREAD phys addr */ lwz r1,KSP(r4) /* Load new stack pointer */ + /* save the old current 'last' for return value */ mr r3,r2 addi r2,r4,-THREAD /* Update current */ diff -Nru a/arch/ppc/kernel/head.S b/arch/ppc/kernel/head.S --- a/arch/ppc/kernel/head.S Tue Mar 30 20:00:36 2004 +++ b/arch/ppc/kernel/head.S Tue Mar 30 20:00:36 2004 @@ -1436,11 +1436,8 @@ stw r4, 0x4(r5) #endif li r4,0 -BEGIN_FTR_SECTION - dssall - sync -END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) -3: isync + isync +3: #ifdef CONFIG_PPC64BRIDGE slbie r4 #endif /* CONFIG_PPC64BRIDGE */ diff -Nru a/arch/ppc/kernel/pci.c b/arch/ppc/kernel/pci.c --- a/arch/ppc/kernel/pci.c Tue Mar 30 20:00:36 2004 +++ b/arch/ppc/kernel/pci.c Tue Mar 30 20:00:36 2004 @@ -159,7 +159,6 @@ ppc_md.pcibios_fixup_resources(dev); } - void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region, struct resource *res) @@ -1522,51 +1521,43 @@ { struct pci_controller *hose = (struct pci_controller *) dev->sysdata; unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; - unsigned long io_offset = 0; - int i, res_bit; + unsigned long size = vma->vm_end - vma->vm_start; + unsigned long base; + struct resource *res; + int i; + int ret = -EINVAL; if (hose == 0) return -EINVAL; /* should never happen */ + if (offset + size <= offset) + return -EINVAL; - /* If memory, add on the PCI bridge address offset */ if (mmap_state == pci_mmap_mem) { + /* PCI memory space */ + base = hose->pci_mem_offset; + for (i = 0; i < 3; ++i) { + res = &hose->mem_resources[i]; + if (res->flags == 0) + continue; + if (offset >= res->start - base + && offset + size - 1 <= res->end - base) { + ret = 0; + break; + } + } offset += hose->pci_mem_offset; - res_bit = IORESOURCE_MEM; } else { - io_offset = (unsigned long)hose->io_base_virt - isa_io_base; - offset += io_offset; - res_bit = IORESOURCE_IO; - } - - /* - * Check that the offset requested corresponds to one of the - * resources of the device. - */ - for (i = 0; i <= PCI_ROM_RESOURCE; i++) { - struct resource *rp = &dev->resource[i]; - int flags = rp->flags; - - /* treat ROM as memory (should be already) */ - if (i == PCI_ROM_RESOURCE) - flags |= IORESOURCE_MEM; - - /* Active and same type? */ - if ((flags & res_bit) == 0) - continue; - - /* In the range of this resource? */ - if (offset < (rp->start & PAGE_MASK) || offset > rp->end) - continue; - - /* found it! construct the final physical address */ - if (mmap_state == pci_mmap_io) - offset += hose->io_base_phys - io_offset; - - vma->vm_pgoff = offset >> PAGE_SHIFT; - return 0; + /* PCI I/O space */ + base = (unsigned long)hose->io_base_virt - isa_io_base; + res = &hose->io_resource; + if (offset >= res->start - base + && offset + size - 1 <= res->end - base) + ret = 0; + offset += hose->io_base_phys; } - return -EINVAL; + vma->vm_pgoff = offset >> PAGE_SHIFT; + return ret; } /* diff -Nru a/arch/ppc/kernel/ppc_ksyms.c b/arch/ppc/kernel/ppc_ksyms.c --- a/arch/ppc/kernel/ppc_ksyms.c Tue Mar 30 20:00:37 2004 +++ b/arch/ppc/kernel/ppc_ksyms.c Tue Mar 30 20:00:37 2004 @@ -192,7 +192,6 @@ EXPORT_SYMBOL(flush_instruction_cache); EXPORT_SYMBOL(giveup_fpu); -EXPORT_SYMBOL(enable_kernel_fp); EXPORT_SYMBOL(flush_icache_range); EXPORT_SYMBOL(flush_dcache_range); EXPORT_SYMBOL(flush_icache_user_range); diff -Nru a/arch/ppc/kernel/process.c b/arch/ppc/kernel/process.c --- a/arch/ppc/kernel/process.c Tue Mar 30 20:00:36 2004 +++ b/arch/ppc/kernel/process.c Tue Mar 30 20:00:36 2004 @@ -163,7 +163,8 @@ void enable_kernel_altivec(void) { - preempt_disable(); + WARN_ON(current_thread_info()->preempt_count == 0 && !irqs_disabled()); + #ifdef CONFIG_SMP if (current->thread.regs && (current->thread.regs->msr & MSR_VEC)) giveup_altivec(current); @@ -172,14 +173,15 @@ #else giveup_altivec(last_task_used_altivec); #endif /* __SMP __ */ - preempt_enable(); } +EXPORT_SYMBOL(enable_kernel_altivec); #endif /* CONFIG_ALTIVEC */ void enable_kernel_fp(void) { - preempt_disable(); + WARN_ON(current_thread_info()->preempt_count == 0 && !irqs_disabled()); + #ifdef CONFIG_SMP if (current->thread.regs && (current->thread.regs->msr & MSR_FP)) giveup_fpu(current); @@ -188,8 +190,8 @@ #else giveup_fpu(last_task_used_math); #endif /* CONFIG_SMP */ - preempt_enable(); } +EXPORT_SYMBOL(enable_kernel_fp); int dump_task_fpu(struct task_struct *tsk, elf_fpregset_t *fpregs) diff -Nru a/arch/ppc64/kernel/entry.S b/arch/ppc64/kernel/entry.S --- a/arch/ppc64/kernel/entry.S Tue Mar 30 20:00:36 2004 +++ b/arch/ppc64/kernel/entry.S Tue Mar 30 20:00:36 2004 @@ -194,7 +194,7 @@ _GLOBAL(ret_from_syscall_2) std r3,RESULT(r1) /* Save result */ li r10,-_LAST_ERRNO - cmpl 0,r3,r10 + cmpld 0,r3,r10 blt 60f neg r3,r3 57: ld r10,_CCR(r1) /* Set SO bit in CR */ @@ -288,6 +288,14 @@ mfcr r23 std r23,_CCR(r1) std r1,KSP(r3) /* Set old stack pointer */ + +#ifdef CONFIG_SMP + /* We need a sync somewhere here to make sure that if the + * previous task gets rescheduled on another CPU, it sees all + * stores it has performed on this one. + */ + sync +#endif /* CONFIG_SMP */ addi r6,r4,-THREAD /* Convert THREAD to 'current' */ std r6,PACACURRENT(r13) /* Set new 'current' */ diff -Nru a/drivers/block/elevator.c b/drivers/block/elevator.c --- a/drivers/block/elevator.c Tue Mar 30 20:00:37 2004 +++ b/drivers/block/elevator.c Tue Mar 30 20:00:37 2004 @@ -210,10 +210,14 @@ rq = NULL; break; } else if (ret == BLKPREP_KILL) { + int nr_bytes = rq->hard_nr_sectors << 9; + + if (!nr_bytes) + nr_bytes = rq->data_len; + blkdev_dequeue_request(rq); rq->flags |= REQ_QUIET; - while (end_that_request_first(rq, 0, rq->nr_sectors)) - ; + end_that_request_chunk(rq, 0, nr_bytes); end_that_request_last(rq); } else { printk("%s: bad return=%d\n", __FUNCTION__, ret); diff -Nru a/drivers/isdn/hardware/eicon/capifunc.c b/drivers/isdn/hardware/eicon/capifunc.c --- a/drivers/isdn/hardware/eicon/capifunc.c Tue Mar 30 20:00:37 2004 +++ b/drivers/isdn/hardware/eicon/capifunc.c Tue Mar 30 20:00:37 2004 @@ -1,4 +1,4 @@ -/* $Id: capifunc.c,v 1.60 2004/03/22 16:28:27 armin Exp $ +/* $Id: capifunc.c,v 1.61 2004/03/26 19:48:48 armin Exp $ * * ISDN interface module for Eicon active cards DIVA. * CAPI Interface common functions @@ -893,15 +893,16 @@ return CAPI_REGOSRESOURCEERR; } + diva_os_enter_spin_lock(&api_lock, &old_irql, "send message"); + if (!this->Id) { + diva_os_leave_spin_lock(&api_lock, &old_irql, "send message"); return CAPI_ILLAPPNR; } /* patch controller number */ msg->header.controller = ControllerMap[card->Id] | (msg->header.controller & 0x80); /* preserve external controller bit */ - - diva_os_enter_spin_lock(&api_lock, &old_irql, "send message"); switch (command) { default: diff -Nru a/drivers/net/wan/pc300_drv.c b/drivers/net/wan/pc300_drv.c --- a/drivers/net/wan/pc300_drv.c Tue Mar 30 20:00:37 2004 +++ b/drivers/net/wan/pc300_drv.c Tue Mar 30 20:00:37 2004 @@ -3661,7 +3661,7 @@ release_mem_region(card->hw.falcphys, card->hw.falcsize); } for (i = 0; i < card->hw.nchan; i++) - if (card->chan[i].d.dev); + if (card->chan[i].d.dev) free_netdev(card->chan[i].d.dev); if (card->hw.irq) free_irq(card->hw.irq, card); diff -Nru a/fs/bio.c b/fs/bio.c --- a/fs/bio.c Tue Mar 30 20:00:36 2004 +++ b/fs/bio.c Tue Mar 30 20:00:36 2004 @@ -701,11 +701,12 @@ { struct bio_pair *bp = container_of(bi, struct bio_pair, bio1); - if (bi->bi_size) - return 1; if (err) bp->error = err; + if (bi->bi_size) + return 1; + bio_pair_release(bp); return 0; } @@ -714,10 +715,11 @@ { struct bio_pair *bp = container_of(bi, struct bio_pair, bio2); - if (bi->bi_size) - return 1; if (err) bp->error = err; + + if (bi->bi_size) + return 1; bio_pair_release(bp); return 0; diff -Nru a/fs/sysfs/dir.c b/fs/sysfs/dir.c --- a/fs/sysfs/dir.c Tue Mar 30 20:00:36 2004 +++ b/fs/sysfs/dir.c Tue Mar 30 20:00:36 2004 @@ -20,18 +20,6 @@ return 0; } -static void sysfs_d_iput(struct dentry * dentry, struct inode * inode) -{ - struct kobject * kobj = dentry->d_fsdata; - - if (kobj) - kobject_put(kobj); - iput(inode); -} - -static struct dentry_operations sysfs_dentry_operations = { - .d_iput = &sysfs_d_iput, -}; static int create_dir(struct kobject * k, struct dentry * p, const char * n, struct dentry ** d) @@ -45,8 +33,7 @@ S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO, init_dir); if (!error) { - (*d)->d_op = &sysfs_dentry_operations; - (*d)->d_fsdata = kobject_get(k); + (*d)->d_fsdata = k; p->d_inode->i_nlink++; } dput(*d); diff -Nru a/include/asm-ppc/cputable.h b/include/asm-ppc/cputable.h --- a/include/asm-ppc/cputable.h Tue Mar 30 20:00:36 2004 +++ b/include/asm-ppc/cputable.h Tue Mar 30 20:00:36 2004 @@ -90,10 +90,24 @@ .long 99b; \ .previous -#define END_FTR_SECTION_IFSET(msk) END_FTR_SECTION((msk), (msk)) -#define END_FTR_SECTION_IFCLR(msk) END_FTR_SECTION((msk), 0) +#else + +#define BEGIN_FTR_SECTION "98:\n" +#define END_FTR_SECTION(msk, val) \ +"99:\n" \ +" .section __ftr_fixup,\"a\";\n" \ +" .align 2;\n" \ +" .long "#msk";\n" \ +" .long "#val";\n" \ +" .long 98b;\n" \ +" .long 99b;\n" \ +" .previous\n" + #endif /* __ASSEMBLY__ */ + +#define END_FTR_SECTION_IFSET(msk) END_FTR_SECTION((msk), (msk)) +#define END_FTR_SECTION_IFCLR(msk) END_FTR_SECTION((msk), 0) #endif /* __ASM_PPC_CPUTABLE_H */ #endif /* __KERNEL__ */ diff -Nru a/include/asm-ppc/mmu_context.h b/include/asm-ppc/mmu_context.h --- a/include/asm-ppc/mmu_context.h Tue Mar 30 20:00:36 2004 +++ b/include/asm-ppc/mmu_context.h Tue Mar 30 20:00:36 2004 @@ -6,6 +6,7 @@ #include #include #include +#include /* * On 32-bit PowerPC 6xx/7xx/7xxx CPUs, we use a set of 16 VSIDs @@ -155,7 +156,24 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { +#ifdef CONFIG_ALTIVEC + asm volatile ( + BEGIN_FTR_SECTION + "dssall;\n" +#ifndef CONFIG_POWER4 + "sync;\n" /* G4 needs a sync here, G5 apparently not */ +#endif + END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) + : : ); +#endif /* CONFIG_ALTIVEC */ + tsk->thread.pgdir = next->pgd; + + /* No need to flush userspace segments if the mm doesnt change */ + if (prev == next) + return; + + /* Setup new userspace context */ get_mmu_context(next); set_context(next->context, next->pgd); } @@ -166,12 +184,7 @@ * After we have set current->mm to a new value, this activates * the context for the new mm so we see the new mappings. */ -static inline void activate_mm(struct mm_struct *active_mm, struct mm_struct *mm) -{ - current->thread.pgdir = mm->pgd; - get_mmu_context(mm); - set_context(mm->context, mm->pgd); -} +#define activate_mm(active_mm, mm) switch_mm(active_mm, mm, current) extern void mmu_context_init(void); diff -Nru a/kernel/stop_machine.c b/kernel/stop_machine.c --- a/kernel/stop_machine.c Tue Mar 30 20:00:36 2004 +++ b/kernel/stop_machine.c Tue Mar 30 20:00:36 2004 @@ -149,10 +149,12 @@ complete(&smdata->done); /* Wait for kthread_stop */ + __set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { - __set_current_state(TASK_INTERRUPTIBLE); schedule(); + __set_current_state(TASK_INTERRUPTIBLE); } + __set_current_state(TASK_RUNNING); return ret; } diff -Nru a/mm/pdflush.c b/mm/pdflush.c --- a/mm/pdflush.c Tue Mar 30 20:00:37 2004 +++ b/mm/pdflush.c Tue Mar 30 20:00:37 2004 @@ -172,6 +172,12 @@ static int pdflush(void *dummy) { struct pdflush_work my_work; + + /* + * pdflush can spend a lot of time doing encryption via dm-crypt. We + * don't want to do that at keventd's priority. + */ + set_user_nice(current, 0); return __pdflush(&my_work); } diff -Nru a/net/Makefile b/net/Makefile --- a/net/Makefile Tue Mar 30 20:00:36 2004 +++ b/net/Makefile Tue Mar 30 20:00:36 2004 @@ -16,7 +16,9 @@ obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ obj-$(CONFIG_INET) += ipv4/ xfrm/ obj-$(CONFIG_UNIX) += unix/ -obj-$(CONFIG_IPV6) += ipv6/ +ifneq ($(CONFIG_IPV6),) +obj-y += ipv6/ +endif obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ obj-$(CONFIG_NET_SCHED) += sched/ diff -Nru a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c --- a/net/ipv4/netfilter/ip_nat_standalone.c Tue Mar 30 20:00:37 2004 +++ b/net/ipv4/netfilter/ip_nat_standalone.c Tue Mar 30 20:00:37 2004 @@ -124,7 +124,16 @@ WRITE_LOCK(&ip_nat_lock); /* Seen it before? This can happen for loopback, retrans, or local packets.. */ - if (!(info->initialized & (1 << maniptype))) { + if (!(info->initialized & (1 << maniptype)) +#ifndef CONFIG_IP_NF_NAT_LOCAL + /* If this session has already been confirmed we must not + * touch it again even if there is no mapping set up. + * Can only happen on local->local traffic with + * CONFIG_IP_NF_NAT_LOCAL disabled. + */ + && !(ct->status & IPS_CONFIRMED) +#endif + ) { unsigned int ret; if (ct->master diff -Nru a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c --- a/net/ipv4/netfilter/ipt_MASQUERADE.c Tue Mar 30 20:00:36 2004 +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c Tue Mar 30 20:00:36 2004 @@ -45,7 +45,7 @@ const struct ip_nat_multi_range *mr = targinfo; if (strcmp(tablename, "nat") != 0) { - DEBUGP("masquerade_check: bad table `%s'.\n", table); + DEBUGP("masquerade_check: bad table `%s'.\n", tablename); return 0; } if (targinfosize != IPT_ALIGN(sizeof(*mr))) { diff -Nru a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c --- a/net/ipv4/tcp_ipv4.c Tue Mar 30 20:00:36 2004 +++ b/net/ipv4/tcp_ipv4.c Tue Mar 30 20:00:36 2004 @@ -1825,12 +1825,15 @@ goto discard_it; do_time_wait: - if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) - goto discard_and_relse; + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + tcp_tw_put((struct tcp_tw_bucket *) sk); + goto discard_it; + } if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { TCP_INC_STATS_BH(TcpInErrs); - goto discard_and_relse; + tcp_tw_put((struct tcp_tw_bucket *) sk); + goto discard_it; } switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk, skb, th, skb->len)) { diff -Nru a/net/ipv6/Makefile b/net/ipv6/Makefile --- a/net/ipv6/Makefile Tue Mar 30 20:00:37 2004 +++ b/net/ipv6/Makefile Tue Mar 30 20:00:37 2004 @@ -19,3 +19,5 @@ obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o + +obj-y += exthdrs_core.o diff -Nru a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c --- a/net/ipv6/exthdrs.c Tue Mar 30 20:00:36 2004 +++ b/net/ipv6/exthdrs.c Tue Mar 30 20:00:36 2004 @@ -633,105 +633,3 @@ } return opt2; } - - -/* - * find out if nexthdr is a well-known extension header or a protocol - */ - -int ipv6_ext_hdr(u8 nexthdr) -{ - /* - * find out if nexthdr is an extension header or a protocol - */ - return ( (nexthdr == NEXTHDR_HOP) || - (nexthdr == NEXTHDR_ROUTING) || - (nexthdr == NEXTHDR_FRAGMENT) || - (nexthdr == NEXTHDR_AUTH) || - (nexthdr == NEXTHDR_NONE) || - (nexthdr == NEXTHDR_DEST) ); -} - -/* - * Skip any extension headers. This is used by the ICMP module. - * - * Note that strictly speaking this conflicts with RFC 2460 4.0: - * ...The contents and semantics of each extension header determine whether - * or not to proceed to the next header. Therefore, extension headers must - * be processed strictly in the order they appear in the packet; a - * receiver must not, for example, scan through a packet looking for a - * particular kind of extension header and process that header prior to - * processing all preceding ones. - * - * We do exactly this. This is a protocol bug. We can't decide after a - * seeing an unknown discard-with-error flavour TLV option if it's a - * ICMP error message or not (errors should never be send in reply to - * ICMP error messages). - * - * But I see no other way to do this. This might need to be reexamined - * when Linux implements ESP (and maybe AUTH) headers. - * --AK - * - * This function parses (probably truncated) exthdr set "hdr" - * of length "len". "nexthdrp" initially points to some place, - * where type of the first header can be found. - * - * It skips all well-known exthdrs, and returns pointer to the start - * of unparsable area i.e. the first header with unknown type. - * If it is not NULL *nexthdr is updated by type/protocol of this header. - * - * NOTES: - if packet terminated with NEXTHDR_NONE it returns NULL. - * - it may return pointer pointing beyond end of packet, - * if the last recognized header is truncated in the middle. - * - if packet is truncated, so that all parsed headers are skipped, - * it returns NULL. - * - First fragment header is skipped, not-first ones - * are considered as unparsable. - * - ESP is unparsable for now and considered like - * normal payload protocol. - * - Note also special handling of AUTH header. Thanks to IPsec wizards. - * - * --ANK (980726) - */ - -int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, int len) -{ - u8 nexthdr = *nexthdrp; - - while (ipv6_ext_hdr(nexthdr)) { - struct ipv6_opt_hdr hdr; - int hdrlen; - - if (len < (int)sizeof(struct ipv6_opt_hdr)) - return -1; - if (nexthdr == NEXTHDR_NONE) - return -1; - if (skb_copy_bits(skb, start, &hdr, sizeof(hdr))) - BUG(); - if (nexthdr == NEXTHDR_FRAGMENT) { - unsigned short frag_off; - if (skb_copy_bits(skb, - start+offsetof(struct frag_hdr, - frag_off), - &frag_off, - sizeof(frag_off))) { - return -1; - } - - if (ntohs(frag_off) & ~0x7) - break; - hdrlen = 8; - } else if (nexthdr == NEXTHDR_AUTH) - hdrlen = (hdr.hdrlen+2)<<2; - else - hdrlen = ipv6_optlen(&hdr); - - nexthdr = hdr.nexthdr; - len -= hdrlen; - start += hdrlen; - } - - *nexthdrp = nexthdr; - return start; -} - diff -Nru a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/net/ipv6/exthdrs_core.c Tue Mar 30 20:00:37 2004 @@ -0,0 +1,108 @@ +/* + * IPv6 library code, needed by static components when full IPv6 support is + * not configured or static. + */ +#include + +/* + * find out if nexthdr is a well-known extension header or a protocol + */ + +int ipv6_ext_hdr(u8 nexthdr) +{ + /* + * find out if nexthdr is an extension header or a protocol + */ + return ( (nexthdr == NEXTHDR_HOP) || + (nexthdr == NEXTHDR_ROUTING) || + (nexthdr == NEXTHDR_FRAGMENT) || + (nexthdr == NEXTHDR_AUTH) || + (nexthdr == NEXTHDR_NONE) || + (nexthdr == NEXTHDR_DEST) ); +} + +/* + * Skip any extension headers. This is used by the ICMP module. + * + * Note that strictly speaking this conflicts with RFC 2460 4.0: + * ...The contents and semantics of each extension header determine whether + * or not to proceed to the next header. Therefore, extension headers must + * be processed strictly in the order they appear in the packet; a + * receiver must not, for example, scan through a packet looking for a + * particular kind of extension header and process that header prior to + * processing all preceding ones. + * + * We do exactly this. This is a protocol bug. We can't decide after a + * seeing an unknown discard-with-error flavour TLV option if it's a + * ICMP error message or not (errors should never be send in reply to + * ICMP error messages). + * + * But I see no other way to do this. This might need to be reexamined + * when Linux implements ESP (and maybe AUTH) headers. + * --AK + * + * This function parses (probably truncated) exthdr set "hdr" + * of length "len". "nexthdrp" initially points to some place, + * where type of the first header can be found. + * + * It skips all well-known exthdrs, and returns pointer to the start + * of unparsable area i.e. the first header with unknown type. + * If it is not NULL *nexthdr is updated by type/protocol of this header. + * + * NOTES: - if packet terminated with NEXTHDR_NONE it returns NULL. + * - it may return pointer pointing beyond end of packet, + * if the last recognized header is truncated in the middle. + * - if packet is truncated, so that all parsed headers are skipped, + * it returns NULL. + * - First fragment header is skipped, not-first ones + * are considered as unparsable. + * - ESP is unparsable for now and considered like + * normal payload protocol. + * - Note also special handling of AUTH header. Thanks to IPsec wizards. + * + * --ANK (980726) + */ + +int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, int len) +{ + u8 nexthdr = *nexthdrp; + + while (ipv6_ext_hdr(nexthdr)) { + struct ipv6_opt_hdr hdr; + int hdrlen; + + if (len < (int)sizeof(struct ipv6_opt_hdr)) + return -1; + if (nexthdr == NEXTHDR_NONE) + return -1; + if (skb_copy_bits(skb, start, &hdr, sizeof(hdr))) + BUG(); + if (nexthdr == NEXTHDR_FRAGMENT) { + unsigned short frag_off; + if (skb_copy_bits(skb, + start+offsetof(struct frag_hdr, + frag_off), + &frag_off, + sizeof(frag_off))) { + return -1; + } + + if (ntohs(frag_off) & ~0x7) + break; + hdrlen = 8; + } else if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hdr.hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(&hdr); + + nexthdr = hdr.nexthdr; + len -= hdrlen; + start += hdrlen; + } + + *nexthdrp = nexthdr; + return start; +} + +EXPORT_SYMBOL(ipv6_ext_hdr); +EXPORT_SYMBOL(ipv6_skip_exthdr); diff -Nru a/net/ipv6/ipv6_syms.c b/net/ipv6/ipv6_syms.c --- a/net/ipv6/ipv6_syms.c Tue Mar 30 20:00:37 2004 +++ b/net/ipv6/ipv6_syms.c Tue Mar 30 20:00:37 2004 @@ -41,9 +41,7 @@ #endif EXPORT_SYMBOL(rt6_lookup); EXPORT_SYMBOL(fl6_sock_lookup); -EXPORT_SYMBOL(ipv6_ext_hdr); EXPORT_SYMBOL(ip6_append_data); EXPORT_SYMBOL(ip6_flush_pending_frames); EXPORT_SYMBOL(ip6_push_pending_frames); EXPORT_SYMBOL(ipv6_push_nfrag_opts); -EXPORT_SYMBOL(ipv6_skip_exthdr); diff -Nru a/net/packet/af_packet.c b/net/packet/af_packet.c --- a/net/packet/af_packet.c Tue Mar 30 20:00:37 2004 +++ b/net/packet/af_packet.c Tue Mar 30 20:00:37 2004 @@ -34,6 +34,8 @@ * Alexey Kuznetsov : Untied from IPv4 stack. * Cyrus Durgin : Fixed kerneld for kmod. * Michal Ostrowski : Module initialization cleanup. + * Ulises Alonso : Frame number limit removal and + * packet_set_ring memory leak. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -168,30 +170,47 @@ struct packet_opt { + struct tpacket_stats stats; +#ifdef CONFIG_PACKET_MMAP + unsigned long *pg_vec; + unsigned int head; + unsigned int frames_per_block; + unsigned int frame_size; + unsigned int frame_max; + int copy_thresh; +#endif struct packet_type prot_hook; spinlock_t bind_lock; char running; /* prot_hook is attached*/ int ifindex; /* bound device */ unsigned short num; - struct tpacket_stats stats; #ifdef CONFIG_PACKET_MULTICAST struct packet_mclist *mclist; #endif #ifdef CONFIG_PACKET_MMAP atomic_t mapped; - unsigned long *pg_vec; - unsigned int pg_vec_order; + unsigned int pg_vec_order; unsigned int pg_vec_pages; unsigned int pg_vec_len; - - struct tpacket_hdr **iovec; - unsigned int frame_size; - unsigned int iovmax; - unsigned int head; - int copy_thresh; #endif }; +#ifdef CONFIG_PACKET_MMAP + +static inline unsigned long packet_lookup_frame(struct packet_opt *po, unsigned int position) +{ + unsigned int pg_vec_pos, frame_offset; + unsigned long frame; + + pg_vec_pos = position / po->frames_per_block; + frame_offset = position % po->frames_per_block; + + frame = (unsigned long) (po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size)); + + return frame; +} +#endif + #define pkt_sk(__sk) ((struct packet_opt *)(__sk)->sk_protinfo) void packet_sock_destruct(struct sock *sk) @@ -586,11 +605,11 @@ snaplen = skb->len-skb->data_len; spin_lock(&sk->sk_receive_queue.lock); - h = po->iovec[po->head]; - + h = (struct tpacket_hdr *)packet_lookup_frame(po, po->head); + if (h->tp_status) goto ring_is_full; - po->head = po->head != po->iovmax ? po->head+1 : 0; + po->head = po->head != po->frame_max ? po->head+1 : 0; po->stats.tp_packets++; if (copy_skb) { status |= TP_STATUS_COPY; @@ -1485,10 +1504,13 @@ unsigned int mask = datagram_poll(file, sock, wait); spin_lock_bh(&sk->sk_receive_queue.lock); - if (po->iovec) { - unsigned last = po->head ? po->head-1 : po->iovmax; + if (po->pg_vec) { + unsigned last = po->head ? po->head-1 : po->frame_max; + struct tpacket_hdr *h; - if (po->iovec[last]->tp_status) + h = (struct tpacket_hdr *)packet_lookup_frame(po, last); + + if (h->tp_status) mask |= POLLIN | POLLRDNORM; } spin_unlock_bh(&sk->sk_receive_queue.lock); @@ -1548,16 +1570,18 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing) { unsigned long *pg_vec = NULL; - struct tpacket_hdr **io_vec = NULL; struct packet_opt *po = pkt_sk(sk); int was_running, num, order = 0; int err = 0; - + if (req->tp_block_nr) { int i, l; - int frames_per_block; /* Sanity tests and some calculations */ + + if (po->pg_vec) + return -EBUSY; + if ((int)req->tp_block_size <= 0) return -EINVAL; if (req->tp_block_size&(PAGE_SIZE-1)) @@ -1566,10 +1590,11 @@ return -EINVAL; if (req->tp_frame_size&(TPACKET_ALIGNMENT-1)) return -EINVAL; - frames_per_block = req->tp_block_size/req->tp_frame_size; - if (frames_per_block <= 0) + + po->frames_per_block = req->tp_block_size/req->tp_frame_size; + if (po->frames_per_block <= 0) return -EINVAL; - if (frames_per_block*req->tp_block_nr != req->tp_frame_nr) + if (po->frames_per_block*req->tp_block_nr != req->tp_frame_nr) return -EINVAL; /* OK! */ @@ -1596,20 +1621,16 @@ } /* Page vector is allocated */ - /* Draw frames */ - io_vec = kmalloc(req->tp_frame_nr*sizeof(struct tpacket_hdr*), GFP_KERNEL); - if (io_vec == NULL) - goto out_free_pgvec; - memset(io_vec, 0, req->tp_frame_nr*sizeof(struct tpacket_hdr*)); - l = 0; for (i=0; itp_block_nr; i++) { unsigned long ptr = pg_vec[i]; + struct tpacket_hdr *header; int k; - for (k=0; ktp_status = TP_STATUS_KERNEL; + for (k=0; kframes_per_block; k++) { + + header = (struct tpacket_hdr*)ptr; + header->tp_status = TP_STATUS_KERNEL; ptr += req->tp_frame_size; } } @@ -1642,8 +1663,7 @@ spin_lock_bh(&sk->sk_receive_queue.lock); pg_vec = XC(po->pg_vec, pg_vec); - io_vec = XC(po->iovec, io_vec); - po->iovmax = req->tp_frame_nr-1; + po->frame_max = req->tp_frame_nr-1; po->head = 0; po->frame_size = req->tp_frame_size; spin_unlock_bh(&sk->sk_receive_queue.lock); @@ -1652,7 +1672,7 @@ req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr); po->pg_vec_pages = req->tp_block_size/PAGE_SIZE; - po->prot_hook.func = po->iovec ? tpacket_rcv : packet_rcv; + po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv; skb_queue_purge(&sk->sk_receive_queue); #undef XC if (atomic_read(&po->mapped)) @@ -1669,9 +1689,6 @@ spin_unlock(&po->bind_lock); release_sock(sk); - - if (io_vec) - kfree(io_vec); out_free_pgvec: if (pg_vec)