patch-2.4.20 linux-2.4.20/net/ipv4/netfilter/ip_queue.c

Next file: linux-2.4.20/net/ipv4/netfilter/ip_tables.c
Previous file: linux-2.4.20/net/ipv4/netfilter/ip_nat_standalone.c
Back to the patch index
Back to the overall index

diff -urN linux-2.4.19/net/ipv4/netfilter/ip_queue.c linux-2.4.20/net/ipv4/netfilter/ip_queue.c
@@ -2,13 +2,14 @@
  * This is a module which is used for queueing IPv4 packets and
  * communicating with userspace via netlink.
  *
- * (C) 2000 James Morris, this code is GPL.
+ * (C) 2000-2002 James Morris, this code is GPL.
  *
  * 2000-03-27: Simplified code (thanks to Andi Kleen for clues).
  * 2000-05-20: Fixed notifier problems (following Miguel Freitas' report).
  * 2000-06-19: Fixed so nfmark is copied to metadata (reported by Sebastian 
  *             Zander).
  * 2000-08-01: Added Nick Williams' MAC support.
+ * 2002-06-25: Code cleanup.
  *
  */
 #include <linux/module.h>
@@ -18,205 +19,310 @@
 #include <linux/notifier.h>
 #include <linux/netdevice.h>
 #include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_queue.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
 #include <linux/netlink.h>
 #include <linux/spinlock.h>
-#include <linux/rtnetlink.h>
+#include <linux/brlock.h>
 #include <linux/sysctl.h>
 #include <linux/proc_fs.h>
 #include <net/sock.h>
 #include <net/route.h>
 
-#include <linux/netfilter_ipv4/ip_queue.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-
 #define IPQ_QMAX_DEFAULT 1024
 #define IPQ_PROC_FS_NAME "ip_queue"
 #define NET_IPQ_QMAX 2088
 #define NET_IPQ_QMAX_NAME "ip_queue_maxlen"
 
-typedef struct ipq_rt_info {
+struct ipq_rt_info {
 	__u8 tos;
 	__u32 daddr;
 	__u32 saddr;
-} ipq_rt_info_t;
+};
 
-typedef struct ipq_queue_element {
-	struct list_head list;		/* Links element into queue */
-	int verdict;			/* Current verdict */
-	struct nf_info *info;		/* Extra info from netfilter */
-	struct sk_buff *skb;		/* Packet inside */
-	ipq_rt_info_t rt_info;		/* May need post-mangle routing */
-} ipq_queue_element_t;
-
-typedef int (*ipq_send_cb_t)(ipq_queue_element_t *e);
-
-typedef struct ipq_peer {
-	pid_t pid;			/* PID of userland peer */
-	unsigned char died;		/* We think the peer died */
-	unsigned char copy_mode;	/* Copy packet as well as metadata? */
-	size_t copy_range;		/* Range past metadata to copy */
-	ipq_send_cb_t send;		/* Callback for sending data to peer */
-} ipq_peer_t;
-
-typedef struct ipq_queue {
- 	int len;			/* Current queue len */
- 	int *maxlen;			/* Maximum queue len, via sysctl */
- 	unsigned char flushing;		/* If queue is being flushed */
- 	unsigned char terminate;	/* If the queue is being terminated */
- 	struct list_head list;		/* Head of packet queue */
- 	spinlock_t lock;		/* Queue spinlock */
- 	ipq_peer_t peer;		/* Userland peer */
-} ipq_queue_t;
+struct ipq_queue_entry {
+	struct list_head list;
+	struct nf_info *info;
+	struct sk_buff *skb;
+	struct ipq_rt_info rt_info;
+};
 
-/****************************************************************************
- *
- * Packet queue
- *
- ****************************************************************************/
-/* Dequeue a packet if matched by cmp, or the next available if cmp is NULL */
-static ipq_queue_element_t *
-ipq_dequeue(ipq_queue_t *q,
-            int (*cmp)(ipq_queue_element_t *, unsigned long),
-            unsigned long data)
-{
-	struct list_head *i;
-
-	spin_lock_bh(&q->lock);
-	for (i = q->list.prev; i != &q->list; i = i->prev) {
-		ipq_queue_element_t *e = (ipq_queue_element_t *)i;
-		
-		if (!cmp || cmp(e, data)) {
-			list_del(&e->list);
-			q->len--;
-			spin_unlock_bh(&q->lock);
-			return e;
-		}
+typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long);
+
+static unsigned char copy_mode = IPQ_COPY_NONE;
+static unsigned int queue_maxlen = IPQ_QMAX_DEFAULT;
+static rwlock_t queue_lock = RW_LOCK_UNLOCKED;
+static int peer_pid;
+static unsigned int copy_range;
+static unsigned int queue_total;
+static struct sock *ipqnl;
+static LIST_HEAD(queue_list);
+static DECLARE_MUTEX(ipqnl_sem);
+
+static void
+ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict)
+{
+	nf_reinject(entry->skb, entry->info, verdict);
+	kfree(entry);
+}
+
+static inline int
+__ipq_enqueue_entry(struct ipq_queue_entry *entry)
+{
+       if (queue_total >= queue_maxlen) {
+               if (net_ratelimit()) 
+                       printk(KERN_WARNING "ip_queue: full at %d entries, "
+                              "dropping packet(s).\n", queue_total);
+               return -ENOSPC;
+       }
+       list_add(&entry->list, &queue_list);
+       queue_total++;
+       return 0;
+}
+
+/*
+ * Find and return a queued entry matched by cmpfn, or return the last
+ * entry if cmpfn is NULL.
+ */
+static inline struct ipq_queue_entry *
+__ipq_find_entry(ipq_cmpfn cmpfn, unsigned long data)
+{
+	struct list_head *p;
+
+	list_for_each_prev(p, &queue_list) {
+		struct ipq_queue_entry *entry = (struct ipq_queue_entry *)p;
+		
+		if (!cmpfn || cmpfn(entry, data))
+			return entry;
 	}
-	spin_unlock_bh(&q->lock);
 	return NULL;
 }
 
-/* Flush all packets */
-static void ipq_flush(ipq_queue_t *q)
+static inline void
+__ipq_dequeue_entry(struct ipq_queue_entry *entry)
 {
-	ipq_queue_element_t *e;
-	
-	spin_lock_bh(&q->lock);
-	q->flushing = 1;
-	spin_unlock_bh(&q->lock);
-	while ((e = ipq_dequeue(q, NULL, 0))) {
-		e->verdict = NF_DROP;
-		nf_reinject(e->skb, e->info, e->verdict);
-		kfree(e);
-	}
-	spin_lock_bh(&q->lock);
-	q->flushing = 0;
-	spin_unlock_bh(&q->lock);
-}
-
-static ipq_queue_t *ipq_create_queue(nf_queue_outfn_t outfn,
-                                     ipq_send_cb_t send_cb,
-                                     int *errp, int *sysctl_qmax)
+	list_del(&entry->list);
+	queue_total--;
+}
+
+static inline struct ipq_queue_entry *
+__ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data)
 {
-	int status;
-	ipq_queue_t *q;
+	struct ipq_queue_entry *entry;
 
-	*errp = 0;
-	q = kmalloc(sizeof(ipq_queue_t), GFP_KERNEL);
-	if (q == NULL) {
-		*errp = -ENOMEM;
+	entry = __ipq_find_entry(cmpfn, data);
+	if (entry == NULL)
 		return NULL;
+
+	__ipq_dequeue_entry(entry);
+	return entry;
+}
+
+
+static inline void
+__ipq_flush(int verdict)
+{
+	struct ipq_queue_entry *entry;
+	
+	while ((entry = __ipq_find_dequeue_entry(NULL, 0)))
+		ipq_issue_verdict(entry, verdict);
+}
+
+static inline int
+__ipq_set_mode(unsigned char mode, unsigned int range)
+{
+	int status = 0;
+	
+	switch(mode) {
+	case IPQ_COPY_NONE:
+	case IPQ_COPY_META:
+		copy_mode = mode;
+		copy_range = 0;
+		break;
+		
+	case IPQ_COPY_PACKET:
+		copy_mode = mode;
+		copy_range = range;
+		if (copy_range > 0xFFFF)
+			copy_range = 0xFFFF;
+		break;
+		
+	default:
+		status = -EINVAL;
+
 	}
-	q->peer.pid = 0;
-	q->peer.died = 0;
-	q->peer.copy_mode = IPQ_COPY_NONE;
-	q->peer.copy_range = 0;
-	q->peer.send = send_cb;
-	q->len = 0;
-	q->maxlen = sysctl_qmax;
-	q->flushing = 0;
-	q->terminate = 0;
-	INIT_LIST_HEAD(&q->list);
-	spin_lock_init(&q->lock);
-	status = nf_register_queue_handler(PF_INET, outfn, q);
-	if (status < 0) {
-		*errp = -EBUSY;
-		kfree(q);
+	return status;
+}
+
+static inline void
+__ipq_reset(void)
+{
+	peer_pid = 0;
+	__ipq_set_mode(IPQ_COPY_NONE, 0);
+	__ipq_flush(NF_DROP);
+}
+
+static struct ipq_queue_entry *
+ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data)
+{
+	struct ipq_queue_entry *entry;
+	
+	write_lock_bh(&queue_lock);
+	entry = __ipq_find_dequeue_entry(cmpfn, data);
+	write_unlock_bh(&queue_lock);
+	return entry;
+}
+
+static void
+ipq_flush(int verdict)
+{
+	write_lock_bh(&queue_lock);
+	__ipq_flush(verdict);
+	write_unlock_bh(&queue_lock);
+}
+
+static struct sk_buff *
+ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
+{
+	unsigned char *old_tail;
+	size_t size = 0;
+	size_t data_len = 0;
+	struct sk_buff *skb;
+	struct ipq_packet_msg *pmsg;
+	struct nlmsghdr *nlh;
+
+	read_lock_bh(&queue_lock);
+	
+	switch (copy_mode) {
+	case IPQ_COPY_META:
+	case IPQ_COPY_NONE:
+		size = NLMSG_SPACE(sizeof(*pmsg));
+		data_len = 0;
+		break;
+	
+	case IPQ_COPY_PACKET:
+		if (copy_range == 0 || copy_range > entry->skb->len)
+			data_len = entry->skb->len;
+		else
+			data_len = copy_range;
+		
+		size = NLMSG_SPACE(sizeof(*pmsg) + data_len);
+		break;
+	
+	default:
+		*errp = -EINVAL;
+		read_unlock_bh(&queue_lock);
 		return NULL;
 	}
-	return q;
+
+	read_unlock_bh(&queue_lock);
+
+	skb = alloc_skb(size, GFP_ATOMIC);
+	if (!skb)
+		goto nlmsg_failure;
+		
+	old_tail= skb->tail;
+	nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh));
+	pmsg = NLMSG_DATA(nlh);
+	memset(pmsg, 0, sizeof(*pmsg));
+
+	pmsg->packet_id       = (unsigned long )entry;
+	pmsg->data_len        = data_len;
+	pmsg->timestamp_sec   = entry->skb->stamp.tv_sec;
+	pmsg->timestamp_usec  = entry->skb->stamp.tv_usec;
+	pmsg->mark            = entry->skb->nfmark;
+	pmsg->hook            = entry->info->hook;
+	pmsg->hw_protocol     = entry->skb->protocol;
+	
+	if (entry->info->indev)
+		strcpy(pmsg->indev_name, entry->info->indev->name);
+	else
+		pmsg->indev_name[0] = '\0';
+	
+	if (entry->info->outdev)
+		strcpy(pmsg->outdev_name, entry->info->outdev->name);
+	else
+		pmsg->outdev_name[0] = '\0';
+	
+	if (entry->info->indev && entry->skb->dev) {
+		pmsg->hw_type = entry->skb->dev->type;
+		if (entry->skb->dev->hard_header_parse)
+			pmsg->hw_addrlen =
+				entry->skb->dev->hard_header_parse(entry->skb,
+				                                   pmsg->hw_addr);
+	}
+	
+	if (data_len)
+		memcpy(pmsg->payload, entry->skb->data, data_len);
+		
+	nlh->nlmsg_len = skb->tail - old_tail;
+	return skb;
+
+nlmsg_failure:
+	if (skb)
+		kfree_skb(skb);
+	*errp = -EINVAL;
+	printk(KERN_ERR "ip_queue: error creating packet message\n");
+	return NULL;
 }
 
-static int ipq_enqueue(ipq_queue_t *q,
-                       struct sk_buff *skb, struct nf_info *info)
+static int
+ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
 {
-	ipq_queue_element_t *e;
-	int status;
-	
-	e = kmalloc(sizeof(*e), GFP_ATOMIC);
-	if (e == NULL) {
-		printk(KERN_ERR "ip_queue: OOM in enqueue\n");
+	int status = -EINVAL;
+	struct sk_buff *nskb;
+	struct ipq_queue_entry *entry;
+
+	if (copy_mode == IPQ_COPY_NONE)
+		return -EAGAIN;
+
+	entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+	if (entry == NULL) {
+		printk(KERN_ERR "ip_queue: OOM in ipq_enqueue_packet()\n");
 		return -ENOMEM;
 	}
 
-	e->verdict = NF_DROP;
-	e->info = info;
-	e->skb = skb;
+	entry->info = info;
+	entry->skb = skb;
 
-	if (e->info->hook == NF_IP_LOCAL_OUT) {
+	if (entry->info->hook == NF_IP_LOCAL_OUT) {
 		struct iphdr *iph = skb->nh.iph;
 
-		e->rt_info.tos = iph->tos;
-		e->rt_info.daddr = iph->daddr;
-		e->rt_info.saddr = iph->saddr;
-	}
-
-	spin_lock_bh(&q->lock);
-	if (q->len >= *q->maxlen) {
-		spin_unlock_bh(&q->lock);
-		if (net_ratelimit()) 
-			printk(KERN_WARNING "ip_queue: full at %d entries, "
-			       "dropping packet(s).\n", q->len);
-		goto free_drop;
-	}
-	if (q->flushing || q->peer.copy_mode == IPQ_COPY_NONE
-	    || q->peer.pid == 0 || q->peer.died || q->terminate) {
-		spin_unlock_bh(&q->lock);
-		goto free_drop;
-	}
-	status = q->peer.send(e);
-	if (status > 0) {
-		list_add(&e->list, &q->list);
-		q->len++;
-		spin_unlock_bh(&q->lock);
-		return status;
-	}
-	spin_unlock_bh(&q->lock);
-	if (status == -ECONNREFUSED) {
-		printk(KERN_INFO "ip_queue: peer %d died, "
-		       "resetting state and flushing queue\n", q->peer.pid);
-			q->peer.died = 1;
-			q->peer.pid = 0;
-			q->peer.copy_mode = IPQ_COPY_NONE;
-			q->peer.copy_range = 0;
-			ipq_flush(q);
-	}
-free_drop:
-	kfree(e);
-	return -EBUSY;
-}
+		entry->rt_info.tos = iph->tos;
+		entry->rt_info.daddr = iph->daddr;
+		entry->rt_info.saddr = iph->saddr;
+	}
 
-static void ipq_destroy_queue(ipq_queue_t *q)
-{
-	nf_unregister_queue_handler(PF_INET);
-	spin_lock_bh(&q->lock);
-	q->terminate = 1;
-	spin_unlock_bh(&q->lock);
-	ipq_flush(q);
-	kfree(q);
+	nskb = ipq_build_packet_message(entry, &status);
+	if (nskb == NULL)
+		goto err_out_free;
+		
+	write_lock_bh(&queue_lock);
+	
+	if (!peer_pid)
+		goto err_out_unlock;
+
+	status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT);
+	if (status < 0)
+		goto err_out_unlock;
+	
+	status = __ipq_enqueue_entry(entry);
+	if (status < 0)
+		goto err_out_unlock;
+
+	write_unlock_bh(&queue_lock);
+	return status;
+	
+err_out_unlock:
+	write_unlock_bh(&queue_lock);
+
+err_out_free:
+	kfree(entry);
+	return status;
 }
 
-static int ipq_mangle_ipv4(ipq_verdict_msg_t *v, ipq_queue_element_t *e)
+static int
+ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
 {
 	int diff;
 	struct iphdr *user_iph = (struct iphdr *)v->payload;
@@ -266,296 +372,216 @@
 	return 0;
 }
 
-static inline int id_cmp(ipq_queue_element_t *e, unsigned long id)
+static inline int
+id_cmp(struct ipq_queue_entry *e, unsigned long id)
 {
 	return (id == (unsigned long )e);
 }
 
-static int ipq_set_verdict(ipq_queue_t *q,
-                           ipq_verdict_msg_t *v, unsigned int len)
+static int
+ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len)
 {
-	ipq_queue_element_t *e;
+	struct ipq_queue_entry *entry;
 
-	if (v->value > NF_MAX_VERDICT)
+	if (vmsg->value > NF_MAX_VERDICT)
 		return -EINVAL;
-	e = ipq_dequeue(q, id_cmp, v->id);
-	if (e == NULL)
+
+	entry = ipq_find_dequeue_entry(id_cmp, vmsg->id);
+	if (entry == NULL)
 		return -ENOENT;
 	else {
-		e->verdict = v->value;
-		if (v->data_len && v->data_len == len)
-			if (ipq_mangle_ipv4(v, e) < 0)
-				e->verdict = NF_DROP;
-		nf_reinject(e->skb, e->info, e->verdict);
-		kfree(e);
+		int verdict = vmsg->value;
+		
+		if (vmsg->data_len && vmsg->data_len == len)
+			if (ipq_mangle_ipv4(vmsg, entry) < 0)
+				verdict = NF_DROP;
+		
+		ipq_issue_verdict(entry, verdict);
 		return 0;
 	}
 }
 
-static int ipq_receive_peer(ipq_queue_t *q, ipq_peer_msg_t *m,
-                            unsigned char type, unsigned int len)
+static int
+ipq_set_mode(unsigned char mode, unsigned int range)
 {
+	int status;
+
+	write_lock_bh(&queue_lock);
+	status = __ipq_set_mode(mode, range);
+	write_unlock_bh(&queue_lock);
+	return status;
+}
 
+static int
+ipq_receive_peer(struct ipq_peer_msg *pmsg,
+                 unsigned char type, unsigned int len)
+{
 	int status = 0;
-	int busy;
-		
-	spin_lock_bh(&q->lock);
-	busy = (q->terminate || q->flushing);
-	spin_unlock_bh(&q->lock);
-	if (busy)
-		return -EBUSY;
-	if (len < sizeof(ipq_peer_msg_t))
+
+	if (len < sizeof(*pmsg))
 		return -EINVAL;
+
 	switch (type) {
-		case IPQM_MODE:
-			switch (m->msg.mode.value) {
-				case IPQ_COPY_META:
-					q->peer.copy_mode = IPQ_COPY_META;
-					q->peer.copy_range = 0;
-					break;
-				case IPQ_COPY_PACKET:
-					q->peer.copy_mode = IPQ_COPY_PACKET;
-					q->peer.copy_range = m->msg.mode.range;
-					if (q->peer.copy_range > 0xFFFF)
-						q->peer.copy_range = 0xFFFF;
-					break;
-				default:
-					status = -EINVAL;
-			}
-			break;
-		case IPQM_VERDICT:
-			if (m->msg.verdict.value > NF_MAX_VERDICT)
-				status = -EINVAL;
-			else
-				status = ipq_set_verdict(q,
-				                         &m->msg.verdict,
-				                         len - sizeof(*m));
+	case IPQM_MODE:
+		status = ipq_set_mode(pmsg->msg.mode.value,
+		                      pmsg->msg.mode.range);
+		break;
+		
+	case IPQM_VERDICT:
+		if (pmsg->msg.verdict.value > NF_MAX_VERDICT)
+			status = -EINVAL;
+		else
+			status = ipq_set_verdict(&pmsg->msg.verdict,
+			                         len - sizeof(*pmsg));
 			break;
-		default:
-			 status = -EINVAL;
+	default:
+		status = -EINVAL;
 	}
 	return status;
 }
 
-static inline int dev_cmp(ipq_queue_element_t *e, unsigned long ifindex)
+static int
+dev_cmp(struct ipq_queue_entry *entry, unsigned long ifindex)
 {
-	if (e->info->indev)
-		if (e->info->indev->ifindex == ifindex)
+	if (entry->info->indev)
+		if (entry->info->indev->ifindex == ifindex)
 			return 1;
-	if (e->info->outdev)
-		if (e->info->outdev->ifindex == ifindex)
+			
+	if (entry->info->outdev)
+		if (entry->info->outdev->ifindex == ifindex)
 			return 1;
+
 	return 0;
 }
 
-/* Drop any queued packets associated with device ifindex */
-static void ipq_dev_drop(ipq_queue_t *q, int ifindex)
+static void
+ipq_dev_drop(int ifindex)
 {
-	ipq_queue_element_t *e;
+	struct ipq_queue_entry *entry;
 	
-	while ((e = ipq_dequeue(q, dev_cmp, ifindex))) {
-		e->verdict = NF_DROP;
-		nf_reinject(e->skb, e->info, e->verdict);
-		kfree(e);
-	}
-}
-
-/****************************************************************************
- *
- * Netfilter interface
- *
- ****************************************************************************/
-
-/*
- * Packets arrive here from netfilter for queuing to userspace.
- * All of them must be fed back via nf_reinject() or Alexey will kill Rusty.
- */
-static int netfilter_receive(struct sk_buff *skb,
-                             struct nf_info *info, void *data)
-{
-	return ipq_enqueue((ipq_queue_t *)data, skb, info);
-}
-
-/****************************************************************************
- *
- * Netlink interface.
- *
- ****************************************************************************/
-
-static struct sock *nfnl = NULL;
-ipq_queue_t *nlq = NULL;
-
-static struct sk_buff *netlink_build_message(ipq_queue_element_t *e, int *errp)
-{
-	unsigned char *old_tail;
-	size_t size = 0;
-	size_t data_len = 0;
-	struct sk_buff *skb;
-	ipq_packet_msg_t *pm;
-	struct nlmsghdr *nlh;
-
-	switch (nlq->peer.copy_mode) {
-		size_t copy_range;
-
-		case IPQ_COPY_META:
-			size = NLMSG_SPACE(sizeof(*pm));
-			data_len = 0;
-			break;
-		case IPQ_COPY_PACKET:
-			copy_range = nlq->peer.copy_range;
-			if (copy_range == 0 || copy_range > e->skb->len)
-				data_len = e->skb->len;
-			else
-				data_len = copy_range;
-			size = NLMSG_SPACE(sizeof(*pm) + data_len);
-			
-			break;
-		case IPQ_COPY_NONE:
-		default:
-			*errp = -EINVAL;
-			return NULL;
-	}
-	skb = alloc_skb(size, GFP_ATOMIC);
-	if (!skb)
-		goto nlmsg_failure;
-	old_tail = skb->tail;
-	nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh));
-	pm = NLMSG_DATA(nlh);
-	memset(pm, 0, sizeof(*pm));
-	pm->packet_id = (unsigned long )e;
-	pm->data_len = data_len;
-	pm->timestamp_sec = e->skb->stamp.tv_sec;
-	pm->timestamp_usec = e->skb->stamp.tv_usec;
-	pm->mark = e->skb->nfmark;
-	pm->hook = e->info->hook;
-	if (e->info->indev) strcpy(pm->indev_name, e->info->indev->name);
-	else pm->indev_name[0] = '\0';
-	if (e->info->outdev) strcpy(pm->outdev_name, e->info->outdev->name);
-	else pm->outdev_name[0] = '\0';
-	pm->hw_protocol = e->skb->protocol;
-	if (e->info->indev && e->skb->dev) {
-		pm->hw_type = e->skb->dev->type;
-		if (e->skb->dev->hard_header_parse)
-			pm->hw_addrlen =
-				e->skb->dev->hard_header_parse(e->skb,
-				                               pm->hw_addr);
-	}
-	if (data_len)
-		memcpy(pm->payload, e->skb->data, data_len);
-	nlh->nlmsg_len = skb->tail - old_tail;
-	NETLINK_CB(skb).dst_groups = 0;
-	return skb;
-nlmsg_failure:
-	if (skb)
-		kfree_skb(skb);
-	*errp = 0;
-	printk(KERN_ERR "ip_queue: error creating netlink message\n");
-	return NULL;
-}
-
-static int netlink_send_peer(ipq_queue_element_t *e)
-{
-	int status = 0;
-	struct sk_buff *skb;
-
-	skb = netlink_build_message(e, &status);
-	if (skb == NULL)
-		return status;
-	return netlink_unicast(nfnl, skb, nlq->peer.pid, MSG_DONTWAIT);
+	while ((entry = ipq_find_dequeue_entry(dev_cmp, ifindex)) != NULL)
+		ipq_issue_verdict(entry, NF_DROP);
 }
 
 #define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
 
-static __inline__ void netlink_receive_user_skb(struct sk_buff *skb)
+static inline void
+ipq_rcv_skb(struct sk_buff *skb)
 {
-	int status, type;
+	int status, type, pid, flags, nlmsglen, skblen;
 	struct nlmsghdr *nlh;
 
-	if (skb->len < sizeof(struct nlmsghdr))
+	skblen = skb->len;
+	if (skblen < sizeof(*nlh))
 		return;
 
 	nlh = (struct nlmsghdr *)skb->data;
-	if (nlh->nlmsg_len < sizeof(struct nlmsghdr)
-	    || skb->len < nlh->nlmsg_len)
-	    	return;
-
-	if(nlh->nlmsg_pid <= 0
-	    || !(nlh->nlmsg_flags & NLM_F_REQUEST)
-	    || nlh->nlmsg_flags & NLM_F_MULTI)
+	nlmsglen = nlh->nlmsg_len;
+	if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen)
+		return;
+
+	pid = nlh->nlmsg_pid;
+	flags = nlh->nlmsg_flags;
+	
+	if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI)
 		RCV_SKB_FAIL(-EINVAL);
-	if (nlh->nlmsg_flags & MSG_TRUNC)
+		
+	if (flags & MSG_TRUNC)
 		RCV_SKB_FAIL(-ECOMM);
+		
 	type = nlh->nlmsg_type;
 	if (type < NLMSG_NOOP || type >= IPQM_MAX)
 		RCV_SKB_FAIL(-EINVAL);
+		
 	if (type <= IPQM_BASE)
 		return;
+		
 	if(!cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN))
 		RCV_SKB_FAIL(-EPERM);
-	if (nlq->peer.pid && !nlq->peer.died
-	    && (nlq->peer.pid != nlh->nlmsg_pid)) {
-	    	printk(KERN_WARNING "ip_queue: peer pid changed from %d to "
-	    	      "%d, flushing queue\n", nlq->peer.pid, nlh->nlmsg_pid);
-		ipq_flush(nlq);
-	}	
-	nlq->peer.pid = nlh->nlmsg_pid;
-	nlq->peer.died = 0;
-	status = ipq_receive_peer(nlq, NLMSG_DATA(nlh),
-	                          type, skb->len - NLMSG_LENGTH(0));
+	
+	write_lock_bh(&queue_lock);
+	
+	if (peer_pid) {
+		if (peer_pid != pid) {
+			write_unlock_bh(&queue_lock);
+			RCV_SKB_FAIL(-EBUSY);
+		}
+	}
+	else
+		peer_pid = pid;
+		
+	write_unlock_bh(&queue_lock);
+	
+	status = ipq_receive_peer(NLMSG_DATA(nlh), type,
+	                          skblen - NLMSG_LENGTH(0));
 	if (status < 0)
 		RCV_SKB_FAIL(status);
-	if (nlh->nlmsg_flags & NLM_F_ACK)
+		
+	if (flags & NLM_F_ACK)
 		netlink_ack(skb, nlh, 0);
         return;
 }
 
-/* Note: we are only dealing with single part messages at the moment. */
-static void netlink_receive_user_sk(struct sock *sk, int len)
+static void
+ipq_rcv_sk(struct sock *sk, int len)
 {
 	do {
 		struct sk_buff *skb;
 
-		if (rtnl_shlock_nowait())
+		if (down_trylock(&ipqnl_sem))
 			return;
+			
 		while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) {
-			netlink_receive_user_skb(skb);
+			ipq_rcv_skb(skb);
 			kfree_skb(skb);
 		}
-		up(&rtnl_sem);
-	} while (nfnl && nfnl->receive_queue.qlen);
-}
+		
+		up(&ipqnl_sem);
 
-/****************************************************************************
- *
- * System events
- *
- ****************************************************************************/
+	} while (ipqnl && ipqnl->receive_queue.qlen);
+}
 
-static int receive_event(struct notifier_block *this,
-                         unsigned long event, void *ptr)
+static int
+ipq_rcv_dev_event(struct notifier_block *this,
+                  unsigned long event, void *ptr)
 {
 	struct net_device *dev = ptr;
 
 	/* Drop any packets associated with the downed device */
 	if (event == NETDEV_DOWN)
-		ipq_dev_drop(nlq, dev->ifindex);
+		ipq_dev_drop(dev->ifindex);
 	return NOTIFY_DONE;
 }
 
-struct notifier_block ipq_dev_notifier = {
-	receive_event,
+static struct notifier_block ipq_dev_notifier = {
+	ipq_rcv_dev_event,
 	NULL,
 	0
 };
 
-/****************************************************************************
- *
- * Sysctl - queue tuning.
- *
- ****************************************************************************/
+static int
+ipq_rcv_nl_event(struct notifier_block *this,
+                 unsigned long event, void *ptr)
+{
+	struct netlink_notify *n = ptr;
+
+	if (event == NETLINK_URELEASE &&
+	    n->protocol == NETLINK_FIREWALL && n->pid) {
+		write_lock_bh(&queue_lock);
+		if (n->pid == peer_pid)
+			__ipq_reset();
+		write_unlock_bh(&queue_lock);
+	}
+	return NOTIFY_DONE;
+}
 
-static int sysctl_maxlen = IPQ_QMAX_DEFAULT;
+static struct notifier_block ipq_nl_notifier = {
+	ipq_rcv_nl_event,
+	NULL,
+	0
+};
 
+static int sysctl_maxlen = IPQ_QMAX_DEFAULT;
 static struct ctl_table_header *ipq_sysctl_header;
 
 static ctl_table ipq_table[] = {
@@ -574,35 +600,27 @@
 	{ 0 }
 };
 
-/****************************************************************************
- *
- * Procfs - debugging info.
- *
- ****************************************************************************/
-
-static int ipq_get_info(char *buffer, char **start, off_t offset, int length)
+static int
+ipq_get_info(char *buffer, char **start, off_t offset, int length)
 {
 	int len;
 
-	spin_lock_bh(&nlq->lock);
+	read_lock_bh(&queue_lock);
+	
 	len = sprintf(buffer,
-	              "Peer pid            : %d\n"
-	              "Peer died           : %d\n"
-	              "Peer copy mode      : %d\n"
-	              "Peer copy range     : %Zu\n"
-	              "Queue length        : %d\n"
-	              "Queue max. length   : %d\n"
-	              "Queue flushing      : %d\n"
-	              "Queue terminate     : %d\n",
-	              nlq->peer.pid,
-	              nlq->peer.died,
-	              nlq->peer.copy_mode,
-	              nlq->peer.copy_range,
-	              nlq->len,
-	              *nlq->maxlen,
-	              nlq->flushing,
-	              nlq->terminate);
-	spin_unlock_bh(&nlq->lock);
+	              "Peer PID          : %d\n"
+	              "Copy mode         : %hu\n"
+	              "Copy range        : %u\n"
+	              "Queue length      : %u\n"
+	              "Queue max. length : %u\n",
+	              peer_pid,
+	              copy_mode,
+	              copy_range,
+	              queue_total,
+	              queue_maxlen);
+
+	read_unlock_bh(&queue_lock);
+	
 	*start = buffer + offset;
 	len -= offset;
 	if (len > length)
@@ -612,53 +630,74 @@
 	return len;
 }
 
-/****************************************************************************
- *
- * Module stuff.
- *
- ****************************************************************************/
-
-static int __init init(void)
+static int
+init_or_cleanup(int init)
 {
-	int status = 0;
+	int status = -ENOMEM;
 	struct proc_dir_entry *proc;
 	
-	nfnl = netlink_kernel_create(NETLINK_FIREWALL, netlink_receive_user_sk);
-	if (nfnl == NULL) {
-		printk(KERN_ERR "ip_queue: initialisation failed: unable to "
-		       "create kernel netlink socket\n");
-		return -ENOMEM;
-	}
-	nlq = ipq_create_queue(netfilter_receive,
-	                       netlink_send_peer, &status, &sysctl_maxlen);
-	if (nlq == NULL) {
-		printk(KERN_ERR "ip_queue: initialisation failed: unable to "
-		       "create queue\n");
-		sock_release(nfnl->socket);
-		return status;
+	if (!init)
+		goto cleanup;
+
+	netlink_register_notifier(&ipq_nl_notifier);
+	ipqnl = netlink_kernel_create(NETLINK_FIREWALL, ipq_rcv_sk);
+	if (ipqnl == NULL) {
+		printk(KERN_ERR "ip_queue: failed to create netlink socket\n");
+		goto cleanup_netlink_notifier;
 	}
+
 	proc = proc_net_create(IPQ_PROC_FS_NAME, 0, ipq_get_info);
-	if (proc) proc->owner = THIS_MODULE;
+	if (proc)
+		proc->owner = THIS_MODULE;
 	else {
-		ipq_destroy_queue(nlq);
-		sock_release(nfnl->socket);
-		return -ENOMEM;
+		printk(KERN_ERR "ip_queue: failed to create proc entry\n");
+		goto cleanup_ipqnl;
 	}
+	
 	register_netdevice_notifier(&ipq_dev_notifier);
 	ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0);
+	
+	status = nf_register_queue_handler(PF_INET, ipq_enqueue_packet, NULL);
+	if (status < 0) {
+		printk(KERN_ERR "ip_queue: failed to register queue handler\n");
+		goto cleanup_sysctl;
+	}
+	return status;
+
+cleanup:
+	nf_unregister_queue_handler(PF_INET);
+	br_write_lock_bh(BR_NETPROTO_LOCK);
+	br_write_unlock_bh(BR_NETPROTO_LOCK);
+	ipq_flush(NF_DROP);
+	
+cleanup_sysctl:
+	unregister_sysctl_table(ipq_sysctl_header);
+	unregister_netdevice_notifier(&ipq_dev_notifier);
+	proc_net_remove(IPQ_PROC_FS_NAME);
+	
+cleanup_ipqnl:
+	sock_release(ipqnl->socket);
+	down(&ipqnl_sem);
+	up(&ipqnl_sem);
+	
+cleanup_netlink_notifier:
+	netlink_unregister_notifier(&ipq_nl_notifier);
 	return status;
 }
 
+static int __init init(void)
+{
+	
+	return init_or_cleanup(1);
+}
+
 static void __exit fini(void)
 {
-	unregister_sysctl_table(ipq_sysctl_header);
-	proc_net_remove(IPQ_PROC_FS_NAME);
-	unregister_netdevice_notifier(&ipq_dev_notifier);
-	ipq_destroy_queue(nlq);
-	sock_release(nfnl->socket);
+	init_or_cleanup(0);
 }
 
 MODULE_DESCRIPTION("IPv4 packet queue handler");
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
 MODULE_LICENSE("GPL");
 
 module_init(init);

FUNET's LINUX-ADM group, linux-adm@nic.funet.fi
TCL-scripts by Sam Shen (who was at: slshen@lbl.gov)