summaryrefslogtreecommitdiff
path: root/minix/net/lwip/bpfdev.c
blob: 3e12c8dac1dc554fab0bd15bace2ae67f0bb294f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
/* LWIP service - bpfdev.c - Berkeley Packet Filter (/dev/bpf) interface */
/*
 * BPF is a cloning device: opening /dev/bpf returns a new BPF device which is
 * independent from any other opened BPF devices.  We assume that each BPF
 * device is used by one single user process, and this implementation therefore
 * does not support multiple concurrent device calls on the same BPF device.
 *
 * Packet buffering basically follows the BSD model: each BPF device that is
 * configured (that is, it has been attached to an interface) has two buffers,
 * each of the configured size: a store buffer, where new packets are stored,
 * and a hold buffer, which is typically full and awaiting retrieval through a
 * read call from userland.  The buffers are swapped ("rotated") when the store
 * buffer is filled up and the hold buffer is empty - if the hold buffer is not
 * empty is not empty either, additional packets are dropped.
 *
 * These buffers are allocated when the BPF device is attached to an interface.
 * The interface may later disappear, in which case the BPF device is detached
 * from it, allowing any final packets to be read before read requests start
 * returning I/O errors.  The buffers are freed only when the device is closed.
 */

#include "lwip.h"
#include "bpfdev.h"

#include <minix/chardriver.h>
#include <net/if.h>
#include <net/bpfdesc.h>
#include <minix/bpf.h>
#include <sys/mman.h>

/*
 * Make sure that our implementation matches the BPF version in the NetBSD
 * headers.  If they change the version number, we may have to make changes
 * here accordingly.
 */
#if BPF_MAJOR_VERSION != 1 || BPF_MINOR_VERSION != 1
#error "NetBSD BPF version has changed"
#endif

/* The number of BPF devices. */
#define NR_BPFDEV		16

/* BPF receive buffer size: allowed range and default. */
#define BPF_BUF_MIN		BPF_WORDALIGN(sizeof(struct bpf_hdr))
#define BPF_BUF_DEF		32768
#define BPF_BUF_MAX		262144

/*
 * By opening /dev/bpf, one will obtain a cloned device with a different minor
 * number, which maps to one of the BPF devices.
 */
#define BPFDEV_MINOR		0	/* minor number of /dev/bpf */
#define BPFDEV_BASE_MINOR	1	/* base minor number for BPF devices */

static struct bpfdev {
	struct bpfdev_link bpf_link;	/* structure link, MUST be first */
	TAILQ_ENTRY(bpfdev) bpf_next;	/* next on free or interface list */
	struct ifdev *bpf_ifdev;	/* associated interface, or NULL */
	unsigned int bpf_flags;		/* flags (BPFF_) */
	size_t bpf_size;		/* size of packet buffers */
	char *bpf_sbuf;			/* store buffer (mmap'd, or NULL) */
	char *bpf_hbuf;			/* hold buffer (mmap'd, or NULL) */
	size_t bpf_slen;		/* used part of store buffer */
	size_t bpf_hlen;		/* used part of hold buffer */
	struct bpf_insn *bpf_filter;	/* verified BPF filter, or NULL */
	size_t bpf_filterlen;		/* length of filter, for munmap */
	pid_t bpf_pid;			/* process ID of last using process */
	clock_t bpf_timeout;		/* timeout for read calls (0 = none) */
	struct {			/* state for pending read request */
		endpoint_t br_endpt;	/* reading endpoint, or NONE */
		cp_grant_id_t br_grant;	/* grant for reader's buffer */
		cdev_id_t br_id;	/* read request identifier */
		minix_timer_t br_timer;	/* timer for read timeout */
	} bpf_read;
	struct {			/* state for pending select request */
		endpoint_t bs_endpt;	/* selecting endpoint, or NONE */
		unsigned int bs_selops;	/* pending select operations */
	} bpf_select;
	struct {			/* packet capture statistics */
		uint64_t bs_recv;	/* # of packets run through filter */
		uint64_t bs_drop;	/* # of packets dropped: buffer full */
		uint64_t bs_capt;	/* # of packets accepted by filter */
	} bpf_stat;
} bpf_array[NR_BPFDEV];

#define BPFF_IN_USE	0x01		/* this BPF device object is in use */
#define BPFF_PROMISC	0x02		/* promiscuous mode enabled */
#define BPFF_IMMEDIATE	0x04		/* immediate mode is enabled */
#define BPFF_SEESENT	0x08		/* also process host-sent packets */
#define BPFF_HDRCMPLT	0x10		/* do not fill in link-layer source */
#define BPFF_FEEDBACK	0x20		/* feed back written packet as input */

static TAILQ_HEAD(, bpfdev_link) bpfl_freelist;	/* list of free BPF devices */

static struct bpf_stat bpf_stat;

static ssize_t bpfdev_peers(struct rmib_call *, struct rmib_node *,
	struct rmib_oldp *, struct rmib_newp *);

/* The CTL_NET NET_BPF subtree.  All nodes are dynamically numbered. */
static struct rmib_node net_bpf_table[] = {
	RMIB_INT(RMIB_RO, BPF_BUF_MAX, "maxbufsize",
	    "Maximum size for data capture buffer"), /* TODO: read-write */
	RMIB_STRUCT(RMIB_RO, sizeof(bpf_stat), &bpf_stat, "stats",
	    "BPF stats"),
	RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, bpfdev_peers, "peers",
	    "BPF peers"),
};

static struct rmib_node net_bpf_node =
    RMIB_NODE(RMIB_RO, net_bpf_table, "bpf", "BPF options");

/*
 * Initialize the BPF module.
 */
void
bpfdev_init(void)
{
	const int mib[] = { CTL_NET, NET_BPF };
	unsigned int slot;
	int r;

	/* Initialize data structures. */
	TAILQ_INIT(&bpfl_freelist);

	for (slot = 0; slot < __arraycount(bpf_array); slot++) {
		bpf_array[slot].bpf_flags = 0;

		TAILQ_INSERT_TAIL(&bpfl_freelist, &bpf_array[slot].bpf_link,
		    bpfl_next);
	}

	memset(&bpf_stat, 0, sizeof(bpf_stat));

	/* Register the "net.bpf" subtree with the MIB service. */
	if ((r = rmib_register(mib, __arraycount(mib), &net_bpf_node)) != OK)
		panic("unable to register net.bpf RMIB tree: %d", r);
}

/*
 * Given a BPF device object, return the corresponding minor number.
 */
static devminor_t
bpfdev_get_minor(struct bpfdev * bpfdev)
{

	assert(bpfdev != NULL);

	return BPFDEV_BASE_MINOR + (devminor_t)(bpfdev - bpf_array);
}

/*
 * Given a minor number, return the corresponding BPF device object, or NULL if
 * the minor number does not identify a BPF device.
 */
static struct bpfdev *
bpfdev_get_by_minor(devminor_t minor)
{

	if (minor < BPFDEV_BASE_MINOR ||
	    (unsigned int)minor >= BPFDEV_BASE_MINOR + __arraycount(bpf_array))
		return NULL;

	return &bpf_array[minor - BPFDEV_BASE_MINOR];
}

/*
 * Open a BPF device, returning a cloned device instance.
 */
static int
bpfdev_open(devminor_t minor, int access __unused, endpoint_t user_endpt)
{
	struct bpfdev_link *bpfl;
	struct bpfdev *bpf;

	/* Disallow opening cloned devices through device nodes. */
	if (minor != BPFDEV_MINOR)
		return ENXIO;

	if (TAILQ_EMPTY(&bpfl_freelist))
		return ENOBUFS;

	bpfl = TAILQ_FIRST(&bpfl_freelist);
	TAILQ_REMOVE(&bpfl_freelist, bpfl, bpfl_next);

	bpf = (struct bpfdev *)bpfl;

	memset(bpf, 0, sizeof(*bpf));

	bpf->bpf_flags = BPFF_IN_USE | BPFF_SEESENT;
	bpf->bpf_size = BPF_BUF_DEF;
	bpf->bpf_pid = getnpid(user_endpt);
	bpf->bpf_read.br_endpt = NONE;
	bpf->bpf_select.bs_endpt = NONE;

	return CDEV_CLONED | bpfdev_get_minor(bpf);
}

/*
 * Close a BPF device.
 */
static int
bpfdev_close(devminor_t minor)
{
	struct bpfdev *bpf;

	if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
		return EINVAL;

	/*
	 * There cannot possibly be a pending read request, so we never need to
	 * cancel the read timer from here either.
	 */
	assert(bpf->bpf_read.br_endpt == NONE);

	if (bpf->bpf_sbuf != NULL) {
		assert(bpf->bpf_hbuf != NULL);

		if (munmap(bpf->bpf_sbuf, bpf->bpf_size) != 0)
			panic("munmap failed: %d", -errno);
		if (munmap(bpf->bpf_hbuf, bpf->bpf_size) != 0)
			panic("munmap failed: %d", -errno);

		bpf->bpf_sbuf = NULL;
		bpf->bpf_hbuf = NULL;
	} else
		assert(bpf->bpf_hbuf == NULL);

	if (bpf->bpf_filter != NULL) {
		assert(bpf->bpf_filterlen > 0);

		if (munmap(bpf->bpf_filter, bpf->bpf_filterlen) != 0)
			panic("munmap failed: %d", -errno);

		bpf->bpf_filter = NULL;
	}

	/*
	 * If the BPF device was attached to an interface, and that interface
	 * has not disappeared in the meantime, detach from it now.
	 */
	if (bpf->bpf_ifdev != NULL) {
		if (bpf->bpf_flags & BPFF_PROMISC)
			ifdev_clear_promisc(bpf->bpf_ifdev);

		ifdev_detach_bpf(bpf->bpf_ifdev, &bpf->bpf_link);

		bpf->bpf_ifdev = NULL;
	}

	bpf->bpf_flags = 0;		/* mark as no longer in use */

	TAILQ_INSERT_HEAD(&bpfl_freelist, &bpf->bpf_link, bpfl_next);

	return OK;
}

/*
 * Rotate buffers for the BPF device, by swapping the store buffer and the hold
 * buffer.
 */
static void
bpfdev_rotate(struct bpfdev * bpf)
{
	char *buf;
	size_t len;

	/*
	 * When rotating, the store buffer may or may not be empty, but the
	 * hold buffer must always be empty.
	 */
	assert(bpf->bpf_hlen == 0);

	buf = bpf->bpf_sbuf;
	len = bpf->bpf_slen;
	bpf->bpf_sbuf = bpf->bpf_hbuf;
	bpf->bpf_slen = bpf->bpf_hlen;
	bpf->bpf_hbuf = buf;
	bpf->bpf_hlen = len;
}

/*
 * Test whether any of the given select operations are ready on the BPF device,
 * and return the set of ready operations.
 */
static unsigned int
bpfdev_test_select(struct bpfdev * bpf, unsigned int ops)
{
	unsigned int ready_ops;

	ready_ops = 0;

	/*
	 * The BPF device is ready for reading if the hold buffer is not empty
	 * (i.e.: the store buffer has been filled up completely and was
	 * therefore rotated) or if immediate mode is set and the store buffer
	 * is not empty (i.e.: any packet is available at all).  In the latter
	 * case, the buffers will be rotated during the read.  We do not
	 * support applying the read timeout to selects and maintaining state
	 * between the select and the following read, because despite that
	 * libpcap claims that it is the right behavior, that is just insane.
	 */
	if (ops & CDEV_OP_RD) {
		if (bpf->bpf_ifdev == NULL)
			ready_ops |= CDEV_OP_RD;
		else if (bpf->bpf_hlen > 0)
			ready_ops |= CDEV_OP_RD;
		else if ((bpf->bpf_flags & BPFF_IMMEDIATE) &&
		    bpf->bpf_slen > 0)
			ready_ops |= CDEV_OP_RD;
	}

	if (ops & CDEV_OP_WR)
		ready_ops |= CDEV_OP_WR;

	return ready_ops;
}

/*
 * There has been a state change on the BPF device.  If now possible, resume a
 * pending select query, if any.
 */
static void
bpfdev_resume_select(struct bpfdev * bpf)
{
	unsigned int ops, ready_ops;
	endpoint_t endpt;

	/* First see if there is a pending select request at all. */
	if ((endpt = bpf->bpf_select.bs_endpt) == NONE)
		return;
	ops = bpf->bpf_select.bs_selops;

	assert(ops != 0);

	/* Then see if any of the pending operations are now ready. */
	if ((ready_ops = bpfdev_test_select(bpf, ops)) == 0)
		return;

	/* If so, notify VFS about the ready operations. */
	chardriver_reply_select(bpf->bpf_select.bs_endpt,
	    bpfdev_get_minor(bpf), ready_ops);

	/*
	 * Forget about the ready operations.  If that leaves no pending
	 * operations, forget about the select request altogether.
	 */
	if ((bpf->bpf_select.bs_selops &= ~ready_ops) == 0)
		bpf->bpf_select.bs_endpt = NONE;
}

/*
 * There has been a state change on the BPF device.  If now possible, resume a
 * pending read request, if any.  If the call is a result of a timeout,
 * 'is_timeout' is set.  In that case, the read request must be resumed with an
 * EAGAIN error if no packets are available, and the running timer must be
 * canceled.  Otherwise, the resumption is due to a full buffer or a
 * disappeared interface, and 'is_timeout' is not set.  In this case, the read
 * request must be resumed with an I/O error if no packets are available.
 */
static void
bpfdev_resume_read(struct bpfdev * bpf, int is_timeout)
{
	ssize_t r;

	assert(bpf->bpf_read.br_endpt != NONE);

	/*
	 * If the hold buffer is still empty, see if the store buffer has
	 * any packets to copy out.
	 */
	if (bpf->bpf_hlen == 0)
		bpfdev_rotate(bpf);

	/* Return any available packets, or otherwise an error. */
	if (bpf->bpf_hlen > 0) {
		assert(bpf->bpf_hlen <= bpf->bpf_size);

		r = sys_safecopyto(bpf->bpf_read.br_endpt,
		    bpf->bpf_read.br_grant, 0, (vir_bytes)bpf->bpf_hbuf,
		    bpf->bpf_hlen);

		if (r == OK) {
			r = (ssize_t)bpf->bpf_hlen;

			bpf->bpf_hlen = 0;

			assert(bpf->bpf_slen != bpf->bpf_size);

			/*
			 * Allow readers to get the last packets after the
			 * interface has disappeared, before getting errors.
			 */
			if (bpf->bpf_ifdev == NULL)
				bpfdev_rotate(bpf);
		}
	} else
		r = (is_timeout) ? EAGAIN : EIO;

	chardriver_reply_task(bpf->bpf_read.br_endpt, bpf->bpf_read.br_id, r);

	bpf->bpf_read.br_endpt = NONE;

	/* Was there still a timer running?  Then cancel it now. */
	if (bpf->bpf_timeout > 0 && !is_timeout)
		cancel_timer(&bpf->bpf_read.br_timer);
}

/*
 * A read timeout has triggered for the BPF device.  Wake up the pending read
 * request.
 */
static void
bpfdev_timeout(int arg)
{
	struct bpfdev *bpf;

	assert(arg >= 0 && (unsigned int)arg < __arraycount(bpf_array));

	bpf = &bpf_array[arg];

	assert(bpf->bpf_read.br_endpt != NONE);

	bpfdev_resume_read(bpf, TRUE /*is_timeout*/);
}

/*
 * Read from a BPF device.
 */
static ssize_t
bpfdev_read(devminor_t minor, uint64_t position, endpoint_t endpt,
	cp_grant_id_t grant, size_t size, int flags, cdev_id_t id)
{
	struct bpfdev *bpf;
	ssize_t r;
	int suspend;

	if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
		return EINVAL;

	/* Allow only one read call at a time. */
	if (bpf->bpf_read.br_endpt != NONE)
		return EIO;

	/* Has this BPF device been configured at all yet? */
	if (bpf->bpf_sbuf == NULL)
		return EINVAL;

	/*
	 * Does the read call size match the entire buffer size?  This is a
	 * ridiculous requirement but it makes our job quite a bit easier..
	 */
	if (size != bpf->bpf_size)
		return EINVAL;

	/*
	 * Following standard receive semantics, if the interface is gone,
	 * return all the packets that were pending before returning an error.
	 * This requires extra buffer rotations after read completion, too.
	 */
	if (bpf->bpf_ifdev == NULL && bpf->bpf_hlen == 0)
		return EIO;

	/*
	 * If immediate mode is not enabled, we should always suspend the read
	 * call if the hold buffer is empty.  If immediate mode is enabled, we
	 * should only suspend the read call if both buffers are empty, and
	 * return data from the hold buffer or otherwise the store buffer,
	 * whichever is not empty.  A non-blocking call behaves as though
	 * immediate mode is enabled, except it will return EAGAIN instead of
	 * suspending the read call if both buffers are empty.  Thus, we may
	 * have to rotate buffers for both immediate mode and non-blocking
	 * calls.  The latter is necessary for libpcap to behave correctly.
	 */
	if ((flags & CDEV_NONBLOCK) || (bpf->bpf_flags & BPFF_IMMEDIATE))
		suspend = (bpf->bpf_hlen == 0 && bpf->bpf_slen == 0);
	else
		suspend = (bpf->bpf_hlen == 0);

	if (suspend) {
		if (flags & CDEV_NONBLOCK)
			return EAGAIN;

		/* Suspend the read call for later. */
		bpf->bpf_read.br_endpt = endpt;
		bpf->bpf_read.br_grant = grant;
		bpf->bpf_read.br_id = id;

		/* Set a timer if requested. */
		if (bpf->bpf_timeout > 0)
			set_timer(&bpf->bpf_read.br_timer, bpf->bpf_timeout,
			    bpfdev_timeout, (int)(bpf - bpf_array));

		return EDONTREPLY;
	}

	/* If we get here, either buffer has data; rotate buffers if needed. */
	if (bpf->bpf_hlen == 0)
		bpfdev_rotate(bpf);
	assert(bpf->bpf_hlen > 0);

	if ((r = sys_safecopyto(endpt, grant, 0, (vir_bytes)bpf->bpf_hbuf,
	    bpf->bpf_hlen)) != OK)
		return r;

	r = (ssize_t)bpf->bpf_hlen;

	bpf->bpf_hlen = 0;

	/*
	 * If the store buffer is exactly full, rotate it now.  Also, if the
	 * interface has disappeared, the store buffer will never fill up.
	 * Rotate it so that the application will get any remaining data before
	 * getting errors about the interface being gone.
	 */
	if (bpf->bpf_slen == bpf->bpf_size || bpf->bpf_ifdev == NULL)
		bpfdev_rotate(bpf);

	return r;
}

/*
 * Write to a BPF device.
 */
static ssize_t
bpfdev_write(devminor_t minor, uint64_t position, endpoint_t endpt,
	cp_grant_id_t grant, size_t size, int flags, cdev_id_t id)
{
	struct bpfdev *bpf;
	struct pbuf *pbuf, *pptr, *pcopy;
	size_t off;
	err_t err;
	int r;

	if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
		return EINVAL;

	if (bpf->bpf_ifdev == NULL)
		return EINVAL;

	/* VFS skips zero-sized I/O calls right now, but that may change. */
	if (size == 0)
		return 0;	/* nothing to do */

	if (size > ifdev_get_hdrlen(bpf->bpf_ifdev) +
	    ifdev_get_mtu(bpf->bpf_ifdev))
		return EMSGSIZE;

	if ((pbuf = pchain_alloc(PBUF_LINK, size)) == NULL)
		return ENOMEM;

	/* TODO: turn this into a series of vector copies. */
	off = 0;
	for (pptr = pbuf; pptr != NULL; pptr = pptr->next) {
		if ((r = sys_safecopyfrom(endpt, grant, off,
		    (vir_bytes)pptr->payload, pptr->len)) != OK) {
			pbuf_free(pbuf);

			return r;
		}
		off += pptr->len;
	}
	assert(off == size);

	/*
	 * In feedback mode, we cannot use the same packet buffers for both
	 * output and input, so make a copy.  We do this before calling the
	 * output function, which may change part of the buffers, because the
	 * BSDs take this approach as well.
	 */
	if (bpf->bpf_flags & BPFF_FEEDBACK) {
		if ((pcopy = pchain_alloc(PBUF_LINK, size)) == NULL) {
			pbuf_free(pbuf);

			return ENOMEM;
		}

		if (pbuf_copy(pcopy, pbuf) != ERR_OK)
			panic("unexpected pbuf copy failure");
	} else
		pcopy = NULL;

	/* Pass in the packet as output, and free it again. */
	err = ifdev_output(bpf->bpf_ifdev, pbuf, NULL /*netif*/,
	    TRUE /*to_bpf*/, !!(bpf->bpf_flags & BPFF_HDRCMPLT));

	pbuf_free(pbuf);

	/* In feedback mode, pass in the copy as input, if output succeeded. */
	if (err == ERR_OK && (bpf->bpf_flags & BPFF_FEEDBACK))
		ifdev_input(bpf->bpf_ifdev, pcopy, NULL /*netif*/,
		    FALSE /*to_bpf*/);
	else if (pcopy != NULL)
		pbuf_free(pcopy);

	return (err == ERR_OK) ? (ssize_t)size : util_convert_err(err);
}

/*
 * Attach a BPF device to a network interface, using the interface name given
 * in an ifreq structure.  As side effect, allocate hold and store buffers for
 * the device.  These buffers will stay allocated until the device is closed,
 * even though the interface may disappear before that.  Return OK if the BPF
 * device was successfully attached to the interface, or a negative error code
 * otherwise.
 */
static int
bpfdev_attach(struct bpfdev * bpf, struct ifreq * ifr)
{
	struct ifdev *ifdev;
	void *sbuf, *hbuf;

	/* Find the interface with the given name. */
	ifr->ifr_name[sizeof(ifr->ifr_name) - 1] = '\0';
	if ((ifdev = ifdev_find_by_name(ifr->ifr_name)) == NULL)
		return ENXIO;

	/*
	 * Allocate a store buffer and a hold buffer.  Preallocate the memory,
	 * or we might get killed later during low-memory conditions.
	 */
	if ((sbuf = (char *)mmap(NULL, bpf->bpf_size, PROT_READ | PROT_WRITE,
	    MAP_ANON | MAP_PRIVATE | MAP_PREALLOC, -1, 0)) == MAP_FAILED)
		return ENOMEM;

	if ((hbuf = (char *)mmap(NULL, bpf->bpf_size, PROT_READ | PROT_WRITE,
	    MAP_ANON | MAP_PRIVATE | MAP_PREALLOC, -1, 0)) == MAP_FAILED) {
		(void)munmap(sbuf, bpf->bpf_size);

		return ENOMEM;
	}

	bpf->bpf_ifdev = ifdev;
	bpf->bpf_sbuf = sbuf;
	bpf->bpf_hbuf = hbuf;
	assert(bpf->bpf_slen == 0);
	assert(bpf->bpf_hlen == 0);

	ifdev_attach_bpf(ifdev, &bpf->bpf_link);

	return OK;
}

/*
 * Detach the BPF device from its interface, which is about to disappear.
 */
void
bpfdev_detach(struct bpfdev_link * bpfl)
{
	struct bpfdev *bpf = (struct bpfdev *)bpfl;

	assert(bpf->bpf_flags & BPFF_IN_USE);
	assert(bpf->bpf_ifdev != NULL);

	/*
	 * We deliberately leave the buffers allocated here, for two reasons:
	 *
	 * 1) it lets applications to read any last packets in the buffers;
	 * 2) it prevents reattaching the BPF device to another interface.
	 */
	bpf->bpf_ifdev = NULL;

	/*
	 * Resume pending read and select requests, returning any data left,
	 * or an error if none.
	 */
	if (bpf->bpf_hlen == 0)
		bpfdev_rotate(bpf);

	if (bpf->bpf_read.br_endpt != NONE)
		bpfdev_resume_read(bpf, FALSE /*is_timeout*/);

	bpfdev_resume_select(bpf);
}

/*
 * Flush the given BPF device, resetting its buffer contents and statistics
 * counters.
 */
static void
bpfdev_flush(struct bpfdev * bpf)
{

	bpf->bpf_slen = 0;
	bpf->bpf_hlen = 0;

	bpf->bpf_stat.bs_recv = 0;
	bpf->bpf_stat.bs_drop = 0;
	bpf->bpf_stat.bs_capt = 0;
}

/*
 * Install a filter program on the BPF device.  A new filter replaces any old
 * one.  A zero-sized filter simply clears a previous filter.  On success,
 * perform a flush and return OK.  On failure, return a negative error code
 * without making any modifications to the current filter.
 */
static int
bpfdev_setfilter(struct bpfdev * bpf, endpoint_t endpt, cp_grant_id_t grant)
{
	struct bpf_insn *filter;
	unsigned int count;
	size_t len;
	int r;

	if ((r = sys_safecopyfrom(endpt, grant,
	    offsetof(struct minix_bpf_program, mbf_len), (vir_bytes)&count,
	    sizeof(count))) != OK)
		return r;

	if (count > BPF_MAXINSNS)
		return EINVAL;
	len = count * sizeof(struct bpf_insn);

	if (len > 0) {
		if ((filter = (struct bpf_insn *)mmap(NULL, len,
		    PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0)) ==
		    MAP_FAILED)
			return ENOMEM;

		if ((r = sys_safecopyfrom(endpt, grant,
		    offsetof(struct minix_bpf_program, mbf_insns),
		    (vir_bytes)filter, len)) != OK) {
			(void)munmap(filter, len);

			return r;
		}

		if (!bpf_validate(filter, count)) {
			(void)munmap(filter, len);

			return EINVAL;
		}
	} else
		filter = NULL;

	if (bpf->bpf_filter != NULL)
		(void)munmap(bpf->bpf_filter, bpf->bpf_filterlen);

	bpf->bpf_filter = filter;
	bpf->bpf_filterlen = len;

	bpfdev_flush(bpf);

	return OK;
}

/*
 * Process an I/O control request on the BPF device.
 */
static int
bpfdev_ioctl(devminor_t minor, unsigned long request, endpoint_t endpt,
	cp_grant_id_t grant, int flags, endpoint_t user_endpt, cdev_id_t id)
{
	struct bpfdev *bpf;
	struct bpf_stat bs;
	struct bpf_version bv;
	struct bpf_dltlist bfl;
	struct timeval tv;
	struct ifreq ifr;
	unsigned int uval;
	int r, val;

	if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
		return EINVAL;

	/*
	 * We do not support multiple concurrent requests in this module.  That
	 * not only means that we forbid a read(2) call on a BPF device object
	 * while another read(2) is already pending: we also disallow IOCTL
	 * IOCTL calls while such a read(2) call is in progress.  This
	 * restriction should never be a problem for user programs, and allows
	 * us to rely on the fact that that no settings can change between the
	 * start and end of any read call.  As a side note, pending select(2)
	 * queries may be similarly affected, and will also not be fully
	 * accurate if any options are changed while pending.
	 */
	if (bpf->bpf_read.br_endpt != NONE)
		return EIO;

	bpf->bpf_pid = getnpid(user_endpt);

	/* These are in order of the NetBSD BIOC.. IOCTL numbers. */
	switch (request) {
	case BIOCGBLEN:
		uval = bpf->bpf_size;

		return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
		    sizeof(uval));

	case BIOCSBLEN:
		if (bpf->bpf_sbuf != NULL)
			return EINVAL;

		if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
		    sizeof(uval))) != OK)
			return r;

		if (uval < BPF_BUF_MIN)
			uval = BPF_BUF_MIN;
		else if (uval > BPF_BUF_MAX)
			uval = BPF_BUF_MAX;

		/* Is this the right thing to do?  It doesn't matter for us. */
		uval = BPF_WORDALIGN(uval);

		if ((r = sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
		    sizeof(uval))) != OK)
			return r;

		bpf->bpf_size = uval;

		return OK;

	case MINIX_BIOCSETF:
		return bpfdev_setfilter(bpf, endpt, grant);

	case BIOCPROMISC:
		if (bpf->bpf_ifdev == NULL)
			return EINVAL;

		if (!(bpf->bpf_flags & BPFF_PROMISC)) {
			if (!ifdev_set_promisc(bpf->bpf_ifdev))
				return EINVAL;

			bpf->bpf_flags |= BPFF_PROMISC;
		}

		return OK;

	case BIOCFLUSH:
		bpfdev_flush(bpf);

		return OK;

	case BIOCGDLT:
		if (bpf->bpf_ifdev == NULL)
			return EINVAL;

		/* TODO: support for type configuration per BPF device. */
		uval = ifdev_get_dlt(bpf->bpf_ifdev);

		return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
		    sizeof(uval));

	case BIOCGETIF:
		if (bpf->bpf_ifdev == NULL)
			return EINVAL;

		memset(&ifr, 0, sizeof(ifr));
		strlcpy(ifr.ifr_name, ifdev_get_name(bpf->bpf_ifdev),
		    sizeof(ifr.ifr_name));

		return sys_safecopyto(endpt, grant, 0, (vir_bytes)&ifr,
		    sizeof(ifr));

	case BIOCSETIF:
		/*
		 * Test on the presence of a buffer rather than on an interface
		 * since the latter may disappear and thus be reset to NULL, in
		 * which case we do not want to allow rebinding to another.
		 */
		if (bpf->bpf_sbuf != NULL)
			return EINVAL;

		if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&ifr,
		    sizeof(ifr))) != OK)
			return r;

		return bpfdev_attach(bpf, &ifr);

	case BIOCGSTATS:
		/*
		 * Why do we not embed a bpf_stat structure directly in the
		 * BPF device structure?  Well, bpf_stat has massive padding..
		 */
		memset(&bs, 0, sizeof(bs));
		bs.bs_recv = bpf->bpf_stat.bs_recv;
		bs.bs_drop = bpf->bpf_stat.bs_drop;
		bs.bs_capt = bpf->bpf_stat.bs_capt;

		return sys_safecopyto(endpt, grant, 0, (vir_bytes)&bs,
		    sizeof(bs));

	case BIOCIMMEDIATE:
		if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
		    sizeof(uval))) != OK)
			return r;

		if (uval)
			bpf->bpf_flags |= BPFF_IMMEDIATE;
		else
			bpf->bpf_flags &= ~BPFF_IMMEDIATE;

		return OK;

	case BIOCVERSION:
		memset(&bv, 0, sizeof(bv));
		bv.bv_major = BPF_MAJOR_VERSION;
		bv.bv_minor = BPF_MINOR_VERSION;

		return sys_safecopyto(endpt, grant, 0, (vir_bytes)&bv,
		    sizeof(bv));

	case BIOCGHDRCMPLT:
		uval = !!(bpf->bpf_flags & BPFF_HDRCMPLT);

		return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
		    sizeof(uval));

	case BIOCSHDRCMPLT:
		if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
		    sizeof(uval))) != OK)
			return r;

		if (uval)
			bpf->bpf_flags |= BPFF_HDRCMPLT;
		else
			bpf->bpf_flags &= ~BPFF_HDRCMPLT;

		return OK;

	case BIOCSDLT:
		if (bpf->bpf_ifdev == NULL)
			return EINVAL;

		if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
		    sizeof(uval))) != OK)
			return r;

		/* TODO: support for type configuration per BPF device. */
		if (uval != ifdev_get_dlt(bpf->bpf_ifdev))
			return EINVAL;

		return OK;

	case MINIX_BIOCGDLTLIST:
		if (bpf->bpf_ifdev == NULL)
			return EINVAL;

		if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&bfl,
		    sizeof(bfl))) != OK)
			return r;

		if (bfl.bfl_list != NULL) {
			if (bfl.bfl_len < 1)
				return ENOMEM;

			/*
			 * Copy out the 'list', which consists of one entry.
			 * If we were to produce multiple entries, we would
			 * have to check against the MINIX_BPF_MAXDLT limit.
			 */
			uval = ifdev_get_dlt(bpf->bpf_ifdev);

			if ((r = sys_safecopyto(endpt, grant,
			    offsetof(struct minix_bpf_dltlist, mbfl_list),
			    (vir_bytes)&uval, sizeof(uval))) != OK)
				return r;
		}
		bfl.bfl_len = 1;

		return sys_safecopyto(endpt, grant, 0, (vir_bytes)&bfl,
		    sizeof(bfl));

	case BIOCGSEESENT:
		uval = !!(bpf->bpf_flags & BPFF_SEESENT);

		return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
		    sizeof(uval));

	case BIOCSSEESENT:
		if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
		    sizeof(uval))) != OK)
			return r;

		if (uval)
			bpf->bpf_flags |= BPFF_SEESENT;
		else
			bpf->bpf_flags &= ~BPFF_SEESENT;

		return OK;

	case BIOCSRTIMEOUT:
		if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&tv,
		    sizeof(tv))) != OK)
			return r;

		if ((r = util_timeval_to_ticks(&tv, &bpf->bpf_timeout)) != OK)
			return r;

		return OK;

	case BIOCGRTIMEOUT:
		util_ticks_to_timeval(bpf->bpf_timeout, &tv);

		return sys_safecopyto(endpt, grant, 0, (vir_bytes)&tv,
		    sizeof(tv));

	case BIOCGFEEDBACK:
		uval = !!(bpf->bpf_flags & BPFF_FEEDBACK);

		return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval,
		    sizeof(uval));

	case BIOCSFEEDBACK:
		if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval,
		    sizeof(uval))) != OK)
			return r;

		if (uval)
			bpf->bpf_flags |= BPFF_FEEDBACK;
		else
			bpf->bpf_flags &= ~BPFF_FEEDBACK;

		return OK;

	case FIONREAD:
		val = 0;
		if (bpf->bpf_hlen > 0)
			val = bpf->bpf_hlen;
		else if ((bpf->bpf_flags & BPFF_IMMEDIATE) &&
		    bpf->bpf_slen > 0)
			val = bpf->bpf_slen;
		else
			val = 0;

		return sys_safecopyto(endpt, grant, 0, (vir_bytes)&val,
		    sizeof(val));

	default:
		return ENOTTY;
	}
}

/*
 * Cancel a previously suspended request on a BPF device.  Since only read
 * requests may be suspended (select is handled differently), the cancel
 * request must be for a read request.  Note that character devices currently
 * (still) behave slightly differently from socket devices here: while socket
 * drivers are supposed to respond to the original request, character drivers
 * must respond to the original request from the cancel callback.
 */
static int
bpfdev_cancel(devminor_t minor, endpoint_t endpt, cdev_id_t id)
{
	struct bpfdev *bpf;

	if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
		return EDONTREPLY;

	/* Is this a cancel request for the currently pending read request? */
	if (bpf->bpf_read.br_endpt != endpt || bpf->bpf_read.br_id != id)
		return EDONTREPLY;

	/* If so, cancel the read request. */
	if (bpf->bpf_timeout > 0)
		cancel_timer(&bpf->bpf_read.br_timer);

	bpf->bpf_read.br_endpt = NONE;

	return EINTR; /* the return value for the canceled read request */
}

/*
 * Perform a select query on a BPF device.
 */
static int
bpfdev_select(devminor_t minor, unsigned int ops, endpoint_t endpt)
{
	struct bpfdev *bpf;
	unsigned int r, notify;

	if ((bpf = bpfdev_get_by_minor(minor)) == NULL)
		return EINVAL;

	notify = (ops & CDEV_NOTIFY);
	ops &= (CDEV_OP_RD | CDEV_OP_WR | CDEV_OP_ERR);

	r = bpfdev_test_select(bpf, ops);

	/*
	 * For the operations that were not immediately ready, if requested,
	 * save the select request for later.
	 */
	ops &= ~r;

	if (ops != 0 && notify) {
		if (bpf->bpf_select.bs_endpt != NONE) {
			/* Merge in the operations with any earlier request. */
			if (bpf->bpf_select.bs_endpt != endpt)
				return EIO;
			bpf->bpf_select.bs_selops |= ops;
		} else {
			bpf->bpf_select.bs_endpt = endpt;
			bpf->bpf_select.bs_selops = ops;
		}
	}

	return r;
}

/*
 * Process an incoming packet on the interface to which the given BPF device is
 * attached.  If the packet passes the filter (if any), store as much as
 * requested of it in the store buffer, rotating buffers if needed and resuming
 * suspended read and select requests as appropriate.  This function is also
 * called through bpfdev_output() below.
 */
void
bpfdev_input(struct bpfdev_link * bpfl, const struct pbuf * pbuf)
{
	struct bpfdev *bpf = (struct bpfdev *)bpfl;
	struct timespec ts;
	struct bpf_hdr bh;
	const struct pbuf *pptr;
	size_t caplen, hdrlen, totlen, off, chunk;
	int hfull;

	/*
	 * Apparently bs_recv is the counter of packets that were run through
	 * the filter, not the number of packets that were or could be received
	 * by the user (which is what I got from the manual page.. oh well).
	 */
	bpf->bpf_stat.bs_recv++;
	bpf_stat.bs_recv++;

	/*
	 * Run the packet through the BPF device's filter to see whether the
	 * packet should be stored and if so, how much of it.  If no filter is
	 * set, all packets will be stored in their entirety.
	 */
	caplen = bpf_filter_ext(bpf->bpf_filter, pbuf, (u_char *)pbuf->payload,
	    pbuf->tot_len, pbuf->len);

	if (caplen == 0)
		return;		/* no match; ignore packet */

	if (caplen > pbuf->tot_len)
		caplen = pbuf->tot_len;

	/* Truncate packet entries to the full size of the buffers. */
	hdrlen = BPF_WORDALIGN(sizeof(bh));
	totlen = BPF_WORDALIGN(hdrlen + caplen);

	if (totlen > bpf->bpf_size) {
		totlen = bpf->bpf_size;
		caplen = totlen - hdrlen;
	}
	assert(totlen >= hdrlen);

	bpf->bpf_stat.bs_capt++;
	bpf_stat.bs_capt++;

	assert(bpf->bpf_sbuf != NULL);
	if (totlen > bpf->bpf_size - bpf->bpf_slen) {
		/*
		 * If the store buffer is full and the hold buffer is not
		 * empty, we cannot swap the two buffers, and so we must drop
		 * the current packet.
		 */
		if (bpf->bpf_hlen > 0) {
			bpf->bpf_stat.bs_drop++;
			bpf_stat.bs_drop++;

			return;
		}

		/*
		 * Rotate the buffers: the hold buffer will now be "full" and
		 * ready to be read - it may not actually be entirely full, but
		 * we could not fit this packet and we are not going to deliver
		 * packets out of order..
		 */
		bpfdev_rotate(bpf);

		hfull = TRUE;
	} else
		hfull = FALSE;

	/*
	 * Retrieve the capture time for the packet.  Ideally this would be
	 * done only once per accepted packet, but we do not expect many BPF
	 * devices to be receiving the same packets often enough to make that
	 * worth it.
	 */
	clock_time(&ts);

	/*
	 * Copy the packet into the store buffer, including a newly generated
	 * header.  Zero any padding areas, even if strictly not necessary.
	 */
	memset(&bh, 0, sizeof(bh));
	bh.bh_tstamp.tv_sec = ts.tv_sec;
	bh.bh_tstamp.tv_usec = ts.tv_nsec / 1000;
	bh.bh_caplen = caplen;
	bh.bh_datalen = pbuf->tot_len;
	bh.bh_hdrlen = hdrlen;

	assert(bpf->bpf_sbuf != NULL);
	off = bpf->bpf_slen;

	memcpy(&bpf->bpf_sbuf[off], &bh, sizeof(bh));
	if (hdrlen > sizeof(bh))
		memset(&bpf->bpf_sbuf[off + sizeof(bh)], 0,
		    hdrlen - sizeof(bh));
	off += hdrlen;

	for (pptr = pbuf; pptr != NULL && caplen > 0; pptr = pptr->next) {
		chunk = pptr->len;
		if (chunk > caplen)
			chunk = caplen;

		memcpy(&bpf->bpf_sbuf[off], pptr->payload, chunk);

		off += chunk;
		caplen -= chunk;
	}

	assert(off <= bpf->bpf_slen + totlen);
	if (bpf->bpf_slen + totlen > off)
		memset(&bpf->bpf_sbuf[off], 0, bpf->bpf_slen + totlen - off);

	bpf->bpf_slen += totlen;

	/*
	 * Edge case: if the hold buffer is empty and the store buffer is now
	 * exactly full, rotate buffers so that the packets can be read
	 * immediately, without waiting for the next packet to cause rotation.
	 */
	if (bpf->bpf_hlen == 0 && bpf->bpf_slen == bpf->bpf_size) {
		bpfdev_rotate(bpf);

		hfull = TRUE;
	}

	/*
	 * If the hold buffer is now full, or if immediate mode is enabled,
	 * then we now have data to deliver to userland.  See if we can wake up
	 * any read or select call (either but not both here).
	 */
	if (hfull || (bpf->bpf_flags & BPFF_IMMEDIATE)) {
		if (bpf->bpf_read.br_endpt != NONE)
			bpfdev_resume_read(bpf, FALSE /*is_timeout*/);
		else
			bpfdev_resume_select(bpf);
	}
}

/*
 * Process an outgoing packet on the interface to which the given BPF device is
 * attached.  If the BPF device is configured to capture outgoing packets as
 * well, attempt to capture the packet as per bpfdev_input().
 */
void
bpfdev_output(struct bpfdev_link * bpfl, const struct pbuf * pbuf)
{
	struct bpfdev *bpf = (struct bpfdev *)bpfl;

	if (bpf->bpf_flags & BPFF_SEESENT)
		bpfdev_input(bpfl, pbuf);
}

/*
 * Fill the given 'bde' structure with information about BPF device 'bpf'.
 */
static void
bpfdev_get_info(struct bpf_d_ext * bde, const struct bpfdev * bpf)
{

	bde->bde_bufsize = bpf->bpf_size;
	bde->bde_promisc = !!(bpf->bpf_flags & BPFF_PROMISC);
	bde->bde_state = BPF_IDLE;
	bde->bde_immediate = !!(bpf->bpf_flags & BPFF_IMMEDIATE);
	bde->bde_hdrcmplt = !!(bpf->bpf_flags & BPFF_HDRCMPLT);
	bde->bde_seesent = !!(bpf->bpf_flags & BPFF_SEESENT);
	/*
	 * NetBSD updates the process ID upon device open, close, ioctl, and
	 * poll.  From those, only open and ioctl make sense for us.  Sadly
	 * there is no way to indicate "no known PID" to netstat(1), so we
	 * cannot even save just the endpoint and look up the corresponding PID
	 * later, since the user process may be gone by then.
	 */
	bde->bde_pid = bpf->bpf_pid;
	bde->bde_rcount = bpf->bpf_stat.bs_recv;
	bde->bde_dcount = bpf->bpf_stat.bs_drop;
	bde->bde_ccount = bpf->bpf_stat.bs_capt;
	if (bpf->bpf_ifdev != NULL)
		strlcpy(bde->bde_ifname, ifdev_get_name(bpf->bpf_ifdev),
		    sizeof(bde->bde_ifname));
}

/*
 * Obtain statistics about open BPF devices ("peers").  This node may be
 * accessed by the superuser only.  Used by netstat(1).
 */
static ssize_t
bpfdev_peers(struct rmib_call * call, struct rmib_node * node __unused,
	struct rmib_oldp * oldp, struct rmib_newp * newp __unused)
{
	struct bpfdev *bpf;
	struct bpf_d_ext bde;
	unsigned int slot;
	ssize_t off;
	int r, size, max;

	if (!(call->call_flags & RMIB_FLAG_AUTH))
		return EPERM;

	if (call->call_namelen != 2)
		return EINVAL;

	size = call->call_name[0];
	if (size < 0 || (size_t)size > sizeof(bde))
		return EINVAL;
	if (size == 0)
		size = sizeof(bde);
	max = call->call_name[1];

	off = 0;

	for (slot = 0; slot < __arraycount(bpf_array); slot++) {
		bpf = &bpf_array[slot];

		if (!(bpf->bpf_flags & BPFF_IN_USE))
			continue;

		if (rmib_inrange(oldp, off)) {
			memset(&bde, 0, sizeof(bde));

			bpfdev_get_info(&bde, bpf);

			if ((r = rmib_copyout(oldp, off, &bde, size)) < 0)
				return r;
		}

		off += sizeof(bde);
		if (max > 0 && --max == 0)
			break;
	}

	/* No slack needed: netstat(1) resizes its buffer as needed. */
	return off;
}

static const struct chardriver bpfdev_tab = {
	.cdr_open		= bpfdev_open,
	.cdr_close		= bpfdev_close,
	.cdr_read		= bpfdev_read,
	.cdr_write		= bpfdev_write,
	.cdr_ioctl		= bpfdev_ioctl,
	.cdr_cancel		= bpfdev_cancel,
	.cdr_select		= bpfdev_select
};

/*
 * Process a character driver request.  Since the LWIP service offers character
 * devices for BPF only, it must be a request for a BPF device.
 */
void
bpfdev_process(message * m_ptr, int ipc_status)
{

	chardriver_process(&bpfdev_tab, m_ptr, ipc_status);
}