minix/net/lwip/tcpsock.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793

/* LWIP service - tcpsock.c - TCP sockets */
/*
 * This module implements support for TCP sockets based on lwIP's core TCP PCB
 * module, which is largely but not fully cooperative with exactly what we want
 * to achieve, with as a result that this module is rather complicated.
 *
 * Each socket has a send queue and a receive queue.  Both are using lwIP's own
 * (pbuf) buffers, which largely come out of the main 512-byte buffer pool.
 * The buffers on the send queue are allocated and freed by us--the latter only
 * once they are no longer in use by lwIP as well.  A bit counterintuitively,
 * we deliberately use a smaller lwIP per-PCB TCP send buffer limit
 * (TCP_SND_BUF) in the lwIP send configuration (lwipopts.h) in order to more
 * easily trigger conditions where we cannot enqueue data (or the final FIN)
 * right away.  This way, we get to test the internal logic of this module a
 * lot more easily.  The small lwIP send queue size should not have any impact
 * on performance, as our own per-socket send queues can be much larger and we
 * enqueue more of that on the lwIP PCB as soon as we can in all cases.
 *
 * The receive queue consists of whatever buffers were given to us by lwIP, but
 * since those may be many buffers with small amounts of data each, we perform
 * fairly aggressive merging of consecutive buffers.  The intended result is
 * that we waste no more than 50% of memory within the receive queue.  Merging
 * requires memory copies, which makes it expensive, but we do not configure
 * lwIP with enough buffers to make running out of buffers a non-issue, so this
 * trade-off is necessary.  Practical experience and measurements of the merge
 * policy will have to show whether and how the current policy may be improved.
 *
 * As can be expected, the connection close semantics are by far the most
 * complicated part of this module.  We attempt to get rid of the lwIP PCB as
 * soon as we can, letting lwIP take care of the TIME_WAIT state for example.
 * However, there are various conditions that have to be met before we can
 * forget about the PCB here--most importantly, that none of our sent data
 * blocks are still referenced by lwIP because they have not yet been sent or
 * acknowledged.  We can only free the data blocks once lwIP is done with them.
 *
 * We do consider the TCP state of lwIP's PCB, in order to avoid duplicating
 * full state tracking here.  However, we do not look at a socket's TCP state
 * while in a lwIP-generated event for that socket, because the state may not
 * necessarily reflect the (correct or new) TCP state of the connection, nor
 * may the PCB be available--this is the case for error events.  For these
 * reasons we use a few internal TCPF_ flags to perform partial state tracking.
 *
 * More generally, we tend to access lwIP PCB fields directly only when lwIP's
 * own BSD API implementation does that too and there is no better alternative.
 * One example of this is the check to see if our FIN was acknowledged, for
 * SO_LINGER support.  In terms of maintenance, our hope is that if lwIP's API
 * changes later, we can change our code to imitate whatever lwIP's BSD API
 * implementation does at that point.
 */

#include <sys/socketvar.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <netinet/ip_var.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_fsm.h>

/*
 * Unfortunately, NetBSD and lwIP have different definitions of a few relevant
 * preprocessor variables.  Make sure we do not attempt to use the NetBSD one
 * where it matters.  We do need one of the NetBSD definitions though.
 */
static const unsigned int NETBSD_TF_NODELAY = TF_NODELAY;
#undef TF_NODELAY
#undef TCP_MSS

#include "lwip.h"
#include "tcpisn.h"

#include "lwip/tcp.h"
#include "lwip/priv/tcp_priv.h" /* for tcp_pcb_lists */

/*
 * The number of TCP sockets (NR_TCPSOCK) is defined in the lwIP configuration.
 */

/*
 * We fully control the send buffer, so we can let its size be set to whatever
 * we want.  The receive buffer is different: if it is smaller than the window
 * size, we may have to refuse data that lwIP hands us, at which point more
 * incoming data will cause lwIP to abort the TCP connection--even aside from
 * performance issues.  Therefore, we must make sure the receive buffer is
 * larger than the TCP window at all times.
 */
#define TCP_SNDBUF_MIN	1		/* minimum TCP send buffer size */
#define TCP_SNDBUF_DEF	32768		/* default TCP send buffer size */
#define TCP_SNDBUF_MAX	131072		/* maximum TCP send buffer size */
#define TCP_RCVBUF_MIN	TCP_WND		/* minimum TCP receive buffer size */
#define TCP_RCVBUF_DEF	MAX(TCP_WND, 32768) /* default TCP recv buffer size */
#define TCP_RCVBUF_MAX	MAX(TCP_WND, 131072) /* maximum TCP recv buffer size */

/*
 * The total number of buffers that may in use for TCP socket send queues.  The
 * goal is to allow at least some progress to be made on receiving from TCP
 * sockets and on differently-typed sockets, at least as long as the LWIP
 * service can manage to allocate the memory it wants.  For the case that it
 * does not, we can only reactively kill off TCP sockets and/or free enqueued
 * ethernet packets, neither of which is currently implemented (TODO).
 */
#define TCP_MAX_SENDBUFS	(mempool_max_buffers() * 3 / 4)

/* Polling intervals, in 500-millsecond units. */
#define TCP_POLL_REG_INTERVAL	10	/* interval for reattempting sends */
#define TCP_POLL_CLOSE_INTERVAL	1	/* interval while closing connection */

static struct tcpsock {
	struct ipsock tcp_ipsock;		/* IP socket, MUST be first */
	struct tcp_pcb *tcp_pcb;		/* lwIP TCP control block */
	union pxfer_tcp_queue {			/* free/accept queue */
		TAILQ_ENTRY(tcpsock) tq_next;	/* next in queue */
		TAILQ_HEAD(, tcpsock) tq_head;	/* head of queue */
	} tcp_queue;
	struct tcpsock *tcp_listener;		/* listener if on accept q. */
	struct {				/* send queue */
		struct pbuf *ts_head;		/* first pbuf w/unacked data */
		struct pbuf *ts_unsent;		/* first pbuf w/unsent data */
		struct pbuf *ts_tail;		/* most recently added data */
		size_t ts_len;			/* total sent + unsent */
		unsigned short ts_head_off;	/* offset into head pbuf */
		unsigned short ts_unsent_off;	/* offset into unsent pbuf */
	} tcp_snd;
	struct {				/* receive queue */
		struct pbuf *tr_head;		/* first pbuf w/unrecvd data */
		struct pbuf **tr_pre_tailp;	/* ptr-ptr to newest pbuf */
		size_t tr_len;			/* bytes on receive queue */
		unsigned short tr_head_off;	/* offset into head pbuf */
		unsigned short tr_unacked;	/* current window reduction */
	} tcp_rcv;
} tcp_array[NR_TCPSOCK];

static TAILQ_HEAD(, tcpsock) tcp_freelist;	/* list of free TCP sockets */

static const struct sockevent_ops tcpsock_ops;

static unsigned int tcpsock_sendbufs;		/* # send buffers in use */
static unsigned int tcpsock_recvbufs;		/* # receive buffers in use */

/* A bunch of macros that are just for convenience. */
#define tcpsock_get_id(tcp)	(SOCKID_TCP | (sockid_t)((tcp) - tcp_array))
#define tcpsock_get_ipsock(tcp)	(&(tcp)->tcp_ipsock)
#define tcpsock_get_sock(tcp)	(ipsock_get_sock(tcpsock_get_ipsock(tcp)))
#define tcpsock_get_sndbuf(tcp)	(ipsock_get_sndbuf(tcpsock_get_ipsock(tcp)))
#define tcpsock_get_rcvbuf(tcp)	(ipsock_get_rcvbuf(tcpsock_get_ipsock(tcp)))
#define tcpsock_is_ipv6(tcp)	(ipsock_is_ipv6(tcpsock_get_ipsock(tcp)))
#define tcpsock_is_shutdown(tcp,fl) \
	(sockevent_is_shutdown(tcpsock_get_sock(tcp), fl))
#define tcpsock_is_listening(tcp) \
	(sockevent_is_listening(tcpsock_get_sock(tcp)))
#define tcpsock_get_flags(tcp)	(ipsock_get_flags(tcpsock_get_ipsock(tcp)))
#define tcpsock_set_flag(tcp,fl) \
	(ipsock_set_flag(tcpsock_get_ipsock(tcp), fl))
#define tcpsock_clear_flag(tcp,fl) \
	(ipsock_clear_flag(tcpsock_get_ipsock(tcp), fl))

static ssize_t tcpsock_pcblist(struct rmib_call *, struct rmib_node *,
	struct rmib_oldp *, struct rmib_newp *);

/* The CTL_NET {PF_INET,PF_INET6} IPPROTO_TCP subtree. */
/* TODO: add many more and make some of them writable.. */
static struct rmib_node net_inet_tcp_table[] = {
/* 2*/	[TCPCTL_SENDSPACE]	= RMIB_INT(RMIB_RO, TCP_SNDBUF_DEF,
				    "sendspace",
				    "Default TCP send buffer size"),
/* 3*/	[TCPCTL_RECVSPACE]	= RMIB_INT(RMIB_RO, TCP_RCVBUF_DEF,
				    "recvspace",
				    "Default TCP receive buffer size"),
/*29*/	[TCPCTL_LOOPBACKCKSUM]	= RMIB_FUNC(RMIB_RW | CTLTYPE_INT, sizeof(int),
				    loopif_cksum, "do_loopback_cksum",
				    "Perform TCP checksum on loopback"),
/*+0*/	[TCPCTL_MAXID]		= RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0,
				    tcpsock_pcblist, "pcblist",
				    "TCP protocol control block list"),
/*+1*/	[TCPCTL_MAXID + 1]	= RMIB_FUNC(RMIB_RW | CTLFLAG_PRIVATE |
				    CTLFLAG_HIDDEN | CTLTYPE_STRING,
				    TCPISN_SECRET_HEX_LENGTH, tcpisn_secret,
				    "isn_secret",
				    "TCP ISN secret (MINIX 3 specific)")
};

static struct rmib_node net_inet_tcp_node =
    RMIB_NODE(RMIB_RO, net_inet_tcp_table, "tcp", "TCP related settings");
static struct rmib_node net_inet6_tcp6_node =
    RMIB_NODE(RMIB_RO, net_inet_tcp_table, "tcp6", "TCP related settings");

/*
 * Initialize the TCP sockets module.
 */
void
tcpsock_init(void)
{
	unsigned int slot;

	/* Initialize the list of free TCP sockets. */
	TAILQ_INIT(&tcp_freelist);

	for (slot = 0; slot < __arraycount(tcp_array); slot++)
		TAILQ_INSERT_TAIL(&tcp_freelist, &tcp_array[slot],
		    tcp_queue.tq_next);

	/* Initialize other variables. */
	tcpsock_sendbufs = 0;

	/* Register the net.inet.tcp and net.inet6.tcp6 RMIB subtrees. */
	mibtree_register_inet(PF_INET, IPPROTO_TCP, &net_inet_tcp_node);
	mibtree_register_inet(PF_INET6, IPPROTO_TCP, &net_inet6_tcp6_node);
}

/*
 * Initialize the state of a TCP socket's send queue.
 */
static void
tcpsock_reset_send(struct tcpsock * tcp)
{

	tcp->tcp_snd.ts_tail = NULL;
	tcp->tcp_snd.ts_unsent = NULL;
	tcp->tcp_snd.ts_head = NULL;
	tcp->tcp_snd.ts_len = 0;
	tcp->tcp_snd.ts_unsent_off = 0;
	tcp->tcp_snd.ts_head_off = 0;
}

/*
 * Initialize the state of a TCP socket's receive queue.
 */
static void
tcpsock_reset_recv(struct tcpsock * tcp)
{

	tcp->tcp_rcv.tr_pre_tailp = NULL;
	tcp->tcp_rcv.tr_head = NULL;
	tcp->tcp_rcv.tr_len = 0;
	tcp->tcp_rcv.tr_head_off = 0;
	tcp->tcp_rcv.tr_unacked = 0;
}

/*
 * Create a TCP socket.
 */
sockid_t
tcpsock_socket(int domain, int protocol, struct sock ** sockp,
	const struct sockevent_ops ** ops)
{
	struct tcpsock *tcp;
	uint8_t ip_type;

	switch (protocol) {
	case 0:
	case IPPROTO_TCP:
		break;

	default:
		return EPROTONOSUPPORT;
	}

	if (TAILQ_EMPTY(&tcp_freelist))
		return ENOBUFS;

	tcp = TAILQ_FIRST(&tcp_freelist);

	/*
	 * Initialize the structure.  Do not memset it to zero, as it is still
	 * part of the linked free list.  Initialization may still fail.  When
	 * adding new fields, make sure to change tcpsock_clone() accordingly.
	 */

	ip_type = ipsock_socket(tcpsock_get_ipsock(tcp), domain,
	    TCP_SNDBUF_DEF, TCP_RCVBUF_DEF, sockp);

	if ((tcp->tcp_pcb = tcp_new_ip_type(ip_type)) == NULL)
		return ENOBUFS;
	tcp_arg(tcp->tcp_pcb, tcp);

	tcp->tcp_listener = NULL;

	tcpsock_reset_send(tcp);
	tcpsock_reset_recv(tcp);

	TAILQ_REMOVE(&tcp_freelist, tcp, tcp_queue.tq_next);

	*ops = &tcpsock_ops;
	return tcpsock_get_id(tcp);
}

/*
 * Create a TCP socket for the TCP PCB 'pcb' which identifies a new connection
 * incoming on listening socket 'listener'.  The new socket is essentially a
 * "clone" of the listening TCP socket, in that it should inherit any settings
 * from the listening socket.  The socket has not yet been accepted by userland
 * so add it to the queue of connetions pending for the listening socket.  On
 * success, return OK.  On failure, return a negative error code.
 */
static int
tcpsock_clone(struct tcpsock * listener, struct tcp_pcb * pcb)
{
	struct tcpsock *tcp;

	if (TAILQ_EMPTY(&tcp_freelist))
		return ENOBUFS;

	tcp = TAILQ_FIRST(&tcp_freelist);

	/*
	 * Initialize the structure.  Do not memset it to zero, as it is still
	 * part of the linked free list.  Initialization may still fail.  Most
	 * settings should be inherited from the listening socket here, rather
	 * than being initialized to their default state.
	 */

	ipsock_clone(tcpsock_get_ipsock(listener), tcpsock_get_ipsock(tcp),
	    tcpsock_get_id(tcp));

	tcp->tcp_pcb = pcb;
	tcp_arg(pcb, tcp);

	tcpsock_reset_send(tcp);
	tcpsock_reset_recv(tcp);

	/*
	 * Remove the new socket from the free list, and add it to the queue of
	 * the listening socket--in this order, because the same next pointer
	 * is used for both.
	 */
	TAILQ_REMOVE(&tcp_freelist, tcp, tcp_queue.tq_next);

	TAILQ_INSERT_TAIL(&listener->tcp_queue.tq_head, tcp,
	    tcp_queue.tq_next);
	tcp->tcp_listener = listener;

	return OK;
}

/*
 * Allocate a buffer from the pool, using the standard pool size.  The returned
 * buffer is a single element--never a chain.
 */
static struct pbuf *
tcpsock_alloc_buf(void)
{
	struct pbuf *pbuf;

	pbuf = pbuf_alloc(PBUF_RAW, MEMPOOL_BUFSIZE, PBUF_RAM);

	assert(pbuf == NULL || pbuf->len == pbuf->tot_len);

	return pbuf;
}

/*
 * Free the given buffer.  Ensure that pbuf_free() will not attempt to free the
 * next buffer(s) in the chain as well.  This may be called for pbufs other
 * than those allocated with tcpsock_alloc_buf().
 */
static void
tcpsock_free_buf(struct pbuf * pbuf)
{

	/*
	 * Resetting the length is currently not necessary, but better safe
	 * than sorry..
	 */
	pbuf->len = pbuf->tot_len;
	pbuf->next = NULL;

	pbuf_free(pbuf);
}

/*
 * Clear the send queue of a TCP socket.  The caller must ensure that lwIP will
 * no longer access any of data on the send queue.
 */
static void
tcpsock_clear_send(struct tcpsock * tcp)
{
	struct pbuf *phead;

	assert(tcp->tcp_pcb == NULL);

	while ((phead = tcp->tcp_snd.ts_head) != NULL) {
		tcp->tcp_snd.ts_head = phead->next;

		assert(tcpsock_sendbufs > 0);
		tcpsock_sendbufs--;

		tcpsock_free_buf(phead);
	}

	tcpsock_reset_send(tcp);
}

/*
 * Clear the receive queue of a TCP socket.  If 'ack_data' is set, also
 * acknowledge the previous contents of the receive queue to lwIP.
 */
static size_t
tcpsock_clear_recv(struct tcpsock * tcp, int ack_data)
{
	struct pbuf *phead;
	size_t rlen;

	rlen = tcp->tcp_rcv.tr_len;

	while ((phead = tcp->tcp_rcv.tr_head) != NULL) {
		tcp->tcp_rcv.tr_head = phead->next;

		assert(tcpsock_recvbufs > 0);
		tcpsock_recvbufs--;

		tcpsock_free_buf(phead);
	}

	/*
	 * From now on, we will basically be discarding incoming data as fast
	 * as possible, to keep the full window open at all times.
	 */
	if (ack_data && tcp->tcp_pcb != NULL && tcp->tcp_rcv.tr_unacked > 0)
		tcp_recved(tcp->tcp_pcb, tcp->tcp_rcv.tr_unacked);

	tcpsock_reset_recv(tcp);

	return rlen;
}

/*
 * The TCP socket's PCB has been detached from the socket, typically because
 * the connection was aborted, either by us or by lwIP.  Either way, any TCP
 * connection is gone.  Clear the socket's send queue, remove the socket from
 * a listening socket's queue, and if the socket itself is ready and allowed to
 * be freed, free it now.  The socket is ready to be freed if it was either on
 * a listening queue or being closed already.  The socket is allowed to be
 * freed only if 'may_free' is TRUE.  If the socket is not freed, its receive
 * queue is left as is, as it may still have data to be received by userland.
 */
static int
tcpsock_cleanup(struct tcpsock * tcp, int may_free)
{
	int destroy;

	assert(tcp->tcp_pcb == NULL);

	/*
	 * Free any data on the send queue.  This is safe to do right now,
	 * because the PCB has been aborted (or was already gone).  We must be
	 * very careful about clearing the send queue in all other situations.
	 */
	tcpsock_clear_send(tcp);

	/*
	 * If this was a socket pending acceptance, remove it from the
	 * corresponding listener socket's queue, and free it.  Otherwise, free
	 * the socket only if it suspended a graceful close operation.
	 */
	if (tcp->tcp_listener != NULL) {
		TAILQ_REMOVE(&tcp->tcp_listener->tcp_queue.tq_head, tcp,
		    tcp_queue.tq_next);
		tcp->tcp_listener = NULL;

		/*
		 * The listener socket's backlog count should be adjusted by
		 * lwIP whenever the PCB is freed up, so we need (and must) not
		 * attempt to do that here.
		 */

		destroy = TRUE;
	} else
		destroy = sockevent_is_closing(tcpsock_get_sock(tcp));

	/*
	 * Do not free the socket if 'may_free' is FALSE.  That flag may be set
	 * if we are currently in the second tcpsock_close() call on the
	 * socket, in which case sockevent_is_closing() is TRUE but we must
	 * still not free the socket now: doing so would derail libsockevent.
	 */
	if (destroy && may_free) {
		(void)tcpsock_clear_recv(tcp, FALSE /*ack_data*/);

		sockevent_raise(tcpsock_get_sock(tcp), SEV_CLOSE);
	}

	return destroy;
}

/*
 * Abort the lwIP PCB for the given socket, using tcp_abort().  If the PCB is
 * connected, this will cause the connection to be reset.  The PCB, which must
 * have still been present before the call, will be gone after the call.
 */
static void
tcpsock_pcb_abort(struct tcpsock * tcp)
{

	assert(tcp->tcp_pcb != NULL);
	assert(!tcpsock_is_listening(tcp));

	tcp_recv(tcp->tcp_pcb, NULL);
	tcp_sent(tcp->tcp_pcb, NULL);
	tcp_err(tcp->tcp_pcb, NULL);
	tcp_poll(tcp->tcp_pcb, NULL, TCP_POLL_REG_INTERVAL);

	tcp_arg(tcp->tcp_pcb, NULL);

	tcp_abort(tcp->tcp_pcb);

	tcp->tcp_pcb = NULL;
}

/*
 * Close the lwIP PCB for the given socket, using tcp_close().  If the PCB is
 * connected, its graceful close will be finished by lwIP in the background.
 * The PCB, which must have still been present before the call, will be gone
 * after the call.
 */
static void
tcpsock_pcb_close(struct tcpsock * tcp)
{
	err_t err;

	assert(tcp->tcp_pcb != NULL);
	assert(tcp->tcp_snd.ts_len == 0);

	if (!tcpsock_is_listening(tcp)) {
		tcp_recv(tcp->tcp_pcb, NULL);
		tcp_sent(tcp->tcp_pcb, NULL);
		tcp_err(tcp->tcp_pcb, NULL);
		tcp_poll(tcp->tcp_pcb, NULL, TCP_POLL_REG_INTERVAL);
	}

	tcp_arg(tcp->tcp_pcb, NULL);

	if ((err = tcp_close(tcp->tcp_pcb)) != ERR_OK)
		panic("unexpected TCP close failure: %d", err);

	tcp->tcp_pcb = NULL;
}

/*
 * Return TRUE if all conditions are met for closing the TCP socket's PCB, or
 * FALSE if they are not.  Upon calling this function, the socket's PCB must
 * still be around.
 */
static int
tcpsock_may_close(struct tcpsock * tcp)
{

	assert(tcp->tcp_pcb != NULL);

	/*
	 * Regular closing of the PCB requires three conditions to be met:
	 *
	 * 1. all our data has been transmitted AND acknowledged, so that we do
	 *    not risk corruption in case there are still unsent or unack'ed
	 *    data buffers that may otherwise be recycled too soon;
	 * 2. we have sent our FIN to the peer; and,
	 * 3. we have received a FIN from the peer.
	 */
	return ((tcpsock_get_flags(tcp) & (TCPF_SENT_FIN | TCPF_RCVD_FIN)) ==
	    (TCPF_SENT_FIN | TCPF_RCVD_FIN) && tcp->tcp_snd.ts_len == 0);
}

/*
 * The given socket is ready to be closed as per the tcpsock_may_close() rules.
 * This implies that its send queue is already empty.  Gracefully close the
 * PCB.  In addition, if the socket is being closed gracefully, meaning we
 * suspended an earlier tcpsock_close() call (and as such already emptied the
 * receive queue as well), then tell libsockevent that the close is finished,
 * freeing the socket.  Return TRUE if the socket has indeed been freed this
 * way, or FALSE if the socket is still around.
 */
static int
tcpsock_finish_close(struct tcpsock * tcp)
{

	assert(tcp->tcp_snd.ts_len == 0);
	assert(tcp->tcp_listener == NULL);

	/*
	 * If we get here, we have already shut down the sending side of the
	 * PCB.  Technically, we are interested only in shutting down the
	 * receiving side of the PCB here, so that lwIP may decide to recycle
	 * the socket later etcetera.  We call tcp_close() because we do not
	 * want to rely on tcp_shutdown(RX) doing the exact same thing.
	 * However, we do rely on the fact that the PCB is not immediately
	 * destroyed by the tcp_close() call: otherwise we may have to return
	 * ERR_ABRT if this function is called from a lwIP-generated event.
	 */
	tcpsock_pcb_close(tcp);

	/*
	 * If we suspended an earlier tcpsock_close() call, we have to tell
	 * libsockevent that the close operation is now complete.
	 */
	if (sockevent_is_closing(tcpsock_get_sock(tcp))) {
		assert(tcp->tcp_rcv.tr_len == 0);

		sockevent_raise(tcpsock_get_sock(tcp), SEV_CLOSE);

		return TRUE;
	} else
		return FALSE;
}

/*
 * Attempt to start or resume enqueuing data and/or a FIN to send on the given
 * TCP socket.  Return TRUE if anything at all could be newly enqueued on the
 * lwIP PCB, even if less than desired.  In that case, the caller should try to
 * send whatever was enqueued, and if applicable, check if the socket may now
 * be closed (due to the FIN being enqueued).  In particular, in any situation
 * where the socket may be in the process of being closed, the caller must use
 * tcpsock_may_close() if TRUE is returned.  Return FALSE if nothing new could
 * be enqueued, in which case no send attempt need to be made either.
 */
static int
tcpsock_pcb_enqueue(struct tcpsock * tcp)
{
	struct pbuf *punsent;
	size_t space, chunk;
	unsigned int flags;
	err_t err;
	int enqueued;

	assert(tcp->tcp_pcb != NULL);

	if (tcpsock_get_flags(tcp) & TCPF_FULL)
		return FALSE;

	/*
	 * Attempt to enqueue more unsent data, if any, on the PCB's send
	 * queue.
	 */
	enqueued = FALSE;

	while (tcp->tcp_snd.ts_unsent != NULL) {
		if ((space = tcp_sndbuf(tcp->tcp_pcb)) == 0)
			break;

		/*
		 * We may maintain a non-NULL unsent pointer even when there is
		 * nothing more to send right now, because the tail buffer may
		 * be filled up further later on.
		 */
		punsent = tcp->tcp_snd.ts_unsent;

		assert(punsent->len >= tcp->tcp_snd.ts_unsent_off);

		chunk = (size_t)punsent->len - tcp->tcp_snd.ts_unsent_off;
		if (chunk == 0)
			break;

		if (chunk > space)
			chunk = space;

		/* Try to enqueue more data for sending. */
		if (chunk < punsent->len || punsent->next != NULL)
			flags = TCP_WRITE_FLAG_MORE;
		else
			flags = 0;

		err = tcp_write(tcp->tcp_pcb, (char *)punsent->payload +
		    tcp->tcp_snd.ts_unsent_off, chunk, flags);

		/*
		 * Since tcp_write() enqueues data only, it should only return
		 * out-of-memory errors; no fatal ones.  In any case, stop.
		 */
		if (err != ERR_OK) {
			assert(err == ERR_MEM);

			break;
		}

		/* We have successfully enqueued data. */
		enqueued = TRUE;

		tcp->tcp_snd.ts_unsent_off += chunk;

		if (tcp->tcp_snd.ts_unsent_off < punsent->tot_len) {
			assert(tcp->tcp_snd.ts_unsent_off < punsent->len ||
			    punsent->next == NULL);

			break;
		}

		tcp->tcp_snd.ts_unsent = punsent->next;
		tcp->tcp_snd.ts_unsent_off = 0;
	}

	/*
	 * If all pending data has been enqueued for sending, and we should
	 * shut down the sending end of the socket, try that now.
	 */
	if ((tcp->tcp_snd.ts_unsent == NULL ||
	    tcp->tcp_snd.ts_unsent_off == tcp->tcp_snd.ts_unsent->len) &&
	    tcpsock_is_shutdown(tcp, SFL_SHUT_WR) &&
	    !(tcpsock_get_flags(tcp) & TCPF_SENT_FIN)) {
		err = tcp_shutdown(tcp->tcp_pcb, 0 /*shut_rx*/, 1 /*shut_tx*/);

		if (err == ERR_OK) {
			/*
			 * We have successfully enqueued a FIN.  The caller is
			 * now responsible for checking whether the PCB and
			 * possibly even the socket object can now be freed.
			 */
			tcpsock_set_flag(tcp, TCPF_SENT_FIN);

			enqueued = TRUE;
		} else {
			assert(err == ERR_MEM);

			/*
			 * FIXME: the resolution for lwIP bug #47485 has taken
			 * away even more control over the closing process from
			 * us, making tracking sockets especially for SO_LINGER
			 * even harder.  For now, we simply effectively undo
			 * the patch by clearing TF_CLOSEPEND if tcp_shutdown()
			 * returns ERR_MEM.  This will not be sustainable in
			 * the long term, though.
			 */
			tcp->tcp_pcb->flags &= ~TF_CLOSEPEND;

			tcpsock_set_flag(tcp, TCPF_FULL);
		}
	}

	return enqueued;
}

/*
 * Request lwIP to start sending any enqueued data and/or FIN on the TCP
 * socket's lwIP PCB.  On success, return OK.  On failure, return a negative
 * error code, after cleaning up the socket, freeing the PCB.  If the socket
 * was already being closed, also free the socket object in that case; the
 * caller must then not touch the socket object anymore upon return.  If the
 * socket object is not freed, and if 'raise_error' is TRUE, raise the error
 * on the socket object.
 */
static int
tcpsock_pcb_send(struct tcpsock * tcp, int raise_error)
{
	err_t err;
	int r;

	assert(tcp->tcp_pcb != NULL);

	/*
	 * If we have enqueued something, ask lwIP to send TCP packets now.
	 * This may result in a fatal error, in which case we clean up the
	 * socket and return the error to the caller.  Since cleaning up the
	 * socket may free the socket object, and the caller cannot tell
	 * whether that will happen or has happened, also possibly raise the
	 * error on the socket object if it is not gone.  As such, callers that
	 * set 'raise_error' to FALSE must know for sure that the socket was
	 * not being closed, for example because the caller is processing a
	 * (send) call from userland.
	 */
	err = tcp_output(tcp->tcp_pcb);

	if (err != ERR_OK && err != ERR_MEM) {
		tcpsock_pcb_abort(tcp);

		r = util_convert_err(err);

		if (!tcpsock_cleanup(tcp, TRUE /*may_free*/)) {
			if (raise_error)
				sockevent_set_error(tcpsock_get_sock(tcp), r);
		}
		/* Otherwise, do not touch the socket object anymore! */

		return r;
	} else
		return OK;
}

/*
 * Callback from lwIP.  The given number of data bytes have been acknowledged
 * as received by the remote end.  Dequeue and free data from the TCP socket's
 * send queue as appropriate.
 */
static err_t
tcpsock_event_sent(void * arg, struct tcp_pcb * pcb __unused, uint16_t len)
{
	struct tcpsock *tcp = (struct tcpsock *)arg;
	struct pbuf *phead;
	size_t left;

	assert(tcp != NULL);
	assert(pcb == tcp->tcp_pcb);
	assert(len > 0);

	assert(tcp->tcp_snd.ts_len >= len);
	assert(tcp->tcp_snd.ts_head != NULL);

	left = len;

	/*
	 * First see if we can free up whole buffers.  Check against the head
	 * buffer's 'len' rather than 'tot_len', or we may end up leaving an
	 * empty buffer on the chain.
	 */
	while ((phead = tcp->tcp_snd.ts_head) != NULL &&
	    left >= (size_t)phead->len - tcp->tcp_snd.ts_head_off) {
		left -= (size_t)phead->len - tcp->tcp_snd.ts_head_off;

		tcp->tcp_snd.ts_head = phead->next;
		tcp->tcp_snd.ts_head_off = 0;

		if (phead == tcp->tcp_snd.ts_unsent) {
			assert(tcp->tcp_snd.ts_unsent_off == phead->len);

			tcp->tcp_snd.ts_unsent = phead->next;
			tcp->tcp_snd.ts_unsent_off = 0;
		}

		assert(tcpsock_sendbufs > 0);
		tcpsock_sendbufs--;

		tcpsock_free_buf(phead);
	}

	/*
	 * The rest of the given length is for less than the current head
	 * buffer.
	 */
	if (left > 0) {
		assert(tcp->tcp_snd.ts_head != NULL);
		assert((size_t)tcp->tcp_snd.ts_head->len -
		    tcp->tcp_snd.ts_head_off > left);

		tcp->tcp_snd.ts_head_off += left;
	}

	tcp->tcp_snd.ts_len -= (size_t)len;

	if (tcp->tcp_snd.ts_head == NULL) {
		assert(tcp->tcp_snd.ts_len == 0);
		assert(tcp->tcp_snd.ts_unsent == NULL);
		tcp->tcp_snd.ts_tail = NULL;
	} else
		assert(tcp->tcp_snd.ts_len > 0);

	/*
	 * If we emptied the send queue, and we already managed to send a FIN
	 * earlier, we may now have met all requirements to close the socket's
	 * PCB.  Otherwise, we may also be able to send more now, so try to
	 * resume sending.  Since we are invoked from the "sent" event,
	 * tcp_output() will not actually process anything, and so we do not
	 * call it either.  If we did, we would have to deal with errors here.
	 */
	if (tcpsock_may_close(tcp)) {
		if (tcpsock_finish_close(tcp))
			return ERR_OK;
	} else {
		tcpsock_clear_flag(tcp, TCPF_FULL);

		/*
		 * If we now manage to enqueue a FIN, we may be ready to close
		 * the PCB after all.
		 */
		if (tcpsock_pcb_enqueue(tcp)) {
			if (tcpsock_may_close(tcp) &&
			    tcpsock_finish_close(tcp))
				return ERR_OK;
		}
	}

	/* The user may also be able to send more now. */
	sockevent_raise(tcpsock_get_sock(tcp), SEV_SEND);

	return ERR_OK;
}

/*
 * Check whether any (additional) data previously received on a TCP socket
 * should be acknowledged, possibly allowing the remote end to send additional
 * data as a result.
 */
static void
tcpsock_ack_recv(struct tcpsock * tcp)
{
	size_t rcvbuf, left, delta, ack;

	assert(tcp->tcp_pcb != NULL);

	/*
	 * We must make sure that at all times, we can still add an entire
	 * window's worth of data to the receive queue.  If the amount of free
	 * space drops below that threshold, we stop acknowledging received
	 * data.  The user may change the receive buffer size at all times; we
	 * update the window size lazily as appropriate.
	 */
	rcvbuf = tcpsock_get_rcvbuf(tcp);

	if (rcvbuf > tcp->tcp_rcv.tr_len && tcp->tcp_rcv.tr_unacked > 0) {
		/*
		 * The number of bytes that lwIP can still give us at any time
		 * is represented as 'left'.  The number of bytes that we still
		 * allow to be stored in the receive queue is represented as
		 * 'delta'.  We must make sure that 'left' does not ever exceed
		 * 'delta' while acknowledging as many bytes as possible under
		 * that rule.
		 */
		left = TCP_WND - tcp->tcp_rcv.tr_unacked;
		delta = rcvbuf - tcp->tcp_rcv.tr_len;

		if (left < delta) {
			ack = delta - left;

			if (ack > tcp->tcp_rcv.tr_unacked)
				ack = tcp->tcp_rcv.tr_unacked;

			tcp_recved(tcp->tcp_pcb, ack);

			tcp->tcp_rcv.tr_unacked -= ack;

			assert(tcp->tcp_rcv.tr_len + TCP_WND -
			    tcp->tcp_rcv.tr_unacked <= rcvbuf);
		}
	}
}

/*
 * Attempt to merge two consecutive underfilled buffers in the receive queue of
 * a TCP socket, freeing up one of the two buffers as a result.  The first
 * (oldest) buffer is 'ptail', and the pointer to this buffer is stored at
 * 'pnext'.  The second (new) buffer is 'pbuf', which is already attached to
 * the first buffer.  The second buffer may be followed by additional buffers
 * with even more new data.  Return TRUE if buffers have been merged, in which
 * case the pointer at 'pnext' may have changed, and no assumptions should be
 * made about whether 'ptail' and 'pbuf' still exist in any form.  Return FALSE
 * if no merging was necessary or if no new buffer could be allocated.
 */
static int
tcpsock_try_merge(struct pbuf **pnext, struct pbuf * ptail, struct pbuf * pbuf)
{
	struct pbuf *pnew;

	assert(*pnext == ptail);
	assert(ptail->next == pbuf);

	/*
	 * Unfortunately, we cannot figure out what kind of pbuf we were given
	 * by the lower layers, so we cannot merge two buffers without first
	 * allocating a third.  Once we have done that, though, we can easily
	 * merge more into that new buffer.  For now we use the following
	 * policies:
	 *
	 * 1. if two consecutive lwIP-provided buffers are both used less than
	 *    half the size of a full buffer, try to allocate a new buffer and
	 *    copy both lwIP-provided buffers into that new buffer, freeing up
	 *    the pair afterwards;
	 * 2. if the tail buffer on the chain is allocated by us and not yet
	 *    full, and the next buffer's contents can be added to the tail
	 *    buffer in their entirety, do just that.
	 *
	 * Obviously there is a trade-off between the performance overhead of
	 * copying and the resource overhead of keeping less-than-full buffers
	 * on the receive queue, but this policy should both keep actual memory
	 * usage to no more than twice the receive queue length and prevent
	 * excessive copying.  The policy deliberately performs more aggressive
	 * merging into a buffer that we allocated ourselves.
	 */
	if (ptail->tot_len <= MEMPOOL_BUFSIZE / 2 &&
	    pbuf->len <= MEMPOOL_BUFSIZE / 2) {
		/*
		 * Case #1.
		 */
		assert(ptail->tot_len == ptail->len);
		assert(pbuf->tot_len == pbuf->len);

		pnew = tcpsock_alloc_buf();
		if (pnew == NULL)
			return FALSE;

		memcpy(pnew->payload, ptail->payload, ptail->len);
		memcpy((char *)pnew->payload + ptail->len, pbuf->payload,
		    pbuf->len);
		pnew->len = ptail->len + pbuf->len;
		assert(pnew->len <= pnew->tot_len);

		pnew->next = pbuf->next;
		/* For now, we need not inherit any flags from either pbuf. */

		*pnext = pnew;

		/* One allocated, two about to be deallocated. */
		assert(tcpsock_recvbufs > 0);
		tcpsock_recvbufs--;

		tcpsock_free_buf(ptail);
		tcpsock_free_buf(pbuf);

		return TRUE;
	} else if (ptail->tot_len - ptail->len >= pbuf->len) {
		/*
		 * Case #2.
		 */
		memcpy((char *)ptail->payload + ptail->len, pbuf->payload,
		    pbuf->len);

		ptail->len += pbuf->len;

		ptail->next = pbuf->next;

		assert(tcpsock_recvbufs > 0);
		tcpsock_recvbufs--;

		tcpsock_free_buf(pbuf);

		return TRUE;
	} else
		return FALSE;
}

/*
 * Callback from lwIP.  New data or flags have been received on a TCP socket.
 */
static err_t
tcpsock_event_recv(void * arg, struct tcp_pcb * pcb __unused,
	struct pbuf * pbuf, err_t err)
{
	struct tcpsock *tcp = (struct tcpsock *)arg;
	struct pbuf *ptail, **pprevp;
	size_t len;

	assert(tcp != NULL);
	assert(pcb == tcp->tcp_pcb);

	/*
	 * lwIP should never provide anything other than ERR_OK in 'err', and
	 * it is not clear what we should do if it would.  If lwIP ever changes
	 * in this regard, we will likely have to change this code accordingly.
	 */
	if (err != ERR_OK)
		panic("TCP receive event with error: %d", err);

	/* If the given buffer is NULL, we have received a FIN. */
	if (pbuf == NULL) {
		tcpsock_set_flag(tcp, TCPF_RCVD_FIN);

		/* Userland may now receive EOF. */
		if (!tcpsock_is_shutdown(tcp, SFL_SHUT_RD))
			sockevent_raise(tcpsock_get_sock(tcp), SEV_RECV);

		/*
		 * If we were in the process of closing the socket, and we
		 * receive a FIN before our FIN got acknowledged, we close the
		 * socket anyway, as described in tcpsock_close().  However, if
		 * there is still unacknowledged outgoing data or we did not
		 * even manage to send our FIN yet, hold off closing the socket
		 * for now.
		 */
		if (tcpsock_may_close(tcp))
			(void)tcpsock_finish_close(tcp);

		return ERR_OK;
	}

	/*
	 * If the socket is being closed, receiving new data should cause a
	 * reset.
	 */
	if (sockevent_is_closing(tcpsock_get_sock(tcp))) {
		tcpsock_pcb_abort(tcp);

		(void)tcpsock_cleanup(tcp, TRUE /*may_free*/);
		/* Do not touch the socket object anymore! */

		pbuf_free(pbuf);

		return ERR_ABRT;
	}

	/*
	 * If the socket has already been shut down for reading, discard the
	 * incoming data and do nothing else.
	 */
	if (tcpsock_is_shutdown(tcp, SFL_SHUT_RD)) {
		tcp_recved(tcp->tcp_pcb, pbuf->tot_len);

		pbuf_free(pbuf);

		return ERR_OK;
	}

	/*
	 * We deliberately ignore the PBUF_FLAG_PUSH flag.  This flag would
	 * enable the receive functionality to delay delivering "un-pushed"
	 * data to applications.  The implementation of this scheme could track
	 * the amount of data up to and including the last-pushed segment using
	 * a "tr_push_len" field or so.  Deciding when to deliver "un-pushed"
	 * data after all is a bit tricker though.  As far as I can tell, the
	 * BSDs do not implement anything like that.  Windows does, and this
	 * results in interaction problems with even more lightweight TCP/IP
	 * stacks that do not send the TCP PSH flag.  Currently, there is no
	 * obvious benefit for us to support delaying data delivery like that.
	 * In addition, testing its implementation reliably would be difficult.
	 */

	len = (size_t)pbuf->tot_len;

	/*
	 * Count the number of buffers that are now owned by us.  The new total
	 * of buffers owned by us must not exceed the size of the memory pool.
	 * Any more would indicate an accounting error.  Note that
	 * tcpsock_recvbufs is currently used for debugging only!
	 */
	tcpsock_recvbufs += pbuf_clen(pbuf);
	assert(tcpsock_recvbufs < mempool_cur_buffers());

	/*
	 * The pre-tail pointer points to whatever is pointing to the tail
	 * buffer.  The latter pointer may be the 'tr_head' field in our
	 * tcpsock structure, or the 'next' field in the penultimate buffer,
	 * or NULL if there are currently no buffers on the receive queue.
	 */
	if ((pprevp = tcp->tcp_rcv.tr_pre_tailp) != NULL) {
		ptail = *pprevp;

		assert(ptail != NULL);
		assert(ptail->next == NULL);
		assert(tcp->tcp_rcv.tr_head != NULL);

		ptail->next = pbuf;
		pbuf->tot_len = pbuf->len;	/* to help freeing on merges */

		if (tcpsock_try_merge(pprevp, ptail, pbuf)) {
			ptail = *pprevp;
			pbuf = ptail->next;
		}

		if (pbuf != NULL)
			pprevp = &ptail->next;
	} else {
		assert(tcp->tcp_rcv.tr_head == NULL);
		assert(tcp->tcp_rcv.tr_head_off == 0);

		tcp->tcp_rcv.tr_head = pbuf;

		pprevp = &tcp->tcp_rcv.tr_head;
	}

	/*
	 * Chop up the chain into individual buffers.  This is necessary as we
	 * overload 'tot_len' to mean "space available in the buffer", as we
	 * want for buffers allocated by us as part of buffer merges.  Also get
	 * a pointer to the pointer to the new penultimate tail buffer.  Due to
	 * merging, the chain may already be empty by now, though.
	 */
	if (pbuf != NULL) {
		for (; pbuf->next != NULL; pbuf = pbuf->next) {
			pbuf->tot_len = pbuf->len;

			pprevp = &pbuf->next;
		}
		assert(pbuf->len == pbuf->tot_len);
	}

	assert(*pprevp != NULL);
	assert((*pprevp)->next == NULL);
	tcp->tcp_rcv.tr_pre_tailp = pprevp;

	tcp->tcp_rcv.tr_len += len;
	tcp->tcp_rcv.tr_unacked += len;

	assert(tcp->tcp_rcv.tr_unacked <= TCP_WND);

	/*
	 * Note that tr_len may now exceed the receive buffer size in the
	 * highly exceptional case that the user is adjusting the latter after
	 * the socket had already received data.
	 */

	/* See if we can immediately acknowledge some or all of the data. */
	tcpsock_ack_recv(tcp);

	/* Also wake up any receivers now. */
	sockevent_raise(tcpsock_get_sock(tcp), SEV_RECV);

	return ERR_OK;
}

/*
 * Callback from lwIP.  The PCB corresponding to the socket identified by 'arg'
 * has been closed by lwIP, with the reason specified in 'err': either the
 * connection has been aborted locally (ERR_ABRT), it has been reset by the
 * remote end (ERR_RST), or it is closed due to state transitions (ERR_CLSD).
 */
static void
tcpsock_event_err(void * arg, err_t err)
{
	struct tcpsock *tcp = (struct tcpsock *)arg;
	int r;

	assert(tcp != NULL);
	assert(tcp->tcp_pcb != NULL);
	assert(err != ERR_OK);

	/* The original PCB is now gone, or will be shortly. */
	tcp->tcp_pcb = NULL;

	/*
	 * Clean up the socket.  As a result it may be freed, in which case we
	 * must not touch it anymore.  No need to return ERR_ABRT from here, as
	 * the PCB has been aborted already.
	 */
	if (tcpsock_cleanup(tcp, TRUE /*may_free*/))
		return;

	if (err == ERR_CLSD) {
		/*
		 * We may get here if the socket is shut down for writing and
		 * we already received a FIN from the remote side, thus putting
		 * the socket in LAST_ACK state, and we receive that last
		 * acknowledgment.  There is nothing more we need to do.
		 *
		 * We will never get here in the other case that ERR_CLSD is
		 * raised, which is when the socket is reset because of
		 * unacknowledged data while closing: we handle the
		 * reset-on-ACK case ourselves in tcpsock_close(), and the
		 * socket is in closing state after that.
		 */
		assert(tcpsock_is_shutdown(tcp, SFL_SHUT_WR));
		assert(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN);
	} else {
		/*
		 * Anything else should be an error directly from lwIP;
		 * currently either ERR_ABRT and ERR_RST.  Covert it to a
		 * regular error and set it on the socket.  Doing so will also
		 * raise the appropriate events.
		 */
		/*
		 * Unfortunately, lwIP is not throwing accurate errors even
		 * when it can.  We convert some errors to reflect more
		 * accurately the most likely cause.
		 *
		 * TODO: fix lwIP in this regard..
		 */
		r = util_convert_err(err);

		if (tcpsock_get_flags(tcp) & TCPF_CONNECTING) {
			switch (err) {
			case ERR_ABRT:	r = ETIMEDOUT;		break;
			case ERR_RST:	r = ECONNREFUSED;	break;
			}
		}

		sockevent_set_error(tcpsock_get_sock(tcp), r);
	}
}

/*
 * Callback from lwIP.  Perform regular checks on a TCP socket.  This function
 * is called one per five seconds on connected sockets, and twice per second on
 * closing sockets.
 */
static err_t
tcpsock_event_poll(void * arg, struct tcp_pcb * pcb __unused)
{
	struct tcpsock *tcp = (struct tcpsock *)arg;
	err_t err;
	int r;

	assert(tcp != NULL);
	assert(pcb == tcp->tcp_pcb);

	/*
	 * If we ended up running out of buffers earlier, try resuming any send
	 * requests now, both for enqueuing TCP data with lwIP and for user
	 * requests.
	 */
	if (tcpsock_get_flags(tcp) & (TCPF_FULL | TCPF_OOM)) {
		tcpsock_clear_flag(tcp, TCPF_FULL);
		tcpsock_clear_flag(tcp, TCPF_OOM);

		/* See if we can enqueue more data with lwIP. */
		if (tcpsock_pcb_enqueue(tcp)) {
			/* In some cases, we can now close the PCB. */
			if (tcpsock_may_close(tcp)) {
				(void)tcpsock_finish_close(tcp);
				/*
				 * The PCB is definitely gone here, and the
				 * entire socket object may be gone now too.
				 * Do not touch either anymore!
				 */

				return ERR_OK;
			}

			/*
			 * If actually sending the data fails, the PCB will be
			 * gone, and the socket object may be gone as well.  Do
			 * not touch either anymore in that case!
			 */
			if (tcpsock_pcb_send(tcp, TRUE /*raise_error*/) != OK)
				return ERR_ABRT;
		}

		/*
		 * If we ran out of buffers earlier, it may be possible to take
		 * in more data from a user process now, even if we did not
		 * manage to enqueue any more pending data with lwIP.
		 */
		sockevent_raise(tcpsock_get_sock(tcp), SEV_SEND);

		assert(tcp->tcp_pcb != NULL);
	} else if (tcp->tcp_snd.ts_unsent != NULL &&
	    tcp->tcp_snd.ts_unsent_off < tcp->tcp_snd.ts_unsent->len) {
		/*
		 * If the send buffer is full, we will no longer call
		 * tcp_output(), which means we may also miss out on fatal
		 * errors that would otherwise kill the connection (e.g., no
		 * route).  As a result, the connection may erroneously
		 * continue to exist for a long time.  To avoid this, we call
		 * tcp_output() every once in a while when there are still
		 * unsent data.
		 */
		err = tcp_output(tcp->tcp_pcb);

		if (err != ERR_OK && err != ERR_MEM) {
			tcpsock_pcb_abort(tcp);

			if (!tcpsock_cleanup(tcp, TRUE /*may_free*/)) {
				r = util_convert_err(err);

				sockevent_set_error(tcpsock_get_sock(tcp), r);
			}
			/* Otherwise do not touch the socket object anymore! */

			return ERR_ABRT;
		}
	}

	/*
	 * If we are closing the socket, and we sent a FIN, see if the FIN got
	 * acknowledged.  If so, finish closing the socket.  Unfortunately, we
	 * can perform this check by polling only.  TODO: change lwIP..
	 */
	if (sockevent_is_closing(tcpsock_get_sock(tcp)) &&
	    (tcpsock_get_flags(tcp) & TCPF_SENT_FIN) &&
	    tcp->tcp_pcb->unsent == NULL && tcp->tcp_pcb->unacked == NULL) {
		assert(tcp->tcp_snd.ts_len == 0);

		tcpsock_finish_close(tcp);
	}

	return ERR_OK;
}

/*
 * Bind a TCP socket to a local address.
 */
static int
tcpsock_bind(struct sock * sock, const struct sockaddr * addr,
	socklen_t addr_len, endpoint_t user_endpt)
{
	struct tcpsock *tcp = (struct tcpsock *)sock;
	ip_addr_t ipaddr;
	uint16_t port;
	err_t err;
	int r;

	if (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state != CLOSED)
		return EINVAL;

	if ((r = ipsock_get_src_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
	    user_endpt, &tcp->tcp_pcb->local_ip, tcp->tcp_pcb->local_port,
	    FALSE /*allow_mcast*/, &ipaddr, &port)) != OK)
		return r;

	err = tcp_bind(tcp->tcp_pcb, &ipaddr, port);

	return util_convert_err(err);
}

/*
 * Callback from lwIP.  A new connection 'pcb' has arrived on the listening
 * socket identified by 'arg'.  Note that 'pcb' may be NULL in the case that
 * lwIP could not accept the connection itself.
 */
static err_t
tcpsock_event_accept(void * arg, struct tcp_pcb * pcb, err_t err)
{
	struct tcpsock *tcp = (struct tcpsock *)arg;

	assert(tcp != NULL);
	assert(tcpsock_is_listening(tcp));

	/*
	 * If the given PCB is NULL, then lwIP ran out of memory allocating a
	 * PCB for the new connection.  There is nothing we can do with that
	 * information.  Also check 'err' just to make sure.
	 */
	if (pcb == NULL || err != OK)
		return ERR_OK;

	/*
	 * The TCP socket is the listening socket, but the PCB is for the
	 * incoming connection.
	 */
	if (tcpsock_clone(tcp, pcb) != OK) {
		/*
		 * We could not allocate the resources necessary to accept the
		 * connection.  Abort it immediately.
		 */
		tcp_abort(pcb);

		return ERR_ABRT;
	}

	/*
	 * The connection has not yet been accepted, and thus should still be
	 * considered on the listen queue.
	 */
	tcp_backlog_delayed(pcb);

	/* Set the callback functions. */
	tcp_recv(pcb, tcpsock_event_recv);
	tcp_sent(pcb, tcpsock_event_sent);
	tcp_err(pcb, tcpsock_event_err);
	tcp_poll(pcb, tcpsock_event_poll, TCP_POLL_REG_INTERVAL);

	sockevent_raise(tcpsock_get_sock(tcp), SEV_ACCEPT);

	return ERR_OK;
}

/*
 * Put a TCP socket in listening mode.
 */
static int
tcpsock_listen(struct sock * sock, int backlog)
{
	struct tcpsock *tcp = (struct tcpsock *)sock;
	struct tcp_pcb *pcb;
	err_t err;

	/* The maximum backlog value must not exceed its field size. */
	assert(SOMAXCONN <= UINT8_MAX);

	/*
	 * Allow only CLOSED sockets to enter listening mode.  If the socket
	 * was already in listening mode, allow its backlog value to be
	 * updated, even if it was shut down already (making this a no-op).
	 */
	if (!tcpsock_is_listening(tcp) &&
	    (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state != CLOSED))
		return EINVAL;

	/*
	 * If the socket was not already in listening mode, put it in that mode
	 * now.  That involves switching PCBs as lwIP attempts to save memory
	 * by replacing the original PCB with a smaller one.  If the socket was
	 * already in listening mode, simply update its backlog value--this has
	 * no effect on the sockets already in the backlog.
	 */
	if (!tcpsock_is_listening(tcp)) {
		assert(tcp->tcp_pcb != NULL);

		/*
		 * If the socket has not been bound to a port yet, do that
		 * first.  This does mean that the listen call may fail with
		 * side effects, but that is acceptable in this case.
		 */
		if (tcp->tcp_pcb->local_port == 0) {
			err = tcp_bind(tcp->tcp_pcb, &tcp->tcp_pcb->local_ip,
			    0 /*port*/);

			if (err != ERR_OK)
				return util_convert_err(err);
		}

		/*
		 * Clear the argument on the PCB that is about to be replaced,
		 * because if we do not, once the PCB is reused (which does not
		 * clear the argument), we might get weird events.  Do this
		 * before the tcp_listen() call, because we should no longer
		 * access the old PCB afterwards (even if we can).
		 */
		tcp_arg(tcp->tcp_pcb, NULL);

		pcb = tcp_listen_with_backlog_and_err(tcp->tcp_pcb, backlog,
		    &err);

		if (pcb == NULL) {
			tcp_arg(tcp->tcp_pcb, tcp); /* oops, undo. */

			return util_convert_err(err);
		}

		tcp_arg(pcb, tcp);
		tcp->tcp_pcb = pcb;

		tcp_accept(pcb, tcpsock_event_accept);

		/* Initialize the queue head for sockets pending acceptance. */
		TAILQ_INIT(&tcp->tcp_queue.tq_head);
	} else if (tcp->tcp_pcb != NULL)
		tcp_backlog_set(tcp->tcp_pcb, backlog);

	return OK;
}

/*
 * Callback from lwIP.  A socket connection attempt has succeeded.  Note that
 * failed socket events will trigger the tcpsock_event_err() callback instead.
 */
static err_t
tcpsock_event_connected(void * arg, struct tcp_pcb * pcb __unused, err_t err)
{
	struct tcpsock *tcp = (struct tcpsock *)arg;

	assert(tcp != NULL);
	assert(pcb == tcp->tcp_pcb);
	assert(tcpsock_get_flags(tcp) & TCPF_CONNECTING);

	/*
	 * If lwIP ever changes so that this callback is called for connect
	 * failures as well, then we need to change the code here accordingly.
	 */
	if (err != ERR_OK)
		panic("TCP connected event with error: %d", err);

	tcpsock_clear_flag(tcp, TCPF_CONNECTING);

	sockevent_raise(tcpsock_get_sock(tcp), SEV_CONNECT | SEV_SEND);

	return ERR_OK;
}

/*
 * Connect a TCP socket to a remote address.
 */
static int
tcpsock_connect(struct sock * sock, const struct sockaddr * addr,
	socklen_t addr_len, endpoint_t user_endpt)
{
	struct tcpsock *tcp = (struct tcpsock *)sock;
	ip_addr_t dst_addr;
	uint16_t dst_port;
	err_t err;
	int r;

	/*
	 * Listening sockets may not have a PCB, so we use higher-level flags
	 * to throw the correct error code for those instead.
	 */
	if (tcpsock_is_listening(tcp))
		return EOPNOTSUPP;

	/*
	 * If there is no longer any PCB, we obviously cannot perform the
	 * connection, but POSIX is not clear on which error to return.  We
	 * copy NetBSD's.
	 */
	if (tcp->tcp_pcb == NULL)
		return EINVAL;

	/*
	 * The only state from which a connection can be initiated, is CLOSED.
	 * Some of the other states require distinct error codes, though.
	 */
	switch (tcp->tcp_pcb->state) {
	case CLOSED:
		break;
	case SYN_SENT:
		return EALREADY;
	case LISTEN:
		assert(0); /* we just checked.. */
	default:
		return EISCONN;
	}

	/*
	 * Get the destination address, and attempt to start connecting.  If
	 * the socket was not bound before, or it was bound to a port only,
	 * then lwIP will select a source address for us.  We cannot do this
	 * ourselves even if we wanted to: it is impossible to re-bind a TCP
	 * PCB in the case it was previously bound to a port only.
	 */
	if ((r = ipsock_get_dst_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
	    &tcp->tcp_pcb->local_ip, &dst_addr, &dst_port)) != OK)
		return r;

	err = tcp_connect(tcp->tcp_pcb, &dst_addr, dst_port,
	    tcpsock_event_connected);

	/*
	 * Note that various tcp_connect() error cases will leave the PCB with
	 * a newly set local and remote IP address anyway.  We should be
	 * careful not to rely on the addresses being as they were before.
	 */
	if (err != ERR_OK)
		return util_convert_err(err);

	/* Set the other callback functions. */
	tcp_recv(tcp->tcp_pcb, tcpsock_event_recv);
	tcp_sent(tcp->tcp_pcb, tcpsock_event_sent);
	tcp_err(tcp->tcp_pcb, tcpsock_event_err);
	tcp_poll(tcp->tcp_pcb, tcpsock_event_poll, TCP_POLL_REG_INTERVAL);

	/*
	 * Set a flag so that we can correct lwIP's error codes in case the
	 * connection fails.
	 */
	tcpsock_set_flag(tcp, TCPF_CONNECTING);

	return SUSPEND;
}

/*
 * Test whether any new connections are pending on a listening TCP socket.
 */
static int
tcpsock_test_accept(struct sock * sock)
{
	struct tcpsock *tcp = (struct tcpsock *)sock;

	/* Is this socket in listening mode at all? */
	if (!tcpsock_is_listening(tcp))
		return EINVAL;

	/* Are there any connections to accept right now? */
	if (!TAILQ_EMPTY(&tcp->tcp_queue.tq_head))
		return OK;

	/* If the socket has been shut down, we return ECONNABORTED. */
	if (tcp->tcp_pcb == NULL)
		return ECONNABORTED;

	/* Otherwise, wait for a new connection first. */
	return SUSPEND;
}

/*
 * Accept a connection on a listening TCP socket, creating a new TCP socket.
 */
static sockid_t
tcpsock_accept(struct sock * sock, struct sockaddr * addr,
	socklen_t * addr_len, endpoint_t user_endpt __unused,
	struct sock ** newsockp)
{
	struct tcpsock *listener = (struct tcpsock *)sock;
	struct tcpsock *tcp;
	int r;

	if ((r = tcpsock_test_accept(sock)) != OK)
		return r;
	/* Below, we must not assume that the listener has a PCB. */

	tcp = TAILQ_FIRST(&listener->tcp_queue.tq_head);
	assert(tcp->tcp_listener == listener);
	assert(tcp->tcp_pcb != NULL);

	TAILQ_REMOVE(&listener->tcp_queue.tq_head, tcp, tcp_queue.tq_next);
	tcp->tcp_listener = NULL;

	tcp_backlog_accepted(tcp->tcp_pcb);

	ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
	    &tcp->tcp_pcb->remote_ip, tcp->tcp_pcb->remote_port);

	/*
	 * Set 'newsockp' to NULL so that libsockevent knows we already cloned
	 * the socket, and it must not be reinitialized anymore.
	 */
	*newsockp = NULL;
	return tcpsock_get_id(tcp);
}

/*
 * Perform preliminary checks on a send request.
 */
static int
tcpsock_pre_send(struct sock * sock, size_t len __unused,
	socklen_t ctl_len __unused, const struct sockaddr * addr __unused,
	socklen_t addr_len __unused, endpoint_t user_endpt __unused, int flags)
{

	/*
	 * Reject calls with unknown flags.  Since libsockevent strips out the
	 * flags it handles itself here, we only have to test for ones we can
	 * not handle.  Currently, there are no send flags that we support.
	 */
	if (flags != 0)
		return EOPNOTSUPP;

	return OK;
}

/*
 * Test whether the given number of data bytes can be sent on a TCP socket.
 */
static int
tcpsock_test_send(struct sock * sock, size_t min)
{
	struct tcpsock *tcp = (struct tcpsock *)sock;
	size_t sndbuf;

	if (tcp->tcp_pcb == NULL)
		return EPIPE;

	switch (tcp->tcp_pcb->state) {
	case CLOSED:			/* new */
	case LISTEN:			/* listening */
		return ENOTCONN;
	case SYN_SENT:			/* connecting */
	case SYN_RCVD:			/* simultaneous open, maybe someday? */
		return SUSPEND;
	case ESTABLISHED:		/* connected */
	case CLOSE_WAIT:		/* closed remotely */
		break;
	default:			/* shut down locally */
		assert(tcpsock_is_shutdown(tcp, SFL_SHUT_WR));
		return EPIPE;
	}

	sndbuf = tcpsock_get_sndbuf(tcp);
	if (min > sndbuf)
		min = sndbuf;

	if (tcp->tcp_snd.ts_len + min > sndbuf)
		return SUSPEND;
	else
		return OK;
}

/*
 * Send data on a TCP socket.
 */
static int
tcpsock_send(struct sock * sock, const struct sockdriver_data * data,
	size_t len, size_t * offp, const struct sockdriver_data * ctl __unused,
	socklen_t ctl_len __unused, socklen_t * ctl_off __unused,
	const struct sockaddr * addr __unused, socklen_t addr_len __unused,
	endpoint_t user_endpt __unused, int flags __unused, size_t min)
{
	struct tcpsock *tcp = (struct tcpsock *)sock;
	struct pbuf *ptail, *pfirst, *pnext, *plast;
	size_t off, tail_off, chunk, left, sndbuf;
	int r;

	if ((r = tcpsock_test_send(sock, min)) != OK)
		return r;

	if (len == 0)
		return OK;	/* nothing to do */

	sndbuf = tcpsock_get_sndbuf(tcp);
	if (min > sndbuf)
		min = sndbuf;
	assert(min > 0);

	assert(sndbuf > tcp->tcp_snd.ts_len);
	left = sndbuf - tcp->tcp_snd.ts_len;
	if (left > len)
		left = len;

	/*
	 * First see if we can fit any more data in the current tail buffer.
	 * If so, we set 'ptail' to point to it and 'tail_off' to the previous
	 * length of the tail buffer, while optimistically extending it to
	 * include the new data.  If not, we set them to NULL/0.
	 */
	if ((ptail = tcp->tcp_snd.ts_tail) != NULL &&
	    ptail->len < ptail->tot_len) {
		assert(ptail->len > 0);
		tail_off = (size_t)ptail->len;

		/*
		 * Optimistically extend the head buffer to include whatever
		 * fits in it.  This is needed for util_copy_data().
		 */
		assert(ptail->tot_len > ptail->len);
		off = (size_t)ptail->tot_len - (size_t)ptail->len;
		if (off > left)
			off = left;
		ptail->len += off;
	} else {
		ptail = NULL;
		tail_off = 0;
		off = 0;
	}

	/*
	 * Then, if there is more to send, allocate new buffers as needed.  If
	 * we run out of memory, work with whatever we did manage to grab.
	 */
	pfirst = NULL;
	plast = NULL;
	while (off < left) {
		if (tcpsock_sendbufs >= TCP_MAX_SENDBUFS ||
		    (pnext = tcpsock_alloc_buf()) == NULL) {
			/*
			 * Chances are that we will end up suspending this send
			 * request because of being out of buffers.  We try to
			 * resume such requests from the polling function.
			 */
			tcpsock_set_flag(tcp, TCPF_OOM);

			break;
		}

		tcpsock_sendbufs++;

		if (pfirst == NULL)
			pfirst = pnext;
		else
			plast->next = pnext;
		plast = pnext;

		chunk = (size_t)pnext->tot_len;
		if (chunk > left - off)
			chunk = left - off;
		pnext->len = chunk;
		off += chunk;
	}

	/*
	 * Copy in the data and continue, unless we did not manage to find
	 * enough space to even meet the low send watermark, in which case we
	 * undo any allocation and suspend the call until later.
	 */
	if (off >= min) {
		/*
		 * Optimistically attach the new buffers to the tail, also for
		 * util_copy_data().  We undo all this if the copy fails.
		 */
		if (ptail != NULL) {
			ptail->next = pfirst;

			pnext = ptail;
		} else
			pnext = pfirst;

		assert(pnext != NULL);

		r = util_copy_data(data, off, *offp, pnext, tail_off,
		    TRUE /*copy_in*/);
	} else
		r = SUSPEND;

	if (r != OK) {
		/* Undo the modifications made so far. */
		while (pfirst != NULL) {
			pnext = pfirst->next;

			assert(tcpsock_sendbufs > 0);
			tcpsock_sendbufs--;

			tcpsock_free_buf(pfirst);

			pfirst = pnext;
		}

		if (ptail != NULL) {
			ptail->next = NULL;

			ptail->len = tail_off;
		}

		return r;
	}

	/* Attach the new buffers, if any, to the buffer tail. */
	if (pfirst != NULL) {
		if ((ptail = tcp->tcp_snd.ts_tail) != NULL) {
			assert(ptail->len == ptail->tot_len);

			/*
			 * Due to our earlier optimistic modifications, this
			 * may or may not be redundant.
			 */
			ptail->next = pfirst;
		}

		assert(plast != NULL);
		tcp->tcp_snd.ts_tail = plast;

		if (tcp->tcp_snd.ts_head == NULL) {
			tcp->tcp_snd.ts_head = pfirst;
			assert(tcp->tcp_snd.ts_head_off == 0);
		}
		if (tcp->tcp_snd.ts_unsent == NULL) {
			tcp->tcp_snd.ts_unsent = pfirst;
			assert(tcp->tcp_snd.ts_unsent_off == 0);
		}
	}

	tcp->tcp_snd.ts_len += off;

	/*
	 * See if we can send any of the data we just enqueued.  The socket is
	 * still open as we are still processing a call from userland on it;
	 * this saves us from having to deal with the cases that the following
	 * calls end up freeing the socket object.
	 */
	if (tcpsock_pcb_enqueue(tcp) &&
	    (r = tcpsock_pcb_send(tcp, FALSE /*raise_error*/)) != OK) {
		/*
		 * That did not go well.  Return the error immediately if we
		 * had not made any progress earlier.  Otherwise, return our
		 * partial progress and leave the error to be picked up later.
		 */
		if (*offp > 0) {
			sockevent_set_error(tcpsock_get_sock(tcp), r);

			return OK;
		} else
			return r;
	}

	*offp += off;
	return (off < len) ? SUSPEND : OK;
}

/*
 * Perform preliminary checks on a receive request.
 */
static int
tcpsock_pre_recv(struct sock * sock __unused, endpoint_t user_endpt __unused,
	int flags)
{

	/*
	 * Reject calls with unknown flags.  Since libsockevent strips out the
	 * flags it handles itself here, we only have to test for ones we can
	 * not handle.
	 */
	if ((flags & ~(MSG_PEEK | MSG_WAITALL)) != 0)
		return EOPNOTSUPP;

	return OK;
}

/*
 * Return TRUE if receive calls may wait for more data to come in on the
 * connection, or FALSE if we already know that that is not going to happen.
 */
static int
tcpsock_may_wait(struct tcpsock * tcp)
{

	return (tcp->tcp_pcb != NULL &&
	    !(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN));
}

/*
 * Test whether data can be received on a TCP socket, and if so, how many bytes
 * of data.
 */
static int
tcpsock_test_recv(struct sock * sock, size_t min, size_t * size)
{
	struct tcpsock *tcp = (struct tcpsock *)sock;
	int may_wait;

	/* If there is and never was a connection, refuse the call at all. */
	if (tcp->tcp_pcb != NULL && (tcp->tcp_pcb->state == CLOSED ||
	    tcp->tcp_pcb->state == LISTEN))
		return ENOTCONN;

	/*
	 * If we are certain that no more data will come in later, ignore the
	 * low receive watermark.  Otherwise, bound it to the size of the
	 * receive buffer, or receive calls may block forever.
	 */
	if (!(may_wait = tcpsock_may_wait(tcp)))
		min = 1;
	else if (min > tcpsock_get_rcvbuf(tcp))
		min = tcpsock_get_rcvbuf(tcp);

	if (tcp->tcp_rcv.tr_len >= min) {
		if (size != NULL)
			*size = tcp->tcp_rcv.tr_len;

		return OK;
	}

	return (may_wait) ? SUSPEND : SOCKEVENT_EOF;
}

/*
 * Receive data on a TCP socket.
 */
static int
tcpsock_recv(struct sock * sock, const struct sockdriver_data * data,
	size_t len, size_t * offp, const struct sockdriver_data * ctl __unused,
	socklen_t ctl_len __unused, socklen_t * ctl_off __unused,
	struct sockaddr * addr __unused, socklen_t * addr_len __unused,
	endpoint_t user_endpt __unused, int flags, size_t min,
	int * rflags __unused)
{
	struct tcpsock *tcp = (struct tcpsock *)sock;
	struct pbuf *ptail;
	size_t off, left;
	int r;

	/* See if we can receive at all, and if so, how much at most. */
	if ((r = tcpsock_test_recv(sock, min, NULL)) != OK)
		return r;

	if (len == 0)
		return OK;	/* nothing to do */

	off = tcp->tcp_rcv.tr_len;
	if (off > len)
		off = len;

	assert(tcp->tcp_rcv.tr_head != NULL);
	assert(tcp->tcp_rcv.tr_head_off < tcp->tcp_rcv.tr_head->len);

	/* Copy out the data to the caller. */
	if ((r = util_copy_data(data, off, *offp, tcp->tcp_rcv.tr_head,
	    tcp->tcp_rcv.tr_head_off, FALSE /*copy_in*/)) != OK)
		return r;

	/* Unless peeking, remove the data from the receive queue. */
	if (!(flags & MSG_PEEK)) {
		left = off;

		/* Dequeue and free as many entire buffers as possible. */
		while ((ptail = tcp->tcp_rcv.tr_head) != NULL &&
		    left >= (size_t)ptail->len - tcp->tcp_rcv.tr_head_off) {
			left -= (size_t)ptail->len - tcp->tcp_rcv.tr_head_off;

			tcp->tcp_rcv.tr_head = ptail->next;
			tcp->tcp_rcv.tr_head_off = 0;

			if (tcp->tcp_rcv.tr_head == NULL)
				tcp->tcp_rcv.tr_pre_tailp = NULL;
			else if (tcp->tcp_rcv.tr_pre_tailp == &ptail->next)
				tcp->tcp_rcv.tr_pre_tailp =
				    &tcp->tcp_rcv.tr_head;

			assert(tcpsock_recvbufs > 0);
			tcpsock_recvbufs--;

			tcpsock_free_buf(ptail);
		}

		/*
		 * If only part of the (new) head buffer is consumed, adjust
		 * the saved offset into that buffer.
		 */
		if (left > 0) {
			assert(tcp->tcp_rcv.tr_head != NULL);
			assert((size_t)tcp->tcp_rcv.tr_head->len -
			    tcp->tcp_rcv.tr_head_off > left);

			tcp->tcp_rcv.tr_head_off += left;
		}

		tcp->tcp_rcv.tr_len -= off;

		if (tcp->tcp_rcv.tr_head != NULL) {
			assert(tcp->tcp_rcv.tr_pre_tailp != NULL);
			assert(tcp->tcp_rcv.tr_len > 0);
		} else {
			assert(tcp->tcp_rcv.tr_pre_tailp == NULL);
			assert(tcp->tcp_rcv.tr_len == 0);
		}

		/*
		 * The receive buffer has shrunk, so there may now be space to
		 * receive more data.
		 */
		if (tcp->tcp_pcb != NULL)
			tcpsock_ack_recv(tcp);
	} else
		flags &= ~MSG_WAITALL; /* for the check below */

	/* Advance the current copy position, and see if we are done. */
	*offp += off;
	if ((flags & MSG_WAITALL) && off < len && tcpsock_may_wait(tcp))
		return SUSPEND;
	else
		return OK;
}

/*
 * Update the set of flag-type socket options on a TCP socket.
 */
static void
tcpsock_setsockmask(struct sock * sock, unsigned int mask)
{
	struct tcpsock *tcp = (struct tcpsock *)sock;

	if (tcp->tcp_pcb == NULL)
		return;

	if (mask & SO_REUSEADDR)
		ip_set_option(tcp->tcp_pcb, SOF_REUSEADDR);
	else
		ip_reset_option(tcp->tcp_pcb, SOF_REUSEADDR);

	if (mask & SO_KEEPALIVE)
		ip_set_option(tcp->tcp_pcb, SOF_KEEPALIVE);
	else
		ip_reset_option(tcp->tcp_pcb, SOF_KEEPALIVE);
}

/*
 * Prepare a helper structure for IP-level option processing.
 */
static void
tcpsock_get_ipopts(struct tcpsock * tcp, struct ipopts * ipopts)
{

	ipopts->local_ip = &tcp->tcp_pcb->local_ip;
	ipopts->remote_ip = &tcp->tcp_pcb->remote_ip;
	ipopts->tos = &tcp->tcp_pcb->tos;
	ipopts->ttl = &tcp->tcp_pcb->ttl;
	ipopts->sndmin = TCP_SNDBUF_MIN;
	ipopts->sndmax = TCP_SNDBUF_MAX;
	ipopts->rcvmin = TCP_RCVBUF_MIN;
	ipopts->rcvmax = TCP_RCVBUF_MAX;
}

/*
 * Set socket options on a TCP socket.
 */
static int
tcpsock_setsockopt(struct sock * sock, int level, int name,
	const struct sockdriver_data * data, socklen_t len)
{
	struct tcpsock *tcp = (struct tcpsock *)sock;
	struct ipopts ipopts;
	uint32_t uval;
	int r, val;

	if (tcp->tcp_pcb == NULL)
		return ECONNRESET;

	/* Handle TCP-level options. */
	switch (level) {
	case IPPROTO_IPV6:
		switch (name) {
		case IPV6_RECVTCLASS:
			if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
			    len)) != OK)
				return r;

			/*
			 * This option is not supported for TCP sockets; it
			 * would not even make sense.  However, named(8)
			 * insists on trying to set it anyway.  We accept the
			 * request but ignore the value, not even returning
			 * what was set through getsockopt(2).
			 */
			return OK;

		case IPV6_FAITH:
			if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
			    len)) != OK)
				return r;

			/*
			 * This option is not supported at all, but to save
			 * ourselves from having to remember the current state
			 * for getsockopt(2), we also refuse to enable it.
			 */
			if (val != 0)
				return EINVAL;

			return OK;
		}

		break;

	case IPPROTO_TCP:
		switch (name) {
		case TCP_NODELAY:
			/*
			 * lwIP's listening TCP PCBs do not have this field.
			 * If this ever becomes an issue, we can create our own
			 * shadow flag and do the inheritance ourselves.
			 */
			if (tcp->tcp_pcb->state == LISTEN)
				return EINVAL;

			if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
			    len)) != OK)
				return r;

			if (val)
				tcp_nagle_disable(tcp->tcp_pcb);
			else
				tcp_nagle_enable(tcp->tcp_pcb);

			return OK;

		case TCP_KEEPIDLE:
		case TCP_KEEPINTVL:
			/*
			 * lwIP's listening TCP PCBs do not have these fields.
			 */
			if (tcp->tcp_pcb->state == LISTEN)
				return EINVAL;

			if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
			    len)) != OK)
				return r;

			if (val == 0)
				return EINVAL;

			/*
			 * The given value is unsigned, but lwIP stores the
			 * value in milliseconds in a uint32_t field, so we
			 * have to limit large values to whatever fits in the
			 * field anyway.
			 */
			if (val < 0 || (uint32_t)val > UINT32_MAX / 1000)
				uval = UINT32_MAX;
			else
				uval = (uint32_t)val * 1000;

			if (name == TCP_KEEPIDLE)
				tcp->tcp_pcb->keep_idle = uval;
			else
				tcp->tcp_pcb->keep_intvl = uval;

			return OK;

		case TCP_KEEPCNT:
			/* lwIP's listening TCP PCBs do not have this field. */
			if (tcp->tcp_pcb->state == LISTEN)
				return EINVAL;

			if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
			    len)) != OK)
				return r;

			if (val == 0)
				return EINVAL;

			tcp->tcp_pcb->keep_cnt = (uint32_t)val;

			return OK;
		}

		return EOPNOTSUPP;
	}

	/* Handle all other options at the IP level. */
	tcpsock_get_ipopts(tcp, &ipopts);

	return ipsock_setsockopt(tcpsock_get_ipsock(tcp), level, name, data,
	    len, &ipopts);
}

/*
 * Retrieve socket options on a TCP socket.
 */
static int
tcpsock_getsockopt(struct sock * sock, int level, int name,
	const struct sockdriver_data * data, socklen_t * len)
{
	struct tcpsock *tcp = (struct tcpsock *)sock;
	struct ipopts ipopts;
	int val;

	if (tcp->tcp_pcb == NULL)
		return ECONNRESET;

	/* Handle TCP-level options. */
	switch (level) {
	case IPPROTO_IPV6:
		switch (name) {
		case IPV6_RECVTCLASS:
		case IPV6_FAITH:
			val = 0;

			return sockdriver_copyout_opt(data, &val, sizeof(val),
			    len);
		}

		break;

	case IPPROTO_TCP:
		switch (name) {
		case TCP_NODELAY:
			/* lwIP's listening TCP PCBs do not have this field. */
			if (tcp->tcp_pcb->state == LISTEN)
				return EINVAL;

			val = tcp_nagle_disabled(tcp->tcp_pcb);

			return sockdriver_copyout_opt(data, &val, sizeof(val),
			    len);

		case TCP_MAXSEG:
			/* lwIP's listening TCP PCBs do not have this field. */
			if (tcp->tcp_pcb->state == LISTEN)
				return EINVAL;

			/* This option is read-only at this time. */
			val = tcp->tcp_pcb->mss;

			return sockdriver_copyout_opt(data, &val, sizeof(val),
			    len);

		case TCP_KEEPIDLE:
			/* lwIP's listening TCP PCBs do not have this field. */
			if (tcp->tcp_pcb->state == LISTEN)
				return EINVAL;

			val = (int)(tcp->tcp_pcb->keep_idle / 1000);

			return sockdriver_copyout_opt(data, &val, sizeof(val),
			    len);

		case TCP_KEEPINTVL:
			/* lwIP's listening TCP PCBs do not have this field. */
			if (tcp->tcp_pcb->state == LISTEN)
				return EINVAL;

			val = (int)(tcp->tcp_pcb->keep_intvl / 1000);

			return sockdriver_copyout_opt(data, &val, sizeof(val),
			    len);

		case TCP_KEEPCNT:
			/* lwIP's listening TCP PCBs do not have this field. */
			if (tcp->tcp_pcb->state == LISTEN)
				return EINVAL;

			val = (int)tcp->tcp_pcb->keep_cnt;

			return sockdriver_copyout_opt(data, &val, sizeof(val),
			    len);
		}

		return EOPNOTSUPP;
	}

	/* Handle all other options at the IP level. */
	tcpsock_get_ipopts(tcp, &ipopts);

	return ipsock_getsockopt(tcpsock_get_ipsock(tcp), level, name, data,
	    len, &ipopts);
}

/*
 * Retrieve the local socket address of a TCP socket.
 */
static int
tcpsock_getsockname(struct sock * sock, struct sockaddr * addr,
	socklen_t * addr_len)
{
	struct tcpsock *tcp = (struct tcpsock *)sock;

	if (tcp->tcp_pcb == NULL)
		return EINVAL;

	ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
	    &tcp->tcp_pcb->local_ip, tcp->tcp_pcb->local_port);

	return OK;
}

/*
 * Retrieve the remote socket address of a TCP socket.
 */
static int
tcpsock_getpeername(struct sock * sock, struct sockaddr * addr,
	socklen_t * addr_len)
{
	struct tcpsock *tcp = (struct tcpsock *)sock;

	if (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state == CLOSED ||
	    tcp->tcp_pcb->state == LISTEN || tcp->tcp_pcb->state == SYN_SENT)
		return ENOTCONN;

	ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len,
	    &tcp->tcp_pcb->remote_ip, tcp->tcp_pcb->remote_port);

	return OK;
}

/*
 * Perform a TCP half-close on a TCP socket.  This operation may not complete
 * immediately due to memory conditions, in which case it will be completed at
 * a later time.
 */
static void
tcpsock_send_fin(struct tcpsock * tcp)
{

	sockevent_set_shutdown(tcpsock_get_sock(tcp), SFL_SHUT_WR);

	/*
	 * Attempt to send the FIN.  If a fatal error occurs as a result, raise
	 * it as an asynchronous error, because this function's callers cannot
	 * do much with it.  That happens to match the way these functions are
	 * used elsewhere.  In any case, as a result, the PCB may be closed.
	 * However, we are never called from a situation where the socket is
	 * being closed here, so the socket object will not be freed either.
	 */
	if (tcpsock_pcb_enqueue(tcp)) {
		assert(!sockevent_is_closing(tcpsock_get_sock(tcp)));

		if (tcpsock_may_close(tcp))
			tcpsock_finish_close(tcp);
		else
			(void)tcpsock_pcb_send(tcp, TRUE /*raise_error*/);
	}
}

/*
 * Shut down a TCP socket for reading and/or writing.
 */
static int
tcpsock_shutdown(struct sock * sock, unsigned int mask)
{
	struct tcpsock *tcp = (struct tcpsock *)sock;

	/*
	 * If the PCB is gone, we want to allow shutdowns for reading but not
	 * writing: shutting down for writing affects the PCB, shutting down
	 * for reading does not.  Also, if the PCB is in CLOSED state, we would
	 * not know how to deal with subsequent operations after a shutdown for
	 * writing, so forbid such calls altogether.
	 */
	if ((tcp->tcp_pcb == NULL || tcp->tcp_pcb->state == CLOSED) &&
	    (mask & SFL_SHUT_WR))
		return ENOTCONN;

	/*
	 * Handle listening sockets as a special case.  Shutting down a
	 * listening socket frees its PCB.  Sockets pending on the accept queue
	 * may still be accepted, but after that, accept(2) will start
	 * returning ECONNABORTED.  This feature allows multi-process server
	 * applications to shut down gracefully, supposedly..
	 */
	if (tcpsock_is_listening(tcp)) {
		if (tcp->tcp_pcb != NULL)
			tcpsock_pcb_close(tcp);

		return OK;
	}

	/*
	 * We control shutdown-for-reading locally, and intentially do not tell
	 * lwIP about it: if we do that and also shut down for writing, the PCB
	 * may disappear (now or eventually), which is not what we want.
	 * Instead, we only tell lwIP to shut down for reading once we actually
	 * want to get rid of the PCB, using tcp_close().  In the meantime, if
	 * the socket is shut down for reading by the user, we simply discard
	 * received data as fast as we can--one out of a number of possible
	 * design choices there, and (reportedly) the one used by the BSDs.
	 */
	if (mask & SFL_SHUT_RD)
		(void)tcpsock_clear_recv(tcp, TRUE /*ack_data*/);

	/*
	 * Shutting down for writing a connecting socket simply closes its PCB.
	 * Closing a PCB in SYN_SENT state simply deallocates it, so this can
	 * not fail.  On the other hand, for connected sockets we want to send
	 * a FIN, which may fail due to memory shortage, in which case we have
	 * to try again later..
	 */
	if (mask & SFL_SHUT_WR) {
		if (tcp->tcp_pcb->state == SYN_SENT)
			tcpsock_pcb_close(tcp);
		else if (!tcpsock_is_shutdown(tcp, SFL_SHUT_WR))
			tcpsock_send_fin(tcp);
	}

	return OK;
}

/*
 * Close a TCP socket.  Complete the operation immediately if possible, or
 * otherwise initiate the closing process and complete it later, notifying
 * libsockevent about that as well.  Depending on linger settings, this
 * function may be called twice on the same socket: the first time with the
 * 'force' flag cleared, and the second time with the 'force' flag set.
 */
static int
tcpsock_close(struct sock * sock, int force)
{
	struct tcpsock *tcp = (struct tcpsock *)sock;
	struct tcpsock *queued;
	size_t rlen;

	assert(tcp->tcp_listener == NULL);

	/*
	 * If this was a listening socket, so abort and clean up any and all
	 * connections on its listener queue.  Note that the listening socket
	 * may or may not have a PCB at this point.
	 */
	if (tcpsock_is_listening(tcp)) {
		while (!TAILQ_EMPTY(&tcp->tcp_queue.tq_head)) {
			queued = TAILQ_FIRST(&tcp->tcp_queue.tq_head);

			tcpsock_pcb_abort(queued);

			(void)tcpsock_cleanup(queued, TRUE /*may_free*/);
		}
	}

	/*
	 * Clear the receive queue, and make sure that we no longer add new
	 * data to it.  The latter is relevant only for the case that we end up
	 * returning SUSPEND below.  Remember whether there were bytes left,
	 * because we should reset the connection if there were.
	 */
	rlen = tcpsock_clear_recv(tcp, FALSE /*ack_data*/);

	sockevent_set_shutdown(tcpsock_get_sock(tcp), SFL_SHUT_RD);

	/*
	 * If the socket is connected, perform a graceful shutdown, unless 1)
	 * we are asked to force-close the socket, or 2) if the local side has
	 * not consumed all data, as per RFC 1122 Sec.4.2.2.13.  Normally lwIP
	 * would take care of the second point, but we may have data in our
	 * receive buffer of which lwIP is not aware.
	 *
	 * Implementing proper linger support is somewhat difficult with lwIP.
	 * In particular, we cannot reliably wait for our FIN to be ACK'ed by
	 * the other side in all cases:
	 *
	 * - the lwIP TCP transition from states CLOSING to TIME_WAIT does not
	 *   trigger any event and once in the TIME_WAIT state, the poll event
	 *   no longer triggers either;
	 * - the lwIP TCP transition from states FIN_WAIT_1 and FIN_WAIT_2 to
	 *   TIME_WAIT will trigger a receive event, but it is not clear
	 *   whether we can reliably check that our FIN was ACK'ed from there.
	 *
	 * That means we have to compromise.  Instead of the proper approach,
	 * we complete our side of the close operation whenever:
	 *
	 * 1. all of or data was acknowledged, AND,
	 * 2. our FIN was sent, AND,
	 * 3a. our FIN was acknowledged, OR,
	 * 3b. we received a FIN from the other side.
	 *
	 * With the addition of the rule 3b, we do not run into the above
	 * reliability problems, but we may return from SO_LINGER-blocked close
	 * calls too early and thus give callers a false impression of success.
	 * TODO: if lwIP ever gets improved on this point, the code in this
	 * module should be rewritten to make use of the improvements.
	 *
	 * The set of rules is basically the same as for closing the PCB early
	 * as per tcpsock_may_close(), except with the check for our FIN being
	 * acknowledged.  Unfortunately only the FIN_WAIT_2, TIME_WAIT, and
	 * (reentered) CLOSED TCP states guarantee that there are no
	 * unacknowledged data segments anymore, so we may have to wait for
	 * reaching any one of these before we can actually finish closing the
	 * socket with tcp_close().
	 *
	 * In addition, lwIP does not tell us when our FIN gets acknowledged,
	 * so we have to use polling and direct access to lwIP's PCB fields
	 * instead, just like lwIP's BSD API does.  There is no other way.
	 * Also, we may not even be able to send the FIN right away, in which
	 * case we must defer that until later.
	 */
	if (tcp->tcp_pcb != NULL) {
		switch (tcp->tcp_pcb->state) {
		case CLOSE_WAIT:
		case CLOSING:
		case LAST_ACK:
			assert(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN);

			/* FALLTHROUGH */
		case SYN_RCVD:
		case ESTABLISHED:
		case FIN_WAIT_1:
			/* First check if we should abort the connection. */
			if (force || rlen > 0)
				break;

			/*
			 * If we have not sent a FIN yet, try sending it now;
			 * if all other conditions are met for closing the
			 * socket, successful FIN transmission will complete
			 * the close.  Otherwise, perform the close check
			 * explicitly.
			 */
			if (!tcpsock_is_shutdown(tcp, SFL_SHUT_WR))
				tcpsock_send_fin(tcp);
			else if (tcpsock_may_close(tcp))
				tcpsock_pcb_close(tcp);

			/*
			 * If at this point the PCB is gone, we managed to
			 * close the connection immediately, and the socket has
			 * already been cleaned up by now.  This may occur if
			 * there is no unacknowledged data and we already
			 * received a FIN earlier on.
			 */
			if (tcp->tcp_pcb == NULL)
				return OK;

			/*
			 * Complete the close operation at a later time.
			 * Adjust the polling interval, so that we can detect
			 * completion of the close as quickly as possible.
			 */
			tcp_poll(tcp->tcp_pcb, tcpsock_event_poll,
			    TCP_POLL_CLOSE_INTERVAL);

			return SUSPEND;

		default:
			/*
			 * The connection is either not yet established, or
			 * already in a state where we can close it right now.
			 */
			tcpsock_pcb_close(tcp);
		}
	}

	/*
	 * Abort the connection is the PCB is still around, and clean up the
	 * socket.  We cannot let tcpsock_cleanup() free the socket object yet,
	 * because we are still in the callback from libsockevent, and the
	 * latter cannot handle the socket object being freed from here.
	 */
	if (tcp->tcp_pcb != NULL)
		tcpsock_pcb_abort(tcp);

	(void)tcpsock_cleanup(tcp, FALSE /*may_free*/);

	return OK;
}

/*
 * Free up a closed TCP socket.
 */
static void
tcpsock_free(struct sock * sock)
{
	struct tcpsock *tcp = (struct tcpsock *)sock;

	assert(tcp->tcp_pcb == NULL);
	assert(tcp->tcp_snd.ts_len == 0);
	assert(tcp->tcp_snd.ts_head == NULL);
	assert(tcp->tcp_rcv.tr_len == 0);
	assert(tcp->tcp_rcv.tr_head == NULL);

	TAILQ_INSERT_HEAD(&tcp_freelist, tcp, tcp_queue.tq_next);
}

/* This table maps TCP states from lwIP numbers to NetBSD numbers. */
static const struct {
	int tsm_tstate;
	int tsm_sostate;
} tcpsock_statemap[] = {
	[CLOSED]	= { TCPS_CLOSED,	SS_ISDISCONNECTED	},
	[LISTEN]	= { TCPS_LISTEN,	0			},
	[SYN_SENT]	= { TCPS_SYN_SENT,	SS_ISCONNECTING		},
	[SYN_RCVD]	= { TCPS_SYN_RECEIVED,	SS_ISCONNECTING		},
	[ESTABLISHED]	= { TCPS_ESTABLISHED,	SS_ISCONNECTED		},
	[FIN_WAIT_1]	= { TCPS_FIN_WAIT_1,	SS_ISDISCONNECTING	},
	[FIN_WAIT_2]	= { TCPS_FIN_WAIT_2,	SS_ISDISCONNECTING	},
	[CLOSE_WAIT]	= { TCPS_CLOSE_WAIT,	SS_ISCONNECTED		},
	[CLOSING]	= { TCPS_CLOSING,	SS_ISDISCONNECTING	},
	[LAST_ACK]	= { TCPS_LAST_ACK,	SS_ISDISCONNECTING	},
	[TIME_WAIT]	= { TCPS_TIME_WAIT,	SS_ISDISCONNECTED	},
};

/*
 * Fill the given kinfo_pcb sysctl(7) structure with information about the TCP
 * PCB identified by the given pointer.
 */
static void
tcpsock_get_info(struct kinfo_pcb * ki, const void * ptr)
{
	const struct tcp_pcb *pcb = (const struct tcp_pcb *)ptr;
	struct tcpsock *tcp;

	/*
	 * Not all TCP PCBs have an associated tcpsock structure.  We are
	 * careful enough clearing the callback argument for PCBs on any of the
	 * TCP lists that we can use that callback argument to determine
	 * whether there is an associated tcpsock structure, although with one
	 * exception: PCBs for incoming connections that have not yet been
	 * fully established (i.e., in SYN_RCVD state).  These will have the
	 * callback argument of the listening socket (which itself may already
	 * have been deallocated at this point) but should not be considered as
	 * associated with the listening socket's tcpsock structure.
	 */
	if (pcb->callback_arg != NULL && pcb->state != SYN_RCVD) {
		tcp = (struct tcpsock *)pcb->callback_arg;
		assert(tcp >= tcp_array &&
		    tcp < &tcp_array[__arraycount(tcp_array)]);

		/* TODO: change this so that sockstat(1) may work one day. */
		ki->ki_sockaddr = (uint64_t)(uintptr_t)tcpsock_get_sock(tcp);
	} else {
		/* No tcpsock.  Could also be in TIME_WAIT state etc. */
		tcp = NULL;

		ki->ki_sostate = SS_NOFDREF;
	}

	ki->ki_type = SOCK_STREAM;

	if ((unsigned int)pcb->state < __arraycount(tcpsock_statemap)) {
		ki->ki_tstate = tcpsock_statemap[pcb->state].tsm_tstate;
		/* TODO: this needs work, but does anything rely on it? */
		ki->ki_sostate |= tcpsock_statemap[pcb->state].tsm_sostate;
	}

	/* Careful with the LISTEN state here (see below). */
	ipsock_get_info(ki, &pcb->local_ip, pcb->local_port,
	    &pcb->remote_ip, (pcb->state != LISTEN) ? pcb->remote_port : 0);

	/*
	 * The PCBs for listening sockets are actually smaller.  Thus, for
	 * listening sockets, do not attempt to access any of the fields beyond
	 * those provided in the smaller structure.
	 */
	if (pcb->state == LISTEN) {
		assert(tcp != NULL);
		ki->ki_refs =
		    (uint64_t)(uintptr_t)TAILQ_FIRST(&tcp->tcp_queue.tq_head);
	} else {
		if (tcp_nagle_disabled(pcb))
			ki->ki_tflags |= NETBSD_TF_NODELAY;

		if (tcp != NULL) {
			ki->ki_rcvq = tcp->tcp_rcv.tr_len;
			ki->ki_sndq = tcp->tcp_snd.ts_len;

			if (tcp->tcp_listener != NULL)
				ki->ki_nextref = (uint64_t)(uintptr_t)
				    TAILQ_NEXT(tcp, tcp_queue.tq_next);
		}
	}
}

/*
 * Given either NULL or a previously returned TCP PCB pointer, return the first
 * or next TCP PCB pointer, or NULL if there are no more.  The current
 * implementation supports only one concurrent iteration at once.
 */
static const void *
tcpsock_enum(const void * last)
{
	static struct {
		unsigned int i;
		const struct tcp_pcb *pcb;
	} iter;

	if (last != NULL && (iter.pcb = iter.pcb->next) != NULL)
		return (const void *)iter.pcb;

	for (iter.i = (last != NULL) ? iter.i + 1 : 0;
	    iter.i < __arraycount(tcp_pcb_lists); iter.i++) {
		if ((iter.pcb = *tcp_pcb_lists[iter.i]) != NULL)
			return (const void *)iter.pcb;
	}

	return NULL;
}

/*
 * Obtain the list of TCP protocol control blocks, for sysctl(7).
 */
static ssize_t
tcpsock_pcblist(struct rmib_call * call, struct rmib_node * node __unused,
	struct rmib_oldp * oldp, struct rmib_newp * newp __unused)
{

	return util_pcblist(call, oldp, tcpsock_enum, tcpsock_get_info);
}

static const struct sockevent_ops tcpsock_ops = {
	.sop_bind		= tcpsock_bind,
	.sop_listen		= tcpsock_listen,
	.sop_connect		= tcpsock_connect,
	.sop_accept		= tcpsock_accept,
	.sop_test_accept	= tcpsock_test_accept,
	.sop_pre_send		= tcpsock_pre_send,
	.sop_send		= tcpsock_send,
	.sop_test_send		= tcpsock_test_send,
	.sop_pre_recv		= tcpsock_pre_recv,
	.sop_recv		= tcpsock_recv,
	.sop_test_recv		= tcpsock_test_recv,
	.sop_ioctl		= ifconf_ioctl,
	.sop_setsockmask	= tcpsock_setsockmask,
	.sop_setsockopt		= tcpsock_setsockopt,
	.sop_getsockopt		= tcpsock_getsockopt,
	.sop_getsockname	= tcpsock_getsockname,
	.sop_getpeername	= tcpsock_getpeername,
	.sop_shutdown		= tcpsock_shutdown,
	.sop_close		= tcpsock_close,
	.sop_free		= tcpsock_free
};