head 1.23; access; symbols netbsd-10-0-RELEASE:1.23 netbsd-10-0-RC6:1.23 netbsd-10-0-RC5:1.23 netbsd-10-0-RC4:1.23 netbsd-10-0-RC3:1.23 netbsd-10-0-RC2:1.23 thorpej-ifq:1.23.0.8 thorpej-ifq-base:1.23 thorpej-altq-separation:1.23.0.6 thorpej-altq-separation-base:1.23 netbsd-10-0-RC1:1.23 netbsd-10:1.23.0.4 netbsd-10-base:1.23 bouyer-sunxi-drm:1.23.0.2 bouyer-sunxi-drm-base:1.23 netbsd-9-3-RELEASE:1.21 thorpej-i2c-spi-conf2:1.21.0.26 thorpej-i2c-spi-conf2-base:1.21 thorpej-futex2:1.21.0.24 thorpej-futex2-base:1.21 thorpej-cfargs2:1.21.0.22 thorpej-cfargs2-base:1.21 cjep_sun2x-base1:1.21 cjep_sun2x:1.21.0.20 cjep_sun2x-base:1.21 cjep_staticlib_x-base1:1.21 netbsd-9-2-RELEASE:1.21 cjep_staticlib_x:1.21.0.18 cjep_staticlib_x-base:1.21 thorpej-i2c-spi-conf:1.21.0.16 thorpej-i2c-spi-conf-base:1.21 thorpej-cfargs:1.21.0.14 thorpej-cfargs-base:1.21 thorpej-futex:1.21.0.12 thorpej-futex-base:1.21 netbsd-9-1-RELEASE:1.21 bouyer-xenpvh-base2:1.21 phil-wifi-20200421:1.21 bouyer-xenpvh-base1:1.21 phil-wifi-20200411:1.21 bouyer-xenpvh:1.21.0.10 bouyer-xenpvh-base:1.21 is-mlppp:1.21.0.8 is-mlppp-base:1.21 phil-wifi-20200406:1.21 netbsd-8-2-RELEASE:1.11.8.7 ad-namecache-base3:1.21 netbsd-9-0-RELEASE:1.21 netbsd-9-0-RC2:1.21 ad-namecache-base2:1.21 ad-namecache-base1:1.21 ad-namecache:1.21.0.6 ad-namecache-base:1.21 netbsd-9-0-RC1:1.21 phil-wifi-20191119:1.21 netbsd-9:1.21.0.4 netbsd-9-base:1.21 phil-wifi-20190609:1.21 netbsd-8-1-RELEASE:1.11.8.7 netbsd-8-1-RC1:1.11.8.7 isaki-audio2:1.21.0.2 isaki-audio2-base:1.21 pgoyette-compat-merge-20190127:1.13.2.6 pgoyette-compat-20190127:1.21 pgoyette-compat-20190118:1.21 pgoyette-compat-1226:1.21 pgoyette-compat-1126:1.21 pgoyette-compat-1020:1.21 pgoyette-compat-0930:1.20 pgoyette-compat-0906:1.18 netbsd-7-2-RELEASE:1.9.4.1 pgoyette-compat-0728:1.18 netbsd-8-0-RELEASE:1.11.8.3 phil-wifi:1.17.0.2 phil-wifi-base:1.17 pgoyette-compat-0625:1.17 netbsd-8-0-RC2:1.11.8.3 pgoyette-compat-0521:1.17 pgoyette-compat-0502:1.15 pgoyette-compat-0422:1.15 netbsd-8-0-RC1:1.11.8.3 pgoyette-compat-0415:1.15 pgoyette-compat-0407:1.14 pgoyette-compat-0330:1.14 pgoyette-compat-0322:1.14 pgoyette-compat-0315:1.14 netbsd-7-1-2-RELEASE:1.9 pgoyette-compat:1.13.0.2 pgoyette-compat-base:1.13 netbsd-7-1-1-RELEASE:1.9 tls-maxphys-base-20171202:1.11 matt-nb8-mediatek:1.11.0.12 matt-nb8-mediatek-base:1.11 nick-nhusb-base-20170825:1.11 perseant-stdc-iso10646:1.11.0.10 perseant-stdc-iso10646-base:1.11 netbsd-8:1.11.0.8 netbsd-8-base:1.11 prg-localcount2-base3:1.11 prg-localcount2-base2:1.11 prg-localcount2-base1:1.11 prg-localcount2:1.11.0.6 prg-localcount2-base:1.11 pgoyette-localcount-20170426:1.11 bouyer-socketcan-base1:1.11 jdolecek-ncq:1.11.0.4 jdolecek-ncq-base:1.11 pgoyette-localcount-20170320:1.11 netbsd-7-1:1.9.0.12 netbsd-7-1-RELEASE:1.9 netbsd-7-1-RC2:1.9 nick-nhusb-base-20170204:1.11 netbsd-7-nhusb-base-20170116:1.9 bouyer-socketcan:1.11.0.2 bouyer-socketcan-base:1.11 pgoyette-localcount-20170107:1.10 netbsd-7-1-RC1:1.9 nick-nhusb-base-20161204:1.10 pgoyette-localcount-20161104:1.10 netbsd-7-0-2-RELEASE:1.9 nick-nhusb-base-20161004:1.10 localcount-20160914:1.10 netbsd-7-nhusb:1.9.0.10 netbsd-7-nhusb-base:1.9 pgoyette-localcount-20160806:1.10 pgoyette-localcount-20160726:1.10 pgoyette-localcount:1.10.0.2 pgoyette-localcount-base:1.10 nick-nhusb-base-20160907:1.10 nick-nhusb-base-20160529:1.10 netbsd-7-0-1-RELEASE:1.9 nick-nhusb-base-20160422:1.9 nick-nhusb-base-20160319:1.9 nick-nhusb-base-20151226:1.9 netbsd-7-0:1.9.0.8 netbsd-7-0-RELEASE:1.9 nick-nhusb-base-20150921:1.9 netbsd-7-0-RC3:1.9 netbsd-7-0-RC2:1.9 netbsd-7-0-RC1:1.9 nick-nhusb-base-20150606:1.9 nick-nhusb-base-20150406:1.9 nick-nhusb:1.9.0.6 nick-nhusb-base:1.9 netbsd-6-0-6-RELEASE:1.8 netbsd-6-1-5-RELEASE:1.8 netbsd-7:1.9.0.4 netbsd-7-base:1.9 yamt-pagecache-base9:1.9 yamt-pagecache-tag8:1.8 netbsd-6-1-4-RELEASE:1.8 netbsd-6-0-5-RELEASE:1.8 tls-earlyentropy:1.9.0.2 tls-earlyentropy-base:1.9 riastradh-xf86-video-intel-2-7-1-pre-2-21-15:1.9 riastradh-drm2-base3:1.9 netbsd-6-1-3-RELEASE:1.8 netbsd-6-0-4-RELEASE:1.8 netbsd-6-1-2-RELEASE:1.8 netbsd-6-0-3-RELEASE:1.8 rmind-smpnet-nbase:1.9 netbsd-6-1-1-RELEASE:1.8 riastradh-drm2-base2:1.8 riastradh-drm2-base1:1.8 riastradh-drm2:1.8.0.24 riastradh-drm2-base:1.8 rmind-smpnet:1.8.0.16 rmind-smpnet-base:1.9 netbsd-6-1:1.8.0.22 netbsd-6-0-2-RELEASE:1.8 netbsd-6-1-RELEASE:1.8 khorben-n900:1.8.0.20 netbsd-6-1-RC4:1.8 netbsd-6-1-RC3:1.8 agc-symver:1.8.0.18 agc-symver-base:1.8 netbsd-6-1-RC2:1.8 netbsd-6-1-RC1:1.8 yamt-pagecache-base8:1.8 netbsd-6-0-1-RELEASE:1.8 yamt-pagecache-base7:1.8 matt-nb6-plus-nbase:1.8 yamt-pagecache-base6:1.8 netbsd-6-0:1.8.0.14 netbsd-6-0-RELEASE:1.8 netbsd-6-0-RC2:1.8 tls-maxphys:1.8.0.12 tls-maxphys-base:1.9 matt-nb6-plus:1.8.0.10 matt-nb6-plus-base:1.8 netbsd-6-0-RC1:1.8 jmcneill-usbmp-base10:1.8 yamt-pagecache-base5:1.8 jmcneill-usbmp-base9:1.8 yamt-pagecache-base4:1.8 jmcneill-usbmp-base8:1.8 jmcneill-usbmp-base7:1.8 jmcneill-usbmp-base6:1.8 jmcneill-usbmp-base5:1.8 jmcneill-usbmp-base4:1.8 jmcneill-usbmp-base3:1.8 jmcneill-usbmp-pre-base2:1.8 jmcneill-usbmp-base2:1.8 netbsd-6:1.8.0.8 netbsd-6-base:1.8 jmcneill-usbmp:1.8.0.6 jmcneill-usbmp-base:1.8 jmcneill-audiomp3:1.8.0.4 jmcneill-audiomp3-base:1.8 yamt-pagecache-base3:1.8 yamt-pagecache-base2:1.8 yamt-pagecache:1.8.0.2 yamt-pagecache-base:1.8 rmind-uvmplock-nbase:1.7 cherry-xenmp:1.7.0.8 cherry-xenmp-base:1.7 rmind-uvmplock-base:1.7 rmind-uvmplock:1.7.0.6 bouyer-quota2-nbase:1.7 bouyer-quota2:1.7.0.4 bouyer-quota2-base:1.7 jruoho-x86intr:1.7.0.2 jruoho-x86intr-base:1.7 matt-mips64-premerge-20101231:1.7 uebayasi-xip-base4:1.7 uebayasi-xip-base3:1.6 yamt-nfs-mp-base11:1.6 uebayasi-xip:1.2.0.4 uebayasi-xip-base2:1.2 yamt-nfs-mp:1.2.0.2 yamt-nfs-mp-base10:1.2; locks; strict; comment @ * @; 1.23 date 2022.05.31.08.43.16; author andvar; state Exp; branches; next 1.22; commitid yPbXHl19O9q2zbGD; 1.22 date 2022.02.16.22.00.56; author andvar; state Exp; branches; next 1.21; commitid ZVPSrUG1o7c1kTsD; 1.21 date 2018.10.12.05.41.18; author maxv; state Exp; branches; next 1.20; commitid pfEHfpjgcufndDVA; 1.20 date 2018.09.17.08.11.27; author maxv; state Exp; branches; next 1.19; commitid VD9ZPnXuQlBTQqSA; 1.19 date 2018.09.17.06.01.36; author maxv; state Exp; branches; next 1.18; commitid NWbPMX1oFfKj8qSA; 1.18 date 2018.07.10.15.46.58; author maxv; state Exp; branches; next 1.17; commitid 5XLU9pNUB8cBABJA; 1.17 date 2018.05.15.19.16.38; author maxv; state Exp; branches 1.17.2.1; next 1.16; commitid os1oNWzbEjtjxqCA; 1.16 date 2018.05.03.07.25.49; author maxv; state Exp; branches; next 1.15; commitid nVO5yGjWhwad0PAA; 1.15 date 2018.04.11.07.15.12; author maxv; state Exp; branches; next 1.14; commitid 0WDS1Rayw3IqEZxA; 1.14 date 2018.03.09.11.57.38; author maxv; state Exp; branches; next 1.13; commitid 0fQa1TQUpuPFgMtA; 1.13 date 2018.02.08.10.03.52; author maxv; state Exp; branches 1.13.2.1; next 1.12; commitid aUittp5bdLgAz2qA; 1.12 date 2018.02.06.15.48.02; author maxv; state Exp; branches; next 1.11; commitid 2MU0mBEbjGrPvOpA; 1.11 date 2017.01.11.13.08.29; author ozaki-r; state Exp; branches 1.11.8.1; next 1.10; commitid IQEstuoluehfeyBz; 1.10 date 2016.04.26.08.44.44; author ozaki-r; state Exp; branches 1.10.2.1; next 1.9; commitid 8nnDQKLFZ1ep474z; 1.9 date 2014.02.25.18.30.12; author pooka; state Exp; branches 1.9.4.1 1.9.6.1 1.9.8.1 1.9.12.1; next 1.8; commitid j3yFpIze9zIWKvqx; 1.8 date 2011.06.27.00.45.50; author enami; state Exp; branches 1.8.2.1 1.8.12.1 1.8.16.1; next 1.7; 1.7 date 2010.11.05.00.21.51; author rmind; state Exp; branches 1.7.6.1; next 1.6; 1.6 date 2010.10.07.03.15.49; author yamt; state Exp; branches; next 1.5; 1.5 date 2010.10.06.07.39.37; author enami; state Exp; branches; next 1.4; 1.4 date 2010.10.03.19.44.47; author rmind; state Exp; branches; next 1.3; 1.3 date 2010.08.25.00.05.14; author rmind; state Exp; branches; next 1.2; 1.2 date 2010.07.19.14.09.45; author rmind; state Exp; branches 1.2.2.1 1.2.4.1; next 1.1; 1.1 date 2010.07.13.22.16.10; author rmind; state Exp; branches; next ; 1.17.2.1 date 2019.06.10.22.09.47; author christos; state Exp; branches; next ; commitid jtc8rnCzWiEEHGqB; 1.13.2.1 date 2018.03.15.09.12.06; author pgoyette; state Exp; branches; next 1.13.2.2; commitid lb7w3QtkrVH4axuA; 1.13.2.2 date 2018.04.16.02.00.08; author pgoyette; state Exp; branches; next 1.13.2.3; commitid qk3nktk0szmTIByA; 1.13.2.3 date 2018.05.21.04.36.16; author pgoyette; state Exp; branches; next 1.13.2.4; commitid X5L8kSrBWQcDt7DA; 1.13.2.4 date 2018.07.28.04.38.10; author pgoyette; state Exp; branches; next 1.13.2.5; commitid 1UP1xAIUxv1ZgRLA; 1.13.2.5 date 2018.09.30.01.45.56; author pgoyette; state Exp; branches; next 1.13.2.6; commitid SQ44grEPCeKPh4UA; 1.13.2.6 date 2018.10.20.06.58.46; author pgoyette; state Exp; branches; next ; commitid mTSoqZEZ4arHnFWA; 1.11.8.1 date 2018.03.30.11.10.14; author martin; state Exp; branches; next 1.11.8.2; commitid W9fU5ftVgZw2ltwA; 1.11.8.2 date 2018.04.05.14.33.41; author martin; state Exp; branches; next 1.11.8.3; commitid wXAqxlJ9VP3SggxA; 1.11.8.3 date 2018.04.09.16.40.07; author martin; state Exp; branches; next 1.11.8.4; commitid pDqFOREENW0SPMxA; 1.11.8.4 date 2018.09.27.15.07.34; author martin; state Exp; branches; next 1.11.8.5; commitid xRgeuyxdwDSJPKTA; 1.11.8.5 date 2018.10.03.17.53.56; author martin; state Exp; branches; next 1.11.8.6; commitid NVvpjfySjAyRyxUA; 1.11.8.6 date 2018.10.09.09.44.31; author martin; state Exp; branches; next 1.11.8.7; commitid JgrqvtoAGVywEgVA; 1.11.8.7 date 2018.10.17.13.38.04; author martin; state Exp; branches; next ; commitid TrP3xomOhLebHjWA; 1.10.2.1 date 2017.03.20.06.57.50; author pgoyette; state Exp; branches; next ; commitid jjw7cAwgyKq7RfKz; 1.9.4.1 date 2018.04.05.11.48.13; author martin; state Exp; branches; next ; commitid HnaOWeQySxz6mfxA; 1.9.6.1 date 2016.05.29.08.44.38; author skrll; state Exp; branches; next 1.9.6.2; commitid 8mlnPW6uSaJU1m8z; 1.9.6.2 date 2017.02.05.13.40.59; author skrll; state Exp; branches; next ; commitid 8hwpk1aHl2UuyLEz; 1.9.8.1 date 2018.04.05.11.53.02; author martin; state Exp; branches; next ; commitid gmmkfxrcvEZKnfxA; 1.9.12.1 date 2018.04.05.11.50.17; author martin; state Exp; branches; next ; commitid 2pXwqo4MhNAOmfxA; 1.8.2.1 date 2014.05.22.11.41.09; author yamt; state Exp; branches; next ; commitid VUUXuyNWnt3AKwBx; 1.8.12.1 date 2014.08.20.00.04.35; author tls; state Exp; branches; next 1.8.12.2; commitid jTnpym9Qu0o4R1Nx; 1.8.12.2 date 2017.12.03.11.39.04; author jdolecek; state Exp; branches; next ; commitid XcIYRZTAh1LmerhA; 1.8.16.1 date 2014.05.18.17.46.13; author rmind; state Exp; branches; next ; commitid mL5ZYSzpqK6QS2Bx; 1.7.6.1 date 2010.11.05.00.21.51; author rmind; state dead; branches; next 1.7.6.2; 1.7.6.2 date 2011.03.05.20.55.58; author rmind; state Exp; branches; next ; 1.2.2.1 date 2010.07.19.14.09.45; author yamt; state dead; branches; next 1.2.2.2; 1.2.2.2 date 2010.08.11.22.54.56; author yamt; state Exp; branches; next 1.2.2.3; 1.2.2.3 date 2010.10.09.03.32.38; author yamt; state Exp; branches; next ; 1.2.4.1 date 2010.07.19.14.09.45; author uebayasi; state dead; branches; next 1.2.4.2; 1.2.4.2 date 2010.08.17.06.47.46; author uebayasi; state Exp; branches; next 1.2.4.3; 1.2.4.3 date 2010.10.22.07.22.39; author uebayasi; state Exp; branches; next 1.2.4.4; 1.2.4.4 date 2010.11.06.08.08.50; author uebayasi; state Exp; branches; next ; desc @@ 1.23 log @fix various typos in comments, documentation and messages. @ text @/* $NetBSD: ip_reass.c,v 1.22 2022/02/16 22:00:56 andvar Exp $ */ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @@(#)ip_input.c 8.2 (Berkeley) 1/4/94 */ /* * IP reassembly. * * Additive-Increase/Multiplicative-Decrease (AIMD) strategy for IP * reassembly queue buffer management. * * We keep a count of total IP fragments (NB: not fragmented packets), * awaiting reassembly (ip_nfrags) and a limit (ip_maxfrags) on fragments. * If ip_nfrags exceeds ip_maxfrags the limit, we drop half the total * fragments in reassembly queues. This AIMD policy avoids repeatedly * deleting single packets under heavy fragmentation load (e.g., from lossy * NFS peers). */ #include __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.22 2022/02/16 22:00:56 andvar Exp $"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * IP reassembly queue structures. Each fragment being reassembled is * attached to one of these structures. They are timed out after TTL * drops to 0, and may also be reclaimed if memory becomes tight. */ typedef struct ipfr_qent { TAILQ_ENTRY(ipfr_qent) ipqe_q; struct ip * ipqe_ip; struct mbuf * ipqe_m; bool ipqe_mff; uint16_t ipqe_off; uint16_t ipqe_len; } ipfr_qent_t; TAILQ_HEAD(ipfr_qent_head, ipfr_qent); typedef struct ipfr_queue { LIST_ENTRY(ipfr_queue) ipq_q; /* to other reass headers */ struct ipfr_qent_head ipq_fragq; /* queue of fragment entries */ uint8_t ipq_ttl; /* time for reass q to live */ uint8_t ipq_p; /* protocol of this fragment */ uint16_t ipq_id; /* sequence id for reassembly */ struct in_addr ipq_src; struct in_addr ipq_dst; uint16_t ipq_nfrags; /* frags in this queue entry */ uint8_t ipq_tos; /* TOS of this fragment */ int ipq_ipsec; /* IPsec flags */ } ipfr_queue_t; /* * Hash table of IP reassembly queues. */ #define IPREASS_HASH_SHIFT 6 #define IPREASS_HASH_SIZE (1 << IPREASS_HASH_SHIFT) #define IPREASS_HASH_MASK (IPREASS_HASH_SIZE - 1) #define IPREASS_HASH(x, y) \ (((((x) & 0xf) | ((((x) >> 8) & 0xf) << 4)) ^ (y)) & IPREASS_HASH_MASK) static LIST_HEAD(, ipfr_queue) ip_frags[IPREASS_HASH_SIZE]; static pool_cache_t ipfren_cache; static kmutex_t ipfr_lock; /* Number of packets in reassembly queue and total number of fragments. */ static int ip_nfragpackets; static int ip_nfrags; /* Limits on packet and fragments. */ static int ip_maxfragpackets; static int ip_maxfrags; /* * Cached copy of nmbclusters. If nbclusters is different, recalculate * IP parameters derived from nmbclusters. */ static int ip_nmbclusters; /* * IP reassembly TTL machinery for multiplicative drop. */ static u_int fragttl_histo[IPFRAGTTL + 1]; static struct sysctllog *ip_reass_sysctllog; void sysctl_ip_reass_setup(void); static void ip_nmbclusters_changed(void); static struct mbuf * ip_reass(ipfr_qent_t *, ipfr_queue_t *, u_int); static u_int ip_reass_ttl_decr(u_int ticks); static void ip_reass_drophalf(void); static void ip_freef(ipfr_queue_t *); /* * ip_reass_init: * * Initialization of IP reassembly mechanism. */ void ip_reass_init(void) { int i; ipfren_cache = pool_cache_init(sizeof(ipfr_qent_t), coherency_unit, 0, 0, "ipfrenpl", NULL, IPL_NET, NULL, NULL, NULL); mutex_init(&ipfr_lock, MUTEX_DEFAULT, IPL_VM); for (i = 0; i < IPREASS_HASH_SIZE; i++) { LIST_INIT(&ip_frags[i]); } ip_maxfragpackets = 200; ip_maxfrags = 0; ip_nmbclusters_changed(); sysctl_ip_reass_setup(); } void sysctl_ip_reass_setup(void) { sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "inet", SYSCTL_DESCR("PF_INET related settings"), NULL, 0, NULL, 0, CTL_NET, PF_INET, CTL_EOL); sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "ip", SYSCTL_DESCR("IPv4 related settings"), NULL, 0, NULL, 0, CTL_NET, PF_INET, IPPROTO_IP, CTL_EOL); sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "maxfragpackets", SYSCTL_DESCR("Maximum number of fragments to retain for " "possible reassembly"), NULL, 0, &ip_maxfragpackets, 0, CTL_NET, PF_INET, IPPROTO_IP, IPCTL_MAXFRAGPACKETS, CTL_EOL); } #define CHECK_NMBCLUSTER_PARAMS() \ do { \ if (__predict_false(ip_nmbclusters != nmbclusters)) \ ip_nmbclusters_changed(); \ } while (/*CONSTCOND*/0) /* * Compute IP limits derived from the value of nmbclusters. */ static void ip_nmbclusters_changed(void) { ip_maxfrags = nmbclusters / 4; ip_nmbclusters = nmbclusters; } /* * ip_reass: * * Take incoming datagram fragment and try to reassemble it into whole * datagram. If a chain for reassembly of this datagram already exists, * then it is given as 'fp'; otherwise have to make a chain. */ static struct mbuf * ip_reass(ipfr_qent_t *ipqe, ipfr_queue_t *fp, const u_int hash) { struct ip *ip = ipqe->ipqe_ip; const int hlen = ip->ip_hl << 2; struct mbuf *m = ipqe->ipqe_m, *t; int ipsecflags = m->m_flags & (M_DECRYPTED|M_AUTHIPHDR); ipfr_qent_t *nq, *p, *q; int i, next; KASSERT(mutex_owned(&ipfr_lock)); /* * Presence of header sizes in mbufs would confuse code below. */ m->m_data += hlen; m->m_len -= hlen; /* * We are about to add a fragment; increment frag count. */ ip_nfrags++; /* * If first fragment to arrive, create a reassembly queue. */ if (fp == NULL) { /* * Enforce upper bound on number of fragmented packets * for which we attempt reassembly: a) if maxfrag is 0, * never accept fragments b) if maxfrag is -1, accept * all fragments without limitation. */ if (ip_maxfragpackets < 0) { /* no limit */ } else if (ip_nfragpackets >= ip_maxfragpackets) { goto dropfrag; } fp = malloc(sizeof(ipfr_queue_t), M_FTABLE, M_NOWAIT); if (fp == NULL) { goto dropfrag; } ip_nfragpackets++; TAILQ_INIT(&fp->ipq_fragq); fp->ipq_nfrags = 1; fp->ipq_ttl = IPFRAGTTL; fp->ipq_p = ip->ip_p; fp->ipq_id = ip->ip_id; fp->ipq_tos = ip->ip_tos; fp->ipq_ipsec = ipsecflags; fp->ipq_src = ip->ip_src; fp->ipq_dst = ip->ip_dst; LIST_INSERT_HEAD(&ip_frags[hash], fp, ipq_q); p = NULL; goto insert; } else { fp->ipq_nfrags++; } /* * Find a segment which begins after this one does. */ TAILQ_FOREACH(q, &fp->ipq_fragq, ipqe_q) { if (q->ipqe_off > ipqe->ipqe_off) break; } if (q != NULL) { p = TAILQ_PREV(q, ipfr_qent_head, ipqe_q); } else { p = TAILQ_LAST(&fp->ipq_fragq, ipfr_qent_head); } /* * Look at the preceding segment. * * If it provides some of our data already, in part or entirely, trim * us or drop us. * * If a preceding segment exists, and was marked as the last segment, * drop us. */ if (p != NULL) { i = p->ipqe_off + p->ipqe_len - ipqe->ipqe_off; if (i > 0) { if (i >= ipqe->ipqe_len) { goto dropfrag; } m_adj(ipqe->ipqe_m, i); ipqe->ipqe_off = ipqe->ipqe_off + i; ipqe->ipqe_len = ipqe->ipqe_len - i; } } if (p != NULL && !p->ipqe_mff) { goto dropfrag; } /* * Look at the segments that follow. * * If we cover them, in part or entirely, trim them or dequeue them. * * If a following segment exists, and we are marked as the last * segment, drop us. */ while (q != NULL) { i = ipqe->ipqe_off + ipqe->ipqe_len - q->ipqe_off; if (i <= 0) { break; } if (i < q->ipqe_len) { q->ipqe_off = q->ipqe_off + i; q->ipqe_len = q->ipqe_len - i; m_adj(q->ipqe_m, i); break; } nq = TAILQ_NEXT(q, ipqe_q); m_freem(q->ipqe_m); TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q); pool_cache_put(ipfren_cache, q); fp->ipq_nfrags--; ip_nfrags--; q = nq; } if (q != NULL && !ipqe->ipqe_mff) { goto dropfrag; } insert: /* * Stick new segment in its place; check for complete reassembly. */ if (p == NULL) { TAILQ_INSERT_HEAD(&fp->ipq_fragq, ipqe, ipqe_q); } else { TAILQ_INSERT_AFTER(&fp->ipq_fragq, p, ipqe, ipqe_q); } next = 0; TAILQ_FOREACH(q, &fp->ipq_fragq, ipqe_q) { if (q->ipqe_off != next) { mutex_exit(&ipfr_lock); return NULL; } next += q->ipqe_len; } p = TAILQ_LAST(&fp->ipq_fragq, ipfr_qent_head); if (p->ipqe_mff) { mutex_exit(&ipfr_lock); return NULL; } /* * Reassembly is complete. Check for a bogus message size. */ q = TAILQ_FIRST(&fp->ipq_fragq); ip = q->ipqe_ip; if ((next + (ip->ip_hl << 2)) > IP_MAXPACKET) { IP_STATINC(IP_STAT_TOOLONG); ip_freef(fp); mutex_exit(&ipfr_lock); return NULL; } LIST_REMOVE(fp, ipq_q); ip_nfrags -= fp->ipq_nfrags; ip_nfragpackets--; mutex_exit(&ipfr_lock); /* Concatenate all fragments. */ m = q->ipqe_m; t = m->m_next; m->m_next = NULL; m_cat(m, t); nq = TAILQ_NEXT(q, ipqe_q); pool_cache_put(ipfren_cache, q); for (q = nq; q != NULL; q = nq) { t = q->ipqe_m; nq = TAILQ_NEXT(q, ipqe_q); pool_cache_put(ipfren_cache, q); m_remove_pkthdr(t); m_cat(m, t); } /* * Create header for new packet by modifying header of first * packet. Dequeue and discard fragment reassembly header. Make * header visible. */ ip->ip_len = htons((ip->ip_hl << 2) + next); ip->ip_off = htons(0); ip->ip_src = fp->ipq_src; ip->ip_dst = fp->ipq_dst; free(fp, M_FTABLE); m->m_len += (ip->ip_hl << 2); m->m_data -= (ip->ip_hl << 2); /* Fix up mbuf. XXX This should be done elsewhere. */ { KASSERT(m->m_flags & M_PKTHDR); int plen = 0; for (t = m; t; t = t->m_next) { plen += t->m_len; } m->m_pkthdr.len = plen; m->m_pkthdr.csum_flags = 0; } return m; dropfrag: if (fp != NULL) { fp->ipq_nfrags--; } ip_nfrags--; IP_STATINC(IP_STAT_FRAGDROPPED); mutex_exit(&ipfr_lock); pool_cache_put(ipfren_cache, ipqe); m_freem(m); return NULL; } /* * ip_freef: * * Free a fragment reassembly header and all associated datagrams. */ static void ip_freef(ipfr_queue_t *fp) { ipfr_qent_t *q; KASSERT(mutex_owned(&ipfr_lock)); LIST_REMOVE(fp, ipq_q); ip_nfrags -= fp->ipq_nfrags; ip_nfragpackets--; while ((q = TAILQ_FIRST(&fp->ipq_fragq)) != NULL) { TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q); m_freem(q->ipqe_m); pool_cache_put(ipfren_cache, q); } free(fp, M_FTABLE); } /* * ip_reass_ttl_decr: * * Decrement TTL of all reasembly queue entries by `ticks'. Count * number of distinct fragments (as opposed to partial, fragmented * datagrams) in the reassembly queue. While we traverse the entire * reassembly queue, compute and return the median TTL over all * fragments. */ static u_int ip_reass_ttl_decr(u_int ticks) { u_int nfrags, median, dropfraction, keepfraction; ipfr_queue_t *fp, *nfp; int i; nfrags = 0; memset(fragttl_histo, 0, sizeof(fragttl_histo)); for (i = 0; i < IPREASS_HASH_SIZE; i++) { for (fp = LIST_FIRST(&ip_frags[i]); fp != NULL; fp = nfp) { fp->ipq_ttl = ((fp->ipq_ttl <= ticks) ? 0 : fp->ipq_ttl - ticks); nfp = LIST_NEXT(fp, ipq_q); if (fp->ipq_ttl == 0) { IP_STATINC(IP_STAT_FRAGTIMEOUT); ip_freef(fp); } else { nfrags += fp->ipq_nfrags; fragttl_histo[fp->ipq_ttl] += fp->ipq_nfrags; } } } KASSERT(ip_nfrags == nfrags); /* Find median (or other drop fraction) in histogram. */ dropfraction = (ip_nfrags / 2); keepfraction = ip_nfrags - dropfraction; for (i = IPFRAGTTL, median = 0; i >= 0; i--) { median += fragttl_histo[i]; if (median >= keepfraction) break; } /* Return TTL of median (or other fraction). */ return (u_int)i; } static void ip_reass_drophalf(void) { u_int median_ticks; KASSERT(mutex_owned(&ipfr_lock)); /* * Compute median TTL of all fragments, and count frags * with that TTL or lower (roughly half of all fragments). */ median_ticks = ip_reass_ttl_decr(0); /* Drop half. */ median_ticks = ip_reass_ttl_decr(median_ticks); } /* * ip_reass_drain: drain off all datagram fragments. Do not acquire * softnet_lock as can be called from hardware interrupt context. */ void ip_reass_drain(void) { /* * We may be called from a device's interrupt context. If * the ipq is already busy, just bail out now. */ if (mutex_tryenter(&ipfr_lock)) { /* * Drop half the total fragments now. If more mbufs are * needed, we will be called again soon. */ ip_reass_drophalf(); mutex_exit(&ipfr_lock); } } /* * ip_reass_slowtimo: * * If a timer expires on a reassembly queue, discard it. */ void ip_reass_slowtimo(void) { static u_int dropscanidx = 0; u_int i, median_ttl; mutex_enter(&ipfr_lock); /* Age TTL of all fragments by 1 tick .*/ median_ttl = ip_reass_ttl_decr(1); /* Make sure fragment limit is up-to-date. */ CHECK_NMBCLUSTER_PARAMS(); /* If we have too many fragments, drop the older half. */ if (ip_nfrags > ip_maxfrags) { ip_reass_ttl_decr(median_ttl); } /* * If we are over the maximum number of fragmented packets (due to * the limit being lowered), drain off enough to get down to the * new limit. Start draining from the reassembly hashqueue most * recently drained. */ if (ip_maxfragpackets < 0) ; else { int wrapped = 0; i = dropscanidx; while (ip_nfragpackets > ip_maxfragpackets && wrapped == 0) { while (LIST_FIRST(&ip_frags[i]) != NULL) { ip_freef(LIST_FIRST(&ip_frags[i])); } if (++i >= IPREASS_HASH_SIZE) { i = 0; } /* * Do not scan forever even if fragment counters are * wrong: stop after scanning entire reassembly queue. */ if (i == dropscanidx) { wrapped = 1; } } dropscanidx = i; } mutex_exit(&ipfr_lock); } /* * ip_reass_packet: generic routine to perform IP reassembly. * * => Passed fragment should have IP_MF flag and/or offset set. * => Fragment should not have other than IP_MF flags set. * * => Returns 0 on success or error otherwise. * => On complete, m0 represents a constructed final packet. */ int ip_reass_packet(struct mbuf **m0) { struct mbuf *m = *m0; struct ip *ip = mtod(m, struct ip *); const int hlen = ip->ip_hl << 2; const int len = ntohs(ip->ip_len); int ipsecflags = m->m_flags & (M_DECRYPTED|M_AUTHIPHDR); ipfr_queue_t *fp; ipfr_qent_t *ipqe; u_int hash, off, flen; bool mff; /* * Prevent TCP blind data attacks by not allowing non-initial * fragments to start at less than 68 bytes (minimal fragment * size) and making sure the first fragment is at least 68 * bytes. */ off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3; if ((off > 0 ? off + hlen : len) < IP_MINFRAGSIZE - 1) { IP_STATINC(IP_STAT_BADFRAGS); return EINVAL; } if (off + len > IP_MAXPACKET) { IP_STATINC(IP_STAT_TOOLONG); return EINVAL; } /* * Fragment length and MF flag. Make sure that fragments have * a data length which is non-zero and multiple of 8 bytes. */ flen = ntohs(ip->ip_len) - hlen; mff = (ip->ip_off & htons(IP_MF)) != 0; if (mff && (flen == 0 || (flen & 0x7) != 0)) { IP_STATINC(IP_STAT_BADFRAGS); return EINVAL; } /* Look for queue of fragments of this datagram. */ mutex_enter(&ipfr_lock); hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); LIST_FOREACH(fp, &ip_frags[hash], ipq_q) { if (ip->ip_id != fp->ipq_id) continue; if (!in_hosteq(ip->ip_src, fp->ipq_src)) continue; if (!in_hosteq(ip->ip_dst, fp->ipq_dst)) continue; if (ip->ip_p != fp->ipq_p) continue; break; } if (fp) { /* All fragments must have the same IPsec flags. */ if (fp->ipq_ipsec != ipsecflags) { IP_STATINC(IP_STAT_BADFRAGS); mutex_exit(&ipfr_lock); return EINVAL; } /* Make sure that TOS matches previous fragments. */ if (fp->ipq_tos != ip->ip_tos) { IP_STATINC(IP_STAT_BADFRAGS); mutex_exit(&ipfr_lock); return EINVAL; } } /* * Create new entry and attempt to reassembly. */ IP_STATINC(IP_STAT_FRAGMENTS); ipqe = pool_cache_get(ipfren_cache, PR_NOWAIT); if (ipqe == NULL) { IP_STATINC(IP_STAT_RCVMEMDROP); mutex_exit(&ipfr_lock); return ENOMEM; } ipqe->ipqe_mff = mff; ipqe->ipqe_m = m; ipqe->ipqe_ip = ip; ipqe->ipqe_off = off; ipqe->ipqe_len = flen; *m0 = ip_reass(ipqe, fp, hash); if (*m0) { /* Note that finally reassembled. */ IP_STATINC(IP_STAT_REASSEMBLED); } return 0; } @ 1.22 log @fix various typos, mainly in comments. @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.21 2018/10/12 05:41:18 maxv Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.21 2018/10/12 05:41:18 maxv Exp $"); d467 1 a467 1 * datagrams) inthe reassembly queue. While we traverse the entire @ 1.21 log @Force ip_off to zero when the reassembly is complete. This was lost in my rev1.19 - before that the IP struct was clobbered for the reassembly, but it actually implicitly guaranteed that the first fragment of the packet would end up with ip_off = 0, and this was a desired behavior. @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.20 2018/09/17 08:11:27 maxv Exp $ */ d38 1 a38 1 * reassembly queue buffer managment. d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.20 2018/09/17 08:11:27 maxv Exp $"); @ 1.20 log @Kick fragments that would introduce several !MFFs in a reassembly chain. The problem arises if we receive three fragments of the kind 3. A -> has MFF 1. B -> doesn't have MFF 2. C -> doesn't have MFF Because of the received order B->C->A, we don't see that B is !MFF, and therefore that there is a problem in this chain. Now we do two checks, and drop us if: * there is a fragment preceding us, and this fragment is !MFF, or * there is a fragment following us, and we are !MFF Spotted a long time ago. @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.19 2018/09/17 06:01:36 maxv Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.19 2018/09/17 06:01:36 maxv Exp $"); d405 1 @ 1.19 log @Hold ip_off and ip_len in the fragment entry, instead of always reading the associated mbuf (and converting to host order). This reduces the cache/TLB misses when processing long lists. @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.18 2018/07/10 15:46:58 maxv Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.18 2018/07/10 15:46:58 maxv Exp $"); d290 7 a296 3 * If there is a preceding segment, it may provide some of our * data already. If so, drop the data from the incoming segment. * If it provides all of our data, drop us. d309 3 d314 6 a319 2 * While we overlap succeeding segments trim them or, if they are * completely covered, dequeue them. d340 3 @ 1.18 log @Remove the second argument from ip_reass_packet(). We want the IP header on the mbuf, not elsewhere. Simplifies the NPF reassembly code a little. No real functional change. @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.17 2018/05/15 19:16:38 maxv Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.17 2018/05/15 19:16:38 maxv Exp $"); d83 2 d220 1 a220 1 struct ip *ip = ipqe->ipqe_ip, *qip; a234 10 #ifdef notyet /* Make sure fragment limit is up-to-date. */ CHECK_NMBCLUSTER_PARAMS(); /* If we have too many fragments, drop the older half. */ if (ip_nfrags >= ip_maxfrags) { ip_reass_drophalf(void); } #endif d250 3 a252 3 if (ip_maxfragpackets < 0) ; else if (ip_nfragpackets >= ip_maxfragpackets) { d280 1 a280 1 if (ntohs(q->ipqe_ip->ip_off) > ntohs(ip->ip_off)) d295 1 a295 2 i = ntohs(p->ipqe_ip->ip_off) + ntohs(p->ipqe_ip->ip_len) - ntohs(ip->ip_off); d297 1 a297 1 if (i >= ntohs(ip->ip_len)) { d301 2 a302 2 ip->ip_off = htons(ntohs(ip->ip_off) + i); ip->ip_len = htons(ntohs(ip->ip_len) - i); d311 2 a312 5 size_t end; qip = q->ipqe_ip; end = ntohs(ip->ip_off) + ntohs(ip->ip_len); if (end <= ntohs(qip->ip_off)) { d315 3 a317 4 i = end - ntohs(qip->ip_off); if (i < ntohs(qip->ip_len)) { qip->ip_len = htons(ntohs(qip->ip_len) - i); qip->ip_off = htons(ntohs(qip->ip_off) + i); d341 1 a341 2 qip = q->ipqe_ip; if (ntohs(qip->ip_off) != next) { d345 1 a345 1 next += ntohs(qip->ip_len); a640 7 /* * Adjust total IP length to not reflect header and convert * offset of this to bytes. XXX: clobbers struct ip. */ ip->ip_len = htons(flen); ip->ip_off = htons(off); d685 2 @ 1.17 log @When reassembling IPv4/IPv6 packets, ensure each fragment has been subject to the same IPsec processing. That is to say, that all fragments are ESP, or AH, or AH+ESP, or none. The reassembly mechanism can be used both on the wire and inside an IPsec tunnel, so we need to make sure all fragments of a packet were received on only one side. Even though I haven't tried, I believe there are configurations where it would be possible for an attacker to inject an unencrypted fragment into a legitimate stream of already-decrypted-and-authenticated fragments. Typically on IPsec gateways with ESP tunnels, where we can encapsulate fragments (as opposed to the general case, where we fragment encapsulated data). Note, for the record: a funnier thing, under IPv4, would be to send a zero-sized !MFF fragment at the head of the packet, and manage to trigger an ICMP error; M_DECRYPTED gets lost by the reassembly, and ICMP will reply with the packet in clear (not encrypted). @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.16 2018/05/03 07:25:49 maxv Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.16 2018/05/03 07:25:49 maxv Exp $"); d615 1 a615 1 ip_reass_packet(struct mbuf **m0, struct ip *ip) d617 2 a620 1 struct mbuf *m = *m0; @ 1.17.2.1 log @Sync with HEAD @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.21 2018/10/12 05:41:18 maxv Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.21 2018/10/12 05:41:18 maxv Exp $"); a82 2 uint16_t ipqe_off; uint16_t ipqe_len; d218 1 a218 1 struct ip *ip = ipqe->ipqe_ip; d233 10 d258 3 a260 3 if (ip_maxfragpackets < 0) { /* no limit */ } else if (ip_nfragpackets >= ip_maxfragpackets) { d288 1 a288 1 if (q->ipqe_off > ipqe->ipqe_off) d298 3 a300 7 * Look at the preceding segment. * * If it provides some of our data already, in part or entirely, trim * us or drop us. * * If a preceding segment exists, and was marked as the last segment, * drop us. d303 2 a304 1 i = p->ipqe_off + p->ipqe_len - ipqe->ipqe_off; d306 1 a306 1 if (i >= ipqe->ipqe_len) { d310 2 a311 2 ipqe->ipqe_off = ipqe->ipqe_off + i; ipqe->ipqe_len = ipqe->ipqe_len - i; a313 3 if (p != NULL && !p->ipqe_mff) { goto dropfrag; } d316 2 a317 6 * Look at the segments that follow. * * If we cover them, in part or entirely, trim them or dequeue them. * * If a following segment exists, and we are marked as the last * segment, drop us. d320 5 a324 2 i = ipqe->ipqe_off + ipqe->ipqe_len - q->ipqe_off; if (i <= 0) { d327 4 a330 3 if (i < q->ipqe_len) { q->ipqe_off = q->ipqe_off + i; q->ipqe_len = q->ipqe_len - i; a341 3 if (q != NULL && !ipqe->ipqe_mff) { goto dropfrag; } d354 2 a355 1 if (q->ipqe_off != next) { d359 1 a359 1 next += q->ipqe_len; a404 1 ip->ip_off = htons(0); d615 1 a615 1 ip_reass_packet(struct mbuf **m0) a616 2 struct mbuf *m = *m0; struct ip *ip = mtod(m, struct ip *); d619 1 d654 7 a704 2 ipqe->ipqe_off = off; ipqe->ipqe_len = flen; @ 1.16 log @Rename m_pkthdr_remove -> m_remove_pkthdr, to match the existing naming convention, eg m_copy_pkthdr and m_move_pkthdr. @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.15 2018/04/11 07:15:12 maxv Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.15 2018/04/11 07:15:12 maxv Exp $"); d96 2 a97 1 uint8_t ipq_tos; /* TOS of this fragment */ d221 1 d274 1 d620 1 d676 14 a689 5 /* Make sure that TOS matches previous fragments. */ if (fp && fp->ipq_tos != ip->ip_tos) { IP_STATINC(IP_STAT_BADFRAGS); mutex_exit(&ipfr_lock); return EINVAL; @ 1.15 log @Add 'static', like the prototype. @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.14 2018/03/09 11:57:38 maxv Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.14 2018/03/09 11:57:38 maxv Exp $"); d392 1 a392 1 m_pkthdr_remove(t); @ 1.14 log @Remove M_PKTHDR from secondary mbufs when reassembling packets. This is a real problem, because I found at least one component that relies on the fact that only the first mbuf has M_PKTHDR: far from here, in m_splithdr, we don't update m->m_pkthdr.len if M_PKTHDR is found in a secondary mbuf. (The initial intention there was to avoid updating m_pkthdr.len twice, the assumption was that if M_PKTHDR is set then we're dealing with the first mbuf.) Therefore, when handling fragmented IPsec packets (in particular IPv6, IPv4 is a bit more complicated), we may end up with an incorrect m_pkthdr.len after authentication or decryption. In the case of ESP, this can lead to a remote crash on this instruction: m_copydata(m, m->m_pkthdr.len - 3, 3, lastthree); m_pkthdr.len is bigger than the actual mbuf chain. It seems possible to me to trigger this bug even if you don't have the ESP key, because the fragmentation part is outside of the encrypted ESP payload. So if you MITM the target, and intercept an incoming ESP packet (which you can't decrypt), you should be able to forge a new specially-crafted, fragmented packet and stuff the ESP payload (still encrypted, as you intercepted it) into it. The decryption succeeds and the target crashes. @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.13 2018/02/08 10:03:52 maxv Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.13 2018/02/08 10:03:52 maxv Exp $"); d214 1 a214 1 struct mbuf * @ 1.13 log @Change the error stat from IP_STAT_BADFRAGS to IP_STAT_TOOLONG. The ping_of_death ATF test expects this counter to get increased. @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.12 2018/02/06 15:48:02 maxv Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.12 2018/02/06 15:48:02 maxv Exp $"); d392 1 d410 2 a411 1 if (m->m_flags & M_PKTHDR) { @ 1.13.2.1 log @Synch with HEAD @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.14 2018/03/09 11:57:38 maxv Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.14 2018/03/09 11:57:38 maxv Exp $"); a391 1 m_pkthdr_remove(t); d409 1 a409 2 { KASSERT(m->m_flags & M_PKTHDR); @ 1.13.2.2 log @Sync with HEAD, resolve some conflicts @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.15 2018/04/11 07:15:12 maxv Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.15 2018/04/11 07:15:12 maxv Exp $"); d214 1 a214 1 static struct mbuf * @ 1.13.2.3 log @Sync with HEAD @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.17 2018/05/15 19:16:38 maxv Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.17 2018/05/15 19:16:38 maxv Exp $"); d96 1 a96 2 uint8_t ipq_tos; /* TOS of this fragment */ int ipq_ipsec; /* IPsec flags */ a219 1 int ipsecflags = m->m_flags & (M_DECRYPTED|M_AUTHIPHDR); a271 1 fp->ipq_ipsec = ipsecflags; d392 1 a392 1 m_remove_pkthdr(t); a616 1 int ipsecflags = m->m_flags & (M_DECRYPTED|M_AUTHIPHDR); d672 5 a676 14 if (fp) { /* All fragments must have the same IPsec flags. */ if (fp->ipq_ipsec != ipsecflags) { IP_STATINC(IP_STAT_BADFRAGS); mutex_exit(&ipfr_lock); return EINVAL; } /* Make sure that TOS matches previous fragments. */ if (fp->ipq_tos != ip->ip_tos) { IP_STATINC(IP_STAT_BADFRAGS); mutex_exit(&ipfr_lock); return EINVAL; } @ 1.13.2.4 log @Sync with HEAD @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.18 2018/07/10 15:46:58 maxv Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.18 2018/07/10 15:46:58 maxv Exp $"); d615 1 a615 1 ip_reass_packet(struct mbuf **m0) a616 2 struct mbuf *m = *m0; struct ip *ip = mtod(m, struct ip *); d619 1 @ 1.13.2.5 log @Ssync with HEAD @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.20 2018/09/17 08:11:27 maxv Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.20 2018/09/17 08:11:27 maxv Exp $"); a82 2 uint16_t ipqe_off; uint16_t ipqe_len; d218 1 a218 1 struct ip *ip = ipqe->ipqe_ip; d233 10 d258 3 a260 3 if (ip_maxfragpackets < 0) { /* no limit */ } else if (ip_nfragpackets >= ip_maxfragpackets) { d288 1 a288 1 if (q->ipqe_off > ipqe->ipqe_off) d298 3 a300 7 * Look at the preceding segment. * * If it provides some of our data already, in part or entirely, trim * us or drop us. * * If a preceding segment exists, and was marked as the last segment, * drop us. d303 2 a304 1 i = p->ipqe_off + p->ipqe_len - ipqe->ipqe_off; d306 1 a306 1 if (i >= ipqe->ipqe_len) { d310 2 a311 2 ipqe->ipqe_off = ipqe->ipqe_off + i; ipqe->ipqe_len = ipqe->ipqe_len - i; a313 3 if (p != NULL && !p->ipqe_mff) { goto dropfrag; } d316 2 a317 6 * Look at the segments that follow. * * If we cover them, in part or entirely, trim them or dequeue them. * * If a following segment exists, and we are marked as the last * segment, drop us. d320 5 a324 2 i = ipqe->ipqe_off + ipqe->ipqe_len - q->ipqe_off; if (i <= 0) { d327 4 a330 3 if (i < q->ipqe_len) { q->ipqe_off = q->ipqe_off + i; q->ipqe_len = q->ipqe_len - i; a341 3 if (q != NULL && !ipqe->ipqe_mff) { goto dropfrag; } d354 2 a355 1 if (q->ipqe_off != next) { d359 1 a359 1 next += q->ipqe_len; d655 7 a705 2 ipqe->ipqe_off = off; ipqe->ipqe_len = flen; @ 1.13.2.6 log @Sync with head @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.21 2018/10/12 05:41:18 maxv Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.21 2018/10/12 05:41:18 maxv Exp $"); a404 1 ip->ip_off = htons(0); @ 1.12 log @Add one more check in ip_reass_packet(): make sure that the end of each fragment does not exceed IP_MAXPACKET. In ip_reass(), we only check the final length of the reassembled packet against IP_MAXPACKET. But there is an integer overflow that can happen a little earlier. We are doing: i = ntohs(p->ipqe_ip->ip_off) + ntohs(p->ipqe_ip->ip_len) - ntohs(ip->ip_off); [...] ip->ip_off = htons(ntohs(ip->ip_off) + i); It is possible that ntohs(p->ipqe_ip->ip_off) + ntohs(p->ipqe_ip->ip_len) > 65535 so the computation of ip_off wraps to zero. This breaks an assumption in the reassembler - it expects the list of fragments to be ordered by offset, and here it's not ordered anymore. (Un)Fortunately I couldn't turn this into anything exploitable. With the new check, it is guaranteed that ip_off+ip_len<=65535. @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.11 2017/01/11 13:08:29 ozaki-r Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.11 2017/01/11 13:08:29 ozaki-r Exp $"); d633 1 a633 1 IP_STATINC(IP_STAT_BADFRAGS); @ 1.11 log @Get rid of unnecessary header inclusions @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.10 2016/04/26 08:44:44 ozaki-r Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.10 2016/04/26 08:44:44 ozaki-r Exp $"); d632 5 @ 1.11.8.1 log @Pull up following revision(s) (requested by maxv in ticket #668): sys/netinet/ip_reass.c: revision 1.12 Add one more check in ip_reass_packet(): make sure that the end of each fragment does not exceed IP_MAXPACKET. In ip_reass(), we only check the final length of the reassembled packet against IP_MAXPACKET. But there is an integer overflow that can happen a little earlier. We are doing: i = ntohs(p->ipqe_ip->ip_off) + ntohs(p->ipqe_ip->ip_len) - ntohs(ip->ip_off); [...] ip->ip_off = htons(ntohs(ip->ip_off) + i); It is possible that ntohs(p->ipqe_ip->ip_off) + ntohs(p->ipqe_ip->ip_len) > 65535 so the computation of ip_off wraps to zero. This breaks an assumption in the reassembler - it expects the list of fragments to be ordered by offset, and here it's not ordered anymore. (Un)Fortunately I couldn't turn this into anything exploitable. With the new check, it is guaranteed that ip_off+ip_len<=65535. @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.11 2017/01/11 13:08:29 ozaki-r Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.11 2017/01/11 13:08:29 ozaki-r Exp $"); a631 5 if (off + len > IP_MAXPACKET) { IP_STATINC(IP_STAT_BADFRAGS); return EINVAL; } @ 1.11.8.2 log @Pull up following revision(s) (requested by maxv in ticket #695): sys/kern/uipc_mbuf.c: revision 1.182 sys/netinet6/frag6.c: revision 1.67 sys/netinet/ip_reass.c: revision 1.14 sys/sys/mbuf.h: revision 1.179 Remove M_PKTHDR from secondary mbufs when reassembling packets. This is a real problem, because I found at least one component that relies on the fact that only the first mbuf has M_PKTHDR: far from here, in m_splithdr, we don't update m->m_pkthdr.len if M_PKTHDR is found in a secondary mbuf. (The initial intention there was to avoid updating m_pkthdr.len twice, the assumption was that if M_PKTHDR is set then we're dealing with the first mbuf.) Therefore, when handling fragmented IPsec packets (in particular IPv6, IPv4 is a bit more complicated), we may end up with an incorrect m_pkthdr.len after authentication or decryption. In the case of ESP, this can lead to a remote crash on this instruction: m_copydata(m, m->m_pkthdr.len - 3, 3, lastthree); m_pkthdr.len is bigger than the actual mbuf chain. It seems possible to me to trigger this bug even if you don't have the ESP key, because the fragmentation part is outside of the encrypted ESP payload. So if you MITM the target, and intercept an incoming ESP packet (which you can't decrypt), you should be able to forge a new specially-crafted, fragmented packet and stuff the ESP payload (still encrypted, as you intercepted it) into it. The decryption succeeds and the target crashes. @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.11.8.1 2018/03/30 11:10:14 martin Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.11.8.1 2018/03/30 11:10:14 martin Exp $"); a391 1 m_pkthdr_remove(t); d409 1 a409 2 { KASSERT(m->m_flags & M_PKTHDR); @ 1.11.8.3 log @Additionally pull up the following revision for ticket #668, requested by ozaki-r: sys/netinet/ip_reass.c 1.13 Change the error stat from IP_STAT_BADFRAGS to IP_STAT_TOOLONG. The ping_of_death ATF test expects this counter to get increased. @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.11.8.2 2018/04/05 14:33:41 martin Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.11.8.2 2018/04/05 14:33:41 martin Exp $"); d635 1 a635 1 IP_STATINC(IP_STAT_TOOLONG); @ 1.11.8.4 log @Pull up following revision(s) (requested by maxv in ticket #1041): sys/netinet/ip_reass.c: revision 1.17 (patch) sys/netinet6/frag6.c: revision 1.74 (patch) When reassembling IPv4/IPv6 packets, ensure each fragment has been subject to the same IPsec processing. That is to say, that all fragments are ESP, or AH, or AH+ESP, or none. The reassembly mechanism can be used both on the wire and inside an IPsec tunnel, so we need to make sure all fragments of a packet were received on only one side. Even though I haven't tried, I believe there are configurations where it would be possible for an attacker to inject an unencrypted fragment into a legitimate stream of already-decrypted-and-authenticated fragments. Typically on IPsec gateways with ESP tunnels, where we can encapsulate fragments (as opposed to the general case, where we fragment encapsulated data). Note, for the record: a funnier thing, under IPv4, would be to send a zero-sized !MFF fragment at the head of the packet, and manage to trigger an ICMP error; M_DECRYPTED gets lost by the reassembly, and ICMP will reply with the packet in clear (not encrypted). @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.11.8.3 2018/04/09 16:40:07 martin Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.11.8.3 2018/04/09 16:40:07 martin Exp $"); d96 1 a96 2 uint8_t ipq_tos; /* TOS of this fragment */ int ipq_ipsec; /* IPsec flags */ a219 1 int ipsecflags = m->m_flags & (M_DECRYPTED|M_AUTHIPHDR); a271 1 fp->ipq_ipsec = ipsecflags; a616 1 int ipsecflags = m->m_flags & (M_DECRYPTED|M_AUTHIPHDR); d672 5 a676 14 if (fp) { /* All fragments must have the same IPsec flags. */ if (fp->ipq_ipsec != ipsecflags) { IP_STATINC(IP_STAT_BADFRAGS); mutex_exit(&ipfr_lock); return EINVAL; } /* Make sure that TOS matches previous fragments. */ if (fp->ipq_tos != ip->ip_tos) { IP_STATINC(IP_STAT_BADFRAGS); mutex_exit(&ipfr_lock); return EINVAL; } @ 1.11.8.5 log @Pull up following revision(s) (requested by maxv in ticket #1045): sys/netinet/ip_reass.c: revision 1.19 Hold ip_off and ip_len in the fragment entry, instead of always reading the associated mbuf (and converting to host order). This reduces the cache/TLB misses when processing long lists. @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.11.8.4 2018/09/27 15:07:34 martin Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.11.8.4 2018/09/27 15:07:34 martin Exp $"); a82 2 uint16_t ipqe_off; uint16_t ipqe_len; d218 1 a218 1 struct ip *ip = ipqe->ipqe_ip; d233 10 d258 3 a260 3 if (ip_maxfragpackets < 0) { /* no limit */ } else if (ip_nfragpackets >= ip_maxfragpackets) { d288 1 a288 1 if (q->ipqe_off > ipqe->ipqe_off) d303 2 a304 1 i = p->ipqe_off + p->ipqe_len - ipqe->ipqe_off; d306 1 a306 1 if (i >= ipqe->ipqe_len) { d310 2 a311 2 ipqe->ipqe_off = ipqe->ipqe_off + i; ipqe->ipqe_len = ipqe->ipqe_len - i; d320 5 a324 2 i = ipqe->ipqe_off + ipqe->ipqe_len - q->ipqe_off; if (i <= 0) { d327 4 a330 3 if (i < q->ipqe_len) { q->ipqe_off = q->ipqe_off + i; q->ipqe_len = q->ipqe_len - i; d354 2 a355 1 if (q->ipqe_off != next) { d359 1 a359 1 next += q->ipqe_len; d654 7 a704 2 ipqe->ipqe_off = off; ipqe->ipqe_len = flen; @ 1.11.8.6 log @Back out the following from ticket #1045 by maxv: sys/netinet/ip_reass.c 1.19 Faster IPv4 packet reassembly - causes fallout, needs further investigation (see PR kern/53664) @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.11.8.5 2018/10/03 17:53:56 martin Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.11.8.5 2018/10/03 17:53:56 martin Exp $"); d83 2 d220 1 a220 1 struct ip *ip = ipqe->ipqe_ip, *qip; a234 10 #ifdef notyet /* Make sure fragment limit is up-to-date. */ CHECK_NMBCLUSTER_PARAMS(); /* If we have too many fragments, drop the older half. */ if (ip_nfrags >= ip_maxfrags) { ip_reass_drophalf(void); } #endif d250 3 a252 3 if (ip_maxfragpackets < 0) ; else if (ip_nfragpackets >= ip_maxfragpackets) { d280 1 a280 1 if (ntohs(q->ipqe_ip->ip_off) > ntohs(ip->ip_off)) d295 1 a295 2 i = ntohs(p->ipqe_ip->ip_off) + ntohs(p->ipqe_ip->ip_len) - ntohs(ip->ip_off); d297 1 a297 1 if (i >= ntohs(ip->ip_len)) { d301 2 a302 2 ip->ip_off = htons(ntohs(ip->ip_off) + i); ip->ip_len = htons(ntohs(ip->ip_len) - i); d311 2 a312 5 size_t end; qip = q->ipqe_ip; end = ntohs(ip->ip_off) + ntohs(ip->ip_len); if (end <= ntohs(qip->ip_off)) { d315 3 a317 4 i = end - ntohs(qip->ip_off); if (i < ntohs(qip->ip_len)) { qip->ip_len = htons(ntohs(qip->ip_len) - i); qip->ip_off = htons(ntohs(qip->ip_off) + i); d341 1 a341 2 qip = q->ipqe_ip; if (ntohs(qip->ip_off) != next) { d345 1 a345 1 next += ntohs(qip->ip_len); a639 7 /* * Adjust total IP length to not reflect header and convert * offset of this to bytes. XXX: clobbers struct ip. */ ip->ip_len = htons(flen); ip->ip_off = htons(off); d684 2 @ 1.11.8.7 log @Pull up following revision(s) (requested by maxv in ticket #1045): sys/netinet/ip_reass.c: revision 1.19-1.21 Hold ip_off and ip_len in the fragment entry, instead of always reading the associated mbuf (and converting to host order). This reduces the cache/TLB misses when processing long lists. - Kick fragments that would introduce several !MFFs in a reassembly chain. The problem arises if we receive three fragments of the kind 3. A -> has MFF 1. B -> doesn't have MFF 2. C -> doesn't have MFF Because of the received order B->C->A, we don't see that B is !MFF, and therefore that there is a problem in this chain. Now we do two checks, and drop us if: * there is a fragment preceding us, and this fragment is !MFF, or * there is a fragment following us, and we are !MFF Spotted a long time ago. - Force ip_off to zero when the reassembly is complete. This was lost in my rev1.19 - before that the IP struct was clobbered for the reassembly, but it actually implicitly guaranteed that the first fragment of the packet would end up with ip_off = 0, and this was a desired behavior. @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.11.8.6 2018/10/09 09:44:31 martin Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.11.8.6 2018/10/09 09:44:31 martin Exp $"); a82 2 uint16_t ipqe_off; uint16_t ipqe_len; d218 1 a218 1 struct ip *ip = ipqe->ipqe_ip; d233 10 d258 3 a260 3 if (ip_maxfragpackets < 0) { /* no limit */ } else if (ip_nfragpackets >= ip_maxfragpackets) { d288 1 a288 1 if (q->ipqe_off > ipqe->ipqe_off) d298 3 a300 7 * Look at the preceding segment. * * If it provides some of our data already, in part or entirely, trim * us or drop us. * * If a preceding segment exists, and was marked as the last segment, * drop us. d303 2 a304 1 i = p->ipqe_off + p->ipqe_len - ipqe->ipqe_off; d306 1 a306 1 if (i >= ipqe->ipqe_len) { d310 2 a311 2 ipqe->ipqe_off = ipqe->ipqe_off + i; ipqe->ipqe_len = ipqe->ipqe_len - i; a313 3 if (p != NULL && !p->ipqe_mff) { goto dropfrag; } d316 2 a317 6 * Look at the segments that follow. * * If we cover them, in part or entirely, trim them or dequeue them. * * If a following segment exists, and we are marked as the last * segment, drop us. d320 5 a324 2 i = ipqe->ipqe_off + ipqe->ipqe_len - q->ipqe_off; if (i <= 0) { d327 4 a330 3 if (i < q->ipqe_len) { q->ipqe_off = q->ipqe_off + i; q->ipqe_len = q->ipqe_len - i; a341 3 if (q != NULL && !ipqe->ipqe_mff) { goto dropfrag; } d354 2 a355 1 if (q->ipqe_off != next) { d359 1 a359 1 next += q->ipqe_len; a404 1 ip->ip_off = htons(0); d654 7 a704 2 ipqe->ipqe_off = off; ipqe->ipqe_len = flen; @ 1.10 log @Sweep unnecessary route.h inclusions @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.9 2014/02/25 18:30:12 pooka Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.9 2014/02/25 18:30:12 pooka Exp $"); a56 2 #include #include a68 1 #include @ 1.10.2.1 log @Sync with HEAD @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.11 2017/01/11 13:08:29 ozaki-r Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.11 2017/01/11 13:08:29 ozaki-r Exp $"); d57 2 d71 1 @ 1.9 log @Ensure that the top level sysctl nodes (kern, vfs, net, ...) exist before the sysctl link sets are processed, and remove redundancy. Shaves >13kB off of an amd64 GENERIC, not to mention >1k duplicate lines of code. @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.8 2011/06/27 00:45:50 enami Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.8 2011/06/27 00:45:50 enami Exp $"); a64 1 #include @ 1.9.8.1 log @Pull up following revision(s) (requested by maxv in ticket #1594): sys/kern/uipc_mbuf.c: revision 1.182 sys/netinet6/frag6.c: revision 1.67 sys/netinet/ip_reass.c: revision 1.14 sys/sys/mbuf.h: revision 1.179 Remove M_PKTHDR from secondary mbufs when reassembling packets. This is a real problem, because I found at least one component that relies on the fact that only the first mbuf has M_PKTHDR: far from here, in m_splithdr, we don't update m->m_pkthdr.len if M_PKTHDR is found in a secondary mbuf. (The initial intention there was to avoid updating m_pkthdr.len twice, the assumption was that if M_PKTHDR is set then we're dealing with the first mbuf.) Therefore, when handling fragmented IPsec packets (in particular IPv6, IPv4 is a bit more complicated), we may end up with an incorrect m_pkthdr.len after authentication or decryption. In the case of ESP, this can lead to a remote crash on this instruction: m_copydata(m, m->m_pkthdr.len - 3, 3, lastthree); m_pkthdr.len is bigger than the actual mbuf chain. It seems possible to me to trigger this bug even if you don't have the ESP key, because the fragmentation part is outside of the encrypted ESP payload. So if you MITM the target, and intercept an incoming ESP packet (which you can't decrypt), you should be able to forge a new specially-crafted, fragmented packet and stuff the ESP payload (still encrypted, as you intercepted it) into it. The decryption succeeds and the target crashes. @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.9 2014/02/25 18:30:12 pooka Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.9 2014/02/25 18:30:12 pooka Exp $"); a395 1 m_pkthdr_remove(t); d413 1 a413 2 { KASSERT(m->m_flags & M_PKTHDR); @ 1.9.12.1 log @Pull up following revision(s) (requested by maxv in ticket #1594): sys/kern/uipc_mbuf.c: revision 1.182 sys/netinet6/frag6.c: revision 1.67 sys/netinet/ip_reass.c: revision 1.14 sys/sys/mbuf.h: revision 1.179 Remove M_PKTHDR from secondary mbufs when reassembling packets. This is a real problem, because I found at least one component that relies on the fact that only the first mbuf has M_PKTHDR: far from here, in m_splithdr, we don't update m->m_pkthdr.len if M_PKTHDR is found in a secondary mbuf. (The initial intention there was to avoid updating m_pkthdr.len twice, the assumption was that if M_PKTHDR is set then we're dealing with the first mbuf.) Therefore, when handling fragmented IPsec packets (in particular IPv6, IPv4 is a bit more complicated), we may end up with an incorrect m_pkthdr.len after authentication or decryption. In the case of ESP, this can lead to a remote crash on this instruction: m_copydata(m, m->m_pkthdr.len - 3, 3, lastthree); m_pkthdr.len is bigger than the actual mbuf chain. It seems possible to me to trigger this bug even if you don't have the ESP key, because the fragmentation part is outside of the encrypted ESP payload. So if you MITM the target, and intercept an incoming ESP packet (which you can't decrypt), you should be able to forge a new specially-crafted, fragmented packet and stuff the ESP payload (still encrypted, as you intercepted it) into it. The decryption succeeds and the target crashes. @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.9 2014/02/25 18:30:12 pooka Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.9 2014/02/25 18:30:12 pooka Exp $"); a395 1 m_pkthdr_remove(t); d413 1 a413 2 { KASSERT(m->m_flags & M_PKTHDR); @ 1.9.4.1 log @Pull up following revision(s) (requested by maxv in ticket #1594): sys/kern/uipc_mbuf.c: revision 1.182 sys/netinet6/frag6.c: revision 1.67 sys/netinet/ip_reass.c: revision 1.14 sys/sys/mbuf.h: revision 1.179 Remove M_PKTHDR from secondary mbufs when reassembling packets. This is a real problem, because I found at least one component that relies on the fact that only the first mbuf has M_PKTHDR: far from here, in m_splithdr, we don't update m->m_pkthdr.len if M_PKTHDR is found in a secondary mbuf. (The initial intention there was to avoid updating m_pkthdr.len twice, the assumption was that if M_PKTHDR is set then we're dealing with the first mbuf.) Therefore, when handling fragmented IPsec packets (in particular IPv6, IPv4 is a bit more complicated), we may end up with an incorrect m_pkthdr.len after authentication or decryption. In the case of ESP, this can lead to a remote crash on this instruction: m_copydata(m, m->m_pkthdr.len - 3, 3, lastthree); m_pkthdr.len is bigger than the actual mbuf chain. It seems possible to me to trigger this bug even if you don't have the ESP key, because the fragmentation part is outside of the encrypted ESP payload. So if you MITM the target, and intercept an incoming ESP packet (which you can't decrypt), you should be able to forge a new specially-crafted, fragmented packet and stuff the ESP payload (still encrypted, as you intercepted it) into it. The decryption succeeds and the target crashes. @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.9 2014/02/25 18:30:12 pooka Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.9 2014/02/25 18:30:12 pooka Exp $"); a395 1 m_pkthdr_remove(t); d413 1 a413 2 { KASSERT(m->m_flags & M_PKTHDR); @ 1.9.6.1 log @Sync with HEAD @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.10 2016/04/26 08:44:44 ozaki-r Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.10 2016/04/26 08:44:44 ozaki-r Exp $"); d65 1 @ 1.9.6.2 log @Sync with HEAD @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.9.6.1 2016/05/29 08:44:38 skrll Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.9.6.1 2016/05/29 08:44:38 skrll Exp $"); d57 2 d71 1 @ 1.8 log @Don't increment ip_nfragpackets when failed to allocate fragment queue. No one will decrement it on such case. @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.7 2010/11/05 00:21:51 rmind Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.7 2010/11/05 00:21:51 rmind Exp $"); a174 5 CTLTYPE_NODE, "net", NULL, NULL, 0, NULL, 0, CTL_NET, CTL_EOL); sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, @ 1.8.12.1 log @Rebase to HEAD as of a few days ago. @ text @d1 1 a1 1 /* $NetBSD$ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD$"); d175 5 @ 1.8.12.2 log @update from HEAD @ text @d57 2 d65 1 d72 1 @ 1.8.2.1 log @sync with head. for a reference, the tree before this commit was tagged as yamt-pagecache-tag8. this commit was splitted into small chunks to avoid a limitation of cvs. ("Protocol error: too many arguments") @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.8 2011/06/27 00:45:50 enami Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.8 2011/06/27 00:45:50 enami Exp $"); d175 5 @ 1.8.16.1 log @sync with head @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.8 2011/06/27 00:45:50 enami Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.8 2011/06/27 00:45:50 enami Exp $"); d175 5 @ 1.7 log @ip_reass_packet: finish abstraction; some clean-up. Discussed some time ago with matt@@. @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.6 2010/10/07 03:15:49 yamt Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.6 2010/10/07 03:15:49 yamt Exp $"); a269 1 ip_nfragpackets++; d274 1 @ 1.7.6.1 log @file ip_reass.c was added on branch rmind-uvmplock on 2011-03-05 20:55:58 +0000 @ text @d1 701 @ 1.7.6.2 log @sync with head @ text @a0 701 /* $NetBSD$ */ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @@(#)ip_input.c 8.2 (Berkeley) 1/4/94 */ /* * IP reassembly. * * Additive-Increase/Multiplicative-Decrease (AIMD) strategy for IP * reassembly queue buffer managment. * * We keep a count of total IP fragments (NB: not fragmented packets), * awaiting reassembly (ip_nfrags) and a limit (ip_maxfrags) on fragments. * If ip_nfrags exceeds ip_maxfrags the limit, we drop half the total * fragments in reassembly queues. This AIMD policy avoids repeatedly * deleting single packets under heavy fragmentation load (e.g., from lossy * NFS peers). */ #include __KERNEL_RCSID(0, "$NetBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * IP reassembly queue structures. Each fragment being reassembled is * attached to one of these structures. They are timed out after TTL * drops to 0, and may also be reclaimed if memory becomes tight. */ typedef struct ipfr_qent { TAILQ_ENTRY(ipfr_qent) ipqe_q; struct ip * ipqe_ip; struct mbuf * ipqe_m; bool ipqe_mff; } ipfr_qent_t; TAILQ_HEAD(ipfr_qent_head, ipfr_qent); typedef struct ipfr_queue { LIST_ENTRY(ipfr_queue) ipq_q; /* to other reass headers */ struct ipfr_qent_head ipq_fragq; /* queue of fragment entries */ uint8_t ipq_ttl; /* time for reass q to live */ uint8_t ipq_p; /* protocol of this fragment */ uint16_t ipq_id; /* sequence id for reassembly */ struct in_addr ipq_src; struct in_addr ipq_dst; uint16_t ipq_nfrags; /* frags in this queue entry */ uint8_t ipq_tos; /* TOS of this fragment */ } ipfr_queue_t; /* * Hash table of IP reassembly queues. */ #define IPREASS_HASH_SHIFT 6 #define IPREASS_HASH_SIZE (1 << IPREASS_HASH_SHIFT) #define IPREASS_HASH_MASK (IPREASS_HASH_SIZE - 1) #define IPREASS_HASH(x, y) \ (((((x) & 0xf) | ((((x) >> 8) & 0xf) << 4)) ^ (y)) & IPREASS_HASH_MASK) static LIST_HEAD(, ipfr_queue) ip_frags[IPREASS_HASH_SIZE]; static pool_cache_t ipfren_cache; static kmutex_t ipfr_lock; /* Number of packets in reassembly queue and total number of fragments. */ static int ip_nfragpackets; static int ip_nfrags; /* Limits on packet and fragments. */ static int ip_maxfragpackets; static int ip_maxfrags; /* * Cached copy of nmbclusters. If nbclusters is different, recalculate * IP parameters derived from nmbclusters. */ static int ip_nmbclusters; /* * IP reassembly TTL machinery for multiplicative drop. */ static u_int fragttl_histo[IPFRAGTTL + 1]; static struct sysctllog *ip_reass_sysctllog; void sysctl_ip_reass_setup(void); static void ip_nmbclusters_changed(void); static struct mbuf * ip_reass(ipfr_qent_t *, ipfr_queue_t *, u_int); static u_int ip_reass_ttl_decr(u_int ticks); static void ip_reass_drophalf(void); static void ip_freef(ipfr_queue_t *); /* * ip_reass_init: * * Initialization of IP reassembly mechanism. */ void ip_reass_init(void) { int i; ipfren_cache = pool_cache_init(sizeof(ipfr_qent_t), coherency_unit, 0, 0, "ipfrenpl", NULL, IPL_NET, NULL, NULL, NULL); mutex_init(&ipfr_lock, MUTEX_DEFAULT, IPL_VM); for (i = 0; i < IPREASS_HASH_SIZE; i++) { LIST_INIT(&ip_frags[i]); } ip_maxfragpackets = 200; ip_maxfrags = 0; ip_nmbclusters_changed(); sysctl_ip_reass_setup(); } void sysctl_ip_reass_setup(void) { sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "net", NULL, NULL, 0, NULL, 0, CTL_NET, CTL_EOL); sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "inet", SYSCTL_DESCR("PF_INET related settings"), NULL, 0, NULL, 0, CTL_NET, PF_INET, CTL_EOL); sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "ip", SYSCTL_DESCR("IPv4 related settings"), NULL, 0, NULL, 0, CTL_NET, PF_INET, IPPROTO_IP, CTL_EOL); sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "maxfragpackets", SYSCTL_DESCR("Maximum number of fragments to retain for " "possible reassembly"), NULL, 0, &ip_maxfragpackets, 0, CTL_NET, PF_INET, IPPROTO_IP, IPCTL_MAXFRAGPACKETS, CTL_EOL); } #define CHECK_NMBCLUSTER_PARAMS() \ do { \ if (__predict_false(ip_nmbclusters != nmbclusters)) \ ip_nmbclusters_changed(); \ } while (/*CONSTCOND*/0) /* * Compute IP limits derived from the value of nmbclusters. */ static void ip_nmbclusters_changed(void) { ip_maxfrags = nmbclusters / 4; ip_nmbclusters = nmbclusters; } /* * ip_reass: * * Take incoming datagram fragment and try to reassemble it into whole * datagram. If a chain for reassembly of this datagram already exists, * then it is given as 'fp'; otherwise have to make a chain. */ struct mbuf * ip_reass(ipfr_qent_t *ipqe, ipfr_queue_t *fp, const u_int hash) { struct ip *ip = ipqe->ipqe_ip, *qip; const int hlen = ip->ip_hl << 2; struct mbuf *m = ipqe->ipqe_m, *t; ipfr_qent_t *nq, *p, *q; int i, next; KASSERT(mutex_owned(&ipfr_lock)); /* * Presence of header sizes in mbufs would confuse code below. */ m->m_data += hlen; m->m_len -= hlen; #ifdef notyet /* Make sure fragment limit is up-to-date. */ CHECK_NMBCLUSTER_PARAMS(); /* If we have too many fragments, drop the older half. */ if (ip_nfrags >= ip_maxfrags) { ip_reass_drophalf(void); } #endif /* * We are about to add a fragment; increment frag count. */ ip_nfrags++; /* * If first fragment to arrive, create a reassembly queue. */ if (fp == NULL) { /* * Enforce upper bound on number of fragmented packets * for which we attempt reassembly: a) if maxfrag is 0, * never accept fragments b) if maxfrag is -1, accept * all fragments without limitation. */ if (ip_maxfragpackets < 0) ; else if (ip_nfragpackets >= ip_maxfragpackets) { goto dropfrag; } ip_nfragpackets++; fp = malloc(sizeof(ipfr_queue_t), M_FTABLE, M_NOWAIT); if (fp == NULL) { goto dropfrag; } TAILQ_INIT(&fp->ipq_fragq); fp->ipq_nfrags = 1; fp->ipq_ttl = IPFRAGTTL; fp->ipq_p = ip->ip_p; fp->ipq_id = ip->ip_id; fp->ipq_tos = ip->ip_tos; fp->ipq_src = ip->ip_src; fp->ipq_dst = ip->ip_dst; LIST_INSERT_HEAD(&ip_frags[hash], fp, ipq_q); p = NULL; goto insert; } else { fp->ipq_nfrags++; } /* * Find a segment which begins after this one does. */ TAILQ_FOREACH(q, &fp->ipq_fragq, ipqe_q) { if (ntohs(q->ipqe_ip->ip_off) > ntohs(ip->ip_off)) break; } if (q != NULL) { p = TAILQ_PREV(q, ipfr_qent_head, ipqe_q); } else { p = TAILQ_LAST(&fp->ipq_fragq, ipfr_qent_head); } /* * If there is a preceding segment, it may provide some of our * data already. If so, drop the data from the incoming segment. * If it provides all of our data, drop us. */ if (p != NULL) { i = ntohs(p->ipqe_ip->ip_off) + ntohs(p->ipqe_ip->ip_len) - ntohs(ip->ip_off); if (i > 0) { if (i >= ntohs(ip->ip_len)) { goto dropfrag; } m_adj(ipqe->ipqe_m, i); ip->ip_off = htons(ntohs(ip->ip_off) + i); ip->ip_len = htons(ntohs(ip->ip_len) - i); } } /* * While we overlap succeeding segments trim them or, if they are * completely covered, dequeue them. */ while (q != NULL) { size_t end; qip = q->ipqe_ip; end = ntohs(ip->ip_off) + ntohs(ip->ip_len); if (end <= ntohs(qip->ip_off)) { break; } i = end - ntohs(qip->ip_off); if (i < ntohs(qip->ip_len)) { qip->ip_len = htons(ntohs(qip->ip_len) - i); qip->ip_off = htons(ntohs(qip->ip_off) + i); m_adj(q->ipqe_m, i); break; } nq = TAILQ_NEXT(q, ipqe_q); m_freem(q->ipqe_m); TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q); pool_cache_put(ipfren_cache, q); fp->ipq_nfrags--; ip_nfrags--; q = nq; } insert: /* * Stick new segment in its place; check for complete reassembly. */ if (p == NULL) { TAILQ_INSERT_HEAD(&fp->ipq_fragq, ipqe, ipqe_q); } else { TAILQ_INSERT_AFTER(&fp->ipq_fragq, p, ipqe, ipqe_q); } next = 0; TAILQ_FOREACH(q, &fp->ipq_fragq, ipqe_q) { qip = q->ipqe_ip; if (ntohs(qip->ip_off) != next) { mutex_exit(&ipfr_lock); return NULL; } next += ntohs(qip->ip_len); } p = TAILQ_LAST(&fp->ipq_fragq, ipfr_qent_head); if (p->ipqe_mff) { mutex_exit(&ipfr_lock); return NULL; } /* * Reassembly is complete. Check for a bogus message size. */ q = TAILQ_FIRST(&fp->ipq_fragq); ip = q->ipqe_ip; if ((next + (ip->ip_hl << 2)) > IP_MAXPACKET) { IP_STATINC(IP_STAT_TOOLONG); ip_freef(fp); mutex_exit(&ipfr_lock); return NULL; } LIST_REMOVE(fp, ipq_q); ip_nfrags -= fp->ipq_nfrags; ip_nfragpackets--; mutex_exit(&ipfr_lock); /* Concatenate all fragments. */ m = q->ipqe_m; t = m->m_next; m->m_next = NULL; m_cat(m, t); nq = TAILQ_NEXT(q, ipqe_q); pool_cache_put(ipfren_cache, q); for (q = nq; q != NULL; q = nq) { t = q->ipqe_m; nq = TAILQ_NEXT(q, ipqe_q); pool_cache_put(ipfren_cache, q); m_cat(m, t); } /* * Create header for new packet by modifying header of first * packet. Dequeue and discard fragment reassembly header. Make * header visible. */ ip->ip_len = htons((ip->ip_hl << 2) + next); ip->ip_src = fp->ipq_src; ip->ip_dst = fp->ipq_dst; free(fp, M_FTABLE); m->m_len += (ip->ip_hl << 2); m->m_data -= (ip->ip_hl << 2); /* Fix up mbuf. XXX This should be done elsewhere. */ if (m->m_flags & M_PKTHDR) { int plen = 0; for (t = m; t; t = t->m_next) { plen += t->m_len; } m->m_pkthdr.len = plen; m->m_pkthdr.csum_flags = 0; } return m; dropfrag: if (fp != NULL) { fp->ipq_nfrags--; } ip_nfrags--; IP_STATINC(IP_STAT_FRAGDROPPED); mutex_exit(&ipfr_lock); pool_cache_put(ipfren_cache, ipqe); m_freem(m); return NULL; } /* * ip_freef: * * Free a fragment reassembly header and all associated datagrams. */ static void ip_freef(ipfr_queue_t *fp) { ipfr_qent_t *q; KASSERT(mutex_owned(&ipfr_lock)); LIST_REMOVE(fp, ipq_q); ip_nfrags -= fp->ipq_nfrags; ip_nfragpackets--; while ((q = TAILQ_FIRST(&fp->ipq_fragq)) != NULL) { TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q); m_freem(q->ipqe_m); pool_cache_put(ipfren_cache, q); } free(fp, M_FTABLE); } /* * ip_reass_ttl_decr: * * Decrement TTL of all reasembly queue entries by `ticks'. Count * number of distinct fragments (as opposed to partial, fragmented * datagrams) inthe reassembly queue. While we traverse the entire * reassembly queue, compute and return the median TTL over all * fragments. */ static u_int ip_reass_ttl_decr(u_int ticks) { u_int nfrags, median, dropfraction, keepfraction; ipfr_queue_t *fp, *nfp; int i; nfrags = 0; memset(fragttl_histo, 0, sizeof(fragttl_histo)); for (i = 0; i < IPREASS_HASH_SIZE; i++) { for (fp = LIST_FIRST(&ip_frags[i]); fp != NULL; fp = nfp) { fp->ipq_ttl = ((fp->ipq_ttl <= ticks) ? 0 : fp->ipq_ttl - ticks); nfp = LIST_NEXT(fp, ipq_q); if (fp->ipq_ttl == 0) { IP_STATINC(IP_STAT_FRAGTIMEOUT); ip_freef(fp); } else { nfrags += fp->ipq_nfrags; fragttl_histo[fp->ipq_ttl] += fp->ipq_nfrags; } } } KASSERT(ip_nfrags == nfrags); /* Find median (or other drop fraction) in histogram. */ dropfraction = (ip_nfrags / 2); keepfraction = ip_nfrags - dropfraction; for (i = IPFRAGTTL, median = 0; i >= 0; i--) { median += fragttl_histo[i]; if (median >= keepfraction) break; } /* Return TTL of median (or other fraction). */ return (u_int)i; } static void ip_reass_drophalf(void) { u_int median_ticks; KASSERT(mutex_owned(&ipfr_lock)); /* * Compute median TTL of all fragments, and count frags * with that TTL or lower (roughly half of all fragments). */ median_ticks = ip_reass_ttl_decr(0); /* Drop half. */ median_ticks = ip_reass_ttl_decr(median_ticks); } /* * ip_reass_drain: drain off all datagram fragments. Do not acquire * softnet_lock as can be called from hardware interrupt context. */ void ip_reass_drain(void) { /* * We may be called from a device's interrupt context. If * the ipq is already busy, just bail out now. */ if (mutex_tryenter(&ipfr_lock)) { /* * Drop half the total fragments now. If more mbufs are * needed, we will be called again soon. */ ip_reass_drophalf(); mutex_exit(&ipfr_lock); } } /* * ip_reass_slowtimo: * * If a timer expires on a reassembly queue, discard it. */ void ip_reass_slowtimo(void) { static u_int dropscanidx = 0; u_int i, median_ttl; mutex_enter(&ipfr_lock); /* Age TTL of all fragments by 1 tick .*/ median_ttl = ip_reass_ttl_decr(1); /* Make sure fragment limit is up-to-date. */ CHECK_NMBCLUSTER_PARAMS(); /* If we have too many fragments, drop the older half. */ if (ip_nfrags > ip_maxfrags) { ip_reass_ttl_decr(median_ttl); } /* * If we are over the maximum number of fragmented packets (due to * the limit being lowered), drain off enough to get down to the * new limit. Start draining from the reassembly hashqueue most * recently drained. */ if (ip_maxfragpackets < 0) ; else { int wrapped = 0; i = dropscanidx; while (ip_nfragpackets > ip_maxfragpackets && wrapped == 0) { while (LIST_FIRST(&ip_frags[i]) != NULL) { ip_freef(LIST_FIRST(&ip_frags[i])); } if (++i >= IPREASS_HASH_SIZE) { i = 0; } /* * Do not scan forever even if fragment counters are * wrong: stop after scanning entire reassembly queue. */ if (i == dropscanidx) { wrapped = 1; } } dropscanidx = i; } mutex_exit(&ipfr_lock); } /* * ip_reass_packet: generic routine to perform IP reassembly. * * => Passed fragment should have IP_MF flag and/or offset set. * => Fragment should not have other than IP_MF flags set. * * => Returns 0 on success or error otherwise. * => On complete, m0 represents a constructed final packet. */ int ip_reass_packet(struct mbuf **m0, struct ip *ip) { const int hlen = ip->ip_hl << 2; const int len = ntohs(ip->ip_len); struct mbuf *m = *m0; ipfr_queue_t *fp; ipfr_qent_t *ipqe; u_int hash, off, flen; bool mff; /* * Prevent TCP blind data attacks by not allowing non-initial * fragments to start at less than 68 bytes (minimal fragment * size) and making sure the first fragment is at least 68 * bytes. */ off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3; if ((off > 0 ? off + hlen : len) < IP_MINFRAGSIZE - 1) { IP_STATINC(IP_STAT_BADFRAGS); return EINVAL; } /* * Fragment length and MF flag. Make sure that fragments have * a data length which is non-zero and multiple of 8 bytes. */ flen = ntohs(ip->ip_len) - hlen; mff = (ip->ip_off & htons(IP_MF)) != 0; if (mff && (flen == 0 || (flen & 0x7) != 0)) { IP_STATINC(IP_STAT_BADFRAGS); return EINVAL; } /* * Adjust total IP length to not reflect header and convert * offset of this to bytes. XXX: clobbers struct ip. */ ip->ip_len = htons(flen); ip->ip_off = htons(off); /* Look for queue of fragments of this datagram. */ mutex_enter(&ipfr_lock); hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); LIST_FOREACH(fp, &ip_frags[hash], ipq_q) { if (ip->ip_id != fp->ipq_id) continue; if (!in_hosteq(ip->ip_src, fp->ipq_src)) continue; if (!in_hosteq(ip->ip_dst, fp->ipq_dst)) continue; if (ip->ip_p != fp->ipq_p) continue; break; } /* Make sure that TOS matches previous fragments. */ if (fp && fp->ipq_tos != ip->ip_tos) { IP_STATINC(IP_STAT_BADFRAGS); mutex_exit(&ipfr_lock); return EINVAL; } /* * Create new entry and attempt to reassembly. */ IP_STATINC(IP_STAT_FRAGMENTS); ipqe = pool_cache_get(ipfren_cache, PR_NOWAIT); if (ipqe == NULL) { IP_STATINC(IP_STAT_RCVMEMDROP); mutex_exit(&ipfr_lock); return ENOMEM; } ipqe->ipqe_mff = mff; ipqe->ipqe_m = m; ipqe->ipqe_ip = ip; *m0 = ip_reass(ipqe, fp, hash); if (*m0) { /* Note that finally reassembled. */ IP_STATINC(IP_STAT_REASSEMBLED); } return 0; } @ 1.6 log @make ipfr_lock IPL_VM as ip_reass_drain is called in interrupts via the drain hook for mbuf pools. @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.5 2010/10/06 07:39:37 enami Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.5 2010/10/06 07:39:37 enami Exp $"); d89 2 d93 1 a93 1 TAILQ_HEAD(, ipfr_qent) ipq_fragq; /* queue of fragment entries */ d226 2 a227 1 const int hlen = ipqe->ipqe_ip->ip_hl << 2; a229 1 struct ip *ip; d275 1 a275 1 LIST_INSERT_HEAD(&ip_frags[hash], fp, ipq_q); d278 6 a283 6 fp->ipq_p = ipqe->ipqe_ip->ip_p; fp->ipq_id = ipqe->ipqe_ip->ip_id; fp->ipq_tos = ipqe->ipqe_ip->ip_tos; TAILQ_INIT(&fp->ipq_fragq); fp->ipq_src = ipqe->ipqe_ip->ip_src; fp->ipq_dst = ipqe->ipqe_ip->ip_dst; d293 2 a294 3 for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL; p = q, q = TAILQ_NEXT(q, ipqe_q)) if (ntohs(q->ipqe_ip->ip_off) > ntohs(ipqe->ipqe_ip->ip_off)) d296 6 d310 1 a310 1 ntohs(ipqe->ipqe_ip->ip_off); d312 1 a312 1 if (i >= ntohs(ipqe->ipqe_ip->ip_len)) { d316 2 a317 4 ipqe->ipqe_ip->ip_off = htons(ntohs(ipqe->ipqe_ip->ip_off) + i); ipqe->ipqe_ip->ip_len = htons(ntohs(ipqe->ipqe_ip->ip_len) - i); d325 12 a336 10 for (; q != NULL && ntohs(ipqe->ipqe_ip->ip_off) + ntohs(ipqe->ipqe_ip->ip_len) > ntohs(q->ipqe_ip->ip_off); q = nq) { i = (ntohs(ipqe->ipqe_ip->ip_off) + ntohs(ipqe->ipqe_ip->ip_len)) - ntohs(q->ipqe_ip->ip_off); if (i < ntohs(q->ipqe_ip->ip_len)) { q->ipqe_ip->ip_len = htons(ntohs(q->ipqe_ip->ip_len) - i); q->ipqe_ip->ip_off = htons(ntohs(q->ipqe_ip->ip_off) + i); d346 1 d359 3 a361 3 for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL; p = q, q = TAILQ_NEXT(q, ipqe_q)) { if (ntohs(q->ipqe_ip->ip_off) != next) { d365 1 a365 1 next += ntohs(q->ipqe_ip->ip_len); d367 1 d372 1 d615 2 a616 2 * => Returns 0 on success or error otherwise. When reassembly is complete, * m_final representing a constructed final packet is set. d619 1 a619 1 ip_reass_packet(struct mbuf *m, struct ip *ip, bool mff, struct mbuf **m_final) d621 3 d626 32 a657 1 u_int hash; d695 3 a697 3 *m_final = ip_reass(ipqe, fp, hash); if (*m_final) { /* Note if finally reassembled. */ @ 1.5 log @Don't free memory still in use. Fixes nfs root problem reported by Christoph Egger on source-changes-d. @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.4 2010/10/03 19:44:47 rmind Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.4 2010/10/03 19:44:47 rmind Exp $"); d155 1 a155 1 mutex_init(&ipfr_lock, MUTEX_DEFAULT, IPL_SOFTNET); @ 1.4 log @Re-structure IPv4 reassembly code to make it more MP-friendly and simplify some code fragments while here. Also, use pool_cache(9) and mutex(9). IPv4 reassembly mechanism is MP-safe now. @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.3 2010/08/25 00:05:14 rmind Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.3 2010/08/25 00:05:14 rmind Exp $"); a392 1 free(fp, M_FTABLE); d402 1 @ 1.3 log @Use own IPv4 reassembly queue entry structure and leave struct ipqent only for TCP. Now both struct ipfr_qent, struct ipfr_queue and hashed fragment queue are abstracted and no longer public. @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.2 2010/07/19 14:09:45 rmind Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.2 2010/07/19 14:09:45 rmind Exp $"); d56 1 d111 2 a112 2 static struct pool ipqent_pool; static int ipq_locked; d133 2 d153 3 a155 2 pool_init(&ipqent_pool, sizeof(ipfr_qent_t), 0, 0, 0, "ipqepl", NULL, IPL_VM); a166 2 static struct sysctllog *ip_reass_sysctllog; a213 54 static inline int ipq_lock_try(void); static inline void ipq_unlock(void); static inline int ipq_lock_try(void) { int s; /* * Use splvm() -- we're blocking things that would cause * mbuf allocation. */ s = splvm(); if (ipq_locked) { splx(s); return (0); } ipq_locked = 1; splx(s); return (1); } static inline void ipq_unlock(void) { int s; s = splvm(); ipq_locked = 0; splx(s); } #ifdef DIAGNOSTIC #define IPQ_LOCK() \ do { \ if (ipq_lock_try() == 0) { \ printf("%s:%d: ipq already locked\n", __FILE__, __LINE__); \ panic("ipq_lock"); \ } \ } while (/*CONSTCOND*/ 0) #define IPQ_LOCK_CHECK() \ do { \ if (ipq_locked == 0) { \ printf("%s:%d: ipq lock not held\n", __FILE__, __LINE__); \ panic("ipq lock check"); \ } \ } while (/*CONSTCOND*/ 0) #else #define IPQ_LOCK() (void) ipq_lock_try() #define IPQ_LOCK_CHECK() /* nothing */ #endif #define IPQ_UNLOCK() ipq_unlock() d228 1 a228 1 int i, next, s; d230 1 a230 1 IPQ_LOCK_CHECK(); d336 1 a336 3 s = splvm(); pool_put(&ipqent_pool, q); splx(s); d354 1 a354 1 IPQ_UNLOCK(); d360 1 a360 1 IPQ_UNLOCK(); d364 1 a364 2 * Reassembly is complete. Check for a bogus message size and * concatenate fragments. d371 1 a371 1 IPQ_UNLOCK(); d374 6 d385 2 a386 3 s = splvm(); pool_put(&ipqent_pool, q); splx(s); d390 1 a390 3 s = splvm(); pool_put(&ipqent_pool, q); splx(s); d393 1 a393 1 ip_nfrags -= fp->ipq_nfrags; a403 3 LIST_REMOVE(fp, ipq_q); free(fp, M_FTABLE); ip_nfragpackets--; d406 3 a408 2 /* some debugging cruft by sklower, below, will go away soon */ if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */ a415 1 IPQ_UNLOCK(); d424 3 a427 4 s = splvm(); pool_put(&ipqent_pool, ipqe); splx(s); IPQ_UNLOCK(); d439 1 a439 3 ipfr_qent_t *q, *p; u_int nfrags = 0; int s; d441 1 a441 1 IPQ_LOCK_CHECK(); d443 6 a448 2 for (q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL; q = p) { p = TAILQ_NEXT(q, ipqe_q); d450 1 a450 5 nfrags++; TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q); s = splvm(); pool_put(&ipqent_pool, q); splx(s); a451 6 if (nfrags != fp->ipq_nfrags) { printf("ip_freef: nfrags %d != %d\n", fp->ipq_nfrags, nfrags); } ip_nfrags -= nfrags; LIST_REMOVE(fp, ipq_q); a452 1 ip_nfragpackets--; d509 2 d533 1 a533 1 if (ipq_lock_try() != 0) { d539 1 a539 1 IPQ_UNLOCK(); d554 1 a554 1 IPQ_LOCK(); d596 1 a596 1 IPQ_UNLOCK(); d616 1 a616 1 IPQ_LOCK(); d633 1 a633 1 IPQ_UNLOCK(); d641 1 a641 3 int s = splvm(); ipqe = pool_get(&ipqent_pool, PR_NOWAIT); splx(s); d644 1 a644 1 IPQ_UNLOCK(); @ 1.2 log @Abstract IP reassembly into single generic routine - ip_reass_packet(). Make struct ipq private and struct ipqent not visible to userland. Push ip_len adjustment into reassembly layer. OK matt@@ @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.1 2010/07/13 22:16:10 rmind Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.1 2010/07/13 22:16:10 rmind Exp $"); d76 26 a101 1 * IP datagram reassembly hashed queues, pool, lock and counters. d109 3 a111 3 struct ipqhead ipq[IPREASS_HASH_SIZE]; struct pool ipqent_pool; static int ipq_locked; d113 3 a115 2 static int ip_nfragpackets; /* packets in reass queue */ static int ip_nfrags; /* total fragments in reass queues */ d117 3 a119 2 static int ip_maxfragpackets; /* limit on packets. XXX sysctl */ static int ip_maxfrags; /* limit on fragments. XXX sysctl */ d122 2 a123 3 * IP reassembly queue structure. Each fragment being reassembled is * attached to one of these structures. They are timed out after ipq_ttl * drops to 0, and may also be reclaimed if memory becomes tight. d125 1 a125 17 struct ipq { LIST_ENTRY(ipq) ipq_q; /* to other reass headers */ uint8_t ipq_ttl; /* time for reass q to live */ uint8_t ipq_p; /* protocol of this fragment */ uint16_t ipq_id; /* sequence id for reassembly */ struct ipqehead ipq_fragq; /* to ip fragment queue */ struct in_addr ipq_src; struct in_addr ipq_dst; uint16_t ipq_nfrags; /* frags in this queue entry */ uint8_t ipq_tos; /* TOS of this fragment */ }; /* * Cached copy of nmbclusters. If nbclusters is different, * recalculate IP parameters derived from nmbclusters. */ static int ip_nmbclusters; /* copy of nmbclusters */ d130 1 a130 1 static u_int fragttl_histo[IPFRAGTTL + 1]; d132 2 a133 2 void sysctl_ip_reass_setup(void); static void ip_nmbclusters_changed(void); d135 1 a135 2 static struct ipq * ip_reass_lookup(struct ip *, u_int *); static struct mbuf * ip_reass(struct ipqent *, struct ipq *, u_int); d138 1 a138 1 static void ip_freef(struct ipq *); d150 1 a150 1 pool_init(&ipqent_pool, sizeof(struct ipqent), 0, 0, 0, "ipqepl", d154 1 a154 1 LIST_INIT(&ipq[i]); a266 28 * ip_reass_lookup: * * Look for queue of fragments of this datagram. */ static struct ipq * ip_reass_lookup(struct ip *ip, u_int *hashp) { struct ipq *fp; u_int hash; IPQ_LOCK(); hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); LIST_FOREACH(fp, &ipq[hash], ipq_q) { if (ip->ip_id != fp->ipq_id) continue; if (!in_hosteq(ip->ip_src, fp->ipq_src)) continue; if (!in_hosteq(ip->ip_dst, fp->ipq_dst)) continue; if (ip->ip_p != fp->ipq_p) continue; break; } *hashp = hash; return fp; } /* d274 1 a274 1 ip_reass(struct ipqent *ipqe, struct ipq *fp, u_int hash) a275 1 struct ipqhead *ipqhead = &ipq[hash]; d278 1 a278 1 struct ipqent *nq, *p, *q; d321 1 a321 1 fp = malloc(sizeof(struct ipq), M_FTABLE, M_NOWAIT); d325 1 a325 1 LIST_INSERT_HEAD(ipqhead, fp, ipq_q); d493 1 a493 1 ip_freef(struct ipq *fp) d495 1 a495 1 struct ipqent *q, *p; d533 1 a533 1 struct ipq *fp, *nfp; d540 1 a540 1 for (fp = LIST_FIRST(&ipq[i]); fp != NULL; fp = nfp) { d643 2 a644 2 while (LIST_FIRST(&ipq[i]) != NULL) { ip_freef(LIST_FIRST(&ipq[i])); d674 2 a675 2 struct ipq *fp; struct ipqent *ipqe; d679 13 a691 1 fp = ip_reass_lookup(ip, &hash); @ 1.2.4.1 log @file ip_reass.c was added on branch uebayasi-xip on 2010-08-17 06:47:46 +0000 @ text @d1 730 @ 1.2.4.2 log @Sync with HEAD. @ text @a0 730 /* $NetBSD$ */ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @@(#)ip_input.c 8.2 (Berkeley) 1/4/94 */ /* * IP reassembly. * * Additive-Increase/Multiplicative-Decrease (AIMD) strategy for IP * reassembly queue buffer managment. * * We keep a count of total IP fragments (NB: not fragmented packets), * awaiting reassembly (ip_nfrags) and a limit (ip_maxfrags) on fragments. * If ip_nfrags exceeds ip_maxfrags the limit, we drop half the total * fragments in reassembly queues. This AIMD policy avoids repeatedly * deleting single packets under heavy fragmentation load (e.g., from lossy * NFS peers). */ #include __KERNEL_RCSID(0, "$NetBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * IP datagram reassembly hashed queues, pool, lock and counters. */ #define IPREASS_HASH_SHIFT 6 #define IPREASS_HASH_SIZE (1 << IPREASS_HASH_SHIFT) #define IPREASS_HASH_MASK (IPREASS_HASH_SIZE - 1) #define IPREASS_HASH(x, y) \ (((((x) & 0xf) | ((((x) >> 8) & 0xf) << 4)) ^ (y)) & IPREASS_HASH_MASK) struct ipqhead ipq[IPREASS_HASH_SIZE]; struct pool ipqent_pool; static int ipq_locked; static int ip_nfragpackets; /* packets in reass queue */ static int ip_nfrags; /* total fragments in reass queues */ static int ip_maxfragpackets; /* limit on packets. XXX sysctl */ static int ip_maxfrags; /* limit on fragments. XXX sysctl */ /* * IP reassembly queue structure. Each fragment being reassembled is * attached to one of these structures. They are timed out after ipq_ttl * drops to 0, and may also be reclaimed if memory becomes tight. */ struct ipq { LIST_ENTRY(ipq) ipq_q; /* to other reass headers */ uint8_t ipq_ttl; /* time for reass q to live */ uint8_t ipq_p; /* protocol of this fragment */ uint16_t ipq_id; /* sequence id for reassembly */ struct ipqehead ipq_fragq; /* to ip fragment queue */ struct in_addr ipq_src; struct in_addr ipq_dst; uint16_t ipq_nfrags; /* frags in this queue entry */ uint8_t ipq_tos; /* TOS of this fragment */ }; /* * Cached copy of nmbclusters. If nbclusters is different, * recalculate IP parameters derived from nmbclusters. */ static int ip_nmbclusters; /* copy of nmbclusters */ /* * IP reassembly TTL machinery for multiplicative drop. */ static u_int fragttl_histo[IPFRAGTTL + 1]; void sysctl_ip_reass_setup(void); static void ip_nmbclusters_changed(void); static struct ipq * ip_reass_lookup(struct ip *, u_int *); static struct mbuf * ip_reass(struct ipqent *, struct ipq *, u_int); static u_int ip_reass_ttl_decr(u_int ticks); static void ip_reass_drophalf(void); static void ip_freef(struct ipq *); /* * ip_reass_init: * * Initialization of IP reassembly mechanism. */ void ip_reass_init(void) { int i; pool_init(&ipqent_pool, sizeof(struct ipqent), 0, 0, 0, "ipqepl", NULL, IPL_VM); for (i = 0; i < IPREASS_HASH_SIZE; i++) { LIST_INIT(&ipq[i]); } ip_maxfragpackets = 200; ip_maxfrags = 0; ip_nmbclusters_changed(); sysctl_ip_reass_setup(); } static struct sysctllog *ip_reass_sysctllog; void sysctl_ip_reass_setup(void) { sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "net", NULL, NULL, 0, NULL, 0, CTL_NET, CTL_EOL); sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "inet", SYSCTL_DESCR("PF_INET related settings"), NULL, 0, NULL, 0, CTL_NET, PF_INET, CTL_EOL); sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "ip", SYSCTL_DESCR("IPv4 related settings"), NULL, 0, NULL, 0, CTL_NET, PF_INET, IPPROTO_IP, CTL_EOL); sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "maxfragpackets", SYSCTL_DESCR("Maximum number of fragments to retain for " "possible reassembly"), NULL, 0, &ip_maxfragpackets, 0, CTL_NET, PF_INET, IPPROTO_IP, IPCTL_MAXFRAGPACKETS, CTL_EOL); } #define CHECK_NMBCLUSTER_PARAMS() \ do { \ if (__predict_false(ip_nmbclusters != nmbclusters)) \ ip_nmbclusters_changed(); \ } while (/*CONSTCOND*/0) /* * Compute IP limits derived from the value of nmbclusters. */ static void ip_nmbclusters_changed(void) { ip_maxfrags = nmbclusters / 4; ip_nmbclusters = nmbclusters; } static inline int ipq_lock_try(void); static inline void ipq_unlock(void); static inline int ipq_lock_try(void) { int s; /* * Use splvm() -- we're blocking things that would cause * mbuf allocation. */ s = splvm(); if (ipq_locked) { splx(s); return (0); } ipq_locked = 1; splx(s); return (1); } static inline void ipq_unlock(void) { int s; s = splvm(); ipq_locked = 0; splx(s); } #ifdef DIAGNOSTIC #define IPQ_LOCK() \ do { \ if (ipq_lock_try() == 0) { \ printf("%s:%d: ipq already locked\n", __FILE__, __LINE__); \ panic("ipq_lock"); \ } \ } while (/*CONSTCOND*/ 0) #define IPQ_LOCK_CHECK() \ do { \ if (ipq_locked == 0) { \ printf("%s:%d: ipq lock not held\n", __FILE__, __LINE__); \ panic("ipq lock check"); \ } \ } while (/*CONSTCOND*/ 0) #else #define IPQ_LOCK() (void) ipq_lock_try() #define IPQ_LOCK_CHECK() /* nothing */ #endif #define IPQ_UNLOCK() ipq_unlock() /* * ip_reass_lookup: * * Look for queue of fragments of this datagram. */ static struct ipq * ip_reass_lookup(struct ip *ip, u_int *hashp) { struct ipq *fp; u_int hash; IPQ_LOCK(); hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); LIST_FOREACH(fp, &ipq[hash], ipq_q) { if (ip->ip_id != fp->ipq_id) continue; if (!in_hosteq(ip->ip_src, fp->ipq_src)) continue; if (!in_hosteq(ip->ip_dst, fp->ipq_dst)) continue; if (ip->ip_p != fp->ipq_p) continue; break; } *hashp = hash; return fp; } /* * ip_reass: * * Take incoming datagram fragment and try to reassemble it into whole * datagram. If a chain for reassembly of this datagram already exists, * then it is given as 'fp'; otherwise have to make a chain. */ struct mbuf * ip_reass(struct ipqent *ipqe, struct ipq *fp, u_int hash) { struct ipqhead *ipqhead = &ipq[hash]; const int hlen = ipqe->ipqe_ip->ip_hl << 2; struct mbuf *m = ipqe->ipqe_m, *t; struct ipqent *nq, *p, *q; struct ip *ip; int i, next, s; IPQ_LOCK_CHECK(); /* * Presence of header sizes in mbufs would confuse code below. */ m->m_data += hlen; m->m_len -= hlen; #ifdef notyet /* Make sure fragment limit is up-to-date. */ CHECK_NMBCLUSTER_PARAMS(); /* If we have too many fragments, drop the older half. */ if (ip_nfrags >= ip_maxfrags) { ip_reass_drophalf(void); } #endif /* * We are about to add a fragment; increment frag count. */ ip_nfrags++; /* * If first fragment to arrive, create a reassembly queue. */ if (fp == NULL) { /* * Enforce upper bound on number of fragmented packets * for which we attempt reassembly: a) if maxfrag is 0, * never accept fragments b) if maxfrag is -1, accept * all fragments without limitation. */ if (ip_maxfragpackets < 0) ; else if (ip_nfragpackets >= ip_maxfragpackets) { goto dropfrag; } ip_nfragpackets++; fp = malloc(sizeof(struct ipq), M_FTABLE, M_NOWAIT); if (fp == NULL) { goto dropfrag; } LIST_INSERT_HEAD(ipqhead, fp, ipq_q); fp->ipq_nfrags = 1; fp->ipq_ttl = IPFRAGTTL; fp->ipq_p = ipqe->ipqe_ip->ip_p; fp->ipq_id = ipqe->ipqe_ip->ip_id; fp->ipq_tos = ipqe->ipqe_ip->ip_tos; TAILQ_INIT(&fp->ipq_fragq); fp->ipq_src = ipqe->ipqe_ip->ip_src; fp->ipq_dst = ipqe->ipqe_ip->ip_dst; p = NULL; goto insert; } else { fp->ipq_nfrags++; } /* * Find a segment which begins after this one does. */ for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL; p = q, q = TAILQ_NEXT(q, ipqe_q)) if (ntohs(q->ipqe_ip->ip_off) > ntohs(ipqe->ipqe_ip->ip_off)) break; /* * If there is a preceding segment, it may provide some of our * data already. If so, drop the data from the incoming segment. * If it provides all of our data, drop us. */ if (p != NULL) { i = ntohs(p->ipqe_ip->ip_off) + ntohs(p->ipqe_ip->ip_len) - ntohs(ipqe->ipqe_ip->ip_off); if (i > 0) { if (i >= ntohs(ipqe->ipqe_ip->ip_len)) { goto dropfrag; } m_adj(ipqe->ipqe_m, i); ipqe->ipqe_ip->ip_off = htons(ntohs(ipqe->ipqe_ip->ip_off) + i); ipqe->ipqe_ip->ip_len = htons(ntohs(ipqe->ipqe_ip->ip_len) - i); } } /* * While we overlap succeeding segments trim them or, if they are * completely covered, dequeue them. */ for (; q != NULL && ntohs(ipqe->ipqe_ip->ip_off) + ntohs(ipqe->ipqe_ip->ip_len) > ntohs(q->ipqe_ip->ip_off); q = nq) { i = (ntohs(ipqe->ipqe_ip->ip_off) + ntohs(ipqe->ipqe_ip->ip_len)) - ntohs(q->ipqe_ip->ip_off); if (i < ntohs(q->ipqe_ip->ip_len)) { q->ipqe_ip->ip_len = htons(ntohs(q->ipqe_ip->ip_len) - i); q->ipqe_ip->ip_off = htons(ntohs(q->ipqe_ip->ip_off) + i); m_adj(q->ipqe_m, i); break; } nq = TAILQ_NEXT(q, ipqe_q); m_freem(q->ipqe_m); TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q); s = splvm(); pool_put(&ipqent_pool, q); splx(s); fp->ipq_nfrags--; ip_nfrags--; } insert: /* * Stick new segment in its place; check for complete reassembly. */ if (p == NULL) { TAILQ_INSERT_HEAD(&fp->ipq_fragq, ipqe, ipqe_q); } else { TAILQ_INSERT_AFTER(&fp->ipq_fragq, p, ipqe, ipqe_q); } next = 0; for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL; p = q, q = TAILQ_NEXT(q, ipqe_q)) { if (ntohs(q->ipqe_ip->ip_off) != next) { IPQ_UNLOCK(); return NULL; } next += ntohs(q->ipqe_ip->ip_len); } if (p->ipqe_mff) { IPQ_UNLOCK(); return NULL; } /* * Reassembly is complete. Check for a bogus message size and * concatenate fragments. */ q = TAILQ_FIRST(&fp->ipq_fragq); ip = q->ipqe_ip; if ((next + (ip->ip_hl << 2)) > IP_MAXPACKET) { IP_STATINC(IP_STAT_TOOLONG); ip_freef(fp); IPQ_UNLOCK(); return NULL; } m = q->ipqe_m; t = m->m_next; m->m_next = NULL; m_cat(m, t); nq = TAILQ_NEXT(q, ipqe_q); s = splvm(); pool_put(&ipqent_pool, q); splx(s); for (q = nq; q != NULL; q = nq) { t = q->ipqe_m; nq = TAILQ_NEXT(q, ipqe_q); s = splvm(); pool_put(&ipqent_pool, q); splx(s); m_cat(m, t); } ip_nfrags -= fp->ipq_nfrags; /* * Create header for new packet by modifying header of first * packet. Dequeue and discard fragment reassembly header. Make * header visible. */ ip->ip_len = htons((ip->ip_hl << 2) + next); ip->ip_src = fp->ipq_src; ip->ip_dst = fp->ipq_dst; LIST_REMOVE(fp, ipq_q); free(fp, M_FTABLE); ip_nfragpackets--; m->m_len += (ip->ip_hl << 2); m->m_data -= (ip->ip_hl << 2); /* some debugging cruft by sklower, below, will go away soon */ if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */ int plen = 0; for (t = m; t; t = t->m_next) { plen += t->m_len; } m->m_pkthdr.len = plen; m->m_pkthdr.csum_flags = 0; } IPQ_UNLOCK(); return m; dropfrag: if (fp != NULL) { fp->ipq_nfrags--; } ip_nfrags--; IP_STATINC(IP_STAT_FRAGDROPPED); m_freem(m); s = splvm(); pool_put(&ipqent_pool, ipqe); splx(s); IPQ_UNLOCK(); return NULL; } /* * ip_freef: * * Free a fragment reassembly header and all associated datagrams. */ static void ip_freef(struct ipq *fp) { struct ipqent *q, *p; u_int nfrags = 0; int s; IPQ_LOCK_CHECK(); for (q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL; q = p) { p = TAILQ_NEXT(q, ipqe_q); m_freem(q->ipqe_m); nfrags++; TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q); s = splvm(); pool_put(&ipqent_pool, q); splx(s); } if (nfrags != fp->ipq_nfrags) { printf("ip_freef: nfrags %d != %d\n", fp->ipq_nfrags, nfrags); } ip_nfrags -= nfrags; LIST_REMOVE(fp, ipq_q); free(fp, M_FTABLE); ip_nfragpackets--; } /* * ip_reass_ttl_decr: * * Decrement TTL of all reasembly queue entries by `ticks'. Count * number of distinct fragments (as opposed to partial, fragmented * datagrams) inthe reassembly queue. While we traverse the entire * reassembly queue, compute and return the median TTL over all * fragments. */ static u_int ip_reass_ttl_decr(u_int ticks) { u_int nfrags, median, dropfraction, keepfraction; struct ipq *fp, *nfp; int i; nfrags = 0; memset(fragttl_histo, 0, sizeof(fragttl_histo)); for (i = 0; i < IPREASS_HASH_SIZE; i++) { for (fp = LIST_FIRST(&ipq[i]); fp != NULL; fp = nfp) { fp->ipq_ttl = ((fp->ipq_ttl <= ticks) ? 0 : fp->ipq_ttl - ticks); nfp = LIST_NEXT(fp, ipq_q); if (fp->ipq_ttl == 0) { IP_STATINC(IP_STAT_FRAGTIMEOUT); ip_freef(fp); } else { nfrags += fp->ipq_nfrags; fragttl_histo[fp->ipq_ttl] += fp->ipq_nfrags; } } } KASSERT(ip_nfrags == nfrags); /* Find median (or other drop fraction) in histogram. */ dropfraction = (ip_nfrags / 2); keepfraction = ip_nfrags - dropfraction; for (i = IPFRAGTTL, median = 0; i >= 0; i--) { median += fragttl_histo[i]; if (median >= keepfraction) break; } /* Return TTL of median (or other fraction). */ return (u_int)i; } static void ip_reass_drophalf(void) { u_int median_ticks; /* * Compute median TTL of all fragments, and count frags * with that TTL or lower (roughly half of all fragments). */ median_ticks = ip_reass_ttl_decr(0); /* Drop half. */ median_ticks = ip_reass_ttl_decr(median_ticks); } /* * ip_reass_drain: drain off all datagram fragments. Do not acquire * softnet_lock as can be called from hardware interrupt context. */ void ip_reass_drain(void) { /* * We may be called from a device's interrupt context. If * the ipq is already busy, just bail out now. */ if (ipq_lock_try() != 0) { /* * Drop half the total fragments now. If more mbufs are * needed, we will be called again soon. */ ip_reass_drophalf(); IPQ_UNLOCK(); } } /* * ip_reass_slowtimo: * * If a timer expires on a reassembly queue, discard it. */ void ip_reass_slowtimo(void) { static u_int dropscanidx = 0; u_int i, median_ttl; IPQ_LOCK(); /* Age TTL of all fragments by 1 tick .*/ median_ttl = ip_reass_ttl_decr(1); /* Make sure fragment limit is up-to-date. */ CHECK_NMBCLUSTER_PARAMS(); /* If we have too many fragments, drop the older half. */ if (ip_nfrags > ip_maxfrags) { ip_reass_ttl_decr(median_ttl); } /* * If we are over the maximum number of fragmented packets (due to * the limit being lowered), drain off enough to get down to the * new limit. Start draining from the reassembly hashqueue most * recently drained. */ if (ip_maxfragpackets < 0) ; else { int wrapped = 0; i = dropscanidx; while (ip_nfragpackets > ip_maxfragpackets && wrapped == 0) { while (LIST_FIRST(&ipq[i]) != NULL) { ip_freef(LIST_FIRST(&ipq[i])); } if (++i >= IPREASS_HASH_SIZE) { i = 0; } /* * Do not scan forever even if fragment counters are * wrong: stop after scanning entire reassembly queue. */ if (i == dropscanidx) { wrapped = 1; } } dropscanidx = i; } IPQ_UNLOCK(); } /* * ip_reass_packet: generic routine to perform IP reassembly. * * => Passed fragment should have IP_MF flag and/or offset set. * => Fragment should not have other than IP_MF flags set. * * => Returns 0 on success or error otherwise. When reassembly is complete, * m_final representing a constructed final packet is set. */ int ip_reass_packet(struct mbuf *m, struct ip *ip, bool mff, struct mbuf **m_final) { struct ipq *fp; struct ipqent *ipqe; u_int hash; /* Look for queue of fragments of this datagram. */ fp = ip_reass_lookup(ip, &hash); /* Make sure that TOS matches previous fragments. */ if (fp && fp->ipq_tos != ip->ip_tos) { IP_STATINC(IP_STAT_BADFRAGS); IPQ_UNLOCK(); return EINVAL; } /* * Create new entry and attempt to reassembly. */ IP_STATINC(IP_STAT_FRAGMENTS); int s = splvm(); ipqe = pool_get(&ipqent_pool, PR_NOWAIT); splx(s); if (ipqe == NULL) { IP_STATINC(IP_STAT_RCVMEMDROP); IPQ_UNLOCK(); return ENOMEM; } ipqe->ipqe_mff = mff; ipqe->ipqe_m = m; ipqe->ipqe_ip = ip; *m_final = ip_reass(ipqe, fp, hash); if (*m_final) { /* Note if finally reassembled. */ IP_STATINC(IP_STAT_REASSEMBLED); } return 0; } @ 1.2.4.3 log @Sync with HEAD (-D20101022). @ text @a55 1 #include d76 1 a76 26 * IP reassembly queue structures. Each fragment being reassembled is * attached to one of these structures. They are timed out after TTL * drops to 0, and may also be reclaimed if memory becomes tight. */ typedef struct ipfr_qent { TAILQ_ENTRY(ipfr_qent) ipqe_q; struct ip * ipqe_ip; struct mbuf * ipqe_m; bool ipqe_mff; } ipfr_qent_t; typedef struct ipfr_queue { LIST_ENTRY(ipfr_queue) ipq_q; /* to other reass headers */ TAILQ_HEAD(, ipfr_qent) ipq_fragq; /* queue of fragment entries */ uint8_t ipq_ttl; /* time for reass q to live */ uint8_t ipq_p; /* protocol of this fragment */ uint16_t ipq_id; /* sequence id for reassembly */ struct in_addr ipq_src; struct in_addr ipq_dst; uint16_t ipq_nfrags; /* frags in this queue entry */ uint8_t ipq_tos; /* TOS of this fragment */ } ipfr_queue_t; /* * Hash table of IP reassembly queues. d84 3 a86 3 static LIST_HEAD(, ipfr_queue) ip_frags[IPREASS_HASH_SIZE]; static pool_cache_t ipfren_cache; static kmutex_t ipfr_lock; d88 2 a89 3 /* Number of packets in reassembly queue and total number of fragments. */ static int ip_nfragpackets; static int ip_nfrags; d91 2 a92 3 /* Limits on packet and fragments. */ static int ip_maxfragpackets; static int ip_maxfrags; d95 3 a97 2 * Cached copy of nmbclusters. If nbclusters is different, recalculate * IP parameters derived from nmbclusters. d99 17 a115 1 static int ip_nmbclusters; d120 1 a120 3 static u_int fragttl_histo[IPFRAGTTL + 1]; static struct sysctllog *ip_reass_sysctllog; d122 2 a123 2 void sysctl_ip_reass_setup(void); static void ip_nmbclusters_changed(void); d125 2 a126 1 static struct mbuf * ip_reass(ipfr_qent_t *, ipfr_queue_t *, u_int); d129 1 a129 1 static void ip_freef(ipfr_queue_t *); d141 2 a142 3 ipfren_cache = pool_cache_init(sizeof(ipfr_qent_t), coherency_unit, 0, 0, "ipfrenpl", NULL, IPL_NET, NULL, NULL, NULL); mutex_init(&ipfr_lock, MUTEX_DEFAULT, IPL_VM); d145 1 a145 1 LIST_INIT(&ip_frags[i]); d154 2 d203 82 d293 1 a293 1 ip_reass(ipfr_qent_t *ipqe, ipfr_queue_t *fp, const u_int hash) d295 1 d298 1 a298 1 ipfr_qent_t *nq, *p, *q; d300 1 a300 1 int i, next; d302 1 a302 1 KASSERT(mutex_owned(&ipfr_lock)); d341 1 a341 1 fp = malloc(sizeof(ipfr_queue_t), M_FTABLE, M_NOWAIT); d345 1 a345 1 LIST_INSERT_HEAD(&ip_frags[hash], fp, ipq_q); d408 3 a410 1 pool_cache_put(ipfren_cache, q); d428 1 a428 1 mutex_exit(&ipfr_lock); d434 1 a434 1 mutex_exit(&ipfr_lock); d438 2 a439 1 * Reassembly is complete. Check for a bogus message size. d446 1 a446 1 mutex_exit(&ipfr_lock); a448 6 LIST_REMOVE(fp, ipq_q); ip_nfrags -= fp->ipq_nfrags; ip_nfragpackets--; mutex_exit(&ipfr_lock); /* Concatenate all fragments. */ d454 3 a456 2 pool_cache_put(ipfren_cache, q); d460 3 a462 1 pool_cache_put(ipfren_cache, q); d465 1 d475 2 d478 1 a478 1 d481 2 a482 3 /* Fix up mbuf. XXX This should be done elsewhere. */ if (m->m_flags & M_PKTHDR) { d490 1 a498 3 mutex_exit(&ipfr_lock); pool_cache_put(ipfren_cache, ipqe); d500 4 d513 1 a513 1 ip_freef(ipfr_queue_t *fp) d515 3 a517 1 ipfr_qent_t *q; d519 1 a519 1 KASSERT(mutex_owned(&ipfr_lock)); d521 9 a529 3 LIST_REMOVE(fp, ipq_q); ip_nfrags -= fp->ipq_nfrags; ip_nfragpackets--; d531 2 a532 4 while ((q = TAILQ_FIRST(&fp->ipq_fragq)) != NULL) { TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q); m_freem(q->ipqe_m); pool_cache_put(ipfren_cache, q); d534 2 d537 1 d553 1 a553 1 ipfr_queue_t *fp, *nfp; d560 1 a560 1 for (fp = LIST_FIRST(&ip_frags[i]); fp != NULL; fp = nfp) { a593 2 KASSERT(mutex_owned(&ipfr_lock)); d616 1 a616 1 if (mutex_tryenter(&ipfr_lock)) { d622 1 a622 1 mutex_exit(&ipfr_lock); d637 1 a637 1 mutex_enter(&ipfr_lock); d663 2 a664 2 while (LIST_FIRST(&ip_frags[i]) != NULL) { ip_freef(LIST_FIRST(&ip_frags[i])); d679 1 a679 1 mutex_exit(&ipfr_lock); d694 2 a695 2 ipfr_queue_t *fp; ipfr_qent_t *ipqe; d699 1 a699 13 mutex_enter(&ipfr_lock); hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); LIST_FOREACH(fp, &ip_frags[hash], ipq_q) { if (ip->ip_id != fp->ipq_id) continue; if (!in_hosteq(ip->ip_src, fp->ipq_src)) continue; if (!in_hosteq(ip->ip_dst, fp->ipq_dst)) continue; if (ip->ip_p != fp->ipq_p) continue; break; } d704 1 a704 1 mutex_exit(&ipfr_lock); d712 3 a714 1 ipqe = pool_cache_get(ipfren_cache, PR_NOWAIT); d717 1 a717 1 mutex_exit(&ipfr_lock); @ 1.2.4.4 log @Sync with HEAD. @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.2.4.3 2010/10/22 07:22:39 uebayasi Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.2.4.3 2010/10/22 07:22:39 uebayasi Exp $"); a88 2 TAILQ_HEAD(ipfr_qent_head, ipfr_qent); d91 1 a91 1 struct ipfr_qent_head ipq_fragq; /* queue of fragment entries */ d224 1 a224 2 struct ip *ip = ipqe->ipqe_ip, *qip; const int hlen = ip->ip_hl << 2; d227 1 d273 1 a273 1 TAILQ_INIT(&fp->ipq_fragq); d276 6 a281 6 fp->ipq_p = ip->ip_p; fp->ipq_id = ip->ip_id; fp->ipq_tos = ip->ip_tos; fp->ipq_src = ip->ip_src; fp->ipq_dst = ip->ip_dst; LIST_INSERT_HEAD(&ip_frags[hash], fp, ipq_q); d291 3 a293 2 TAILQ_FOREACH(q, &fp->ipq_fragq, ipqe_q) { if (ntohs(q->ipqe_ip->ip_off) > ntohs(ip->ip_off)) a294 6 } if (q != NULL) { p = TAILQ_PREV(q, ipfr_qent_head, ipqe_q); } else { p = TAILQ_LAST(&fp->ipq_fragq, ipfr_qent_head); } d303 1 a303 1 ntohs(ip->ip_off); d305 1 a305 1 if (i >= ntohs(ip->ip_len)) { d309 4 a312 2 ip->ip_off = htons(ntohs(ip->ip_off) + i); ip->ip_len = htons(ntohs(ip->ip_len) - i); d320 10 a329 12 while (q != NULL) { size_t end; qip = q->ipqe_ip; end = ntohs(ip->ip_off) + ntohs(ip->ip_len); if (end <= ntohs(qip->ip_off)) { break; } i = end - ntohs(qip->ip_off); if (i < ntohs(qip->ip_len)) { qip->ip_len = htons(ntohs(qip->ip_len) - i); qip->ip_off = htons(ntohs(qip->ip_off) + i); a338 1 q = nq; d351 3 a353 3 TAILQ_FOREACH(q, &fp->ipq_fragq, ipqe_q) { qip = q->ipqe_ip; if (ntohs(qip->ip_off) != next) { d357 1 a357 1 next += ntohs(qip->ip_len); a358 1 p = TAILQ_LAST(&fp->ipq_fragq, ipfr_qent_head); a362 1 d605 2 a606 2 * => Returns 0 on success or error otherwise. * => On complete, m0 represents a constructed final packet. d609 1 a609 1 ip_reass_packet(struct mbuf **m0, struct ip *ip) a610 3 const int hlen = ip->ip_hl << 2; const int len = ntohs(ip->ip_len); struct mbuf *m = *m0; d613 1 a613 32 u_int hash, off, flen; bool mff; /* * Prevent TCP blind data attacks by not allowing non-initial * fragments to start at less than 68 bytes (minimal fragment * size) and making sure the first fragment is at least 68 * bytes. */ off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3; if ((off > 0 ? off + hlen : len) < IP_MINFRAGSIZE - 1) { IP_STATINC(IP_STAT_BADFRAGS); return EINVAL; } /* * Fragment length and MF flag. Make sure that fragments have * a data length which is non-zero and multiple of 8 bytes. */ flen = ntohs(ip->ip_len) - hlen; mff = (ip->ip_off & htons(IP_MF)) != 0; if (mff && (flen == 0 || (flen & 0x7) != 0)) { IP_STATINC(IP_STAT_BADFRAGS); return EINVAL; } /* * Adjust total IP length to not reflect header and convert * offset of this to bytes. XXX: clobbers struct ip. */ ip->ip_len = htons(flen); ip->ip_off = htons(off); d651 3 a653 3 *m0 = ip_reass(ipqe, fp, hash); if (*m0) { /* Note that finally reassembled. */ @ 1.2.2.1 log @file ip_reass.c was added on branch yamt-nfs-mp on 2010-08-11 22:54:56 +0000 @ text @d1 730 @ 1.2.2.2 log @sync with head. @ text @a0 730 /* $NetBSD$ */ /* * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @@(#)ip_input.c 8.2 (Berkeley) 1/4/94 */ /* * IP reassembly. * * Additive-Increase/Multiplicative-Decrease (AIMD) strategy for IP * reassembly queue buffer managment. * * We keep a count of total IP fragments (NB: not fragmented packets), * awaiting reassembly (ip_nfrags) and a limit (ip_maxfrags) on fragments. * If ip_nfrags exceeds ip_maxfrags the limit, we drop half the total * fragments in reassembly queues. This AIMD policy avoids repeatedly * deleting single packets under heavy fragmentation load (e.g., from lossy * NFS peers). */ #include __KERNEL_RCSID(0, "$NetBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * IP datagram reassembly hashed queues, pool, lock and counters. */ #define IPREASS_HASH_SHIFT 6 #define IPREASS_HASH_SIZE (1 << IPREASS_HASH_SHIFT) #define IPREASS_HASH_MASK (IPREASS_HASH_SIZE - 1) #define IPREASS_HASH(x, y) \ (((((x) & 0xf) | ((((x) >> 8) & 0xf) << 4)) ^ (y)) & IPREASS_HASH_MASK) struct ipqhead ipq[IPREASS_HASH_SIZE]; struct pool ipqent_pool; static int ipq_locked; static int ip_nfragpackets; /* packets in reass queue */ static int ip_nfrags; /* total fragments in reass queues */ static int ip_maxfragpackets; /* limit on packets. XXX sysctl */ static int ip_maxfrags; /* limit on fragments. XXX sysctl */ /* * IP reassembly queue structure. Each fragment being reassembled is * attached to one of these structures. They are timed out after ipq_ttl * drops to 0, and may also be reclaimed if memory becomes tight. */ struct ipq { LIST_ENTRY(ipq) ipq_q; /* to other reass headers */ uint8_t ipq_ttl; /* time for reass q to live */ uint8_t ipq_p; /* protocol of this fragment */ uint16_t ipq_id; /* sequence id for reassembly */ struct ipqehead ipq_fragq; /* to ip fragment queue */ struct in_addr ipq_src; struct in_addr ipq_dst; uint16_t ipq_nfrags; /* frags in this queue entry */ uint8_t ipq_tos; /* TOS of this fragment */ }; /* * Cached copy of nmbclusters. If nbclusters is different, * recalculate IP parameters derived from nmbclusters. */ static int ip_nmbclusters; /* copy of nmbclusters */ /* * IP reassembly TTL machinery for multiplicative drop. */ static u_int fragttl_histo[IPFRAGTTL + 1]; void sysctl_ip_reass_setup(void); static void ip_nmbclusters_changed(void); static struct ipq * ip_reass_lookup(struct ip *, u_int *); static struct mbuf * ip_reass(struct ipqent *, struct ipq *, u_int); static u_int ip_reass_ttl_decr(u_int ticks); static void ip_reass_drophalf(void); static void ip_freef(struct ipq *); /* * ip_reass_init: * * Initialization of IP reassembly mechanism. */ void ip_reass_init(void) { int i; pool_init(&ipqent_pool, sizeof(struct ipqent), 0, 0, 0, "ipqepl", NULL, IPL_VM); for (i = 0; i < IPREASS_HASH_SIZE; i++) { LIST_INIT(&ipq[i]); } ip_maxfragpackets = 200; ip_maxfrags = 0; ip_nmbclusters_changed(); sysctl_ip_reass_setup(); } static struct sysctllog *ip_reass_sysctllog; void sysctl_ip_reass_setup(void) { sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "net", NULL, NULL, 0, NULL, 0, CTL_NET, CTL_EOL); sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "inet", SYSCTL_DESCR("PF_INET related settings"), NULL, 0, NULL, 0, CTL_NET, PF_INET, CTL_EOL); sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT, CTLTYPE_NODE, "ip", SYSCTL_DESCR("IPv4 related settings"), NULL, 0, NULL, 0, CTL_NET, PF_INET, IPPROTO_IP, CTL_EOL); sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL, CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "maxfragpackets", SYSCTL_DESCR("Maximum number of fragments to retain for " "possible reassembly"), NULL, 0, &ip_maxfragpackets, 0, CTL_NET, PF_INET, IPPROTO_IP, IPCTL_MAXFRAGPACKETS, CTL_EOL); } #define CHECK_NMBCLUSTER_PARAMS() \ do { \ if (__predict_false(ip_nmbclusters != nmbclusters)) \ ip_nmbclusters_changed(); \ } while (/*CONSTCOND*/0) /* * Compute IP limits derived from the value of nmbclusters. */ static void ip_nmbclusters_changed(void) { ip_maxfrags = nmbclusters / 4; ip_nmbclusters = nmbclusters; } static inline int ipq_lock_try(void); static inline void ipq_unlock(void); static inline int ipq_lock_try(void) { int s; /* * Use splvm() -- we're blocking things that would cause * mbuf allocation. */ s = splvm(); if (ipq_locked) { splx(s); return (0); } ipq_locked = 1; splx(s); return (1); } static inline void ipq_unlock(void) { int s; s = splvm(); ipq_locked = 0; splx(s); } #ifdef DIAGNOSTIC #define IPQ_LOCK() \ do { \ if (ipq_lock_try() == 0) { \ printf("%s:%d: ipq already locked\n", __FILE__, __LINE__); \ panic("ipq_lock"); \ } \ } while (/*CONSTCOND*/ 0) #define IPQ_LOCK_CHECK() \ do { \ if (ipq_locked == 0) { \ printf("%s:%d: ipq lock not held\n", __FILE__, __LINE__); \ panic("ipq lock check"); \ } \ } while (/*CONSTCOND*/ 0) #else #define IPQ_LOCK() (void) ipq_lock_try() #define IPQ_LOCK_CHECK() /* nothing */ #endif #define IPQ_UNLOCK() ipq_unlock() /* * ip_reass_lookup: * * Look for queue of fragments of this datagram. */ static struct ipq * ip_reass_lookup(struct ip *ip, u_int *hashp) { struct ipq *fp; u_int hash; IPQ_LOCK(); hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); LIST_FOREACH(fp, &ipq[hash], ipq_q) { if (ip->ip_id != fp->ipq_id) continue; if (!in_hosteq(ip->ip_src, fp->ipq_src)) continue; if (!in_hosteq(ip->ip_dst, fp->ipq_dst)) continue; if (ip->ip_p != fp->ipq_p) continue; break; } *hashp = hash; return fp; } /* * ip_reass: * * Take incoming datagram fragment and try to reassemble it into whole * datagram. If a chain for reassembly of this datagram already exists, * then it is given as 'fp'; otherwise have to make a chain. */ struct mbuf * ip_reass(struct ipqent *ipqe, struct ipq *fp, u_int hash) { struct ipqhead *ipqhead = &ipq[hash]; const int hlen = ipqe->ipqe_ip->ip_hl << 2; struct mbuf *m = ipqe->ipqe_m, *t; struct ipqent *nq, *p, *q; struct ip *ip; int i, next, s; IPQ_LOCK_CHECK(); /* * Presence of header sizes in mbufs would confuse code below. */ m->m_data += hlen; m->m_len -= hlen; #ifdef notyet /* Make sure fragment limit is up-to-date. */ CHECK_NMBCLUSTER_PARAMS(); /* If we have too many fragments, drop the older half. */ if (ip_nfrags >= ip_maxfrags) { ip_reass_drophalf(void); } #endif /* * We are about to add a fragment; increment frag count. */ ip_nfrags++; /* * If first fragment to arrive, create a reassembly queue. */ if (fp == NULL) { /* * Enforce upper bound on number of fragmented packets * for which we attempt reassembly: a) if maxfrag is 0, * never accept fragments b) if maxfrag is -1, accept * all fragments without limitation. */ if (ip_maxfragpackets < 0) ; else if (ip_nfragpackets >= ip_maxfragpackets) { goto dropfrag; } ip_nfragpackets++; fp = malloc(sizeof(struct ipq), M_FTABLE, M_NOWAIT); if (fp == NULL) { goto dropfrag; } LIST_INSERT_HEAD(ipqhead, fp, ipq_q); fp->ipq_nfrags = 1; fp->ipq_ttl = IPFRAGTTL; fp->ipq_p = ipqe->ipqe_ip->ip_p; fp->ipq_id = ipqe->ipqe_ip->ip_id; fp->ipq_tos = ipqe->ipqe_ip->ip_tos; TAILQ_INIT(&fp->ipq_fragq); fp->ipq_src = ipqe->ipqe_ip->ip_src; fp->ipq_dst = ipqe->ipqe_ip->ip_dst; p = NULL; goto insert; } else { fp->ipq_nfrags++; } /* * Find a segment which begins after this one does. */ for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL; p = q, q = TAILQ_NEXT(q, ipqe_q)) if (ntohs(q->ipqe_ip->ip_off) > ntohs(ipqe->ipqe_ip->ip_off)) break; /* * If there is a preceding segment, it may provide some of our * data already. If so, drop the data from the incoming segment. * If it provides all of our data, drop us. */ if (p != NULL) { i = ntohs(p->ipqe_ip->ip_off) + ntohs(p->ipqe_ip->ip_len) - ntohs(ipqe->ipqe_ip->ip_off); if (i > 0) { if (i >= ntohs(ipqe->ipqe_ip->ip_len)) { goto dropfrag; } m_adj(ipqe->ipqe_m, i); ipqe->ipqe_ip->ip_off = htons(ntohs(ipqe->ipqe_ip->ip_off) + i); ipqe->ipqe_ip->ip_len = htons(ntohs(ipqe->ipqe_ip->ip_len) - i); } } /* * While we overlap succeeding segments trim them or, if they are * completely covered, dequeue them. */ for (; q != NULL && ntohs(ipqe->ipqe_ip->ip_off) + ntohs(ipqe->ipqe_ip->ip_len) > ntohs(q->ipqe_ip->ip_off); q = nq) { i = (ntohs(ipqe->ipqe_ip->ip_off) + ntohs(ipqe->ipqe_ip->ip_len)) - ntohs(q->ipqe_ip->ip_off); if (i < ntohs(q->ipqe_ip->ip_len)) { q->ipqe_ip->ip_len = htons(ntohs(q->ipqe_ip->ip_len) - i); q->ipqe_ip->ip_off = htons(ntohs(q->ipqe_ip->ip_off) + i); m_adj(q->ipqe_m, i); break; } nq = TAILQ_NEXT(q, ipqe_q); m_freem(q->ipqe_m); TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q); s = splvm(); pool_put(&ipqent_pool, q); splx(s); fp->ipq_nfrags--; ip_nfrags--; } insert: /* * Stick new segment in its place; check for complete reassembly. */ if (p == NULL) { TAILQ_INSERT_HEAD(&fp->ipq_fragq, ipqe, ipqe_q); } else { TAILQ_INSERT_AFTER(&fp->ipq_fragq, p, ipqe, ipqe_q); } next = 0; for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL; p = q, q = TAILQ_NEXT(q, ipqe_q)) { if (ntohs(q->ipqe_ip->ip_off) != next) { IPQ_UNLOCK(); return NULL; } next += ntohs(q->ipqe_ip->ip_len); } if (p->ipqe_mff) { IPQ_UNLOCK(); return NULL; } /* * Reassembly is complete. Check for a bogus message size and * concatenate fragments. */ q = TAILQ_FIRST(&fp->ipq_fragq); ip = q->ipqe_ip; if ((next + (ip->ip_hl << 2)) > IP_MAXPACKET) { IP_STATINC(IP_STAT_TOOLONG); ip_freef(fp); IPQ_UNLOCK(); return NULL; } m = q->ipqe_m; t = m->m_next; m->m_next = NULL; m_cat(m, t); nq = TAILQ_NEXT(q, ipqe_q); s = splvm(); pool_put(&ipqent_pool, q); splx(s); for (q = nq; q != NULL; q = nq) { t = q->ipqe_m; nq = TAILQ_NEXT(q, ipqe_q); s = splvm(); pool_put(&ipqent_pool, q); splx(s); m_cat(m, t); } ip_nfrags -= fp->ipq_nfrags; /* * Create header for new packet by modifying header of first * packet. Dequeue and discard fragment reassembly header. Make * header visible. */ ip->ip_len = htons((ip->ip_hl << 2) + next); ip->ip_src = fp->ipq_src; ip->ip_dst = fp->ipq_dst; LIST_REMOVE(fp, ipq_q); free(fp, M_FTABLE); ip_nfragpackets--; m->m_len += (ip->ip_hl << 2); m->m_data -= (ip->ip_hl << 2); /* some debugging cruft by sklower, below, will go away soon */ if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */ int plen = 0; for (t = m; t; t = t->m_next) { plen += t->m_len; } m->m_pkthdr.len = plen; m->m_pkthdr.csum_flags = 0; } IPQ_UNLOCK(); return m; dropfrag: if (fp != NULL) { fp->ipq_nfrags--; } ip_nfrags--; IP_STATINC(IP_STAT_FRAGDROPPED); m_freem(m); s = splvm(); pool_put(&ipqent_pool, ipqe); splx(s); IPQ_UNLOCK(); return NULL; } /* * ip_freef: * * Free a fragment reassembly header and all associated datagrams. */ static void ip_freef(struct ipq *fp) { struct ipqent *q, *p; u_int nfrags = 0; int s; IPQ_LOCK_CHECK(); for (q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL; q = p) { p = TAILQ_NEXT(q, ipqe_q); m_freem(q->ipqe_m); nfrags++; TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q); s = splvm(); pool_put(&ipqent_pool, q); splx(s); } if (nfrags != fp->ipq_nfrags) { printf("ip_freef: nfrags %d != %d\n", fp->ipq_nfrags, nfrags); } ip_nfrags -= nfrags; LIST_REMOVE(fp, ipq_q); free(fp, M_FTABLE); ip_nfragpackets--; } /* * ip_reass_ttl_decr: * * Decrement TTL of all reasembly queue entries by `ticks'. Count * number of distinct fragments (as opposed to partial, fragmented * datagrams) inthe reassembly queue. While we traverse the entire * reassembly queue, compute and return the median TTL over all * fragments. */ static u_int ip_reass_ttl_decr(u_int ticks) { u_int nfrags, median, dropfraction, keepfraction; struct ipq *fp, *nfp; int i; nfrags = 0; memset(fragttl_histo, 0, sizeof(fragttl_histo)); for (i = 0; i < IPREASS_HASH_SIZE; i++) { for (fp = LIST_FIRST(&ipq[i]); fp != NULL; fp = nfp) { fp->ipq_ttl = ((fp->ipq_ttl <= ticks) ? 0 : fp->ipq_ttl - ticks); nfp = LIST_NEXT(fp, ipq_q); if (fp->ipq_ttl == 0) { IP_STATINC(IP_STAT_FRAGTIMEOUT); ip_freef(fp); } else { nfrags += fp->ipq_nfrags; fragttl_histo[fp->ipq_ttl] += fp->ipq_nfrags; } } } KASSERT(ip_nfrags == nfrags); /* Find median (or other drop fraction) in histogram. */ dropfraction = (ip_nfrags / 2); keepfraction = ip_nfrags - dropfraction; for (i = IPFRAGTTL, median = 0; i >= 0; i--) { median += fragttl_histo[i]; if (median >= keepfraction) break; } /* Return TTL of median (or other fraction). */ return (u_int)i; } static void ip_reass_drophalf(void) { u_int median_ticks; /* * Compute median TTL of all fragments, and count frags * with that TTL or lower (roughly half of all fragments). */ median_ticks = ip_reass_ttl_decr(0); /* Drop half. */ median_ticks = ip_reass_ttl_decr(median_ticks); } /* * ip_reass_drain: drain off all datagram fragments. Do not acquire * softnet_lock as can be called from hardware interrupt context. */ void ip_reass_drain(void) { /* * We may be called from a device's interrupt context. If * the ipq is already busy, just bail out now. */ if (ipq_lock_try() != 0) { /* * Drop half the total fragments now. If more mbufs are * needed, we will be called again soon. */ ip_reass_drophalf(); IPQ_UNLOCK(); } } /* * ip_reass_slowtimo: * * If a timer expires on a reassembly queue, discard it. */ void ip_reass_slowtimo(void) { static u_int dropscanidx = 0; u_int i, median_ttl; IPQ_LOCK(); /* Age TTL of all fragments by 1 tick .*/ median_ttl = ip_reass_ttl_decr(1); /* Make sure fragment limit is up-to-date. */ CHECK_NMBCLUSTER_PARAMS(); /* If we have too many fragments, drop the older half. */ if (ip_nfrags > ip_maxfrags) { ip_reass_ttl_decr(median_ttl); } /* * If we are over the maximum number of fragmented packets (due to * the limit being lowered), drain off enough to get down to the * new limit. Start draining from the reassembly hashqueue most * recently drained. */ if (ip_maxfragpackets < 0) ; else { int wrapped = 0; i = dropscanidx; while (ip_nfragpackets > ip_maxfragpackets && wrapped == 0) { while (LIST_FIRST(&ipq[i]) != NULL) { ip_freef(LIST_FIRST(&ipq[i])); } if (++i >= IPREASS_HASH_SIZE) { i = 0; } /* * Do not scan forever even if fragment counters are * wrong: stop after scanning entire reassembly queue. */ if (i == dropscanidx) { wrapped = 1; } } dropscanidx = i; } IPQ_UNLOCK(); } /* * ip_reass_packet: generic routine to perform IP reassembly. * * => Passed fragment should have IP_MF flag and/or offset set. * => Fragment should not have other than IP_MF flags set. * * => Returns 0 on success or error otherwise. When reassembly is complete, * m_final representing a constructed final packet is set. */ int ip_reass_packet(struct mbuf *m, struct ip *ip, bool mff, struct mbuf **m_final) { struct ipq *fp; struct ipqent *ipqe; u_int hash; /* Look for queue of fragments of this datagram. */ fp = ip_reass_lookup(ip, &hash); /* Make sure that TOS matches previous fragments. */ if (fp && fp->ipq_tos != ip->ip_tos) { IP_STATINC(IP_STAT_BADFRAGS); IPQ_UNLOCK(); return EINVAL; } /* * Create new entry and attempt to reassembly. */ IP_STATINC(IP_STAT_FRAGMENTS); int s = splvm(); ipqe = pool_get(&ipqent_pool, PR_NOWAIT); splx(s); if (ipqe == NULL) { IP_STATINC(IP_STAT_RCVMEMDROP); IPQ_UNLOCK(); return ENOMEM; } ipqe->ipqe_mff = mff; ipqe->ipqe_m = m; ipqe->ipqe_ip = ip; *m_final = ip_reass(ipqe, fp, hash); if (*m_final) { /* Note if finally reassembled. */ IP_STATINC(IP_STAT_REASSEMBLED); } return 0; } @ 1.2.2.3 log @sync with head @ text @d1 1 a1 1 /* $NetBSD: ip_reass.c,v 1.2.2.2 2010/08/11 22:54:56 yamt Exp $ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.2.2.2 2010/08/11 22:54:56 yamt Exp $"); a55 1 #include d76 1 a76 26 * IP reassembly queue structures. Each fragment being reassembled is * attached to one of these structures. They are timed out after TTL * drops to 0, and may also be reclaimed if memory becomes tight. */ typedef struct ipfr_qent { TAILQ_ENTRY(ipfr_qent) ipqe_q; struct ip * ipqe_ip; struct mbuf * ipqe_m; bool ipqe_mff; } ipfr_qent_t; typedef struct ipfr_queue { LIST_ENTRY(ipfr_queue) ipq_q; /* to other reass headers */ TAILQ_HEAD(, ipfr_qent) ipq_fragq; /* queue of fragment entries */ uint8_t ipq_ttl; /* time for reass q to live */ uint8_t ipq_p; /* protocol of this fragment */ uint16_t ipq_id; /* sequence id for reassembly */ struct in_addr ipq_src; struct in_addr ipq_dst; uint16_t ipq_nfrags; /* frags in this queue entry */ uint8_t ipq_tos; /* TOS of this fragment */ } ipfr_queue_t; /* * Hash table of IP reassembly queues. d84 3 a86 3 static LIST_HEAD(, ipfr_queue) ip_frags[IPREASS_HASH_SIZE]; static pool_cache_t ipfren_cache; static kmutex_t ipfr_lock; d88 2 a89 3 /* Number of packets in reassembly queue and total number of fragments. */ static int ip_nfragpackets; static int ip_nfrags; d91 2 a92 3 /* Limits on packet and fragments. */ static int ip_maxfragpackets; static int ip_maxfrags; d95 3 a97 2 * Cached copy of nmbclusters. If nbclusters is different, recalculate * IP parameters derived from nmbclusters. d99 17 a115 1 static int ip_nmbclusters; d120 1 a120 3 static u_int fragttl_histo[IPFRAGTTL + 1]; static struct sysctllog *ip_reass_sysctllog; d122 2 a123 2 void sysctl_ip_reass_setup(void); static void ip_nmbclusters_changed(void); d125 2 a126 1 static struct mbuf * ip_reass(ipfr_qent_t *, ipfr_queue_t *, u_int); d129 1 a129 1 static void ip_freef(ipfr_queue_t *); d141 2 a142 3 ipfren_cache = pool_cache_init(sizeof(ipfr_qent_t), coherency_unit, 0, 0, "ipfrenpl", NULL, IPL_NET, NULL, NULL, NULL); mutex_init(&ipfr_lock, MUTEX_DEFAULT, IPL_VM); d145 1 a145 1 LIST_INIT(&ip_frags[i]); d154 2 d203 82 d293 1 a293 1 ip_reass(ipfr_qent_t *ipqe, ipfr_queue_t *fp, const u_int hash) d295 1 d298 1 a298 1 ipfr_qent_t *nq, *p, *q; d300 1 a300 1 int i, next; d302 1 a302 1 KASSERT(mutex_owned(&ipfr_lock)); d341 1 a341 1 fp = malloc(sizeof(ipfr_queue_t), M_FTABLE, M_NOWAIT); d345 1 a345 1 LIST_INSERT_HEAD(&ip_frags[hash], fp, ipq_q); d408 3 a410 1 pool_cache_put(ipfren_cache, q); d428 1 a428 1 mutex_exit(&ipfr_lock); d434 1 a434 1 mutex_exit(&ipfr_lock); d438 2 a439 1 * Reassembly is complete. Check for a bogus message size. d446 1 a446 1 mutex_exit(&ipfr_lock); a448 6 LIST_REMOVE(fp, ipq_q); ip_nfrags -= fp->ipq_nfrags; ip_nfragpackets--; mutex_exit(&ipfr_lock); /* Concatenate all fragments. */ d454 3 a456 2 pool_cache_put(ipfren_cache, q); d460 3 a462 1 pool_cache_put(ipfren_cache, q); d465 1 d475 2 d478 1 a478 1 d481 2 a482 3 /* Fix up mbuf. XXX This should be done elsewhere. */ if (m->m_flags & M_PKTHDR) { d490 1 a498 3 mutex_exit(&ipfr_lock); pool_cache_put(ipfren_cache, ipqe); d500 4 d513 1 a513 1 ip_freef(ipfr_queue_t *fp) d515 3 a517 1 ipfr_qent_t *q; d519 1 a519 1 KASSERT(mutex_owned(&ipfr_lock)); d521 9 a529 3 LIST_REMOVE(fp, ipq_q); ip_nfrags -= fp->ipq_nfrags; ip_nfragpackets--; d531 2 a532 4 while ((q = TAILQ_FIRST(&fp->ipq_fragq)) != NULL) { TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q); m_freem(q->ipqe_m); pool_cache_put(ipfren_cache, q); d534 2 d537 1 d553 1 a553 1 ipfr_queue_t *fp, *nfp; d560 1 a560 1 for (fp = LIST_FIRST(&ip_frags[i]); fp != NULL; fp = nfp) { a593 2 KASSERT(mutex_owned(&ipfr_lock)); d616 1 a616 1 if (mutex_tryenter(&ipfr_lock)) { d622 1 a622 1 mutex_exit(&ipfr_lock); d637 1 a637 1 mutex_enter(&ipfr_lock); d663 2 a664 2 while (LIST_FIRST(&ip_frags[i]) != NULL) { ip_freef(LIST_FIRST(&ip_frags[i])); d679 1 a679 1 mutex_exit(&ipfr_lock); d694 2 a695 2 ipfr_queue_t *fp; ipfr_qent_t *ipqe; d699 1 a699 13 mutex_enter(&ipfr_lock); hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id); LIST_FOREACH(fp, &ip_frags[hash], ipq_q) { if (ip->ip_id != fp->ipq_id) continue; if (!in_hosteq(ip->ip_src, fp->ipq_src)) continue; if (!in_hosteq(ip->ip_dst, fp->ipq_dst)) continue; if (ip->ip_p != fp->ipq_p) continue; break; } d704 1 a704 1 mutex_exit(&ipfr_lock); d712 3 a714 1 ipqe = pool_cache_get(ipfren_cache, PR_NOWAIT); d717 1 a717 1 mutex_exit(&ipfr_lock); @ 1.1 log @Split-off IPv4 re-assembly mechanism into a separate module. Abstract into ip_reass_init(), ip_reass_lookup(), etc (note: abstraction is not yet complete). No functional changes to the actual mechanism. OK matt@@ @ text @d1 1 a1 1 /* $NetBSD$ */ d49 1 a49 1 __KERNEL_RCSID(0, "$NetBSD$"); d52 1 a52 1 #include d59 1 d61 1 d70 1 a73 1 #include d95 17 d124 6 a129 2 static u_int ip_reass_ttl_decr(u_int ticks); static void ip_reass_drophalf(void); d262 1 a262 1 struct ipq * a284 21 void ip_reass_unlock(void) { IPQ_UNLOCK(); } struct ipqent * ip_reass_getent(void) { struct ipqent *ipqe; int s; IP_STATINC(IP_STAT_FRAGMENTS); s = splvm(); ipqe = pool_get(&ipqent_pool, PR_NOWAIT); splx(s); return ipqe; } d472 1 a472 1 ip->ip_len = htons(next); d475 1 d512 1 a512 1 void d681 50 @