head 1.2; access; symbols netbsd-10-0-RC6:1.2 netbsd-10-0-RC5:1.2 netbsd-10-0-RC4:1.2 netbsd-10-0-RC3:1.2 netbsd-10-0-RC2:1.2 thorpej-ifq:1.2.0.34 thorpej-ifq-base:1.2 thorpej-altq-separation:1.2.0.32 thorpej-altq-separation-base:1.2 netbsd-10-0-RC1:1.2 netbsd-10:1.2.0.30 netbsd-10-base:1.2 bouyer-sunxi-drm:1.2.0.28 bouyer-sunxi-drm-base:1.2 netbsd-9-3-RELEASE:1.2 thorpej-i2c-spi-conf2:1.2.0.26 thorpej-i2c-spi-conf2-base:1.2 thorpej-futex2:1.2.0.24 thorpej-futex2-base:1.2 thorpej-cfargs2:1.2.0.22 thorpej-cfargs2-base:1.2 cjep_sun2x-base1:1.2 cjep_sun2x:1.2.0.20 cjep_sun2x-base:1.2 cjep_staticlib_x-base1:1.2 netbsd-9-2-RELEASE:1.2 cjep_staticlib_x:1.2.0.18 cjep_staticlib_x-base:1.2 thorpej-i2c-spi-conf:1.2.0.16 thorpej-i2c-spi-conf-base:1.2 thorpej-cfargs:1.2.0.14 thorpej-cfargs-base:1.2 thorpej-futex:1.2.0.12 thorpej-futex-base:1.2 netbsd-9-1-RELEASE:1.2 bouyer-xenpvh-base2:1.2 phil-wifi-20200421:1.2 bouyer-xenpvh-base1:1.2 phil-wifi-20200411:1.2 bouyer-xenpvh:1.2.0.10 bouyer-xenpvh-base:1.2 is-mlppp:1.2.0.8 is-mlppp-base:1.2 phil-wifi-20200406:1.2 netbsd-8-2-RELEASE:1.1 ad-namecache-base3:1.2 netbsd-9-0-RELEASE:1.2 netbsd-9-0-RC2:1.2 ad-namecache-base2:1.2 ad-namecache-base1:1.2 ad-namecache:1.2.0.6 ad-namecache-base:1.2 netbsd-9-0-RC1:1.2 phil-wifi-20191119:1.2 netbsd-9:1.2.0.4 netbsd-9-base:1.2 phil-wifi-20190609:1.2 netbsd-8-1-RELEASE:1.1 netbsd-8-1-RC1:1.1 isaki-audio2:1.2.0.2 isaki-audio2-base:1.2 pgoyette-compat-merge-20190127:1.1.102.1 pgoyette-compat-20190127:1.2 pgoyette-compat-20190118:1.2 pgoyette-compat-1226:1.2 pgoyette-compat-1126:1.2 pgoyette-compat-1020:1.2 pgoyette-compat-0930:1.2 pgoyette-compat-0906:1.2 netbsd-7-2-RELEASE:1.1 pgoyette-compat-0728:1.1 netbsd-8-0-RELEASE:1.1 phil-wifi:1.1.0.104 phil-wifi-base:1.1 pgoyette-compat-0625:1.1 netbsd-8-0-RC2:1.1 pgoyette-compat-0521:1.1 pgoyette-compat-0502:1.1 pgoyette-compat-0422:1.1 netbsd-8-0-RC1:1.1 pgoyette-compat-0415:1.1 pgoyette-compat-0407:1.1 pgoyette-compat-0330:1.1 pgoyette-compat-0322:1.1 pgoyette-compat-0315:1.1 netbsd-7-1-2-RELEASE:1.1 pgoyette-compat:1.1.0.102 pgoyette-compat-base:1.1 netbsd-7-1-1-RELEASE:1.1 tls-maxphys-base-20171202:1.1 matt-nb8-mediatek:1.1.0.100 matt-nb8-mediatek-base:1.1 nick-nhusb-base-20170825:1.1 perseant-stdc-iso10646:1.1.0.98 perseant-stdc-iso10646-base:1.1 netbsd-8:1.1.0.96 netbsd-8-base:1.1 prg-localcount2-base3:1.1 prg-localcount2-base2:1.1 prg-localcount2-base1:1.1 prg-localcount2:1.1.0.94 prg-localcount2-base:1.1 pgoyette-localcount-20170426:1.1 bouyer-socketcan-base1:1.1 jdolecek-ncq:1.1.0.92 jdolecek-ncq-base:1.1 pgoyette-localcount-20170320:1.1 netbsd-7-1:1.1.0.90 netbsd-7-1-RELEASE:1.1 netbsd-7-1-RC2:1.1 nick-nhusb-base-20170204:1.1 netbsd-7-nhusb-base-20170116:1.1 bouyer-socketcan:1.1.0.88 bouyer-socketcan-base:1.1 pgoyette-localcount-20170107:1.1 netbsd-7-1-RC1:1.1 nick-nhusb-base-20161204:1.1 pgoyette-localcount-20161104:1.1 netbsd-7-0-2-RELEASE:1.1 nick-nhusb-base-20161004:1.1 localcount-20160914:1.1 netbsd-7-nhusb:1.1.0.86 netbsd-7-nhusb-base:1.1 pgoyette-localcount-20160806:1.1 pgoyette-localcount-20160726:1.1 pgoyette-localcount:1.1.0.84 pgoyette-localcount-base:1.1 nick-nhusb-base-20160907:1.1 nick-nhusb-base-20160529:1.1 netbsd-7-0-1-RELEASE:1.1 nick-nhusb-base-20160422:1.1 nick-nhusb-base-20160319:1.1 nick-nhusb-base-20151226:1.1 netbsd-7-0:1.1.0.82 netbsd-7-0-RELEASE:1.1 nick-nhusb-base-20150921:1.1 netbsd-7-0-RC3:1.1 netbsd-7-0-RC2:1.1 netbsd-7-0-RC1:1.1 nick-nhusb-base-20150606:1.1 nick-nhusb-base-20150406:1.1 nick-nhusb:1.1.0.80 nick-nhusb-base:1.1 netbsd-5-2-3-RELEASE:1.1 netbsd-5-1-5-RELEASE:1.1 netbsd-6-0-6-RELEASE:1.1 netbsd-6-1-5-RELEASE:1.1 netbsd-7:1.1.0.78 netbsd-7-base:1.1 yamt-pagecache-base9:1.1 yamt-pagecache-tag8:1.1 netbsd-6-1-4-RELEASE:1.1 netbsd-6-0-5-RELEASE:1.1 tls-earlyentropy:1.1.0.76 tls-earlyentropy-base:1.1 riastradh-xf86-video-intel-2-7-1-pre-2-21-15:1.1 riastradh-drm2-base3:1.1 netbsd-6-1-3-RELEASE:1.1 netbsd-6-0-4-RELEASE:1.1 netbsd-5-2-2-RELEASE:1.1 netbsd-5-1-4-RELEASE:1.1 netbsd-6-1-2-RELEASE:1.1 netbsd-6-0-3-RELEASE:1.1 netbsd-5-2-1-RELEASE:1.1 netbsd-5-1-3-RELEASE:1.1 rmind-smpnet-nbase:1.1 netbsd-6-1-1-RELEASE:1.1 riastradh-drm2-base2:1.1 riastradh-drm2-base1:1.1 riastradh-drm2:1.1.0.74 riastradh-drm2-base:1.1 rmind-smpnet:1.1.0.66 rmind-smpnet-base:1.1 netbsd-6-1:1.1.0.72 netbsd-6-0-2-RELEASE:1.1 netbsd-6-1-RELEASE:1.1 khorben-n900:1.1.0.70 netbsd-6-1-RC4:1.1 netbsd-6-1-RC3:1.1 agc-symver:1.1.0.68 agc-symver-base:1.1 netbsd-6-1-RC2:1.1 netbsd-6-1-RC1:1.1 yamt-pagecache-base8:1.1 netbsd-5-2:1.1.0.64 netbsd-6-0-1-RELEASE:1.1 yamt-pagecache-base7:1.1 netbsd-5-2-RELEASE:1.1 netbsd-5-2-RC1:1.1 matt-nb6-plus-nbase:1.1 yamt-pagecache-base6:1.1 netbsd-6-0:1.1.0.62 netbsd-6-0-RELEASE:1.1 netbsd-6-0-RC2:1.1 tls-maxphys:1.1.0.60 tls-maxphys-base:1.1 matt-nb6-plus:1.1.0.58 matt-nb6-plus-base:1.1 netbsd-6-0-RC1:1.1 jmcneill-usbmp-base10:1.1 yamt-pagecache-base5:1.1 jmcneill-usbmp-base9:1.1 yamt-pagecache-base4:1.1 jmcneill-usbmp-base8:1.1 jmcneill-usbmp-base7:1.1 jmcneill-usbmp-base6:1.1 jmcneill-usbmp-base5:1.1 jmcneill-usbmp-base4:1.1 jmcneill-usbmp-base3:1.1 jmcneill-usbmp-pre-base2:1.1 jmcneill-usbmp-base2:1.1 netbsd-6:1.1.0.56 netbsd-6-base:1.1 netbsd-5-1-2-RELEASE:1.1 netbsd-5-1-1-RELEASE:1.1 jmcneill-usbmp:1.1.0.54 jmcneill-usbmp-base:1.1 jmcneill-audiomp3:1.1.0.52 jmcneill-audiomp3-base:1.1 yamt-pagecache-base3:1.1 yamt-pagecache-base2:1.1 yamt-pagecache:1.1.0.50 yamt-pagecache-base:1.1 rmind-uvmplock-nbase:1.1 cherry-xenmp:1.1.0.48 cherry-xenmp-base:1.1 bouyer-quota2-nbase:1.1 bouyer-quota2:1.1.0.46 bouyer-quota2-base:1.1 jruoho-x86intr:1.1.0.44 jruoho-x86intr-base:1.1 matt-mips64-premerge-20101231:1.1 matt-nb5-mips64-premerge-20101231:1.1 matt-nb5-pq3:1.1.0.42 matt-nb5-pq3-base:1.1 netbsd-5-1:1.1.0.40 netbsd-5-1-RELEASE:1.1 uebayasi-xip-base4:1.1 uebayasi-xip-base3:1.1 yamt-nfs-mp-base11:1.1 netbsd-5-1-RC4:1.1 matt-nb5-mips64-k15:1.1 uebayasi-xip-base2:1.1 yamt-nfs-mp-base10:1.1 netbsd-5-1-RC3:1.1 netbsd-5-1-RC2:1.1 uebayasi-xip-base1:1.1 netbsd-5-1-RC1:1.1 rmind-uvmplock:1.1.0.38 rmind-uvmplock-base:1.1 yamt-nfs-mp-base9:1.1 uebayasi-xip:1.1.0.36 uebayasi-xip-base:1.1 netbsd-5-0-2-RELEASE:1.1 matt-nb5-mips64-premerge-20091211:1.1 matt-premerge-20091211:1.1 yamt-nfs-mp-base8:1.1 matt-nb5-mips64-u2-k2-k4-k7-k8-k9:1.1 matt-nb4-mips64-k7-u2a-k9b:1.1 matt-nb5-mips64-u1-k1-k5:1.1 yamt-nfs-mp-base7:1.1 matt-nb5-mips64:1.1.0.34 netbsd-5-0-1-RELEASE:1.1 jymxensuspend-base:1.1 yamt-nfs-mp-base6:1.1 yamt-nfs-mp-base5:1.1 yamt-nfs-mp-base4:1.1 jym-xensuspend-nbase:1.1 yamt-nfs-mp-base3:1.1 nick-hppapmap-base4:1.1 nick-hppapmap-base3:1.1 netbsd-5-0:1.1.0.32 netbsd-5-0-RELEASE:1.1 netbsd-5-0-RC4:1.1 netbsd-5-0-RC3:1.1 nick-hppapmap-base2:1.1 netbsd-5-0-RC2:1.1 jym-xensuspend:1.1.0.30 jym-xensuspend-base:1.1 netbsd-5-0-RC1:1.1 haad-dm-base2:1.1 haad-nbase2:1.1 ad-audiomp2:1.1.0.28 ad-audiomp2-base:1.1 netbsd-5:1.1.0.26 netbsd-5-base:1.1 nick-hppapmap:1.1.0.24 nick-hppapmap-base:1.1 matt-mips64-base2:1.1 haad-dm-base1:1.1 wrstuden-revivesa-base-4:1.1 wrstuden-revivesa-base-3:1.1 wrstuden-revivesa-base-2:1.1 haad-dm:1.1.0.22 haad-dm-base:1.1 wrstuden-revivesa-base-1:1.1 simonb-wapbl-nbase:1.1 yamt-pf42-base4:1.1 simonb-wapbl:1.1.0.20 simonb-wapbl-base:1.1 yamt-pf42-base3:1.1 hpcarm-cleanup-nbase:1.1 hpcarm-cleanup-base:1.1 yamt-pf42-baseX:1.1 yamt-pf42-base2:1.1 yamt-nfs-mp-base2:1.1 wrstuden-revivesa:1.1.0.18 wrstuden-revivesa-base:1.1 yamt-nfs-mp:1.1.0.16 yamt-nfs-mp-base:1.1 yamt-pf42:1.1.0.14 yamt-pf42-base:1.1 ad-socklock-base1:1.1 yamt-lazymbuf-base15:1.1 yamt-lazymbuf-base14:1.1 matt-armv6:1.1.0.12 matt-armv6-nbase:1.1 keiichi-mipv6-nbase:1.1 mjf-devfs2:1.1.0.10 mjf-devfs2-base:1.1 nick-net80211-sync:1.1.0.8 nick-net80211-sync-base:1.1 keiichi-mipv6:1.1.0.6 keiichi-mipv6-base:1.1 mjf-devfs:1.1.0.4 mjf-devfs-base:1.1 yamt-lazymbuf:1.1.0.2; locks; strict; comment @ * @; 1.2 date 2018.08.28.07.28.01; author rin; state Exp; branches; next 1.1; commitid wZNLiZ7kUP07fRPA; 1.1 date 2008.01.25.21.12.14; author joerg; state Exp; branches 1.1.2.1 1.1.4.1 1.1.12.1 1.1.102.1 1.1.104.1; next ; 1.1.2.1 date 2008.01.25.21.12.14; author yamt; state dead; branches; next 1.1.2.2; 1.1.2.2 date 2008.02.04.09.24.39; author yamt; state Exp; branches; next ; 1.1.4.1 date 2008.01.25.21.12.14; author mjf; state dead; branches; next 1.1.4.2; 1.1.4.2 date 2008.02.18.21.07.08; author mjf; state Exp; branches; next ; 1.1.12.1 date 2008.01.25.21.12.14; author matt; state dead; branches; next 1.1.12.2; 1.1.12.2 date 2008.03.23.02.05.06; author matt; state Exp; branches; next ; 1.1.102.1 date 2018.09.06.06.56.44; author pgoyette; state Exp; branches; next ; commitid HCi1bXD317XIK0RA; 1.1.104.1 date 2019.06.10.22.09.47; author christos; state Exp; branches; next ; commitid jtc8rnCzWiEEHGqB; desc @@ 1.2 log @No need to update mlen also in the case of (mlen & 16) != 0. @ text @/* $NetBSD: cpu_in_cksum.c,v 1.1 2008/01/25 21:12:14 joerg Exp $ */ /*- * Copyright (c) 2008 Joerg Sonnenberger . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __KERNEL_RCSID(0, "$NetBSD: cpu_in_cksum.c,v 1.1 2008/01/25 21:12:14 joerg Exp $"); #include #include #include #ifdef _KERNEL #include #else #include #include #include #define KASSERT(x) assert(x) #endif #include #include #ifndef _KERNEL int cpu_in_cksum(struct mbuf*, int, int, uint32_t); #endif /* * Checksum routine for Internet Protocol family headers (Portable Version). * * This routine is very heavily used in the network * code and should be modified for each CPU to be as fast as possible. * * A discussion of different implementation techniques can be found in * RFC 1071. * * The default implementation for 32bit architectures is using * a 32bit accumulator and operating on 16bit operands. * * The default implementation for 64bit architectures is using * a 64bit accumulator and operating on 32bit operands. * * Both versions are unrolled to handle 32 Byte / 64 Byte fragments as core * of the inner loop. After each iteration of the inner loop, a partial * reduction is done to avoid carry in long packets. */ #if ULONG_MAX == 0xffffffffUL /* 32bit version */ int cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum) { int mlen; uint32_t sum, partial; unsigned int final_acc; uint8_t *data; bool needs_swap, started_on_odd; KASSERT(len >= 0); KASSERT(off >= 0); needs_swap = false; started_on_odd = false; sum = (initial_sum >> 16) + (initial_sum & 0xffff); for (;;) { if (__predict_false(m == NULL)) { printf("in_cksum: out of data\n"); return -1; } mlen = m->m_len; if (mlen > off) { mlen -= off; data = mtod(m, uint8_t *) + off; goto post_initial_offset; } off -= mlen; if (len == 0) break; m = m->m_next; } for (; len > 0; m = m->m_next) { if (__predict_false(m == NULL)) { printf("in_cksum: out of data\n"); return -1; } mlen = m->m_len; data = mtod(m, uint8_t *); post_initial_offset: if (mlen == 0) continue; if (mlen > len) mlen = len; len -= mlen; partial = 0; if ((uintptr_t)data & 1) { /* Align on word boundary */ started_on_odd = !started_on_odd; #if _BYTE_ORDER == _LITTLE_ENDIAN partial = *data << 8; #else partial = *data; #endif ++data; --mlen; } needs_swap = started_on_odd; while (mlen >= 32) { __builtin_prefetch(data + 32); partial += *(uint16_t *)data; partial += *(uint16_t *)(data + 2); partial += *(uint16_t *)(data + 4); partial += *(uint16_t *)(data + 6); partial += *(uint16_t *)(data + 8); partial += *(uint16_t *)(data + 10); partial += *(uint16_t *)(data + 12); partial += *(uint16_t *)(data + 14); partial += *(uint16_t *)(data + 16); partial += *(uint16_t *)(data + 18); partial += *(uint16_t *)(data + 20); partial += *(uint16_t *)(data + 22); partial += *(uint16_t *)(data + 24); partial += *(uint16_t *)(data + 26); partial += *(uint16_t *)(data + 28); partial += *(uint16_t *)(data + 30); data += 32; mlen -= 32; if (__predict_false(partial & 0xc0000000)) { if (needs_swap) partial = (partial << 8) + (partial >> 24); sum += (partial >> 16); sum += (partial & 0xffff); partial = 0; } } /* * mlen is not updated below as the remaining tests * are using bit masks, which are not affected. */ if (mlen & 16) { partial += *(uint16_t *)data; partial += *(uint16_t *)(data + 2); partial += *(uint16_t *)(data + 4); partial += *(uint16_t *)(data + 6); partial += *(uint16_t *)(data + 8); partial += *(uint16_t *)(data + 10); partial += *(uint16_t *)(data + 12); partial += *(uint16_t *)(data + 14); data += 16; } if (mlen & 8) { partial += *(uint16_t *)data; partial += *(uint16_t *)(data + 2); partial += *(uint16_t *)(data + 4); partial += *(uint16_t *)(data + 6); data += 8; } if (mlen & 4) { partial += *(uint16_t *)data; partial += *(uint16_t *)(data + 2); data += 4; } if (mlen & 2) { partial += *(uint16_t *)data; data += 2; } if (mlen & 1) { #if _BYTE_ORDER == _LITTLE_ENDIAN partial += *data; #else partial += *data << 8; #endif started_on_odd = !started_on_odd; } if (needs_swap) partial = (partial << 8) + (partial >> 24); sum += (partial >> 16) + (partial & 0xffff); /* * Reduce sum to allow potential byte swap * in the next iteration without carry. */ sum = (sum >> 16) + (sum & 0xffff); } final_acc = ((sum >> 16) & 0xffff) + (sum & 0xffff); final_acc = (final_acc >> 16) + (final_acc & 0xffff); return ~final_acc & 0xffff; } #else /* 64bit version */ int cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum) { int mlen; uint64_t sum, partial; unsigned int final_acc; uint8_t *data; bool needs_swap, started_on_odd; KASSERT(len >= 0); KASSERT(off >= 0); needs_swap = false; started_on_odd = false; sum = initial_sum; for (;;) { if (__predict_false(m == NULL)) { printf("in_cksum: out of data\n"); return -1; } mlen = m->m_len; if (mlen > off) { mlen -= off; data = mtod(m, uint8_t *) + off; goto post_initial_offset; } off -= mlen; if (len == 0) break; m = m->m_next; } for (; len > 0; m = m->m_next) { if (__predict_false(m == NULL)) { printf("in_cksum: out of data\n"); return -1; } mlen = m->m_len; data = mtod(m, uint8_t *); post_initial_offset: if (mlen == 0) continue; if (mlen > len) mlen = len; len -= mlen; partial = 0; if ((uintptr_t)data & 1) { /* Align on word boundary */ started_on_odd = !started_on_odd; #if _BYTE_ORDER == _LITTLE_ENDIAN partial = *data << 8; #else partial = *data; #endif ++data; --mlen; } needs_swap = started_on_odd; if ((uintptr_t)data & 2) { if (mlen < 2) goto trailing_bytes; partial += *(uint16_t *)data; data += 2; mlen -= 2; } while (mlen >= 64) { __builtin_prefetch(data + 32); __builtin_prefetch(data + 64); partial += *(uint32_t *)data; partial += *(uint32_t *)(data + 4); partial += *(uint32_t *)(data + 8); partial += *(uint32_t *)(data + 12); partial += *(uint32_t *)(data + 16); partial += *(uint32_t *)(data + 20); partial += *(uint32_t *)(data + 24); partial += *(uint32_t *)(data + 28); partial += *(uint32_t *)(data + 32); partial += *(uint32_t *)(data + 36); partial += *(uint32_t *)(data + 40); partial += *(uint32_t *)(data + 44); partial += *(uint32_t *)(data + 48); partial += *(uint32_t *)(data + 52); partial += *(uint32_t *)(data + 56); partial += *(uint32_t *)(data + 60); data += 64; mlen -= 64; if (__predict_false(partial & (3ULL << 62))) { if (needs_swap) partial = (partial << 8) + (partial >> 56); sum += (partial >> 32); sum += (partial & 0xffffffff); partial = 0; } } /* * mlen is not updated below as the remaining tests * are using bit masks, which are not affected. */ if (mlen & 32) { partial += *(uint32_t *)data; partial += *(uint32_t *)(data + 4); partial += *(uint32_t *)(data + 8); partial += *(uint32_t *)(data + 12); partial += *(uint32_t *)(data + 16); partial += *(uint32_t *)(data + 20); partial += *(uint32_t *)(data + 24); partial += *(uint32_t *)(data + 28); data += 32; } if (mlen & 16) { partial += *(uint32_t *)data; partial += *(uint32_t *)(data + 4); partial += *(uint32_t *)(data + 8); partial += *(uint32_t *)(data + 12); data += 16; } if (mlen & 8) { partial += *(uint32_t *)data; partial += *(uint32_t *)(data + 4); data += 8; } if (mlen & 4) { partial += *(uint32_t *)data; data += 4; } if (mlen & 2) { partial += *(uint16_t *)data; data += 2; } trailing_bytes: if (mlen & 1) { #if _BYTE_ORDER == _LITTLE_ENDIAN partial += *data; #else partial += *data << 8; #endif started_on_odd = !started_on_odd; } if (needs_swap) partial = (partial << 8) + (partial >> 56); sum += (partial >> 32) + (partial & 0xffffffff); /* * Reduce sum to allow potential byte swap * in the next iteration without carry. */ sum = (sum >> 32) + (sum & 0xffffffff); } final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) + ((sum >> 16) & 0xffff) + (sum & 0xffff); final_acc = (final_acc >> 16) + (final_acc & 0xffff); final_acc = (final_acc >> 16) + (final_acc & 0xffff); return ~final_acc & 0xffff; } #endif @ 1.1 log @Refactor in_cksum/in4_cksum/in6_cksum implementations: - All three functions are included in the kernel by default. They call a backend function cpu_in_cksum after possibly computing the checksum of the pseudo header. - cpu_in_cksum is the core to implement the one-complement sum. The default implementation is moderate fast on most platforms and provides a 32bit accumulator with 16bit addends for L32 platforms and a 64bit accumulator with 32bit addends for L64 platforms. It handles edge cases like very large mbuf chains (could happen with native IPv6 in the future) and provides a good base for new native implementations. - Modify i386 and amd64 assembly to use the new interface. This disables the MD implementations on !x86 until the conversion is done. For Alpha, the portable version is faster. @ text @d1 1 a1 1 /* $NetBSD$ */ d32 1 a32 1 __KERNEL_RCSID(0, "$NetBSD$"); d165 4 a178 1 mlen -= 16; a179 4 /* * mlen is not updated below as the remaining tests * are using bit masks, which are not affected. */ @ 1.1.104.1 log @Sync with HEAD @ text @d1 1 a1 1 /* $NetBSD: cpu_in_cksum.c,v 1.2 2018/08/28 07:28:01 rin Exp $ */ d32 1 a32 1 __KERNEL_RCSID(0, "$NetBSD: cpu_in_cksum.c,v 1.2 2018/08/28 07:28:01 rin Exp $"); a164 4 /* * mlen is not updated below as the remaining tests * are using bit masks, which are not affected. */ d175 1 d177 4 @ 1.1.102.1 log @Sync with HEAD Resolve a couple of conflicts (result of the uimin/uimax changes) @ text @d1 1 a1 1 /* $NetBSD: cpu_in_cksum.c,v 1.2 2018/08/28 07:28:01 rin Exp $ */ d32 1 a32 1 __KERNEL_RCSID(0, "$NetBSD: cpu_in_cksum.c,v 1.2 2018/08/28 07:28:01 rin Exp $"); a164 4 /* * mlen is not updated below as the remaining tests * are using bit masks, which are not affected. */ d175 1 d177 4 @ 1.1.12.1 log @file cpu_in_cksum.c was added on branch matt-armv6 on 2008-03-23 02:05:06 +0000 @ text @d1 378 @ 1.1.12.2 log @sync with HEAD @ text @a0 378 /* $NetBSD: cpu_in_cksum.c,v 1.1 2008/01/25 21:12:14 joerg Exp $ */ /*- * Copyright (c) 2008 Joerg Sonnenberger . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __KERNEL_RCSID(0, "$NetBSD: cpu_in_cksum.c,v 1.1 2008/01/25 21:12:14 joerg Exp $"); #include #include #include #ifdef _KERNEL #include #else #include #include #include #define KASSERT(x) assert(x) #endif #include #include #ifndef _KERNEL int cpu_in_cksum(struct mbuf*, int, int, uint32_t); #endif /* * Checksum routine for Internet Protocol family headers (Portable Version). * * This routine is very heavily used in the network * code and should be modified for each CPU to be as fast as possible. * * A discussion of different implementation techniques can be found in * RFC 1071. * * The default implementation for 32bit architectures is using * a 32bit accumulator and operating on 16bit operands. * * The default implementation for 64bit architectures is using * a 64bit accumulator and operating on 32bit operands. * * Both versions are unrolled to handle 32 Byte / 64 Byte fragments as core * of the inner loop. After each iteration of the inner loop, a partial * reduction is done to avoid carry in long packets. */ #if ULONG_MAX == 0xffffffffUL /* 32bit version */ int cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum) { int mlen; uint32_t sum, partial; unsigned int final_acc; uint8_t *data; bool needs_swap, started_on_odd; KASSERT(len >= 0); KASSERT(off >= 0); needs_swap = false; started_on_odd = false; sum = (initial_sum >> 16) + (initial_sum & 0xffff); for (;;) { if (__predict_false(m == NULL)) { printf("in_cksum: out of data\n"); return -1; } mlen = m->m_len; if (mlen > off) { mlen -= off; data = mtod(m, uint8_t *) + off; goto post_initial_offset; } off -= mlen; if (len == 0) break; m = m->m_next; } for (; len > 0; m = m->m_next) { if (__predict_false(m == NULL)) { printf("in_cksum: out of data\n"); return -1; } mlen = m->m_len; data = mtod(m, uint8_t *); post_initial_offset: if (mlen == 0) continue; if (mlen > len) mlen = len; len -= mlen; partial = 0; if ((uintptr_t)data & 1) { /* Align on word boundary */ started_on_odd = !started_on_odd; #if _BYTE_ORDER == _LITTLE_ENDIAN partial = *data << 8; #else partial = *data; #endif ++data; --mlen; } needs_swap = started_on_odd; while (mlen >= 32) { __builtin_prefetch(data + 32); partial += *(uint16_t *)data; partial += *(uint16_t *)(data + 2); partial += *(uint16_t *)(data + 4); partial += *(uint16_t *)(data + 6); partial += *(uint16_t *)(data + 8); partial += *(uint16_t *)(data + 10); partial += *(uint16_t *)(data + 12); partial += *(uint16_t *)(data + 14); partial += *(uint16_t *)(data + 16); partial += *(uint16_t *)(data + 18); partial += *(uint16_t *)(data + 20); partial += *(uint16_t *)(data + 22); partial += *(uint16_t *)(data + 24); partial += *(uint16_t *)(data + 26); partial += *(uint16_t *)(data + 28); partial += *(uint16_t *)(data + 30); data += 32; mlen -= 32; if (__predict_false(partial & 0xc0000000)) { if (needs_swap) partial = (partial << 8) + (partial >> 24); sum += (partial >> 16); sum += (partial & 0xffff); partial = 0; } } if (mlen & 16) { partial += *(uint16_t *)data; partial += *(uint16_t *)(data + 2); partial += *(uint16_t *)(data + 4); partial += *(uint16_t *)(data + 6); partial += *(uint16_t *)(data + 8); partial += *(uint16_t *)(data + 10); partial += *(uint16_t *)(data + 12); partial += *(uint16_t *)(data + 14); data += 16; mlen -= 16; } /* * mlen is not updated below as the remaining tests * are using bit masks, which are not affected. */ if (mlen & 8) { partial += *(uint16_t *)data; partial += *(uint16_t *)(data + 2); partial += *(uint16_t *)(data + 4); partial += *(uint16_t *)(data + 6); data += 8; } if (mlen & 4) { partial += *(uint16_t *)data; partial += *(uint16_t *)(data + 2); data += 4; } if (mlen & 2) { partial += *(uint16_t *)data; data += 2; } if (mlen & 1) { #if _BYTE_ORDER == _LITTLE_ENDIAN partial += *data; #else partial += *data << 8; #endif started_on_odd = !started_on_odd; } if (needs_swap) partial = (partial << 8) + (partial >> 24); sum += (partial >> 16) + (partial & 0xffff); /* * Reduce sum to allow potential byte swap * in the next iteration without carry. */ sum = (sum >> 16) + (sum & 0xffff); } final_acc = ((sum >> 16) & 0xffff) + (sum & 0xffff); final_acc = (final_acc >> 16) + (final_acc & 0xffff); return ~final_acc & 0xffff; } #else /* 64bit version */ int cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum) { int mlen; uint64_t sum, partial; unsigned int final_acc; uint8_t *data; bool needs_swap, started_on_odd; KASSERT(len >= 0); KASSERT(off >= 0); needs_swap = false; started_on_odd = false; sum = initial_sum; for (;;) { if (__predict_false(m == NULL)) { printf("in_cksum: out of data\n"); return -1; } mlen = m->m_len; if (mlen > off) { mlen -= off; data = mtod(m, uint8_t *) + off; goto post_initial_offset; } off -= mlen; if (len == 0) break; m = m->m_next; } for (; len > 0; m = m->m_next) { if (__predict_false(m == NULL)) { printf("in_cksum: out of data\n"); return -1; } mlen = m->m_len; data = mtod(m, uint8_t *); post_initial_offset: if (mlen == 0) continue; if (mlen > len) mlen = len; len -= mlen; partial = 0; if ((uintptr_t)data & 1) { /* Align on word boundary */ started_on_odd = !started_on_odd; #if _BYTE_ORDER == _LITTLE_ENDIAN partial = *data << 8; #else partial = *data; #endif ++data; --mlen; } needs_swap = started_on_odd; if ((uintptr_t)data & 2) { if (mlen < 2) goto trailing_bytes; partial += *(uint16_t *)data; data += 2; mlen -= 2; } while (mlen >= 64) { __builtin_prefetch(data + 32); __builtin_prefetch(data + 64); partial += *(uint32_t *)data; partial += *(uint32_t *)(data + 4); partial += *(uint32_t *)(data + 8); partial += *(uint32_t *)(data + 12); partial += *(uint32_t *)(data + 16); partial += *(uint32_t *)(data + 20); partial += *(uint32_t *)(data + 24); partial += *(uint32_t *)(data + 28); partial += *(uint32_t *)(data + 32); partial += *(uint32_t *)(data + 36); partial += *(uint32_t *)(data + 40); partial += *(uint32_t *)(data + 44); partial += *(uint32_t *)(data + 48); partial += *(uint32_t *)(data + 52); partial += *(uint32_t *)(data + 56); partial += *(uint32_t *)(data + 60); data += 64; mlen -= 64; if (__predict_false(partial & (3ULL << 62))) { if (needs_swap) partial = (partial << 8) + (partial >> 56); sum += (partial >> 32); sum += (partial & 0xffffffff); partial = 0; } } /* * mlen is not updated below as the remaining tests * are using bit masks, which are not affected. */ if (mlen & 32) { partial += *(uint32_t *)data; partial += *(uint32_t *)(data + 4); partial += *(uint32_t *)(data + 8); partial += *(uint32_t *)(data + 12); partial += *(uint32_t *)(data + 16); partial += *(uint32_t *)(data + 20); partial += *(uint32_t *)(data + 24); partial += *(uint32_t *)(data + 28); data += 32; } if (mlen & 16) { partial += *(uint32_t *)data; partial += *(uint32_t *)(data + 4); partial += *(uint32_t *)(data + 8); partial += *(uint32_t *)(data + 12); data += 16; } if (mlen & 8) { partial += *(uint32_t *)data; partial += *(uint32_t *)(data + 4); data += 8; } if (mlen & 4) { partial += *(uint32_t *)data; data += 4; } if (mlen & 2) { partial += *(uint16_t *)data; data += 2; } trailing_bytes: if (mlen & 1) { #if _BYTE_ORDER == _LITTLE_ENDIAN partial += *data; #else partial += *data << 8; #endif started_on_odd = !started_on_odd; } if (needs_swap) partial = (partial << 8) + (partial >> 56); sum += (partial >> 32) + (partial & 0xffffffff); /* * Reduce sum to allow potential byte swap * in the next iteration without carry. */ sum = (sum >> 32) + (sum & 0xffffffff); } final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) + ((sum >> 16) & 0xffff) + (sum & 0xffff); final_acc = (final_acc >> 16) + (final_acc & 0xffff); final_acc = (final_acc >> 16) + (final_acc & 0xffff); return ~final_acc & 0xffff; } #endif @ 1.1.4.1 log @file cpu_in_cksum.c was added on branch mjf-devfs on 2008-02-18 21:07:08 +0000 @ text @d1 378 @ 1.1.4.2 log @Sync with HEAD. @ text @a0 378 /* $NetBSD: cpu_in_cksum.c,v 1.1 2008/01/25 21:12:14 joerg Exp $ */ /*- * Copyright (c) 2008 Joerg Sonnenberger . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __KERNEL_RCSID(0, "$NetBSD: cpu_in_cksum.c,v 1.1 2008/01/25 21:12:14 joerg Exp $"); #include #include #include #ifdef _KERNEL #include #else #include #include #include #define KASSERT(x) assert(x) #endif #include #include #ifndef _KERNEL int cpu_in_cksum(struct mbuf*, int, int, uint32_t); #endif /* * Checksum routine for Internet Protocol family headers (Portable Version). * * This routine is very heavily used in the network * code and should be modified for each CPU to be as fast as possible. * * A discussion of different implementation techniques can be found in * RFC 1071. * * The default implementation for 32bit architectures is using * a 32bit accumulator and operating on 16bit operands. * * The default implementation for 64bit architectures is using * a 64bit accumulator and operating on 32bit operands. * * Both versions are unrolled to handle 32 Byte / 64 Byte fragments as core * of the inner loop. After each iteration of the inner loop, a partial * reduction is done to avoid carry in long packets. */ #if ULONG_MAX == 0xffffffffUL /* 32bit version */ int cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum) { int mlen; uint32_t sum, partial; unsigned int final_acc; uint8_t *data; bool needs_swap, started_on_odd; KASSERT(len >= 0); KASSERT(off >= 0); needs_swap = false; started_on_odd = false; sum = (initial_sum >> 16) + (initial_sum & 0xffff); for (;;) { if (__predict_false(m == NULL)) { printf("in_cksum: out of data\n"); return -1; } mlen = m->m_len; if (mlen > off) { mlen -= off; data = mtod(m, uint8_t *) + off; goto post_initial_offset; } off -= mlen; if (len == 0) break; m = m->m_next; } for (; len > 0; m = m->m_next) { if (__predict_false(m == NULL)) { printf("in_cksum: out of data\n"); return -1; } mlen = m->m_len; data = mtod(m, uint8_t *); post_initial_offset: if (mlen == 0) continue; if (mlen > len) mlen = len; len -= mlen; partial = 0; if ((uintptr_t)data & 1) { /* Align on word boundary */ started_on_odd = !started_on_odd; #if _BYTE_ORDER == _LITTLE_ENDIAN partial = *data << 8; #else partial = *data; #endif ++data; --mlen; } needs_swap = started_on_odd; while (mlen >= 32) { __builtin_prefetch(data + 32); partial += *(uint16_t *)data; partial += *(uint16_t *)(data + 2); partial += *(uint16_t *)(data + 4); partial += *(uint16_t *)(data + 6); partial += *(uint16_t *)(data + 8); partial += *(uint16_t *)(data + 10); partial += *(uint16_t *)(data + 12); partial += *(uint16_t *)(data + 14); partial += *(uint16_t *)(data + 16); partial += *(uint16_t *)(data + 18); partial += *(uint16_t *)(data + 20); partial += *(uint16_t *)(data + 22); partial += *(uint16_t *)(data + 24); partial += *(uint16_t *)(data + 26); partial += *(uint16_t *)(data + 28); partial += *(uint16_t *)(data + 30); data += 32; mlen -= 32; if (__predict_false(partial & 0xc0000000)) { if (needs_swap) partial = (partial << 8) + (partial >> 24); sum += (partial >> 16); sum += (partial & 0xffff); partial = 0; } } if (mlen & 16) { partial += *(uint16_t *)data; partial += *(uint16_t *)(data + 2); partial += *(uint16_t *)(data + 4); partial += *(uint16_t *)(data + 6); partial += *(uint16_t *)(data + 8); partial += *(uint16_t *)(data + 10); partial += *(uint16_t *)(data + 12); partial += *(uint16_t *)(data + 14); data += 16; mlen -= 16; } /* * mlen is not updated below as the remaining tests * are using bit masks, which are not affected. */ if (mlen & 8) { partial += *(uint16_t *)data; partial += *(uint16_t *)(data + 2); partial += *(uint16_t *)(data + 4); partial += *(uint16_t *)(data + 6); data += 8; } if (mlen & 4) { partial += *(uint16_t *)data; partial += *(uint16_t *)(data + 2); data += 4; } if (mlen & 2) { partial += *(uint16_t *)data; data += 2; } if (mlen & 1) { #if _BYTE_ORDER == _LITTLE_ENDIAN partial += *data; #else partial += *data << 8; #endif started_on_odd = !started_on_odd; } if (needs_swap) partial = (partial << 8) + (partial >> 24); sum += (partial >> 16) + (partial & 0xffff); /* * Reduce sum to allow potential byte swap * in the next iteration without carry. */ sum = (sum >> 16) + (sum & 0xffff); } final_acc = ((sum >> 16) & 0xffff) + (sum & 0xffff); final_acc = (final_acc >> 16) + (final_acc & 0xffff); return ~final_acc & 0xffff; } #else /* 64bit version */ int cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum) { int mlen; uint64_t sum, partial; unsigned int final_acc; uint8_t *data; bool needs_swap, started_on_odd; KASSERT(len >= 0); KASSERT(off >= 0); needs_swap = false; started_on_odd = false; sum = initial_sum; for (;;) { if (__predict_false(m == NULL)) { printf("in_cksum: out of data\n"); return -1; } mlen = m->m_len; if (mlen > off) { mlen -= off; data = mtod(m, uint8_t *) + off; goto post_initial_offset; } off -= mlen; if (len == 0) break; m = m->m_next; } for (; len > 0; m = m->m_next) { if (__predict_false(m == NULL)) { printf("in_cksum: out of data\n"); return -1; } mlen = m->m_len; data = mtod(m, uint8_t *); post_initial_offset: if (mlen == 0) continue; if (mlen > len) mlen = len; len -= mlen; partial = 0; if ((uintptr_t)data & 1) { /* Align on word boundary */ started_on_odd = !started_on_odd; #if _BYTE_ORDER == _LITTLE_ENDIAN partial = *data << 8; #else partial = *data; #endif ++data; --mlen; } needs_swap = started_on_odd; if ((uintptr_t)data & 2) { if (mlen < 2) goto trailing_bytes; partial += *(uint16_t *)data; data += 2; mlen -= 2; } while (mlen >= 64) { __builtin_prefetch(data + 32); __builtin_prefetch(data + 64); partial += *(uint32_t *)data; partial += *(uint32_t *)(data + 4); partial += *(uint32_t *)(data + 8); partial += *(uint32_t *)(data + 12); partial += *(uint32_t *)(data + 16); partial += *(uint32_t *)(data + 20); partial += *(uint32_t *)(data + 24); partial += *(uint32_t *)(data + 28); partial += *(uint32_t *)(data + 32); partial += *(uint32_t *)(data + 36); partial += *(uint32_t *)(data + 40); partial += *(uint32_t *)(data + 44); partial += *(uint32_t *)(data + 48); partial += *(uint32_t *)(data + 52); partial += *(uint32_t *)(data + 56); partial += *(uint32_t *)(data + 60); data += 64; mlen -= 64; if (__predict_false(partial & (3ULL << 62))) { if (needs_swap) partial = (partial << 8) + (partial >> 56); sum += (partial >> 32); sum += (partial & 0xffffffff); partial = 0; } } /* * mlen is not updated below as the remaining tests * are using bit masks, which are not affected. */ if (mlen & 32) { partial += *(uint32_t *)data; partial += *(uint32_t *)(data + 4); partial += *(uint32_t *)(data + 8); partial += *(uint32_t *)(data + 12); partial += *(uint32_t *)(data + 16); partial += *(uint32_t *)(data + 20); partial += *(uint32_t *)(data + 24); partial += *(uint32_t *)(data + 28); data += 32; } if (mlen & 16) { partial += *(uint32_t *)data; partial += *(uint32_t *)(data + 4); partial += *(uint32_t *)(data + 8); partial += *(uint32_t *)(data + 12); data += 16; } if (mlen & 8) { partial += *(uint32_t *)data; partial += *(uint32_t *)(data + 4); data += 8; } if (mlen & 4) { partial += *(uint32_t *)data; data += 4; } if (mlen & 2) { partial += *(uint16_t *)data; data += 2; } trailing_bytes: if (mlen & 1) { #if _BYTE_ORDER == _LITTLE_ENDIAN partial += *data; #else partial += *data << 8; #endif started_on_odd = !started_on_odd; } if (needs_swap) partial = (partial << 8) + (partial >> 56); sum += (partial >> 32) + (partial & 0xffffffff); /* * Reduce sum to allow potential byte swap * in the next iteration without carry. */ sum = (sum >> 32) + (sum & 0xffffffff); } final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) + ((sum >> 16) & 0xffff) + (sum & 0xffff); final_acc = (final_acc >> 16) + (final_acc & 0xffff); final_acc = (final_acc >> 16) + (final_acc & 0xffff); return ~final_acc & 0xffff; } #endif @ 1.1.2.1 log @file cpu_in_cksum.c was added on branch yamt-lazymbuf on 2008-02-04 09:24:39 +0000 @ text @d1 378 @ 1.1.2.2 log @sync with head. @ text @a0 378 /* $NetBSD$ */ /*- * Copyright (c) 2008 Joerg Sonnenberger . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __KERNEL_RCSID(0, "$NetBSD$"); #include #include #include #ifdef _KERNEL #include #else #include #include #include #define KASSERT(x) assert(x) #endif #include #include #ifndef _KERNEL int cpu_in_cksum(struct mbuf*, int, int, uint32_t); #endif /* * Checksum routine for Internet Protocol family headers (Portable Version). * * This routine is very heavily used in the network * code and should be modified for each CPU to be as fast as possible. * * A discussion of different implementation techniques can be found in * RFC 1071. * * The default implementation for 32bit architectures is using * a 32bit accumulator and operating on 16bit operands. * * The default implementation for 64bit architectures is using * a 64bit accumulator and operating on 32bit operands. * * Both versions are unrolled to handle 32 Byte / 64 Byte fragments as core * of the inner loop. After each iteration of the inner loop, a partial * reduction is done to avoid carry in long packets. */ #if ULONG_MAX == 0xffffffffUL /* 32bit version */ int cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum) { int mlen; uint32_t sum, partial; unsigned int final_acc; uint8_t *data; bool needs_swap, started_on_odd; KASSERT(len >= 0); KASSERT(off >= 0); needs_swap = false; started_on_odd = false; sum = (initial_sum >> 16) + (initial_sum & 0xffff); for (;;) { if (__predict_false(m == NULL)) { printf("in_cksum: out of data\n"); return -1; } mlen = m->m_len; if (mlen > off) { mlen -= off; data = mtod(m, uint8_t *) + off; goto post_initial_offset; } off -= mlen; if (len == 0) break; m = m->m_next; } for (; len > 0; m = m->m_next) { if (__predict_false(m == NULL)) { printf("in_cksum: out of data\n"); return -1; } mlen = m->m_len; data = mtod(m, uint8_t *); post_initial_offset: if (mlen == 0) continue; if (mlen > len) mlen = len; len -= mlen; partial = 0; if ((uintptr_t)data & 1) { /* Align on word boundary */ started_on_odd = !started_on_odd; #if _BYTE_ORDER == _LITTLE_ENDIAN partial = *data << 8; #else partial = *data; #endif ++data; --mlen; } needs_swap = started_on_odd; while (mlen >= 32) { __builtin_prefetch(data + 32); partial += *(uint16_t *)data; partial += *(uint16_t *)(data + 2); partial += *(uint16_t *)(data + 4); partial += *(uint16_t *)(data + 6); partial += *(uint16_t *)(data + 8); partial += *(uint16_t *)(data + 10); partial += *(uint16_t *)(data + 12); partial += *(uint16_t *)(data + 14); partial += *(uint16_t *)(data + 16); partial += *(uint16_t *)(data + 18); partial += *(uint16_t *)(data + 20); partial += *(uint16_t *)(data + 22); partial += *(uint16_t *)(data + 24); partial += *(uint16_t *)(data + 26); partial += *(uint16_t *)(data + 28); partial += *(uint16_t *)(data + 30); data += 32; mlen -= 32; if (__predict_false(partial & 0xc0000000)) { if (needs_swap) partial = (partial << 8) + (partial >> 24); sum += (partial >> 16); sum += (partial & 0xffff); partial = 0; } } if (mlen & 16) { partial += *(uint16_t *)data; partial += *(uint16_t *)(data + 2); partial += *(uint16_t *)(data + 4); partial += *(uint16_t *)(data + 6); partial += *(uint16_t *)(data + 8); partial += *(uint16_t *)(data + 10); partial += *(uint16_t *)(data + 12); partial += *(uint16_t *)(data + 14); data += 16; mlen -= 16; } /* * mlen is not updated below as the remaining tests * are using bit masks, which are not affected. */ if (mlen & 8) { partial += *(uint16_t *)data; partial += *(uint16_t *)(data + 2); partial += *(uint16_t *)(data + 4); partial += *(uint16_t *)(data + 6); data += 8; } if (mlen & 4) { partial += *(uint16_t *)data; partial += *(uint16_t *)(data + 2); data += 4; } if (mlen & 2) { partial += *(uint16_t *)data; data += 2; } if (mlen & 1) { #if _BYTE_ORDER == _LITTLE_ENDIAN partial += *data; #else partial += *data << 8; #endif started_on_odd = !started_on_odd; } if (needs_swap) partial = (partial << 8) + (partial >> 24); sum += (partial >> 16) + (partial & 0xffff); /* * Reduce sum to allow potential byte swap * in the next iteration without carry. */ sum = (sum >> 16) + (sum & 0xffff); } final_acc = ((sum >> 16) & 0xffff) + (sum & 0xffff); final_acc = (final_acc >> 16) + (final_acc & 0xffff); return ~final_acc & 0xffff; } #else /* 64bit version */ int cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum) { int mlen; uint64_t sum, partial; unsigned int final_acc; uint8_t *data; bool needs_swap, started_on_odd; KASSERT(len >= 0); KASSERT(off >= 0); needs_swap = false; started_on_odd = false; sum = initial_sum; for (;;) { if (__predict_false(m == NULL)) { printf("in_cksum: out of data\n"); return -1; } mlen = m->m_len; if (mlen > off) { mlen -= off; data = mtod(m, uint8_t *) + off; goto post_initial_offset; } off -= mlen; if (len == 0) break; m = m->m_next; } for (; len > 0; m = m->m_next) { if (__predict_false(m == NULL)) { printf("in_cksum: out of data\n"); return -1; } mlen = m->m_len; data = mtod(m, uint8_t *); post_initial_offset: if (mlen == 0) continue; if (mlen > len) mlen = len; len -= mlen; partial = 0; if ((uintptr_t)data & 1) { /* Align on word boundary */ started_on_odd = !started_on_odd; #if _BYTE_ORDER == _LITTLE_ENDIAN partial = *data << 8; #else partial = *data; #endif ++data; --mlen; } needs_swap = started_on_odd; if ((uintptr_t)data & 2) { if (mlen < 2) goto trailing_bytes; partial += *(uint16_t *)data; data += 2; mlen -= 2; } while (mlen >= 64) { __builtin_prefetch(data + 32); __builtin_prefetch(data + 64); partial += *(uint32_t *)data; partial += *(uint32_t *)(data + 4); partial += *(uint32_t *)(data + 8); partial += *(uint32_t *)(data + 12); partial += *(uint32_t *)(data + 16); partial += *(uint32_t *)(data + 20); partial += *(uint32_t *)(data + 24); partial += *(uint32_t *)(data + 28); partial += *(uint32_t *)(data + 32); partial += *(uint32_t *)(data + 36); partial += *(uint32_t *)(data + 40); partial += *(uint32_t *)(data + 44); partial += *(uint32_t *)(data + 48); partial += *(uint32_t *)(data + 52); partial += *(uint32_t *)(data + 56); partial += *(uint32_t *)(data + 60); data += 64; mlen -= 64; if (__predict_false(partial & (3ULL << 62))) { if (needs_swap) partial = (partial << 8) + (partial >> 56); sum += (partial >> 32); sum += (partial & 0xffffffff); partial = 0; } } /* * mlen is not updated below as the remaining tests * are using bit masks, which are not affected. */ if (mlen & 32) { partial += *(uint32_t *)data; partial += *(uint32_t *)(data + 4); partial += *(uint32_t *)(data + 8); partial += *(uint32_t *)(data + 12); partial += *(uint32_t *)(data + 16); partial += *(uint32_t *)(data + 20); partial += *(uint32_t *)(data + 24); partial += *(uint32_t *)(data + 28); data += 32; } if (mlen & 16) { partial += *(uint32_t *)data; partial += *(uint32_t *)(data + 4); partial += *(uint32_t *)(data + 8); partial += *(uint32_t *)(data + 12); data += 16; } if (mlen & 8) { partial += *(uint32_t *)data; partial += *(uint32_t *)(data + 4); data += 8; } if (mlen & 4) { partial += *(uint32_t *)data; data += 4; } if (mlen & 2) { partial += *(uint16_t *)data; data += 2; } trailing_bytes: if (mlen & 1) { #if _BYTE_ORDER == _LITTLE_ENDIAN partial += *data; #else partial += *data << 8; #endif started_on_odd = !started_on_odd; } if (needs_swap) partial = (partial << 8) + (partial >> 56); sum += (partial >> 32) + (partial & 0xffffffff); /* * Reduce sum to allow potential byte swap * in the next iteration without carry. */ sum = (sum >> 32) + (sum & 0xffffffff); } final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) + ((sum >> 16) & 0xffff) + (sum & 0xffff); final_acc = (final_acc >> 16) + (final_acc & 0xffff); final_acc = (final_acc >> 16) + (final_acc & 0xffff); return ~final_acc & 0xffff; } #endif @