summaryrefslogtreecommitdiff
path: root/common/lib/libc/arch/i386/string/strlen.S
blob: a41c2a0defe6ef00003c83d8e2fb1f52d70e8da3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
/*
 * Written by J.T. Conklin <jtc@acorntoolworks.com>
 * Public domain.
 */

#include <machine/asm.h>

#if defined(LIBC_SCCS)
	RCSID("$NetBSD: strlen.S,v 1.2 2014/03/22 19:38:46 jakllsch Exp $")
#endif

ENTRY(strlen)
	movl	4(%esp),%eax

.Lalign:
	/* Consider unrolling loop? */
	testb	$3,%al
	je	.Lword_aligned
	cmpb	$0,(%eax)
	je	.Ldone
	incl	%eax
	jmp	.Lalign

	/*
	 * There are many well known branch-free sequences which are used
	 * for determining whether a zero-byte is contained within a word.
	 * These sequences are generally much more efficent than loading
	 * and comparing each byte individually.
	 *
	 * The expression [1,2]:
	 *
	 * (1)  ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | (x | 0x7f7f7f7f))
	 *
	 * evaluates to a non-zero value if any of the bytes in the
	 * original word is zero.
	 *
	 * It also has the useful property that bytes in the result word
	 * that correspond to non-zero bytes in the original word have
	 * the value 0x00, while bytes corresponding to zero bytes have
	 * the value 0x80. This allows calculation of the first (and
	 * last) occurrence of a zero byte within the word (useful for C's
	 * str* primitives) by counting the number of leading (or
	 * trailing) zeros and dividing the result by 8.  On machines
	 * without (or with slow) clz() / ctz() instructions, testing
	 * each byte in the result word for zero is necessary.
	 *
	 * This typically takes 4 instructions (5 on machines without
	 * "not-or") not including those needed to load the constant.
	 *
	 *
	 * The expression:
	 *
	 * (2)  ((x - 0x01010101) & ~x & 0x80808080)
	 *
	 * evaluates to a non-zero value if any of the bytes in the
	 * original word is zero.
	 *
	 * On little endian machines, the first byte in the result word
	 * that corresponds to a zero byte in the original byte is 0x80,
	 * so clz() can be used as above.  On big endian machines, and
	 * little endian machines without (or with a slow) clz() insn,
	 * testing each byte in the original for zero is necessary.
	 *
	 * This typically takes 3 instructions (4 on machines without
	 * "and with complement") not including those needed to load
	 * constants.
	 *
	 *
	 * The expression:
	 *
	 * (3)  ((x - 0x01010101) & 0x80808080)
	 *
	 * always evaluates to a non-zero value if any of the bytes in
	 * the original word is zero.  However, in rare cases, it also
	 * evaluates to a non-zero value when none of the bytes in the
	 * original word is zero.
	 *
	 * To account for possible false positives, each byte of the
	 * original word must be checked when the expression evaluates to
	 * a non-zero value.  However, because it is simpler than those
	 * presented above, code that uses it will be faster as long as
	 * the rate of false positives is low.
	 *
	 * This is likely, because the the false positive can only occur
	 * if the most siginificant bit of a byte within the word is set.
	 * The expression will never fail for typical 7-bit ASCII strings.
	 *
	 * This typically takes 2 instructions not including those needed
	 * to load constants.
	 *
	 *
	 * [1] Henry S. Warren Jr., "Hacker's Delight", Addison-Westley 2003
	 *
	 * [2] International Business Machines, "The PowerPC Compiler Writer's
	 *     Guide", Warthman Associates, 1996
	 */

	_ALIGN_TEXT
.Lword_aligned:
.Lloop:
	movl	(%eax),%ecx
	addl	$4,%eax
	leal	-0x01010101(%ecx),%edx
	testl	$0x80808080,%edx
	je	.Lloop

	/*
	 * In rare cases, the above loop may exit prematurely. We must
	 * return to the loop if none of the bytes in the word equal 0.
	 */

	/*
	 * The optimal code for determining whether each byte is zero
	 * differs by processor.  This space-optimized code should be
	 * acceptable on all, especially since we don't expect it to
	 * be run frequently,
	 */

	testb	%cl,%cl		/* 1st byte == 0? */
	jne	1f
	subl	$4,%eax
	jmp	.Ldone

1:	testb	%ch,%ch		/* 2nd byte == 0? */
	jne	1f
	subl	$3,%eax
	jmp	.Ldone

1:	shrl	$16,%ecx
	testb	%cl,%cl		/* 3rd byte == 0? */
	jne	1f
	subl	$2,%eax
	jmp	.Ldone

1:	testb	%ch,%ch		/* 4th byte == 0? */
	jne	.Lloop
	decl	%eax

.Ldone:
	subl	4(%esp),%eax
	ret
END(strlen)